diff --git a/go.sum b/go.sum index b3f692e..fc26243 100644 --- a/go.sum +++ b/go.sum @@ -4,3 +4,4 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= +github.com/petar/GoLLRB v0.0.0-20190514000832-33fb24c13b99 h1:KcEvVBAvyHkUdFAygKAzwB6LAcZ6LS32WHmRD2VyXMI= diff --git a/tree/tried/tried.go b/tree/tried/tried.go new file mode 100644 index 0000000..a4b5e0a --- /dev/null +++ b/tree/tried/tried.go @@ -0,0 +1,197 @@ +package tried + +import "github.com/davecgh/go-spew/spew" + +// func (ts TriedString) WordIndex(idx uint) uint { +// w := ts[idx] +// if w >= 'a' && w <= 'z' { +// return uint(w) - 'a' +// } else if w >= 'A' && w <= 'Z' { +// return uint(w) - 'A' + 26 +// } else { +// return uint(w) - '0' + 52 +// } +// } + +type Tried struct { + root *Node + wiStore *wordIndexStore +} + +type Node struct { + data []*Node + value interface{} +} + +// New 默认 WordIndexLower 意味着只支持小写 +func New() *Tried { + tried := &Tried{} + tried.root = new(Node) + tried.wiStore = WordIndexDict[WordIndexLower] + return tried +} + +// NewWithWordType 选择单词的类型 WordIndexLower 意味着只支持小写 +func NewWithWordType(t WordIndexType) *Tried { + tried := &Tried{} + tried.root = new(Node) + + tried.wiStore = WordIndexDict[t] + + return tried +} + +func (tried *Tried) Put(words string, values ...interface{}) { + cur := tried.root + var n *Node + + bytes := []byte(words) + + for i := 0; i < len(bytes); i++ { + w := tried.wiStore.Byte2Index(bytes[i]) + + if cur.data == nil { + cur.data = make([]*Node, tried.wiStore.DataSize) + } + + if n = cur.data[w]; n == nil { + n = new(Node) + cur.data[w] = n + } + cur = n + } + + if values != nil { + cur.value = values[0] + } else { + cur.value = tried + } +} + +func (tried *Tried) Get(words string) interface{} { + cur := tried.root + var n *Node + bytes := []byte(words) + + for i := 0; i < len(bytes); i++ { + w := tried.wiStore.Byte2Index(bytes[i]) //TODO: 升级Index 函数 + if n = cur.data[w]; n == nil { + return nil + } + cur = n + } + return n.value +} + +func (tried *Tried) Has(words string) bool { + return tried.Get(words) != nil +} + +func (tried *Tried) HasPrefix(words string) bool { + cur := tried.root + var n *Node + bytes := []byte(words) + + for i := 0; i < len(bytes); i++ { + w := tried.wiStore.Byte2Index(bytes[i]) //TODO: 升级Index 函数 + if n = cur.data[w]; n == nil { + return false + } + cur = n + } + return true +} + +func (tried *Tried) PrefixWords(words string) []string { + cur := tried.root + var n *Node + bytes := []byte(words) + + var header []byte + for i := 0; i < len(bytes); i++ { + curbyte := bytes[i] + header = append(header, curbyte) + w := tried.wiStore.Byte2Index(curbyte) + if n = cur.data[w]; n == nil { + return nil + } + cur = n + } + + var result []string + + var traversal func([]byte, *Node) + traversal = func(prefix []byte, cur *Node) { + + for i, n := range cur.data { + if n != nil { + nextPrefix := append(prefix, tried.wiStore.Index2Byte(uint(i))) + traversal(nextPrefix, n) + if n.value != nil { + result = append(result, string(append(header, nextPrefix...))) + } + } + } + + } + // 拼接头 + if n != nil { + if n.value != nil { + result = append(result, string(header)) + } + traversal([]byte{}, n) + } + + return result +} + +func (tried *Tried) Traversal(every func(cidx uint, value interface{}) bool) { + + var traversal func(*Node) + traversal = func(cur *Node) { + if cur != nil { + for i, n := range cur.data { + if n != nil { + if n.value != nil { + if !every(uint(i), n.value) { + return + } + } + traversal(n) + } + } + } + } + + root := tried.root + traversal(root) +} + +func (tried *Tried) WordsArray() []string { + var result []string + + var traversal func([]byte, *Node) + traversal = func(prefix []byte, cur *Node) { + + for i, n := range cur.data { + if n != nil { + nextPrefix := append(prefix, tried.wiStore.Index2Byte(uint(i))) + traversal(nextPrefix, n) + if n.value != nil { + result = append(result, string(nextPrefix)) + } + } + } + + } + + if tried.root != nil { + traversal([]byte{}, tried.root) + } + + return result +} + +func (tried *Tried) String() string { + return spew.Sprint(tried.WordsArray()) +} diff --git a/tree/tried/tried_index.go b/tree/tried/tried_index.go new file mode 100644 index 0000000..c73b70b --- /dev/null +++ b/tree/tried/tried_index.go @@ -0,0 +1,152 @@ +package tried + +var WordIndexDict map[WordIndexType]*wordIndexStore + +func init() { + WordIndexDict = make(map[WordIndexType]*wordIndexStore) + WordIndexDict[WordIndexLower] = &wordIndexStore{WordIndexLower, wordIndexLower, indexWordLower, 26} + WordIndexDict[WordIndexUpper] = &wordIndexStore{WordIndexUpper, wordIndexUpper, indexWordUpper, 26} + WordIndexDict[WordIndexDigital] = &wordIndexStore{WordIndexDigital, wordIndexDigital, indexWordDigital, 10} + WordIndexDict[WordIndexUpperLower] = &wordIndexStore{WordIndexUpperLower, wordIndexUpperLower, indexWordUpperLower, 52} + WordIndexDict[WordIndexLowerDigital] = &wordIndexStore{WordIndexLowerDigital, wordIndexLowerDigital, indexWordLowerDigital, 36} + WordIndexDict[WordIndexUpperDigital] = &wordIndexStore{WordIndexUpperDigital, wordIndexUpperDigital, indexWordUpperDigital, 36} + WordIndexDict[WordIndexUpperLowerDigital] = &wordIndexStore{WordIndexUpperLowerDigital, wordIndexUpperLowerDigital, indexWordUpperLowerDigital, 62} + WordIndexDict[WordIndex256] = &wordIndexStore{WordIndex256, wordIndex256, indexWord256, 256} + WordIndexDict[WordIndex32to126] = &wordIndexStore{WordIndex32to126, wordIndex32to126, indexWord32to126, ('~' - ' ' + 1)} +} + +// WordIndexType 单词统计的类型 eg. WordIndexLower 意味Put的单词只支持小写... +type WordIndexType int + +const ( + _ WordIndexType = iota + WordIndexLower + WordIndexUpper + WordIndexDigital + WordIndexUpperLower + WordIndexLowerDigital + WordIndexUpperDigital + WordIndexUpperLowerDigital + WordIndex256 + WordIndex32to126 +) + +type wordIndexStore struct { + Type WordIndexType + Byte2Index func(byte) uint + Index2Byte func(uint) byte + DataSize uint +} + +func wordIndexLower(w byte) uint { + return uint(w) - 'a' +} + +func indexWordLower(w uint) byte { + return byte(w) + 'a' +} + +// +func wordIndexUpper(w byte) uint { + return uint(w) - 'A' +} + +func indexWordUpper(w uint) byte { + return byte(w) + 'A' +} + +// +func wordIndexDigital(w byte) uint { + return uint(w) - '0' +} + +func indexWordDigital(w uint) byte { + return byte(w) + '0' +} + +// +func wordIndexUpperLower(w byte) uint { + iw := uint(w) + if iw >= 'a' { + return iw - 'a' + } + return iw - 'A' + 26 +} + +func indexWordUpperLower(w uint) byte { + + if w >= 26 { + return byte(w) - 26 + 'A' + } + return byte(w) + 'a' +} + +// +func wordIndexLowerDigital(w byte) uint { + iw := uint(w) + if iw >= 'a' { + return iw - 'a' + } + return iw - '0' + 26 +} + +func indexWordLowerDigital(w uint) byte { + if w >= 26 { + return byte(w) - 26 + '0' + } + return byte(w) + 'a' +} + +// +func wordIndexUpperDigital(w byte) uint { + iw := uint(w) + if iw >= 'A' { + return iw - 'A' + } + return iw - '0' + 26 +} + +func indexWordUpperDigital(w uint) byte { + if w >= 26 { + return byte(w) - 26 + '0' + } + return byte(w) + 'A' +} + +// +func wordIndexUpperLowerDigital(w byte) uint { + iw := uint(w) + if iw >= 'a' { + return iw - 'a' + } else if iw >= 'A' { + return iw - 'A' + 26 + } + return iw - '0' + 52 +} + +func indexWordUpperLowerDigital(w uint) byte { + if w >= 52 { + return byte(w) - 52 + '0' + } else if w >= 26 { + return byte(w) - 26 + 'A' + } + return byte(w) + 'a' +} + +// wordIndex256 all byte 支持中文 +func wordIndex256(w byte) uint { + return uint(w) +} + +func indexWord256(w uint) byte { + return byte(w) +} + +// wordIndex32to126 空格-~ 0-9 a-z A-Z 符号等 +func wordIndex32to126(w byte) uint { + return uint(w) - ' ' +} + +func indexWord32to126(w uint) byte { + return byte(w) + ' ' +} diff --git a/tree/tried/tried_test.go b/tree/tried/tried_test.go new file mode 100644 index 0000000..a92e15f --- /dev/null +++ b/tree/tried/tried_test.go @@ -0,0 +1,384 @@ +package tried + +import ( + "bytes" + "encoding/gob" + "os" + "sort" + "testing" + + "github.com/davecgh/go-spew/spew" + + "github.com/Pallinder/go-randomdata" +) + +func CompareSliceWithSorted(source, words []string) (bool, string) { + sort.Slice(words, func(i, j int) bool { + if words[i] < words[j] { + return true + } + return false + }) + + // source := tried.WordsArray() + sort.Slice(source, func(i, j int) bool { + if source[i] < source[j] { + return true + } + return false + }) + result1 := spew.Sprint(source) + result2 := spew.Sprint(words) + + if result1 != result2 { + return false, spew.Sprint(result1, " != ", result2) + } + return true, "" +} + +func TestTried_Has(t *testing.T) { + var tried *Tried + tried = NewWithWordType(WordIndexLower) + tried.Put("ads") + tried.Put("zadads") + tried.Put("asdgdf") + if !tried.Has("ads") { + t.Error("ads is exist, but not has") + } + + if !tried.HasPrefix("ad") { + t.Error("ads is exist, but not HasPrefix") + } + + if !tried.HasPrefix("za") { + t.Error("ads is exist, but not HasPrefix") + } + + if tried.HasPrefix("fsdf") { + t.Error("fsdf is not exist, but HasPrefix") + } + + if len(tried.String()) < 10 { + t.Error(tried.WordsArray()) + } +} +func TestTried_PrefixWords(t *testing.T) { + + var tried *Tried + var wordsCollection []string + var input []string + + var wordsList [][]string + var inputParams [][]string + var triedList []*Tried + + triedList = append(triedList, NewWithWordType(WordIndexLower)) + inputParams = append(inputParams, []string{"ad", "adf"}) + wordsList = append(wordsList, []string{"ad", "adfsxzcdas", "adfadsasd"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpper)) + inputParams = append(inputParams, []string{"AD", "ADF"}) + wordsList = append(wordsList, []string{"AD", "ADFSXZCDAS", "ADFADSASD"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLower)) + inputParams = append(inputParams, []string{"aD", "aDf"}) + wordsList = append(wordsList, []string{"aDF", "aDfsxzcdas", "aDfadsasd"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperDigital)) + inputParams = append(inputParams, []string{"A09D", "A09DF"}) + wordsList = append(wordsList, []string{"A09D", "A09DFSXZCD312AS", "A09DFA32DSASD"}) + + triedList = append(triedList, NewWithWordType(WordIndexLowerDigital)) + inputParams = append(inputParams, []string{"a09d", "a09df"}) + wordsList = append(wordsList, []string{"a09d", "a09dfsxzcd312as", "a09dfa32dsasd"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLowerDigital)) + inputParams = append(inputParams, []string{"A09d", "A09dZ"}) + wordsList = append(wordsList, []string{"A09d", "A09dZsxzcd312as", "A09dZa32dsasd"}) + + triedList = append(triedList, NewWithWordType(WordIndex256)) + inputParams = append(inputParams, []string{"阿萨德", "阿萨德!"}) + wordsList = append(wordsList, []string{"阿萨德", "阿萨德!@$*#))(#*", "阿萨德!╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳▁▂▃▄▅▆▇█ ▉ ▊▋▌▍▎▏"}) + + triedList = append(triedList, NewWithWordType(WordIndex32to126)) + inputParams = append(inputParams, []string{" `", " `<"}) + wordsList = append(wordsList, []string{" `21`3tcdbxcfhyop8901zc[]\\'/?()#$%^&**! ", " `.,?/"}) + + for i := 0; i < len(triedList); i++ { + tried = triedList[i] + input = inputParams[i] + wordsCollection = wordsList[i] + for _, words := range wordsCollection { + tried.Put(words) + } + var prefixWords []string + prefixWords = tried.PrefixWords(input[0]) + if ok, errorResult := CompareSliceWithSorted(prefixWords, wordsCollection); !ok { + t.Error(errorResult) + } + + prefixWords = tried.PrefixWords(input[1]) + if ok, _ := CompareSliceWithSorted(prefixWords, wordsCollection); ok { + t.Error("should be not ok") + } + if len(prefixWords) != 2 { + t.Error(prefixWords, " Size of Array should be 2") + } + + if ok, errorResult := CompareSliceWithSorted(prefixWords, wordsCollection[1:]); !ok { + t.Error(errorResult) + } + + // t.Error(tried.WordsArray()) + } +} + +func TestTried_NewWith(t *testing.T) { + var tried *Tried + var wordsCollection []string + var wordsList [][]string + var triedList []*Tried + + triedList = append(triedList, NewWithWordType(WordIndexLower)) + wordsList = append(wordsList, []string{"adazx", "assdfhgnvb", "ewqyiouyasdfmzvxz"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpper)) + wordsList = append(wordsList, []string{"ADFSZ", "DEFASEWRQWER", "GFHJERQWREWTNBVFGFH"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLower)) + wordsList = append(wordsList, []string{"adazxAZDSAFASZRETHGFTUIPK", "assdfhgDSFGnvb", "yaXZLMPOIQsdGHFfmFBzvxz"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperDigital)) + wordsList = append(wordsList, []string{"AZ3428934470193", "ZPQPDEK09876543629812", "AZEWIRU0192456FDEWR9032"}) + + triedList = append(triedList, NewWithWordType(WordIndexLowerDigital)) + wordsList = append(wordsList, []string{"az3428934470193", "zpqwe0987654362sf9812", "az21301az09azdstr540"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLowerDigital)) + wordsList = append(wordsList, []string{"azAZ09", "aRGFDSFDSzAasdZ06789", "A28374JHFudfsu09qwzzdsw874FDSAZfer"}) + + triedList = append(triedList, NewWithWordType(WordIndex256)) + wordsList = append(wordsList, []string{"21`3tcdbxcfhyop8901zc[]\\'/?()#$%^&**! 09-阿萨德发生的官方说的对符合规定", "符号!@$*#))(#*", "╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳▁▂▃▄▅▆▇█ ▉ ▊▋▌▍▎▏"}) + + triedList = append(triedList, NewWithWordType(WordIndex32to126)) + wordsList = append(wordsList, []string{" 21`3tcdbxcfhyop8901zc[]\\'/?()#$%^&**! ", "AZaz09~ dys!@#$)(*^$#", "<>.,?/"}) + + for i := 0; i < len(triedList); i++ { + tried = triedList[i] + wordsCollection = wordsList[i] + for _, words := range wordsCollection { + tried.Put(words) + + if tried.Get(words) == nil { + t.Error("should be not nil the type is ", tried.wiStore.Type) + } + } + // t.Error(tried.WordsArray()) + } +} + +func TestTried_String(t *testing.T) { + var tried *Tried + var wordsCollection []string + var wordsList [][]string + var triedList []*Tried + + triedList = append(triedList, NewWithWordType(WordIndexLower)) + wordsList = append(wordsList, []string{"adazx", "assdfhgnvb", "ewqyiouyasdfmzvxz"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpper)) + wordsList = append(wordsList, []string{"ADFSZ", "DEFASEWRQWER", "GFHJERQWREWTNBVFGFH"}) + + triedList = append(triedList, NewWithWordType(WordIndexDigital)) + wordsList = append(wordsList, []string{"093875239457", "09123406534", "0912340846"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLower)) + wordsList = append(wordsList, []string{"adazxAZDSAFASZRETHGFTUIPK", "assdfhgDSFGnvb", "yaXZLMPOIQsdGHFfmFBzvxz"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperDigital)) + wordsList = append(wordsList, []string{"AZ3428934470193", "ZPQPDEK09876543629812", "AZEWIRU0192456FDEWR9032"}) + + triedList = append(triedList, NewWithWordType(WordIndexLowerDigital)) + wordsList = append(wordsList, []string{"az3428934470193", "zpqwe0987654362sf9812", "az21301az09azdstr540"}) + + triedList = append(triedList, NewWithWordType(WordIndexUpperLowerDigital)) + wordsList = append(wordsList, []string{"azAZ09", "aRGFDSFDSzAasdZ06789", "A28374JHFudfsu09qwzzdsw874FDSAZfer"}) + + triedList = append(triedList, NewWithWordType(WordIndex256)) + wordsList = append(wordsList, []string{"21`3tcdbxcf囉hyop打算8901zc[]\\'/?()#$%^&**!\x01 09-213", "的支持中文", "!@$*#)中文)(#*", `\/213dsfsdf`}) + + triedList = append(triedList, NewWithWordType(WordIndex32to126)) + wordsList = append(wordsList, []string{" 21`3tcdbxcfhyop8901zc[]\\'/?()#$%^&**! ", "AZaz09~ dys!@#$)(*^$#", "<>.,?/"}) + + for i := 0; i < len(triedList); i++ { + tried = triedList[i] + wordsCollection = wordsList[i] + for _, words := range wordsCollection { + tried.Put(words) + if tried.Get(words) == nil { + t.Error("should be not nil the type is ", tried.wiStore.Type) + } + } + + resultArray := tried.WordsArray() + if ok, errorResult := CompareSliceWithSorted(resultArray, wordsCollection); !ok { + t.Error(errorResult) + } + + // t.Error(tried.WordsArray()) + } +} + +func TestTried_PutAndGet1(t *testing.T) { + tried := New() + + tried.Put(("asdf")) + tried.Put(("hehe"), "hehe") + tried.Put(("xixi"), 3) + + var result interface{} + + result = tried.Get("asdf") + if result != tried { + t.Error("result should be 3") + } + + result = tried.Get("xixi") + if result != 3 { + t.Error("result should be 3") + } + + result = tried.Get("hehe") + if result != "hehe" { + t.Error("result should be hehe") + } + + result = tried.Get("haha") + if result != nil { + t.Error("result should be nil") + } + + result = tried.Get("b") + if result != nil { + t.Error("result should be nil") + } +} + +func TestTried_Traversal(t *testing.T) { + tried := New() + tried.Put("asdf") + tried.Put(("abdf"), "ab") + tried.Put(("hehe"), "hehe") + tried.Put(("xixi"), 3) + + var result []interface{} + tried.Traversal(func(idx uint, v interface{}) bool { + // t.Error(idx, v) + result = append(result, v) + return true + }) + + if result[0] != "ab" { + t.Error(result[0]) + } + + if result[1] != tried { + t.Error(result[1]) + } + + if result[2] != "hehe" { + t.Error(result[2]) + } + + if result[3] != 3 { + t.Error(result[3]) + } +} + +func TesStoreData(t *testing.T) { + var l []string + const N = 1000000 + for i := 0; i < N; i++ { + var content []rune + for c := 0; c < randomdata.Number(5, 15); c++ { + char := randomdata.Number(0, 26) + 'a' + content = append(content, rune(byte(char))) + } + l = append(l, (string(content))) + } + + var result bytes.Buffer + encoder := gob.NewEncoder(&result) + encoder.Encode(l) + lbytes := result.Bytes() + f, _ := os.OpenFile("tried.log", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) + f.Write(lbytes) +} + +func Load() []string { + var result []string + f, err := os.Open("tried.log") + if err != nil { + panic("先执行TesStoreData 然后再测试Benchmark") + } + gob.NewDecoder(f).Decode(&result) + return result +} + +func BenchmarkTried_Put(b *testing.B) { + + var data []string + b.N = 1000000 + count := 10 + + // for i := 0; i < b.N; i++ { + // var content []rune + // for c := 0; c < randomdata.Number(5, 15); c++ { + // char := randomdata.Number(0, 26) + 'a' + // content = append(content, rune(byte(char))) + // } + // data = append(data, (string(content))) + // } + + data = Load() + + b.ResetTimer() + b.N = b.N * count + for c := 0; c < count; c++ { + tried := New() + for _, v := range data { + tried.Put(v) + } + } +} + +func BenchmarkTried_Get(b *testing.B) { + b.StopTimer() + var data []string + b.N = 1000000 + count := 10 + + // for i := 0; i < b.N; i++ { + // var content []rune + // for c := 0; c < randomdata.Number(5, 15); c++ { + // char := randomdata.Number(0, 26) + 'a' + // content = append(content, rune(byte(char))) + // } + // data = append(data, string(content)) + // } + data = Load() + + b.N = b.N * count + + tried := New() + for _, v := range data { + tried.Put(v) + } + + b.StartTimer() + for c := 0; c < count; c++ { + for _, v := range data { + tried.Get(v) + } + } +}