From 5a7a4f2c9201e76f91260eb94357393abbabf945 Mon Sep 17 00:00:00 2001 From: huangsimin Date: Mon, 19 Aug 2019 19:03:58 +0800 Subject: [PATCH] =?UTF-8?q?TODO:=20wordIndexUpperLower=20=E5=8E=9F?= =?UTF-8?q?=E5=9B=A0:=20=E8=BE=B9=E7=95=8C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tree/tried/tried.go | 41 +++++----- tree/tried/tried_index.go | 152 ++++++++++++++++++++++++++++++++++++++ tree/tried/tried_test.go | 99 ++++++++++++++++++++----- 3 files changed, 251 insertions(+), 41 deletions(-) create mode 100644 tree/tried/tried_index.go diff --git a/tree/tried/tried.go b/tree/tried/tried.go index fb93ebc..abfe0b8 100644 --- a/tree/tried/tried.go +++ b/tree/tried/tried.go @@ -1,15 +1,5 @@ package tried -type TriedString string - -func (ts TriedString) Size() uint { - return uint(len(ts)) -} - -func (ts TriedString) WordIndex(idx uint) uint { - return uint(ts[idx]) - 'a' -} - // func (ts TriedString) WordIndex(idx uint) uint { // w := ts[idx] // if w >= 'a' && w <= 'z' { @@ -21,14 +11,9 @@ func (ts TriedString) WordIndex(idx uint) uint { // } // } -type ObjectIndex interface { - WordIndex(idx uint) uint - Size() uint -} - type Tried struct { - root *Node - datasize uint + root *Node + wiStore *wordIndexStore } type Node struct { @@ -36,25 +21,34 @@ type Node struct { value interface{} } +// New 默认 WordIndexLower 意味着只支持小写 func New() *Tried { tried := &Tried{} tried.root = new(Node) - tried.datasize = 62 + + tried.wiStore = WordIndexDict[WordIndexLower] return tried } -func (tried *Tried) wordIndex(w byte) uint { - return uint(w) - 'a' +// NewWithWordType 选择单词的类型 WordIndexLower 意味着只支持小写 +func NewWithWordType(t WordIndexType) *Tried { + tried := &Tried{} + tried.root = new(Node) + + tried.wiStore = WordIndexDict[t] + + return tried } func (tried *Tried) Put(words string, values ...interface{}) { cur := tried.root var n *Node + for i := 0; i < len(words); i++ { - w := tried.wordIndex(words[i]) + w := tried.wiStore.Byte2Index(words[i]) if cur.data == nil { - cur.data = make([]*Node, tried.datasize) + cur.data = make([]*Node, tried.wiStore.DataSize) } if n = cur.data[w]; n == nil { @@ -81,8 +75,9 @@ func (tried *Tried) Put(words string, values ...interface{}) { func (tried *Tried) Get(words string) interface{} { cur := tried.root var n *Node + for i := 0; i < len(words); i++ { - w := tried.wordIndex(words[i]) //TODO: 升级Index 函数 + w := tried.wiStore.Byte2Index(words[i]) //TODO: 升级Index 函数 if n = cur.data[w]; n == nil { return nil } diff --git a/tree/tried/tried_index.go b/tree/tried/tried_index.go new file mode 100644 index 0000000..00c2a80 --- /dev/null +++ b/tree/tried/tried_index.go @@ -0,0 +1,152 @@ +package tried + +var WordIndexDict map[WordIndexType]*wordIndexStore + +func init() { + WordIndexDict = make(map[WordIndexType]*wordIndexStore) + WordIndexDict[WordIndexLower] = &wordIndexStore{WordIndexLower, wordIndexLower, indexWordLower, 26} + WordIndexDict[WordIndexUpper] = &wordIndexStore{WordIndexUpper, wordIndexUpper, indexWordUpper, 26} + WordIndexDict[WordIndexDigital] = &wordIndexStore{WordIndexDigital, wordIndexDigital, indexWordDigital, 10} + WordIndexDict[WordIndexUpperLower] = &wordIndexStore{WordIndexUpperLower, wordIndexUpperLower, indexWordUpperLower, 52} + WordIndexDict[WordIndexLowerDigital] = &wordIndexStore{WordIndexLowerDigital, wordIndexLowerDigital, indexWordLowerDigital, 36} + WordIndexDict[WordIndexUpperDigital] = &wordIndexStore{WordIndexUpperDigital, wordIndexUpperDigital, indexWordUpperDigital, 36} + WordIndexDict[WordIndexUpperLowerDigital] = &wordIndexStore{WordIndexUpperLowerDigital, wordIndexUpperLowerDigital, indexWordUpperLowerDigital, 62} + WordIndexDict[WordIndex256] = &wordIndexStore{WordIndex256, wordIndex256, indexWord256, 256} + WordIndexDict[WordIndex32to126] = &wordIndexStore{WordIndex32to126, wordIndex32to126, indexWord32to126, ('~' - ' ' + 1)} +} + +// WordIndexType 单词统计的类型 eg. WordIndexLower 意味Put的单词只支持小写... +type WordIndexType int + +const ( + _ WordIndexType = iota + WordIndexLower + WordIndexUpper + WordIndexDigital + WordIndexUpperLower + WordIndexLowerDigital + WordIndexUpperDigital + WordIndexUpperLowerDigital + WordIndex256 + WordIndex32to126 +) + +type wordIndexStore struct { + Type WordIndexType + Byte2Index func(byte) uint + Index2Byte func(uint) byte + DataSize uint +} + +func wordIndexLower(w byte) uint { + return uint(w) - 'a' +} + +func indexWordLower(w uint) byte { + return byte(w) + 'a' +} + +// +func wordIndexUpper(w byte) uint { + return uint(w) - 'A' +} + +func indexWordUpper(w uint) byte { + return byte(w) + 'A' +} + +// +func wordIndexDigital(w byte) uint { + return uint(w) - '0' +} + +func indexWordDigital(w uint) byte { + return byte(w) + '0' +} + +// +func wordIndexUpperLower(w byte) uint { + iw := uint(w) + if iw > 'a' { + return iw - 'a' + } + return iw - 'A' + 26 +} + +func indexWordUpperLower(w uint) byte { + + if w >= 26 { + return byte(w) + 'A' + } + return byte(w) + 'a' +} + +// +func wordIndexLowerDigital(w byte) uint { + iw := uint(w) + if iw > 'a' { + return iw - 'a' + } + return iw - '0' + 26 +} + +func indexWordLowerDigital(w uint) byte { + if w >= 26 { + return byte(w) + '0' + } + return byte(w) + 'a' +} + +// +func wordIndexUpperDigital(w byte) uint { + iw := uint(w) + if iw > 'A' { + return iw - 'A' + } + return iw - '0' + 26 +} + +func indexWordUpperDigital(w uint) byte { + if w >= 26 { + return byte(w) + '0' + } + return byte(w) + 'a' +} + +// +func wordIndexUpperLowerDigital(w byte) uint { + iw := uint(w) + if iw > 'a' { + return iw - 'a' + } else if iw > 'A' { + return iw - 'A' + 26 + } + return iw - '0' + 52 +} + +func indexWordUpperLowerDigital(w uint) byte { + if w >= 52 { + return byte(w) + '0' + } else if w >= 26 { + return byte(w) + 'A' + } + return byte(w) + 'a' +} + +// wordIndex256 all byte +func wordIndex256(w byte) uint { + return uint(w) +} + +func indexWord256(w uint) byte { + return byte(w) +} + +// wordIndex32to126 空格-~ 0-9 a-z A-Z 符号等 +func wordIndex32to126(w byte) uint { + return uint(w) - ' ' +} + +func indexWord32to126(w uint) byte { + return byte(w) + ' ' +} diff --git a/tree/tried/tried_test.go b/tree/tried/tried_test.go index 1713c87..422c18c 100644 --- a/tree/tried/tried_test.go +++ b/tree/tried/tried_test.go @@ -1,11 +1,44 @@ package tried import ( + "bytes" + "encoding/gob" + "os" "testing" "github.com/Pallinder/go-randomdata" ) +func TestTried_NewWith(t *testing.T) { + tried := NewWithWordType(WordIndex32to126) + words := "~ 23fd " + tried.Put(words) + if tried.Get(words) == nil { + t.Error("should be not nil") + } + + tried = NewWithWordType(WordIndexLower) + words = "az" + tried.Put(words) + if tried.Get(words) == nil { + t.Error("should be not nil") + } + + tried = NewWithWordType(WordIndexUpper) + words = "AZ" + tried.Put(words) + if tried.Get(words) == nil { + t.Error("should be not nil") + } + + tried = NewWithWordType(WordIndexUpperLower) + words = "AZazsdfsd" + tried.Put(words) + if tried.Get(words) == nil { + t.Error("should be not nil") + } +} + func TestTried_PutAndGet1(t *testing.T) { tried := New() @@ -72,20 +105,49 @@ func TestTried_Traversal(t *testing.T) { } } +func TesStoreData(t *testing.T) { + var l []string + const N = 1000000 + for i := 0; i < N; i++ { + var content []rune + for c := 0; c < randomdata.Number(5, 15); c++ { + char := randomdata.Number(0, 26) + 'a' + content = append(content, rune(byte(char))) + } + l = append(l, (string(content))) + } + + var result bytes.Buffer + encoder := gob.NewEncoder(&result) + encoder.Encode(l) + lbytes := result.Bytes() + f, _ := os.OpenFile("tried.log", os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0666) + f.Write(lbytes) +} + +func Load() []string { + var result []string + f, _ := os.Open("tried.log") + gob.NewDecoder(f).Decode(&result) + return result +} + func BenchmarkTried_Put(b *testing.B) { var data []string b.N = 1000000 count := 10 - for i := 0; i < b.N; i++ { - var content []rune - for c := 0; c < randomdata.Number(5, 15); c++ { - char := randomdata.Number(0, 26) + 'a' - content = append(content, rune(byte(char))) - } - data = append(data, (string(content))) - } + // for i := 0; i < b.N; i++ { + // var content []rune + // for c := 0; c < randomdata.Number(5, 15); c++ { + // char := randomdata.Number(0, 26) + 'a' + // content = append(content, rune(byte(char))) + // } + // data = append(data, (string(content))) + // } + + data = Load() b.ResetTimer() b.N = b.N * count @@ -98,19 +160,20 @@ func BenchmarkTried_Put(b *testing.B) { } func BenchmarkTried_Get(b *testing.B) { - + b.StopTimer() var data []string b.N = 1000000 count := 10 - for i := 0; i < b.N; i++ { - var content []rune - for c := 0; c < randomdata.Number(5, 15); c++ { - char := randomdata.Number(0, 26) + 'a' - content = append(content, rune(byte(char))) - } - data = append(data, string(content)) - } + // for i := 0; i < b.N; i++ { + // var content []rune + // for c := 0; c < randomdata.Number(5, 15); c++ { + // char := randomdata.Number(0, 26) + 'a' + // content = append(content, rune(byte(char))) + // } + // data = append(data, string(content)) + // } + data = Load() b.N = b.N * count @@ -119,7 +182,7 @@ func BenchmarkTried_Get(b *testing.B) { tried.Put(v) } - b.ResetTimer() + b.StartTimer() for c := 0; c < count; c++ { for _, v := range data { tried.Get(v)