Skip to content

Commit ca79c84

Browse files
author
willzhen
committed
Migreate code
1 parent 44f9820 commit ca79c84

File tree

7 files changed

+318
-0
lines changed

7 files changed

+318
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@
1919

2020
# Go workspace file
2121
go.work
22+
go.sum

aho_automaton.go

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
package text
2+
3+
import "github.com/eapache/queue"
4+
5+
// TrieNode ac自动机节点
6+
type TrieNode struct {
7+
value rune
8+
next map[rune]*TrieNode
9+
fail *TrieNode
10+
emit string
11+
}
12+
13+
func newNode(ch rune) (node *TrieNode) {
14+
node = new(TrieNode)
15+
node.value = ch
16+
node.next = map[rune]*TrieNode{}
17+
return node
18+
}
19+
20+
// AcTrie ac自动机匹配字符串算法
21+
type AcTrie struct {
22+
root *TrieNode
23+
}
24+
25+
// Search 返回匹配的字符串
26+
func (ac *AcTrie) Search(s string) (list []string, index []int) {
27+
node := ac.root
28+
for i, c := range []rune(s) {
29+
matched := true
30+
for {
31+
_, ok := node.next[c]
32+
if ok {
33+
break
34+
}
35+
if node.fail == nil {
36+
matched = false
37+
node = ac.root
38+
break
39+
}
40+
node = node.fail
41+
}
42+
if !matched {
43+
continue
44+
}
45+
node = node.next[c]
46+
p := node
47+
for p != nil {
48+
if p.emit != "" {
49+
list = append(list, p.emit)
50+
index = append(index, i+1)
51+
}
52+
p = p.fail
53+
}
54+
}
55+
return list, index
56+
}
57+
58+
//BuildAcTrie 构建一个 ac 自动机
59+
func BuildAcTrie(words []string) (acTrie *AcTrie) {
60+
acTrie = new(AcTrie)
61+
acTrie.root = newNode(rune('r'))
62+
for _, word := range words {
63+
node := acTrie.root
64+
for _, ch := range []rune(word) {
65+
if _, ok := node.next[ch]; !ok {
66+
node.next[ch] = newNode(ch)
67+
}
68+
node = node.next[ch]
69+
}
70+
node.emit = word
71+
}
72+
queue := queue.New()
73+
queue.Add([]*TrieNode{acTrie.root, nil})
74+
for queue.Length() > 0 {
75+
nodeParent := queue.Remove().([]*TrieNode)
76+
curr, parent := nodeParent[0], nodeParent[1]
77+
for _, sub := range curr.next {
78+
queue.Add([]*TrieNode{sub, curr})
79+
}
80+
if parent == nil {
81+
continue
82+
}
83+
if parent == acTrie.root {
84+
curr.fail = acTrie.root
85+
} else {
86+
fail := parent.fail
87+
for fail != nil {
88+
_, ok := fail.next[curr.value]
89+
if ok {
90+
break
91+
}
92+
fail = fail.fail
93+
}
94+
if fail != nil {
95+
curr.fail = fail.next[curr.value]
96+
} else {
97+
curr.fail = acTrie.root
98+
}
99+
}
100+
}
101+
return acTrie
102+
}

edit_distance.go

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package text
2+
3+
func min(a, b int) int {
4+
if a < b {
5+
return a
6+
}
7+
return b
8+
}
9+
10+
func max(a, b int) int {
11+
if a > b {
12+
return a
13+
}
14+
return b
15+
}
16+
17+
// Levenshtein 文本编辑距离
18+
func Levenshtein(word1, word2 []rune) int {
19+
/*
20+
计算编辑距离
21+
Args
22+
word1: 第一个文本
23+
word2: 第二个文本
24+
Returns
25+
两个文本的编辑距离
26+
*/
27+
if len(word1) == 0 || len(word2) == 0 {
28+
return max(len(word1), len(word2))
29+
}
30+
tmp := []int{}
31+
for i := 0; i < len(word2)+1; i++ {
32+
tmp = append(tmp, i)
33+
}
34+
value := 0
35+
for i := range word1 {
36+
tmp[0] = i + 1
37+
last := i
38+
for j := range word2 {
39+
if word1[i] == word2[j] {
40+
value = last
41+
} else {
42+
value = 1 + min(last, min(tmp[j], tmp[j+1]))
43+
}
44+
last = tmp[j+1]
45+
tmp[j+1] = value
46+
}
47+
}
48+
return value
49+
}
50+
51+
// TextSim 计算文本的相识度
52+
func TextSim(str1, str2 string) float32 {
53+
// 需要把 string 转换成 rune
54+
s1 := []rune(str1)
55+
s2 := []rune(str2)
56+
if len(s1) == 0 && len(s2) == 0 {
57+
return 1.0
58+
}
59+
n := Levenshtein(s1, s2)
60+
maxn := max(len(s1), len(s2))
61+
l1 := 1.0 - float32(n)/float32(maxn)
62+
return l1
63+
}

go.mod

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
module github.com/memory-overflow/go-text-algorithm
2+
3+
go 1.16
4+
5+
require github.com/eapache/queue v1.1.0

readme.md

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
- [text 模块](#text-模块)
2+
- [SliceSame](#slicesame)
3+
- [Aho-Corasick automaton](#aho-corasick-automaton)
4+
- [计算文本编辑距离](#计算文本编辑距离)
5+
- [计算文本相似度](#计算文本相似度)
6+
7+
# text 模块
8+
golang 里面的 strings 库已经有了很多丰富的字符串处理功能,但是都是偏向于基础处理。
9+
10+
text模块提供了一些字符串处理相关的算法能力。
11+
12+
## SliceSame
13+
- SliceSame——判断两个字符串数字是否相同。
14+
15+
example: [TestSliceSmae](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L29)
16+
```go
17+
import (
18+
"testing"
19+
20+
"github.com/memory-overflow/go-common-library/text"
21+
)
22+
23+
func TestSliceSmae(t *testing.T) {
24+
a := []string{"3", "2", "1"}
25+
same := text.SliceSame(a, a)
26+
t.Logf("is same: %v", same)
27+
// test can not change order of a
28+
t.Log(a)
29+
}
30+
```
31+
32+
## Aho-Corasick automaton
33+
ac 自动机是一种多模式串的匹配算法。
34+
35+
一个常见的例子就是给出 n 个单词,再给出一段包含 m 个字符的文章,让你找出有多少个单词在文章里出现过。
36+
37+
比较容易想到的做法是,调用 n 次 `strings.Contains(s, xxx)`。假设 n 个单词平局长度为 k, 这样处理的算法时间复杂度为 O(n * k * m)。而使用 ac 自动机可以加速上述过程,整体算法时间复杂度只需要 O(n*k + m)。
38+
39+
example: [TestActrie](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L9)
40+
```go
41+
import (
42+
"testing"
43+
44+
"github.com/memory-overflow/go-common-library/text"
45+
)
46+
47+
func TestActrie(t *testing.T) {
48+
// 在字符串 "哈哈哈哈23434dfgdd" 中找出所有 "哈哈哈", "234","dfg" 出现的位置。
49+
// 使用模式串构建一个 ac 自动机
50+
ac := text.BuildAcTrie([]string{"哈哈哈", "234", "dfg"})
51+
// 匹配母串
52+
list, index := ac.Search("哈哈哈哈23434dfgdd")
53+
for i, l := range list {
54+
t.Log(l, index[i])
55+
}
56+
}
57+
```
58+
59+
## 计算文本编辑距离
60+
编辑距离(Edit Distance):是一个度量两个字符序列之间差异的字符串度量标准,两个单词之间的编辑距离是将一个单词转换为另一个单词所需的单字符编辑(插入、删除或替换)的最小数量。一般来说,编辑距离越小,两个串的相似度越大。
61+
62+
example: [TestLevenshtein](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L24)
63+
```go
64+
import (
65+
"testing"
66+
67+
"github.com/memory-overflow/go-common-library/text"
68+
)
69+
70+
func TestLevenshtein(t *testing.T) {
71+
dist := text.Levenshtein([]rune("编辑距离测试"), []rune("测试一下距离"))
72+
t.Logf("dist: %d", dist)
73+
}
74+
```
75+
76+
## 计算文本相似度
77+
通过编辑距离,计算两个文本的相似度。
78+
79+
example: [TestTextSim](https://github.com/memory-overflow/go-common-library/blob/main/text/text_test.go#L17)
80+
```go
81+
import (
82+
"testing"
83+
84+
"github.com/memory-overflow/go-common-library/text"
85+
)
86+
87+
func TestTextSim(t *testing.T) {
88+
sim := text.TextSim("编辑距离测试", "测试一下距离")
89+
t.Logf("sim: %f", sim)
90+
}
91+
```

slice_same.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package text
2+
3+
import "sort"
4+
5+
// SliceSame 对于两个列表的值是否一样
6+
func SliceSame(a, b []string) bool {
7+
if len(a) != len(b) {
8+
return false
9+
}
10+
tmpa, tempb := []string{}, []string{}
11+
copy(tmpa, a)
12+
copy(tempb, b)
13+
sort.Strings(tmpa)
14+
sort.Strings(tempb)
15+
for i := 0; i < len(tmpa); i++ {
16+
if tmpa[i] != tempb[i] {
17+
return false
18+
}
19+
}
20+
return true
21+
}

text_test.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
package text_test
2+
3+
import (
4+
"testing"
5+
6+
"github.com/memory-overflow/go-text-algorithm"
7+
)
8+
9+
func TestActrie(t *testing.T) {
10+
ac := text.BuildAcTrie([]string{"哈哈哈", "234", "dfg"})
11+
list, index := ac.Search("哈哈哈哈23434dfgdd")
12+
for i, l := range list {
13+
t.Log(l, index[i])
14+
}
15+
}
16+
17+
func TestTextSim(t *testing.T) {
18+
sim := text.TextSim("编辑距离测试", "测试一下距离")
19+
if sim != 0 {
20+
t.Error("Failed")
21+
}
22+
}
23+
24+
func TestLevenshtein(t *testing.T) {
25+
dist := text.Levenshtein([]rune("编辑距离测试"), []rune("测试一下距离"))
26+
t.Logf("dist: %d", dist)
27+
}
28+
29+
func TestSliceSmae(t *testing.T) {
30+
a := []string{"3", "2", "1"}
31+
same := text.SliceSame(a, a)
32+
t.Logf("is same: %v", same)
33+
// test can not change order of a
34+
t.Log(a)
35+
}

0 commit comments

Comments
 (0)