-
Notifications
You must be signed in to change notification settings - Fork 65
/
trigrams.go
79 lines (67 loc) · 1.47 KB
/
trigrams.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
package whatlanggo
import (
"sort"
"strings"
"unicode"
)
type trigram struct {
trigram string
count int
}
//convert punctuations and digits to space.
func toTrigramChar(ch rune) rune {
if isStopChar(ch) {
return ' '
}
return ch
}
func getTrigramsWithPositions(text string) map[string]int {
counterMap := count(text)
trigrams := make([]trigram, len(counterMap))
i := 0
for tg, count := range counterMap {
trigrams[i] = trigram{tg, count}
i++
}
sort.SliceStable(trigrams, func(i, j int) bool {
if trigrams[i].count == trigrams[j].count {
return strings.Compare(trigrams[i].trigram, trigrams[j].trigram) < 0
}
return trigrams[i].count < trigrams[j].count
})
trigramsWithPositions := map[string]int{}
j := 0
for i := len(trigrams) - 1; i >= 0; i-- {
trigramsWithPositions[trigrams[i].trigram] = j
j++
}
return trigramsWithPositions
}
func count(text string) map[string]int {
var r1, r2, r3 rune
trigrams := map[string]int{}
var txt []rune
for _, r := range text {
txt = append(txt, unicode.ToLower(toTrigramChar(r)))
}
txt = append(txt, ' ')
r1 = ' '
r2 = txt[0]
for i := 1; i < len(txt); i++ {
r3 = txt[i]
if !(r2 == ' ' && (r1 == ' ' || r3 == ' ')) {
trigram := []rune{}
trigram = append(trigram, r1)
trigram = append(trigram, r2)
trigram = append(trigram, r3)
if trigrams[string(trigram)] == 0 {
trigrams[string(trigram)] = 1
} else {
trigrams[string(trigram)]++
}
}
r1 = r2
r2 = r3
}
return trigrams
}