-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstr.go
114 lines (103 loc) · 2.69 KB
/
str.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
// String searching algorithms
package main
import (
"regexp"
"slices"
"strings"
"sync"
)
var (
Bigrams map[string][]int
bigramOnce sync.Once
)
func makeBigrams(items []string) map[string][]int {
chars := "abcdefghijklmnopqrstuvwxyz "
big := make(map[string][]int)
for _, a := range chars {
for _, b := range chars {
bi := string(a) + string(b)
if _, done := big[bi]; !done {
big[bi] = searchSubstring(items, bi)
}
}
}
// log.Println(len(big))
return big
}
// Given a slice of items, return a slice of indices of each item that contains
// the target word. Normalisation is applied.
//
// Uses default Rabin-Karp algorithm for each string search
func searchSubstring(items []string, target string) []int {
// fmt.Println(items[0])
if target == "" {
return intRange(len(items))
}
targetLower := strings.ToLower(target)
matchIdxs := make([]int, len(items))
var i int
for j, rel := range items {
if strings.Contains(strings.ToLower(rel), targetLower) {
matchIdxs[i] = j
i++
}
}
// slices.Clip(matchIdxs)
return matchIdxs[:i]
}
func searchSubstringCache(items []string, target string, inputCache map[string][]int) []int { // {{{
if matches, ok := inputCache[target]; ok {
return matches
} else {
matches = searchSubstring(items, target)
inputCache[target] = matches
return matches
}
} // }}}
// In real world usage, this is an 8x speedup over searchSubstring.
//
// The caveat: this function relies on the global Bigrams map, which is
// relatively expensive (676 bigrams * 37 k items = 1.5 s), and calculated only
// once for the lifetime of the program.
// Note that because searches will be inherently fuzzy, false positives may be
// returned.
func searchSubstringBigram(items []string, target string) []int {
// if we are generating the bigrams here, it is already too late; user
// input will usually be faster than 1.5s
//
// bigramOnce.Do(func() {
// go func() {
// t := time.Now()
// Bigrams = makeBigrams(items)
// log.Println("bigram construction took", time.Since(t).Seconds())
// }()
// })
if len(target) < 2 {
return searchSubstring(items, target)
} else if strings.Contains(target, ".") {
// this might look like a really crappy impl, but it doesn't
// feel that slow
r := regexp.MustCompile("(?i)" + target)
matches := []int{}
for i, x := range items {
if r.Match([]byte(x)) {
matches = append(matches, i)
}
}
return matches
}
first := target[:2]
idxs := Bigrams[first]
// fmt.Println(first)
for i := 1; i < len(target)-1; i++ {
bi := target[i : i+2]
// fmt.Println(bi)
found := Bigrams[bi]
if len(found) == 0 {
return []int{}
}
idxs = intersect(idxs, found)
}
slices.Sort(idxs)
return idxs
}