Skip to content

Commit 61fc238

Browse files
committed
sift4
1 parent 92b1210 commit 61fc238

File tree

5 files changed

+315
-1
lines changed

5 files changed

+315
-1
lines changed

.github/workflows/test.yml

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
name: Test
2+
3+
on:
4+
push:
5+
branches: [master]
6+
pull_request:
7+
branches: [master]
8+
9+
permissions: read-all
10+
11+
jobs:
12+
build:
13+
name: Test
14+
runs-on: ubuntu-latest
15+
steps:
16+
- name: code
17+
uses: actions/checkout@v4
18+
19+
- name: go
20+
uses: actions/setup-go@v5
21+
with:
22+
go-version: ^1.24
23+
24+
- name: test
25+
run: go test -coverprofile=coverage.out -covermode=atomic -cover ./...
26+
27+
- name: fuzz
28+
run: go test -list . | grep Fuzz | xargs -P 8 -I {} go test -fuzz {} -fuzztime 30s .
29+
30+
- name: Upload coverage to Codecov
31+
uses: codecov/[email protected]
32+
with:
33+
token: ${{ secrets.CODECOV_TOKEN }}
34+
files: coverage.out

README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1,24 @@
1-
# sift4
1+
SIFT4 — fast approximate string distance algorithm
2+
3+
* zero memory copy
4+
* 100% test coverage
5+
6+
```bash
7+
$ go test -benchmem -bench .
8+
goos: darwin
9+
goarch: arm64
10+
pkg: github.com/ndx-technologies/sift4
11+
cpu: Apple M3 Max
12+
BenchmarkSIFT4Distance/empty-16 956523358 1.109 ns/op 0 B/op 0 allocs/op
13+
BenchmarkSIFT4Distance/one_empty-16 1000000000 1.093 ns/op 0 B/op 0 allocs/op
14+
BenchmarkSIFT4Distance/equal-16 562000238 2.137 ns/op 0 B/op 0 allocs/op
15+
BenchmarkSIFT4Distance/different-16 16788264 71.80 ns/op 48 B/op 2 allocs/op
16+
BenchmarkSIFT4Distance/long_different-16 1488578 809.1 ns/op 24 B/op 1 allocs/op
17+
BenchmarkSIFT4Distance/buffer/empty-16 1000000000 1.122 ns/op 0 B/op 0 allocs/op
18+
BenchmarkSIFT4Distance/buffer/one_empty-16 1000000000 1.116 ns/op 0 B/op 0 allocs/op
19+
BenchmarkSIFT4Distance/buffer/equal-16 552020344 2.179 ns/op 0 B/op 0 allocs/op
20+
BenchmarkSIFT4Distance/buffer/different-16 29124799 41.24 ns/op 0 B/op 0 allocs/op
21+
BenchmarkSIFT4Distance/buffer/long_different-16 1520445 789.8 ns/op 0 B/op 0 allocs/op
22+
PASS
23+
ok github.com/ndx-technologies/sift4 11.848s
24+
```

go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/ndx-technologies/sift4
2+
3+
go 1.24.0

sfit4_test.go

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
package sift4_test
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
"testing"
7+
8+
"github.com/ndx-technologies/sift4"
9+
)
10+
11+
func ExampleDistance() {
12+
d := sift4.Distance("kitten", "sitting", 100, 5, nil)
13+
fmt.Print(d)
14+
// Output: 3
15+
}
16+
17+
func ExampleDistance_buffer() {
18+
var b sift4.Buffer
19+
d := sift4.Distance("kitten", "sitting", 100, 5, &b)
20+
fmt.Print(d)
21+
// Output: 3
22+
}
23+
24+
func TestDistance(t *testing.T) {
25+
tests := []struct {
26+
s1 string
27+
s2 string
28+
maxOffset int
29+
maxDistance int
30+
distance int
31+
}{
32+
{"kitten", "sitting", 100, 5, 3},
33+
{"book", "back", 100, 5, 2},
34+
{"", "abc", 100, 5, 3},
35+
{"abc", "", 100, 5, 3},
36+
{"", "", 100, 5, 0},
37+
{"a", "a", 100, 5, 0},
38+
{"a", "b", 100, 5, 1},
39+
{"ab", "abc", 100, 5, 1},
40+
{"abc", "ab", 100, 5, 1},
41+
{"abc", "def", 100, 5, 3},
42+
{"hello", "helo", 100, 5, 1},
43+
{"world", "word", 100, 5, 1},
44+
{"halooooxo", "hbloooogo", 100, 5, 6},
45+
46+
// early exit not reached
47+
{"distance", "difference", 100, 5, 6},
48+
{"abcdef", "xyz", 100, 2, 3},
49+
{"abcdefabcdefabcdefabcdefabcdefabcdef", "xyz", 100, 2, 3},
50+
51+
// transposition
52+
{"abc", "acb", 100, 5, 1}, // Damerau–Levenshtein distance, transposition of adjacent characters is one operation
53+
{"ab", "ba", 100, 5, 1},
54+
{"abcd", "badc", 100, 5, 2}, // two transpositions
55+
{"abc", "acb", 100, 5, 1},
56+
{"aab", "baa", 100, 5, 1}, // covers cursor adjustment when s1[c1] == s2[c2+i]
57+
{"abcd", "cdab", 100, 5, 2}, // transposition test with cyclic shift
58+
{"01", "11", 100, 5, 1},
59+
{"00010", "000010", 100, 5, 2},
60+
}
61+
62+
for _, tc := range tests {
63+
t.Run("", func(t *testing.T) {
64+
var buf sift4.Buffer
65+
d := sift4.Distance(tc.s1, tc.s2, tc.maxOffset, tc.maxDistance, &buf)
66+
if d != tc.distance {
67+
t.Error(tc, d)
68+
}
69+
})
70+
}
71+
}
72+
73+
func Benchmark_______________________________________(b *testing.B) {}
74+
75+
func BenchmarkSIFT4Distance(b *testing.B) {
76+
testCases := []struct {
77+
name string
78+
s1 string
79+
s2 string
80+
}{
81+
{"empty", "", ""},
82+
{"one empty", "hello", ""},
83+
{"equal", "kitten", "kitten"},
84+
{"different", "kitten", "sitting"},
85+
{"long different", strings.Repeat("a", 256), strings.Repeat("b", 256)},
86+
}
87+
for _, tc := range testCases {
88+
b.Run(tc.name, func(b *testing.B) {
89+
for b.Loop() {
90+
sift4.Distance(tc.s1, tc.s2, 100, 5, nil)
91+
}
92+
})
93+
}
94+
95+
b.Run("buffer", func(b *testing.B) {
96+
for _, tc := range testCases {
97+
b.Run(tc.name, func(b *testing.B) {
98+
var buffer sift4.Buffer
99+
for b.Loop() {
100+
sift4.Distance(tc.s1, tc.s2, 100, 5, &buffer)
101+
}
102+
})
103+
}
104+
})
105+
}
106+
107+
func FuzzSIFT4Distance(f *testing.F) {
108+
f.Add("", "")
109+
f.Add("hello", "")
110+
f.Add("", "world")
111+
f.Add("kitten", "sitting")
112+
113+
f.Fuzz(func(t *testing.T, s1, s2 string) {
114+
d := sift4.Distance(s1, s2, 100, 5, nil)
115+
116+
if d < 0 {
117+
t.Error("d < 0")
118+
}
119+
if s1 == s2 && d != 0 {
120+
t.Error(d)
121+
}
122+
if s1 == "" && d != len(s2) {
123+
t.Error(d)
124+
}
125+
if s2 == "" && d != len(s1) {
126+
t.Error(d)
127+
}
128+
})
129+
}

sift4.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
package sift4
2+
3+
import "slices"
4+
5+
// Buffer stores temporary structures and can be reused in multiple runs.
6+
// Zero value is safe to use.
7+
type Buffer struct {
8+
offset []offset
9+
}
10+
11+
type offset struct {
12+
c1 int
13+
c2 int
14+
trans bool
15+
}
16+
17+
// Distance is fast O(N) approximate string distance algorithm.
18+
// If pointer to buffer is provided, then it will be reused for storing temporary structures.
19+
// - maxOffset is the number of characters to search for matching letters
20+
// - maxDistance is the distance at which the algorithm should stop computing the value and just exit (the strings are too different anyway)
21+
//
22+
// https://siderite.dev/blog/super-fast-and-accurate-string-distance.html
23+
func Distance(s1, s2 string, maxOffset, maxDistance int, buffer *Buffer) int {
24+
if len(s1) == 0 {
25+
return len(s2)
26+
}
27+
if len(s2) == 0 {
28+
return len(s1)
29+
}
30+
if s1 == s2 {
31+
return 0
32+
}
33+
34+
var (
35+
c1, c2 int // cursor for string 1 and 2
36+
lcss int // largest common subsequence
37+
localCS int // local common substring
38+
trans int // number of transpositions
39+
)
40+
41+
if buffer == nil {
42+
buffer = &Buffer{}
43+
}
44+
buffer.offset = buffer.offset[:0]
45+
46+
for (c1 < len(s1)) && (c2 < len(s2)) {
47+
if s1[c1] == s2[c2] {
48+
isTrans := false
49+
50+
localCS++
51+
52+
for i := 0; i < len(buffer.offset); {
53+
ofs := buffer.offset[i]
54+
if c1 <= ofs.c1 || c2 <= ofs.c2 {
55+
isTrans = abs(c2-c1) >= abs(ofs.c2-ofs.c1)
56+
if isTrans {
57+
trans++
58+
} else if !ofs.trans {
59+
ofs.trans = true
60+
trans++
61+
}
62+
break
63+
} else {
64+
if c1 > ofs.c2 && c2 > ofs.c1 {
65+
buffer.offset = slices.Delete(buffer.offset, i, i+1)
66+
} else {
67+
i++
68+
}
69+
}
70+
}
71+
72+
buffer.offset = append(buffer.offset, offset{c1, c2, isTrans})
73+
} else {
74+
lcss += localCS
75+
localCS = 0
76+
77+
if c1 != c2 {
78+
c1 = min(c1, c2) // using min allows the computation of transpositions
79+
c2 = c1
80+
}
81+
82+
// if matching characters are found, remove 1 from both cursors (they get incremented at the end of the loop)
83+
// so that we can have only one code block handling matches
84+
for i := 0; i < maxOffset && (c1+i < len(s1) || c2+i < len(s2)); i++ {
85+
if c1+i < len(s1) && s1[c1+i] == s2[c2] {
86+
c1 += i - 1
87+
c2--
88+
break
89+
}
90+
if c2+i < len(s2) && s1[c1] == s2[c2+i] {
91+
c1--
92+
c2 += i - 1
93+
break
94+
}
95+
}
96+
}
97+
98+
c1++
99+
c2++
100+
101+
if maxDistance > 0 {
102+
if d := max(c1, c2) - lcss + trans; d > maxDistance {
103+
return d
104+
}
105+
}
106+
107+
// this covers the case where the last match is on the last token in list, so that it can compute transpositions correctly
108+
if c1 >= len(s1) || c2 >= len(s2) {
109+
lcss += localCS
110+
localCS = 0
111+
c1 = min(c1, c2)
112+
c2 = c1
113+
}
114+
}
115+
116+
lcss += localCS
117+
return max(len(s1), len(s2)) - lcss + trans // add the cost of transpositions to the final result
118+
}
119+
120+
func abs[T int](x T) T {
121+
if x < 0 {
122+
return -x
123+
}
124+
return x
125+
}

0 commit comments

Comments
 (0)