Skip to content

Commit b818d1e

Browse files
authored
Merge pull request #211 from ashvardanian/main-golang
Initial GoLang Support
2 parents a8a74ca + 4fbdd0b commit b818d1e

File tree

9 files changed

+527
-65
lines changed

9 files changed

+527
-65
lines changed

.gitattributes

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# GitHub's Linguist doesn't properly classify many languages
2+
# https://github.com/github-linguist/linguist/blob/main/docs/overrides.md
3+
*.h linguist-language=C
4+
*.c linguist-language=C
5+
*.hpp lingujson-language=C++
6+
*.cpp lingujson-language=C++

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
build/
33
build_debug/
44
build_release/
5+
build_go/
6+
build_golang/
57
build_artifacts*
68

79
# Yes, everyone loves keeping this file in the history.

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@
8686
"Needleman",
8787
"newfunc",
8888
"NOARGS",
89+
"nocallback",
90+
"noescape",
8991
"noexcept",
9092
"NOMINMAX",
9193
"NOTIMPLEMENTED",

CONTRIBUTING.md

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,43 @@ cargo package --list --allow-dirty
471471

472472
If you want to run benchmarks against third-party implementations, check out the [`ashvardanian/memchr_vs_stringzilla`](https://github.com/ashvardanian/memchr_vs_stringzilla/) repository.
473473

474+
## Contributing in GoLang
475+
476+
First, precompile the C library:
477+
478+
```bash
479+
cmake -D STRINGZILLA_BUILD_SHARED=1 -D STRINGZILLA_BUILD_TEST=0 -D STRINGZILLA_BUILD_BENCHMARK=0 -B build_golang
480+
cmake --build build_golang
481+
```
482+
483+
Then, navigate to the GoLang module root directory and run the tests from there:
484+
485+
```bash
486+
cd golang
487+
CGO_CFLAGS="-I$(pwd)/../include" \
488+
CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
489+
LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
490+
go test
491+
```
492+
493+
To benchmark:
494+
495+
```bash
496+
cd golang
497+
CGO_CFLAGS="-I$(pwd)/../include" \
498+
CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
499+
LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
500+
go run ../scripts/bench.go --input ../leipzig1M.txt
501+
```
502+
503+
Alternatively:
504+
505+
```bash
506+
export GO111MODULE="off"
507+
go run scripts/test.go
508+
go run scripts/bench.go
509+
```
510+
474511
## General Recommendations
475512

476513
### Operations Not Worth Optimizing

golang/go.mod

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
module github.com/ashvardanian/stringzilla/golang
2+
3+
go 1.24

golang/lib.go

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
// StringZilla is a SIMD-accelerated string library modern CPUs, written in C 99,
2+
// and using AVX2, AVX512, Arm NEON, and SVE intrinsics to accelerate processing.
3+
//
4+
// The GoLang binding is intended to provide a simple interface to a precompiled
5+
// shared library, available on GitHub: https://github.com/ashvardanian/stringzilla
6+
//
7+
// It requires Go 1.24 or newer to leverage the `cGo` `noescape` and `nocallback`
8+
// directives. Without those the latency of calling C functions from Go is too high
9+
// to be useful for string processing.
10+
//
11+
// Unlike the native Go `strings` package, StringZilla primarily targets byte-level
12+
// binary data processing, with less emphasis on UTF-8 and locale-specific tasks.
13+
package sz
14+
15+
// #cgo CFLAGS: -O3
16+
// #cgo LDFLAGS: -L. -L/usr/local/lib -lstringzilla_shared
17+
// #cgo noescape sz_find
18+
// #cgo nocallback sz_find
19+
// #cgo noescape sz_find_byte
20+
// #cgo nocallback sz_find_byte
21+
// #cgo noescape sz_rfind
22+
// #cgo nocallback sz_rfind
23+
// #cgo noescape sz_rfind_byte
24+
// #cgo nocallback sz_rfind_byte
25+
// #cgo noescape sz_find_char_from
26+
// #cgo nocallback sz_find_char_from
27+
// #cgo noescape sz_rfind_char_from
28+
// #cgo nocallback sz_rfind_char_from
29+
// #define SZ_DYNAMIC_DISPATCH 1
30+
// #include <stringzilla/stringzilla.h>
31+
import "C"
32+
import "unsafe"
33+
34+
// Contains reports whether `substr` is within `str`.
35+
// https://pkg.go.dev/strings#Contains
36+
func Contains(str string, substr string) bool {
37+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
38+
strLen := len(str)
39+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
40+
substrLen := len(substr)
41+
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
42+
return matchPtr != nil
43+
}
44+
45+
// Index returns the index of the first instance of `substr` in `str`, or -1 if `substr` is not present.
46+
// https://pkg.go.dev/strings#Index
47+
func Index(str string, substr string) int64 {
48+
substrLen := len(substr)
49+
if substrLen == 0 {
50+
return 0
51+
}
52+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
53+
strLen := len(str)
54+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
55+
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
56+
if matchPtr == nil {
57+
return -1
58+
}
59+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
60+
}
61+
62+
// Index returns the index of the last instance of `substr` in `str`, or -1 if `substr` is not present.
63+
// https://pkg.go.dev/strings#LastIndex
64+
func LastIndex(str string, substr string) int64 {
65+
substrLen := len(substr)
66+
strLen := int64(len(str))
67+
if substrLen == 0 {
68+
return strLen
69+
}
70+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
71+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
72+
matchPtr := unsafe.Pointer(C.sz_rfind(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
73+
if matchPtr == nil {
74+
return -1
75+
}
76+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
77+
}
78+
79+
// Index returns the index of the first instance of a byte in `str`, or -1 if a byte is not present.
80+
// https://pkg.go.dev/strings#IndexByte
81+
func IndexByte(str string, c byte) int64 {
82+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
83+
strLen := len(str)
84+
cPtr := (*C.char)(unsafe.Pointer(&c))
85+
matchPtr := unsafe.Pointer(C.sz_find_byte(strPtr, C.ulong(strLen), cPtr))
86+
if matchPtr == nil {
87+
return -1
88+
}
89+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
90+
}
91+
92+
// Index returns the index of the last instance of a byte in `str`, or -1 if a byte is not present.
93+
// https://pkg.go.dev/strings#LastIndexByte
94+
func LastIndexByte(str string, c byte) int64 {
95+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
96+
strLen := len(str)
97+
cPtr := (*C.char)(unsafe.Pointer(&c))
98+
matchPtr := unsafe.Pointer(C.sz_rfind_byte(strPtr, C.ulong(strLen), cPtr))
99+
if matchPtr == nil {
100+
return -1
101+
}
102+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
103+
}
104+
105+
// Index returns the index of the first instance of any byte from `substr` in `str`, or -1 if none are present.
106+
// https://pkg.go.dev/strings#IndexAny
107+
func IndexAny(str string, substr string) int64 {
108+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
109+
strLen := len(str)
110+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
111+
substrLen := len(substr)
112+
matchPtr := unsafe.Pointer(C.sz_find_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
113+
if matchPtr == nil {
114+
return -1
115+
}
116+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
117+
}
118+
119+
// Index returns the index of the last instance of any byte from `substr` in `str`, or -1 if none are present.
120+
// https://pkg.go.dev/strings#LastIndexAny
121+
func LastIndexAny(str string, substr string) int64 {
122+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
123+
strLen := len(str)
124+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
125+
substrLen := len(substr)
126+
matchPtr := unsafe.Pointer(C.sz_rfind_char_from(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
127+
if matchPtr == nil {
128+
return -1
129+
}
130+
return int64(uintptr(matchPtr) - uintptr(unsafe.Pointer(strPtr)))
131+
}
132+
133+
// Count returns the number of overlapping or non-overlapping instances of `substr` in `str`.
134+
// If `substr` is an empty string, returns 1 + the length of the `str`.
135+
// https://pkg.go.dev/strings#Count
136+
func Count(str string, substr string, overlap bool) int64 {
137+
strPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(str)))
138+
strLen := int64(len(str))
139+
substrPtr := (*C.char)(unsafe.Pointer(unsafe.StringData(substr)))
140+
substrLen := int64(len(substr))
141+
142+
if strLen == 0 || strLen < substrLen {
143+
return 0
144+
}
145+
if substrLen == 0 {
146+
return 1 + strLen
147+
}
148+
149+
count := int64(0)
150+
if overlap == true {
151+
for strLen > 0 {
152+
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
153+
if matchPtr == nil {
154+
break
155+
}
156+
count += 1
157+
strLen -= (1 + int64(uintptr(matchPtr)-uintptr(unsafe.Pointer(strPtr))))
158+
strPtr = (*C.char)(unsafe.Add(matchPtr, 1))
159+
}
160+
} else {
161+
for strLen > 0 {
162+
matchPtr := unsafe.Pointer(C.sz_find(strPtr, C.ulong(strLen), substrPtr, C.ulong(substrLen)))
163+
if matchPtr == nil {
164+
break
165+
}
166+
count += 1
167+
strLen -= (substrLen + int64(uintptr(matchPtr)-uintptr(unsafe.Pointer(strPtr))))
168+
strPtr = (*C.char)(unsafe.Add(matchPtr, substrLen))
169+
}
170+
}
171+
172+
return count
173+
}

0 commit comments

Comments
 (0)