Skip to content

Commit 4fbdd0b

Browse files
committed
Improve: Extended GoLang benchmarks
1 parent 7085d92 commit 4fbdd0b

File tree

3 files changed

+125
-50
lines changed

3 files changed

+125
-50
lines changed

CONTRIBUTING.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,16 @@ LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
490490
go test
491491
```
492492

493+
To benchmark:
494+
495+
```bash
496+
cd golang
497+
CGO_CFLAGS="-I$(pwd)/../include" \
498+
CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
499+
LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
500+
go run ../scripts/bench.go --input ../leipzig1M.txt
501+
```
502+
493503
Alternatively:
494504

495505
```bash

golang/lib.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,16 @@ package sz
1515
// #cgo CFLAGS: -O3
1616
// #cgo LDFLAGS: -L. -L/usr/local/lib -lstringzilla_shared
1717
// #cgo noescape sz_find
18-
// #cgo noescape sz_find_byte
19-
// #cgo noescape sz_rfind
20-
// #cgo noescape sz_rfind_byte
21-
// #cgo noescape sz_find_char_from
22-
// #cgo noescape sz_rfind_char_from
2318
// #cgo nocallback sz_find
19+
// #cgo noescape sz_find_byte
2420
// #cgo nocallback sz_find_byte
21+
// #cgo noescape sz_rfind
2522
// #cgo nocallback sz_rfind
23+
// #cgo noescape sz_rfind_byte
2624
// #cgo nocallback sz_rfind_byte
25+
// #cgo noescape sz_find_char_from
2726
// #cgo nocallback sz_find_char_from
27+
// #cgo noescape sz_rfind_char_from
2828
// #cgo nocallback sz_rfind_char_from
2929
// #define SZ_DYNAMIC_DISPATCH 1
3030
// #include <stringzilla/stringzilla.h>

scripts/bench.go

Lines changed: 110 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,135 @@
11
package main
22

33
import (
4+
"flag"
45
"fmt"
6+
"math/rand"
7+
"os"
58
"strings"
9+
"testing"
610
"time"
711

8-
sz "../go/stringzilla"
12+
sz "github.com/ashvardanian/stringzilla/golang"
913
)
1014

11-
func main() {
15+
var sink any //? Global sink to defeat dead-code elimination
1216

13-
str := strings.Repeat("0123456789", 10000) + "something"
14-
pat := "some"
17+
// Repeats a certain function `f` multiple times and prints the benchmark results.
18+
func runBenchmark[T any](name string, f func() T) {
19+
benchResult := testing.Benchmark(func(b *testing.B) {
20+
for i := 0; i < b.N; i++ {
21+
sink = f()
22+
}
23+
})
24+
fmt.Printf("%-30s: %s\n", name, benchResult.String())
25+
}
1526

16-
fmt.Println("Contains")
17-
t := time.Now()
18-
for i := 0; i < 1; i++ {
19-
strings.Contains(str, pat)
20-
}
21-
fmt.Println(" ", time.Since(t), "\tstrings.Contains")
27+
func main() {
2228

23-
t = time.Now()
24-
for i := 0; i < 1; i++ {
25-
sz.Contains(str, pat)
26-
}
27-
fmt.Println(" ", time.Since(t), "\tsz.Contains")
29+
// Define command-line flags.
30+
inputPath := flag.String("input", "", "Path to input file for benchmarking. (Required)")
31+
seedInt := flag.Int64("seed", 0, "Seed for the random number generator. If 0, the current time is used.")
32+
splitMode := flag.String("split", "tokens", "How to split input file: 'tokens' (default) or 'lines'.")
33+
flag.Parse()
2834

29-
fmt.Println("Index")
30-
t = time.Now()
31-
for i := 0; i < 1; i++ {
32-
strings.Index(str, pat)
35+
// Ensure input file is provided.
36+
if *inputPath == "" {
37+
fmt.Fprintln(os.Stderr, "Error: input file must be specified using the -input flag.")
38+
flag.Usage()
39+
os.Exit(1)
3340
}
34-
fmt.Println(" ", time.Since(t), "\tstrings.Index")
3541

36-
t = time.Now()
37-
for i := 0; i < 1; i++ {
38-
sz.Index(str, pat)
42+
// Read input data from file.
43+
bytes, err := os.ReadFile(*inputPath)
44+
if err != nil {
45+
fmt.Fprintf(os.Stderr, "Error reading input file: %v\n", err)
46+
os.Exit(1)
3947
}
40-
fmt.Println(" ", time.Since(t), "\tsz.Index")
48+
data := string(bytes)
49+
fmt.Printf("Benchmarking on `%s` with seed %d.\n", *inputPath, *seedInt)
50+
fmt.Printf("Total input length: %d\n", len(data))
4151

42-
fmt.Println("IndexAny")
43-
t = time.Now()
44-
for i := 0; i < 1; i++ {
45-
strings.IndexAny(str, pat)
52+
// Split the data into items based on the chosen mode.
53+
var items []string
54+
switch *splitMode {
55+
case "lines":
56+
rawLines := strings.Split(data, "\n")
57+
// Filter out empty lines.
58+
for _, line := range rawLines {
59+
if line != "" {
60+
items = append(items, line)
61+
}
62+
}
63+
if len(items) == 0 {
64+
items = []string{"default"}
65+
}
66+
// Print line statistics.
67+
totalLen := 0
68+
for _, line := range items {
69+
totalLen += len(line)
70+
}
71+
fmt.Printf("Total lines: %d\n", len(items))
72+
fmt.Printf("Average line length: %.2f\n", float64(totalLen)/float64(len(items)))
73+
default: // "tokens" or any other value defaults to token mode.
74+
items = strings.Fields(data)
75+
if len(items) == 0 {
76+
items = []string{"default"}
77+
}
78+
fmt.Printf("Total tokens: %d\n", len(items))
79+
fmt.Printf("Average token length: %.2f\n", float64(len(data))/float64(len(items)))
4680
}
47-
fmt.Println(" ", time.Since(t), "\tstrings.IndexAny")
4881

49-
t = time.Now()
50-
for i := 0; i < 1; i++ {
51-
sz.IndexAny(str, pat)
82+
// In Go, a string is represented as a (length, data) pair. If you pass a string around,
83+
// Go will copy the length and the pointer but not the data pointed to.
84+
// It's problematic for our benchmark as it makes substring operations meaningless -
85+
// just comparing if a pointer falls in the range.
86+
// To avoid that, let's copy strings to `[]byte` and back to force a new allocation.
87+
for i, item := range items {
88+
items[i] = string([]byte(item))
5289
}
53-
fmt.Println(" ", time.Since(t), "\tsz.IndexAny")
5490

55-
str = strings.Repeat("0123456789", 100000) + "something"
56-
pat = "123456789"
57-
fmt.Println("Count")
58-
t = time.Now()
59-
for i := 0; i < 1; i++ {
60-
strings.Count(str, pat)
91+
// Create a seeded reproducible random number generator.
92+
if *seedInt == 0 {
93+
*seedInt = time.Now().UnixNano()
6194
}
62-
fmt.Println(" ", time.Since(t), "\tstrings.Count")
63-
64-
t = time.Now()
65-
for i := 0; i < 1; i++ {
66-
sz.Count(str, pat, false)
95+
generator := rand.New(rand.NewSource(*seedInt))
96+
randomItem := func() string {
97+
return items[generator.Intn(len(items))]
6798
}
68-
fmt.Println(" ", time.Since(t), "\tsz.Count")
6999

100+
fmt.Println("Running benchmark using `testing.Benchmark`.")
101+
102+
runBenchmark("strings.Contains", func() bool {
103+
return strings.Contains(data, randomItem())
104+
})
105+
runBenchmark("sz.Contains", func() bool {
106+
return sz.Contains(data, randomItem())
107+
})
108+
runBenchmark("strings.Index", func() int {
109+
return strings.Index(data, randomItem())
110+
})
111+
runBenchmark("sz.Index", func() int64 {
112+
return sz.Index(data, randomItem())
113+
})
114+
runBenchmark("strings.LastIndex", func() int {
115+
return strings.LastIndex(data, randomItem())
116+
})
117+
runBenchmark("sz.LastIndex", func() int64 {
118+
return sz.LastIndex(data, randomItem())
119+
})
120+
runBenchmark("strings.IndexAny", func() int {
121+
return strings.IndexAny(randomItem(), "*^")
122+
})
123+
runBenchmark("sz.IndexAny", func() int64 {
124+
return sz.IndexAny(randomItem(), "*^")
125+
})
126+
runBenchmark("strings.Count", func() int {
127+
return strings.Count(data, randomItem())
128+
})
129+
runBenchmark("sz.Count (non-overlap)", func() int64 {
130+
return sz.Count(data, randomItem(), false)
131+
})
132+
runBenchmark("sz.Count (overlap)", func() int64 {
133+
return sz.Count(data, randomItem(), true)
134+
})
70135
}

0 commit comments

Comments
 (0)