Improve: Extended GoLang benchmarks

ashvardanian · ashvardanian · commit 4fbdd0bc2645 · 2025-02-23T16:24:54.000Z
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -490,6 +490,16 @@ LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
 go test
 ```
 
+To benchmark:
+
+```bash
+cd golang
+CGO_CFLAGS="-I$(pwd)/../include" \
+CGO_LDFLAGS="-L$(pwd)/../build_golang -lstringzilla_shared" \
+LD_LIBRARY_PATH="$(pwd)/../build_golang:$LD_LIBRARY_PATH" \
+go run ../scripts/bench.go --input ../leipzig1M.txt
+```
+
 Alternatively:
 
 ```bash
diff --git a/golang/lib.go b/golang/lib.go
@@ -15,16 +15,16 @@ package sz
 // #cgo CFLAGS: -O3
 // #cgo LDFLAGS: -L. -L/usr/local/lib -lstringzilla_shared
 // #cgo noescape sz_find
-// #cgo noescape sz_find_byte
-// #cgo noescape sz_rfind
-// #cgo noescape sz_rfind_byte
-// #cgo noescape sz_find_char_from
-// #cgo noescape sz_rfind_char_from
 // #cgo nocallback sz_find
+// #cgo noescape sz_find_byte
 // #cgo nocallback sz_find_byte
+// #cgo noescape sz_rfind
 // #cgo nocallback sz_rfind
+// #cgo noescape sz_rfind_byte
 // #cgo nocallback sz_rfind_byte
+// #cgo noescape sz_find_char_from
 // #cgo nocallback sz_find_char_from
+// #cgo noescape sz_rfind_char_from
 // #cgo nocallback sz_rfind_char_from
 // #define SZ_DYNAMIC_DISPATCH 1
 // #include <stringzilla/stringzilla.h>
diff --git a/scripts/bench.go b/scripts/bench.go
@@ -1,70 +1,135 @@
 package main
 
 import (
+	"flag"
 	"fmt"
+	"math/rand"
+	"os"
 	"strings"
+	"testing"
 	"time"
 
-	sz "../go/stringzilla"
+	sz "github.com/ashvardanian/stringzilla/golang"
 )
 
-func main() {
+var sink any //? Global sink to defeat dead-code elimination
 
-	str := strings.Repeat("0123456789", 10000) + "something"
-	pat := "some"
+// Repeats a certain function `f` multiple times and prints the benchmark results.
+func runBenchmark[T any](name string, f func() T) {
+	benchResult := testing.Benchmark(func(b *testing.B) {
+		for i := 0; i < b.N; i++ {
+			sink = f()
+		}
+	})
+	fmt.Printf("%-30s: %s\n", name, benchResult.String())
+}
 
-	fmt.Println("Contains")
-	t := time.Now()
-	for i := 0; i < 1; i++ {
-		strings.Contains(str, pat)
-	}
-	fmt.Println("  ", time.Since(t), "\tstrings.Contains")
+func main() {
 
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		sz.Contains(str, pat)
-	}
-	fmt.Println("  ", time.Since(t), "\tsz.Contains")
+	// Define command-line flags.
+	inputPath := flag.String("input", "", "Path to input file for benchmarking. (Required)")
+	seedInt := flag.Int64("seed", 0, "Seed for the random number generator. If 0, the current time is used.")
+	splitMode := flag.String("split", "tokens", "How to split input file: 'tokens' (default) or 'lines'.")
+	flag.Parse()
 
-	fmt.Println("Index")
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		strings.Index(str, pat)
+	// Ensure input file is provided.
+	if *inputPath == "" {
+		fmt.Fprintln(os.Stderr, "Error: input file must be specified using the -input flag.")
+		flag.Usage()
+		os.Exit(1)
 	}
-	fmt.Println("  ", time.Since(t), "\tstrings.Index")
 
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		sz.Index(str, pat)
+	// Read input data from file.
+	bytes, err := os.ReadFile(*inputPath)
+	if err != nil {
+		fmt.Fprintf(os.Stderr, "Error reading input file: %v\n", err)
+		os.Exit(1)
 	}
-	fmt.Println("  ", time.Since(t), "\tsz.Index")
+	data := string(bytes)
+	fmt.Printf("Benchmarking on `%s` with seed %d.\n", *inputPath, *seedInt)
+	fmt.Printf("Total input length: %d\n", len(data))
 
-	fmt.Println("IndexAny")
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		strings.IndexAny(str, pat)
+	// Split the data into items based on the chosen mode.
+	var items []string
+	switch *splitMode {
+	case "lines":
+		rawLines := strings.Split(data, "\n")
+		// Filter out empty lines.
+		for _, line := range rawLines {
+			if line != "" {
+				items = append(items, line)
+			}
+		}
+		if len(items) == 0 {
+			items = []string{"default"}
+		}
+		// Print line statistics.
+		totalLen := 0
+		for _, line := range items {
+			totalLen += len(line)
+		}
+		fmt.Printf("Total lines: %d\n", len(items))
+		fmt.Printf("Average line length: %.2f\n", float64(totalLen)/float64(len(items)))
+	default: // "tokens" or any other value defaults to token mode.
+		items = strings.Fields(data)
+		if len(items) == 0 {
+			items = []string{"default"}
+		}
+		fmt.Printf("Total tokens: %d\n", len(items))
+		fmt.Printf("Average token length: %.2f\n", float64(len(data))/float64(len(items)))
 	}
-	fmt.Println("  ", time.Since(t), "\tstrings.IndexAny")
 
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		sz.IndexAny(str, pat)
+	// In Go, a string is represented as a (length, data) pair. If you pass a string around,
+	// Go will copy the length and the pointer but not the data pointed to.
+	// It's problematic for our benchmark as it makes substring operations meaningless -
+	// just comparing if a pointer falls in the range.
+	// To avoid that, let's copy strings to `[]byte` and back to force a new allocation.
+	for i, item := range items {
+		items[i] = string([]byte(item))
 	}
-	fmt.Println("  ", time.Since(t), "\tsz.IndexAny")
 
-	str = strings.Repeat("0123456789", 100000) + "something"
-	pat = "123456789"
-	fmt.Println("Count")
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		strings.Count(str, pat)
+	// Create a seeded reproducible random number generator.
+	if *seedInt == 0 {
+		*seedInt = time.Now().UnixNano()
 	}
-	fmt.Println("  ", time.Since(t), "\tstrings.Count")
-
-	t = time.Now()
-	for i := 0; i < 1; i++ {
-		sz.Count(str, pat, false)
+	generator := rand.New(rand.NewSource(*seedInt))
+	randomItem := func() string {
+		return items[generator.Intn(len(items))]
 	}
-	fmt.Println("  ", time.Since(t), "\tsz.Count")
 
+	fmt.Println("Running benchmark using `testing.Benchmark`.")
+
+	runBenchmark("strings.Contains", func() bool {
+		return strings.Contains(data, randomItem())
+	})
+	runBenchmark("sz.Contains", func() bool {
+		return sz.Contains(data, randomItem())
+	})
+	runBenchmark("strings.Index", func() int {
+		return strings.Index(data, randomItem())
+	})
+	runBenchmark("sz.Index", func() int64 {
+		return sz.Index(data, randomItem())
+	})
+	runBenchmark("strings.LastIndex", func() int {
+		return strings.LastIndex(data, randomItem())
+	})
+	runBenchmark("sz.LastIndex", func() int64 {
+		return sz.LastIndex(data, randomItem())
+	})
+	runBenchmark("strings.IndexAny", func() int {
+		return strings.IndexAny(randomItem(), "*^")
+	})
+	runBenchmark("sz.IndexAny", func() int64 {
+		return sz.IndexAny(randomItem(), "*^")
+	})
+	runBenchmark("strings.Count", func() int {
+		return strings.Count(data, randomItem())
+	})
+	runBenchmark("sz.Count (non-overlap)", func() int64 {
+		return sz.Count(data, randomItem(), false)
+	})
+	runBenchmark("sz.Count (overlap)", func() int64 {
+		return sz.Count(data, randomItem(), true)
+	})
 }