|
1 | 1 | package main
|
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "flag" |
4 | 5 | "fmt"
|
| 6 | + "math/rand" |
| 7 | + "os" |
5 | 8 | "strings"
|
| 9 | + "testing" |
6 | 10 | "time"
|
7 | 11 |
|
8 |
| - sz "../go/stringzilla" |
| 12 | + sz "github.com/ashvardanian/stringzilla/golang" |
9 | 13 | )
|
10 | 14 |
|
11 |
| -func main() { |
| 15 | +var sink any //? Global sink to defeat dead-code elimination |
12 | 16 |
|
13 |
| - str := strings.Repeat("0123456789", 10000) + "something" |
14 |
| - pat := "some" |
| 17 | +// Repeats a certain function `f` multiple times and prints the benchmark results. |
| 18 | +func runBenchmark[T any](name string, f func() T) { |
| 19 | + benchResult := testing.Benchmark(func(b *testing.B) { |
| 20 | + for i := 0; i < b.N; i++ { |
| 21 | + sink = f() |
| 22 | + } |
| 23 | + }) |
| 24 | + fmt.Printf("%-30s: %s\n", name, benchResult.String()) |
| 25 | +} |
15 | 26 |
|
16 |
| - fmt.Println("Contains") |
17 |
| - t := time.Now() |
18 |
| - for i := 0; i < 1; i++ { |
19 |
| - strings.Contains(str, pat) |
20 |
| - } |
21 |
| - fmt.Println(" ", time.Since(t), "\tstrings.Contains") |
| 27 | +func main() { |
22 | 28 |
|
23 |
| - t = time.Now() |
24 |
| - for i := 0; i < 1; i++ { |
25 |
| - sz.Contains(str, pat) |
26 |
| - } |
27 |
| - fmt.Println(" ", time.Since(t), "\tsz.Contains") |
| 29 | + // Define command-line flags. |
| 30 | + inputPath := flag.String("input", "", "Path to input file for benchmarking. (Required)") |
| 31 | + seedInt := flag.Int64("seed", 0, "Seed for the random number generator. If 0, the current time is used.") |
| 32 | + splitMode := flag.String("split", "tokens", "How to split input file: 'tokens' (default) or 'lines'.") |
| 33 | + flag.Parse() |
28 | 34 |
|
29 |
| - fmt.Println("Index") |
30 |
| - t = time.Now() |
31 |
| - for i := 0; i < 1; i++ { |
32 |
| - strings.Index(str, pat) |
| 35 | + // Ensure input file is provided. |
| 36 | + if *inputPath == "" { |
| 37 | + fmt.Fprintln(os.Stderr, "Error: input file must be specified using the -input flag.") |
| 38 | + flag.Usage() |
| 39 | + os.Exit(1) |
33 | 40 | }
|
34 |
| - fmt.Println(" ", time.Since(t), "\tstrings.Index") |
35 | 41 |
|
36 |
| - t = time.Now() |
37 |
| - for i := 0; i < 1; i++ { |
38 |
| - sz.Index(str, pat) |
| 42 | + // Read input data from file. |
| 43 | + bytes, err := os.ReadFile(*inputPath) |
| 44 | + if err != nil { |
| 45 | + fmt.Fprintf(os.Stderr, "Error reading input file: %v\n", err) |
| 46 | + os.Exit(1) |
39 | 47 | }
|
40 |
| - fmt.Println(" ", time.Since(t), "\tsz.Index") |
| 48 | + data := string(bytes) |
| 49 | + fmt.Printf("Benchmarking on `%s` with seed %d.\n", *inputPath, *seedInt) |
| 50 | + fmt.Printf("Total input length: %d\n", len(data)) |
41 | 51 |
|
42 |
| - fmt.Println("IndexAny") |
43 |
| - t = time.Now() |
44 |
| - for i := 0; i < 1; i++ { |
45 |
| - strings.IndexAny(str, pat) |
| 52 | + // Split the data into items based on the chosen mode. |
| 53 | + var items []string |
| 54 | + switch *splitMode { |
| 55 | + case "lines": |
| 56 | + rawLines := strings.Split(data, "\n") |
| 57 | + // Filter out empty lines. |
| 58 | + for _, line := range rawLines { |
| 59 | + if line != "" { |
| 60 | + items = append(items, line) |
| 61 | + } |
| 62 | + } |
| 63 | + if len(items) == 0 { |
| 64 | + items = []string{"default"} |
| 65 | + } |
| 66 | + // Print line statistics. |
| 67 | + totalLen := 0 |
| 68 | + for _, line := range items { |
| 69 | + totalLen += len(line) |
| 70 | + } |
| 71 | + fmt.Printf("Total lines: %d\n", len(items)) |
| 72 | + fmt.Printf("Average line length: %.2f\n", float64(totalLen)/float64(len(items))) |
| 73 | + default: // "tokens" or any other value defaults to token mode. |
| 74 | + items = strings.Fields(data) |
| 75 | + if len(items) == 0 { |
| 76 | + items = []string{"default"} |
| 77 | + } |
| 78 | + fmt.Printf("Total tokens: %d\n", len(items)) |
| 79 | + fmt.Printf("Average token length: %.2f\n", float64(len(data))/float64(len(items))) |
46 | 80 | }
|
47 |
| - fmt.Println(" ", time.Since(t), "\tstrings.IndexAny") |
48 | 81 |
|
49 |
| - t = time.Now() |
50 |
| - for i := 0; i < 1; i++ { |
51 |
| - sz.IndexAny(str, pat) |
| 82 | + // In Go, a string is represented as a (length, data) pair. If you pass a string around, |
| 83 | + // Go will copy the length and the pointer but not the data pointed to. |
| 84 | + // It's problematic for our benchmark as it makes substring operations meaningless - |
| 85 | + // just comparing if a pointer falls in the range. |
| 86 | + // To avoid that, let's copy strings to `[]byte` and back to force a new allocation. |
| 87 | + for i, item := range items { |
| 88 | + items[i] = string([]byte(item)) |
52 | 89 | }
|
53 |
| - fmt.Println(" ", time.Since(t), "\tsz.IndexAny") |
54 | 90 |
|
55 |
| - str = strings.Repeat("0123456789", 100000) + "something" |
56 |
| - pat = "123456789" |
57 |
| - fmt.Println("Count") |
58 |
| - t = time.Now() |
59 |
| - for i := 0; i < 1; i++ { |
60 |
| - strings.Count(str, pat) |
| 91 | + // Create a seeded reproducible random number generator. |
| 92 | + if *seedInt == 0 { |
| 93 | + *seedInt = time.Now().UnixNano() |
61 | 94 | }
|
62 |
| - fmt.Println(" ", time.Since(t), "\tstrings.Count") |
63 |
| - |
64 |
| - t = time.Now() |
65 |
| - for i := 0; i < 1; i++ { |
66 |
| - sz.Count(str, pat, false) |
| 95 | + generator := rand.New(rand.NewSource(*seedInt)) |
| 96 | + randomItem := func() string { |
| 97 | + return items[generator.Intn(len(items))] |
67 | 98 | }
|
68 |
| - fmt.Println(" ", time.Since(t), "\tsz.Count") |
69 | 99 |
|
| 100 | + fmt.Println("Running benchmark using `testing.Benchmark`.") |
| 101 | + |
| 102 | + runBenchmark("strings.Contains", func() bool { |
| 103 | + return strings.Contains(data, randomItem()) |
| 104 | + }) |
| 105 | + runBenchmark("sz.Contains", func() bool { |
| 106 | + return sz.Contains(data, randomItem()) |
| 107 | + }) |
| 108 | + runBenchmark("strings.Index", func() int { |
| 109 | + return strings.Index(data, randomItem()) |
| 110 | + }) |
| 111 | + runBenchmark("sz.Index", func() int64 { |
| 112 | + return sz.Index(data, randomItem()) |
| 113 | + }) |
| 114 | + runBenchmark("strings.LastIndex", func() int { |
| 115 | + return strings.LastIndex(data, randomItem()) |
| 116 | + }) |
| 117 | + runBenchmark("sz.LastIndex", func() int64 { |
| 118 | + return sz.LastIndex(data, randomItem()) |
| 119 | + }) |
| 120 | + runBenchmark("strings.IndexAny", func() int { |
| 121 | + return strings.IndexAny(randomItem(), "*^") |
| 122 | + }) |
| 123 | + runBenchmark("sz.IndexAny", func() int64 { |
| 124 | + return sz.IndexAny(randomItem(), "*^") |
| 125 | + }) |
| 126 | + runBenchmark("strings.Count", func() int { |
| 127 | + return strings.Count(data, randomItem()) |
| 128 | + }) |
| 129 | + runBenchmark("sz.Count (non-overlap)", func() int64 { |
| 130 | + return sz.Count(data, randomItem(), false) |
| 131 | + }) |
| 132 | + runBenchmark("sz.Count (overlap)", func() int64 { |
| 133 | + return sz.Count(data, randomItem(), true) |
| 134 | + }) |
70 | 135 | }
|
0 commit comments