Merge branch 'main' into main

indirect · web-flow · commit b08ef8c834e0 · 2025-03-27T15:35:41.000-07:00
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -10,7 +10,7 @@ default-run = "kirby"
 argparse = "0.2.2"
 aws_lambda_events = "0.16.0"
 enum-map = { version = "0.4.1", features = ["serde"] }
-flate2 = { version = "1.0", features = ["zlib"], default-features = false }
+flate2 = { version = "1.0", features = ["zlib-rs"], default-features = false }
 lambda_runtime = { version = "0.13.0", features = ["tracing"] }
 lazy_static = "1.1.0"
 log = "0.4.5"
diff --git a/README.md b/README.md
@@ -14,22 +14,29 @@ This is... very good. For comparison, a Python script that used AWS Glue to do s
 
 Then Rust got more optimized and Apple released the M1, and it got still faster. Finally, and I found the [profile-guided optimization](https://doc.rust-lang.org/rustc/profile-guided-optimization.html) docs, and it improved even more than I thought was still possible.
 
+Most recently, it also turned out there was [a highly contended mutex around the regular expressions](https://github.com/rubytogether/kirby/pull/37) and that bought the multi-core version something like 40-60% more speed.
+
 ### Wait, _how_ fast?
 
-        ~525 records/second/cpu in Python on Apache Spark in AWS Glue
-     ~14,000 records/second/cpu in Ruby on a 2018 Intel MacBook Pro
-    ~353,000 records/second/cpu in Rust on a 2018 Intel MacBook Pro
-    ~550,000 records/second/cpu in Rust on a 2021 M1 MacBook Pro
-    ~638,000 records/second/cpu in Rust on M1 with profile-guided optimization
+          ~525 records/second/cpu in Python on Apache Spark in AWS Glue
+       ~14,000 records/second/cpu in Ruby on a 2018 Intel MacBook Pro
+      ~353,000 records/second/cpu in Rust on a 2018 Intel MacBook Pro
+      ~550,000 records/second/cpu in Rust on a 2021 M1 MacBook Pro
+      ~638,000 records/second/cpu in Rust on a 2021 M1 with PGO
+      ~935,500 records/second/cpu in Rust on a 2025 M4 Max MacBook Pro
+      ~983,500 records/second/cpu in Rust on a 2025 M4 Max with PGO
+    ~1,240,000 records/second/cpu in Rust on a 2024 Ryzen 9 9950X with PGO
 
 ### Are you kidding me?
 
-No. The latest version (which I am now benchmarking without also running `cargo build` 🤦🏻‍♂️) can parse records really, really fast.
+No. The latest version can parse records really, really fast.
 
-        ~4,200 records/second in Python with 8 worker instances on AWS Glue
-    ~1,085,000 records/second in Rust with rayon on an 8-core Intel MacBook Pro
-    ~3,195,000 records/second in Rust with rayon on a 10-core M1 MacBook Pro
-    ~3,583,000 records/second in Rust with rayon on M1 with profile-guided optimization
+         ~4,200 records/second in Python with 8 worker instances on AWS Glue
+     ~1,085,000 records/second in Rust with rayon on an 8-core Intel MacBook Pro
+     ~3,195,000 records/second in Rust with rayon on a 10-core M1 MacBook Pro
+     ~3,583,000 records/second in Rust with rayon on M1 with PGO
+    ~10,789,000 records/second in Rust with rayon on a 16-core M4 Max with PGO
+    ~22,559,000 records/second in Rust with rayon on a 32-core Ryzen 9 9950X with PGO
 
 ### What does it calculate?
 
diff --git a/bin/bench b/bin/bench
@@ -22,10 +22,10 @@ if [[ -n "${PGO:-}" ]]; then
 
   # STEP 1: Build the instrumented binaries
   RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data -Cllvm-args=-vp-counters-per-site=4" \
-    cargo build --release
+    cargo build --release --bin kirby
 
   # STEP 2: Run the instrumented binaries with some typical data
-  ls logs/* | xargs -L1 target/release/kirby
+  ls logs/*.gz | xargs -I{} target/release/kirby "{}"
 
   # STEP 3: Merge the `.profraw` files into a `.profdata` file
   $LLVM_PROFDATA merge -o /tmp/pgo-data/merged.profdata /tmp/pgo-data
diff --git a/src/lib.rs b/src/lib.rs
@@ -132,7 +132,7 @@ pub fn print_unknown_user_agents(path: &str, opts: &Options) {
   });
 }
 
-pub fn count_line(ctx: &user_agent::ParseCtx, times: &mut TimeMap, line: String) {
+pub fn count_line(ctx: &user_agent::ParseCtx, times: &mut TimeMap, line: &str) {
   let r: request::Request = serde_json::from_str(&line).unwrap();
 
   if duplicate_request(&r) {
@@ -157,13 +157,26 @@ pub fn count_line(ctx: &user_agent::ParseCtx, times: &mut TimeMap, line: String)
   }
 }
 
-pub fn stream_stats(stream: Box<dyn BufRead>, opts: &Options) -> TimeMap {
+pub fn stream_stats<'a>(mut stream: Box<dyn BufRead + 'a>, opts: &Options) -> TimeMap {
   let mut times = TimeMap::default();
   let mut lineno = 0;
 
   let ctx = user_agent::ParseCtx::new();
+  let mut line = String::with_capacity(1024*1024);
+
+  loop {
+    line.clear();
+    match stream.read_line(&mut line) {
+      Ok(0) => break,
+      Ok(_) => {}
+      Err(e) => {
+        if opts.verbose {
+          eprintln!("Failed to read line:\n  {}", e);
+        }
+        continue;
+      }
+    }
 
-  stream.lines().for_each(|line| {
     if opts.verbose {
       lineno += 1;
       if lineno % 100_000 == 0 {
@@ -172,17 +185,8 @@ pub fn stream_stats(stream: Box<dyn BufRead>, opts: &Options) -> TimeMap {
       }
     }
 
-    match line {
-      Ok(l) => {
-        count_line(&ctx, &mut times, l);
-      }
-      Err(e) => {
-        if opts.verbose {
-          eprintln!("Failed to read line:\n  {}", e);
-        }
-      }
-    }
-  });
+    count_line(&ctx, &mut times, line.as_str());
+  }
 
   if opts.verbose {
     println!();
@@ -196,3 +200,45 @@ pub fn file_stats(path: &str, opts: &Options) -> TimeMap {
   let file_stream = file::reader(path, opts);
   stream_stats(file_stream, opts)
 }
+
+
+#[cfg(test)]
+mod tests {
+    extern crate test;
+
+    use super::*;
+    use std::fs::File;
+    use std::io::BufReader;
+    use test::Bencher;
+
+    #[test]
+    fn test_stream_stats() {
+        let file = File::open("test/sample_500.log").unwrap();
+        let reader = BufReader::new(file);
+        let opts = Options {
+            verbose: false,
+            unknown: false,
+            paths: vec![],
+        };
+        let times = stream_stats(Box::new(reader), &opts);
+        assert_eq!(times.len(), 45);
+    }
+
+    #[bench]
+    fn bench_stream_stats_sample_500(b: &mut Bencher) {
+        let mut logs = Vec::new();
+        File::open("test/sample_500.log")
+            .unwrap()
+            .read_to_end(&mut logs)
+            .unwrap();
+        let opts = Options {
+            verbose: false,
+            unknown: false,
+            paths: vec![],
+        };
+        b.iter(|| {
+            let reader = Box::new(BufReader::new(logs.as_slice()));
+            stream_stats(reader, &opts);
+        });
+    }
+}