From 906144acbd805796c60b9e051461ec35f015044d Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Tue, 8 Feb 2022 21:23:53 -0800
Subject: [PATCH 01/14] add runtime dataset fetch and parse in-place

---
 .gitignore                     |   5 -
 benchmarks/.gitignore          |   3 +
 benchmarks/Cargo.toml          |   4 +
 benchmarks/benches/datasets.rs | 186 +++++++++++++++++++++
 benchmarks/benches/lib.rs      | 288 +++++++++++++--------------------
 benchmarks/build.rs            |  80 ---------
 6 files changed, 308 insertions(+), 258 deletions(-)
 create mode 100644 benchmarks/.gitignore
 create mode 100644 benchmarks/benches/datasets.rs
diff --git a/.gitignore b/.gitignore
index 75d4b9447..4fffb2f89 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,2 @@
 /target
 /Cargo.lock
-
-# This is generated by the benchmarks crate build script, do not version with git.
-/benchmarks/benches/datasets_paths.rs
-/benchmarks/target
-/benchmarks/Cargo.lock
diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
new file mode 100644
index 000000000..7adbbb807
--- /dev/null
+++ b/benchmarks/.gitignore
@@ -0,0 +1,3 @@
+/target
+/Cargo.lock
+/real-roaring-datasets
\ No newline at end of file
diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index 924794ecb..a6a5326d7 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -10,6 +10,10 @@ publish = false
 roaring = { path = ".." }
 
 [dev-dependencies]
+once_cell = "1.9.0"
+git2 = "0.13.25"
+zip = "0.5.13"
+indicatif = "0.16.2"
 criterion = { version = "0.3", features = ["html_reports"] }
 quickcheck = "0.9"
 quickcheck_macros = "0.9"
diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs
new file mode 100644
index 000000000..c814c23ab
--- /dev/null
+++ b/benchmarks/benches/datasets.rs
@@ -0,0 +1,186 @@
+use std::env;
+use std::fs::File;
+use std::io::BufReader;
+use std::path::{Path, PathBuf};
+
+use git2::FetchOptions;
+use once_cell::sync::OnceCell as SyncOnceCell;
+
+use roaring::RoaringBitmap;
+
+static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new();
+
+pub struct Datasets;
+
+pub struct DatasetsIter {
+    iter: std::slice::Iter<'static, Dataset>,
+}
+
+impl Iterator for DatasetsIter {
+    type Item = &'static Dataset;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+impl IntoIterator for Datasets {
+    type Item = &'static Dataset;
+    type IntoIter = DatasetsIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        DatasetsIter {
+            iter: INSTANCE
+                .get_or_init(|| {
+                    init_datasets().and_then(parse_datasets).expect("a collection of datasets")
+                })
+                .iter(),
+        }
+    }
+}
+
+pub struct Dataset {
+    pub name: String,
+    pub bitmaps: Vec<RoaringBitmap>,
+}
+
+fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
+    let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?;
+
+    let out_path = Path::new(&out_dir);
+    let repo_path = out_path.join("real-roaring-datasets");
+
+    // Setup progress callbacks
+
+    let pb_cell = once_cell::unsync::OnceCell::new();
+    let mut cb = git2::RemoteCallbacks::new();
+
+    cb.transfer_progress(|progress| {
+        let pb = pb_cell.get_or_init(|| {
+            indicatif::ProgressBar::new(progress.total_objects() as u64)
+                .with_style(
+                    indicatif::ProgressStyle::default_bar()
+                        .template(&format!(
+                            "{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}",
+                            progress.total_objects()
+                        ))
+                        .progress_chars("#> "),
+                )
+                .with_prefix("    ")
+                .with_message("Recieving objects")
+        });
+
+        pb.set_position((progress.local_objects() + progress.received_objects()) as u64);
+        true
+    });
+
+    let mut fetch_opts = FetchOptions::new();
+    fetch_opts.remote_callbacks(cb);
+
+    // Do update
+
+    if !Path::new(&repo_path).exists() {
+        git2::build::RepoBuilder::new()
+            .fetch_options(fetch_opts)
+            .clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?;
+    } else {
+        let repo = git2::Repository::open(&repo_path)?;
+        repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?;
+
+        let head = repo.head()?.peel_to_commit()?;
+        let origin_master_head = repo
+            .find_branch("origin/master", git2::BranchType::Remote)?
+            .into_reference()
+            .peel_to_commit()?;
+
+        if head.id() != origin_master_head.id() {
+            repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?;
+        }
+    }
+
+    if let Some(pb) = pb_cell.get() {
+        pb.finish()
+    }
+
+    Ok(repo_path)
+}
+
+fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> {
+    const DATASET_FILENAME_WHITELIST: &[&str] = &[
+        "census-income.zip",
+        "census-income_srt.zip",
+        "census1881.zip",
+        "census1881_srt.zip",
+        "weather_sept_85.zip",
+        "weather_sept_85_srt.zip",
+        "wikileaks-noquotes.zip",
+        "wikileaks-noquotes_srt.zip",
+    ];
+
+    use indicatif::{ProgressBar, ProgressStyle};
+    use std::io::BufRead;
+    use zip::ZipArchive;
+
+    let dir = path.as_ref().read_dir()?;
+
+    let mut datasets = Vec::new();
+
+    // Future work: Reuse this buffer to parse croaring bitmaps for comparison
+    let mut numbers = Vec::new();
+
+    for dir_entry_result in dir {
+        let dir_entry = dir_entry_result?;
+        let metadata = dir_entry.metadata()?;
+        let file_name = dir_entry.file_name();
+        // TODO dont panic
+        let file_name_str = file_name.to_str().expect("utf-8 filename");
+
+        if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) {
+            let file = File::open(dir_entry.path())?;
+            let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string();
+
+            let mut zip = ZipArchive::new(file)?;
+
+            let mut total_size = 0;
+            for i in 0..zip.len() {
+                let file = zip.by_index(i)?;
+                total_size += file.size();
+            }
+
+            let pb = ProgressBar::new(total_size)
+                .with_style(
+                    ProgressStyle::default_bar()
+                        .template("    {prefix:.green} [{bar}] {msg}")
+                        .progress_chars("#> "),
+                )
+                .with_prefix("Parsing")
+                .with_message(name.clone());
+
+            let mut bitmaps = Vec::with_capacity(zip.len());
+            for i in 0..zip.len() {
+                let file = zip.by_index(i)?;
+                let size = file.size();
+                let buf = BufReader::new(file);
+
+                for bytes in buf.split(b',') {
+                    let bytes = bytes?;
+                    let str = String::from_utf8(bytes)?;
+                    let n = str.trim().parse::<u32>()?;
+                    numbers.push(n);
+                }
+
+                let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?;
+                numbers.clear();
+                bitmaps.push(bitmap);
+
+                pb.set_position(pb.position() + size);
+            }
+
+            pb.finish();
+            datasets.push(Dataset { name, bitmaps });
+        }
+    }
+    datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name));
+    println!();
+    Ok(datasets)
+}
diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs
index 25e52e2c7..bff39f76f 100644
--- a/benchmarks/benches/lib.rs
+++ b/benchmarks/benches/lib.rs
@@ -1,14 +1,16 @@
-mod datasets_paths;
-
 use std::cmp::Reverse;
-use std::convert::TryInto;
 use std::num::ParseIntError;
 use std::path::{Path, PathBuf};
 use std::{fs, io};
 
-use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
+
 use roaring::RoaringBitmap;
 
+use crate::datasets::Datasets;
+
+mod datasets;
+
 fn create(c: &mut Criterion) {
     c.bench_function("create", |b| {
         b.iter(|| {
@@ -96,34 +98,24 @@ fn len(c: &mut Criterion) {
 }
 
 fn rank(c: &mut Criterion) {
-    let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT;
-    let parsed_numbers = parse_dir_files(files).unwrap();
-
-    // Cache len to prevent len calculation from effecting benchmark
-    let bitmaps: Vec<_> = parsed_numbers
-        .into_iter()
-        .map(|(_, r)| {
-            r.map(|iter| {
-                let bitmap = RoaringBitmap::from_sorted_iter(iter).unwrap();
-                let len: u32 = bitmap.len().try_into().expect("len <= u32::MAX");
-                (bitmap, len)
-            })
-            .unwrap()
-        })
-        .collect();
-
-    // Rank all multiples of 100 < bitmap.len()
-    // Mupliplier chosen arbitrarily, but should be sure not to rank many values > len()
-    // Doing so would degenerate into benchmarking len()
-    c.bench_function("rank", |b| {
-        b.iter(|| {
-            for (bitmap, len) in bitmaps.iter() {
-                for i in (0..*len).step_by(100) {
-                    black_box(bitmap.rank(i));
+    let mut group = c.benchmark_group("rank");
+    for dataset in Datasets {
+        let bitmaps =
+            dataset.bitmaps.iter().map(|bitmap| (bitmap, bitmap.len() as u32)).collect::<Vec<_>>();
+
+        // Rank all multiples of 100 < bitmap.len()
+        // Mupliplier chosen arbitrarily, but should be sure not to rank many values > len()
+        // Doing so would degenerate into benchmarking len()
+        group.bench_function(BenchmarkId::new("rank", &dataset.name), |b| {
+            b.iter(|| {
+                for (bitmap, len) in bitmaps.iter() {
+                    for i in (0..*len).step_by(100) {
+                        black_box(bitmap.rank(i));
+                    }
                 }
-            }
+            });
         });
-    });
+    }
 }
 
 fn and(c: &mut Criterion) {
@@ -277,66 +269,18 @@ fn insert_range_bitmap(c: &mut Criterion) {
 }
 
 fn iter(c: &mut Criterion) {
-    c.bench_function("iter bitmap 1..10_000", |b| {
-        let bitmap: RoaringBitmap = (1..10_000).collect();
-        b.iter(|| {
-            bitmap.iter().for_each(|i| {
-                black_box(i);
-            });
-        });
-    });
-
-    c.bench_function("iter bitmap sparse", |b| {
-        let bitmap: RoaringBitmap = (0..1 << 16).step_by(61).collect();
-        b.iter(|| {
-            bitmap.iter().for_each(|i| {
-                black_box(i);
+    let mut group = c.benchmark_group("iter");
+    for dataset in Datasets {
+        group.bench_function(BenchmarkId::new("iter", &dataset.name), |b| {
+            b.iter(|| {
+                dataset.bitmaps.iter().flat_map(|bitmap| bitmap.iter()).for_each(|i| {
+                    black_box(i);
+                });
             });
         });
-    });
-
-    c.bench_function("iter bitmap dense", |b| {
-        let bitmap: RoaringBitmap = (0..1 << 16).step_by(2).collect();
-        b.iter(|| {
-            bitmap.iter().for_each(|i| {
-                black_box(i);
-            });
-        });
-    });
-
-    c.bench_function("iter bitmap minimal", |b| {
-        let bitmap: RoaringBitmap = (0..4096).collect();
-        b.iter(|| {
-            bitmap.iter().for_each(|i| {
-                black_box(i);
-            });
-        });
-    });
-
-    c.bench_function("iter bitmap full", |b| {
-        let bitmap: RoaringBitmap = (0..1 << 16).collect();
-        b.iter(|| {
-            bitmap.iter().for_each(|i| {
-                black_box(i);
-            });
-        });
-    });
-
-    c.bench_function("iter parsed", |b| {
-        let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT;
-        let parsed_numbers = parse_dir_files(files).unwrap();
-
-        let bitmaps: Vec<_> = parsed_numbers
-            .into_iter()
-            .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap())
-            .collect();
+    }
 
-        b.iter(|| {
-            bitmaps.iter().flat_map(|bitmap| bitmap.iter()).for_each(|i| {
-                black_box(i);
-            });
-        });
-    });
+    group.finish();
 }
 
 fn is_empty(c: &mut Criterion) {
@@ -417,112 +361,110 @@ fn parse_dir_files<A: AsRef<Path>>(
 }
 
 fn from_sorted_iter(c: &mut Criterion) {
-    let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT;
-    let parsed_numbers = parse_dir_files(files).unwrap();
+    let mut group = c.benchmark_group("from_sorted_iter");
+
+    for dataset in Datasets {
+        let dataset_numbers = dataset
+            .bitmaps
+            .iter()
+            .map(|bitmap| bitmap.iter().collect::<Vec<_>>())
+            .collect::<Vec<_>>();
+
+        group.bench_function(BenchmarkId::new("from_sorted_iter", &dataset.name), |b| {
+            b.iter(|| {
+                for bitmap_numbers in &dataset_numbers {
+                    RoaringBitmap::from_sorted_iter(bitmap_numbers.iter().copied()).unwrap();
+                }
+            })
+        });
+    }
 
-    c.bench_function("from_sorted_iter", |b| {
-        b.iter(|| {
-            for (_, numbers) in &parsed_numbers {
-                let numbers = numbers.as_ref().unwrap();
-                RoaringBitmap::from_sorted_iter(numbers.iter().copied()).unwrap();
-            }
-        })
-    });
+    group.finish();
 }
 
 fn successive_and(c: &mut Criterion) {
-    let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT;
-    let parsed_numbers = parse_dir_files(files).unwrap();
-
-    let mut bitmaps: Vec<_> = parsed_numbers
-        .into_iter()
-        .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap())
-        .collect();
-
-    // biggest bitmaps first.
-    bitmaps.sort_unstable_by_key(|b| Reverse(b.len()));
-
     let mut group = c.benchmark_group("Successive And");
 
-    group.bench_function("Successive And Assign Ref", |b| {
-        b.iter_batched(
-            || bitmaps.clone(),
-            |bitmaps| {
-                let mut iter = bitmaps.into_iter();
-                let mut first = iter.next().unwrap().clone();
-                for bitmap in iter {
-                    first &= bitmap;
-                }
-            },
-            BatchSize::LargeInput,
-        );
-    });
+    for dataset in Datasets {
+        // biggest bitmaps first.
+        let mut sorted_bitmaps = dataset.bitmaps.clone();
+        sorted_bitmaps.sort_unstable_by_key(|b| Reverse(b.len()));
 
-    group.bench_function("Successive And Assign Owned", |b| {
-        b.iter_batched(
-            || bitmaps.clone(),
-            |bitmaps| {
-                black_box(bitmaps.into_iter().reduce(|a, b| a & b).unwrap());
-            },
-            BatchSize::LargeInput,
-        );
-    });
+        group.bench_function(BenchmarkId::new("Successive And Assign Ref", &dataset.name), |b| {
+            b.iter_batched(
+                || sorted_bitmaps.clone(),
+                |bitmaps| {
+                    let mut iter = bitmaps.into_iter();
+                    let mut first = iter.next().unwrap().clone();
+                    for bitmap in iter {
+                        first &= bitmap;
+                    }
+                },
+                BatchSize::LargeInput,
+            );
+        });
 
-    group.bench_function("Successive And Ref Ref", |b| {
-        b.iter_batched(
-            || bitmaps.clone(),
-            |bitmaps| {
-                let mut iter = bitmaps.iter();
-                let first = iter.next().unwrap().clone();
-                black_box(iter.fold(first, |acc, x| (&acc) & x));
-            },
-            BatchSize::LargeInput,
-        );
-    });
+        group.bench_function(BenchmarkId::new("Successive And Assign Owned", &dataset.name), |b| {
+            b.iter_batched(
+                || sorted_bitmaps.clone(),
+                |bitmaps| {
+                    black_box(bitmaps.into_iter().reduce(|a, b| a & b).unwrap());
+                },
+                BatchSize::LargeInput,
+            );
+        });
+
+        group.bench_function(BenchmarkId::new("Successive And Ref Ref", &dataset.name), |b| {
+            b.iter_batched(
+                || sorted_bitmaps.clone(),
+                |bitmaps| {
+                    let mut iter = bitmaps.iter();
+                    let first = iter.next().unwrap().clone();
+                    black_box(iter.fold(first, |acc, x| (&acc) & x));
+                },
+                BatchSize::LargeInput,
+            );
+        });
+    }
 
     group.finish();
 }
 
 fn successive_or(c: &mut Criterion) {
-    let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT;
-    let parsed_numbers = parse_dir_files(files).unwrap();
-
-    let bitmaps: Vec<_> = parsed_numbers
-        .into_iter()
-        .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap())
-        .collect();
-
     let mut group = c.benchmark_group("Successive Or");
-    group.bench_function("Successive Or Assign Ref", |b| {
-        b.iter(|| {
-            let mut output = RoaringBitmap::new();
-            for bitmap in &bitmaps {
-                output |= bitmap;
-            }
-        });
-    });
 
-    group.bench_function("Successive Or Assign Owned", |b| {
-        b.iter_batched(
-            || bitmaps.clone(),
-            |bitmaps: Vec<RoaringBitmap>| {
+    for dataset in Datasets {
+        group.bench_function(BenchmarkId::new("Successive Or Assign Ref", &dataset.name), |b| {
+            b.iter(|| {
                 let mut output = RoaringBitmap::new();
-                for bitmap in bitmaps {
+                for bitmap in &dataset.bitmaps {
                     output |= bitmap;
                 }
-            },
-            BatchSize::LargeInput,
-        );
-    });
+            });
+        });
 
-    group.bench_function("Successive Or Ref Ref", |b| {
-        b.iter(|| {
-            let mut output = RoaringBitmap::new();
-            for bitmap in &bitmaps {
-                output = (&output) | bitmap;
-            }
+        group.bench_function(BenchmarkId::new("Successive Or Assign Owned", &dataset.name), |b| {
+            b.iter_batched(
+                || dataset.bitmaps.clone(),
+                |bitmaps: Vec<RoaringBitmap>| {
+                    let mut output = RoaringBitmap::new();
+                    for bitmap in bitmaps {
+                        output |= bitmap;
+                    }
+                },
+                BatchSize::LargeInput,
+            );
         });
-    });
+
+        group.bench_function(BenchmarkId::new("Successive Or Ref Ref", &dataset.name), |b| {
+            b.iter(|| {
+                let mut output = RoaringBitmap::new();
+                for bitmap in &dataset.bitmaps {
+                    output = (&output) | bitmap;
+                }
+            });
+        });
+    }
 
     group.finish();
 }
diff --git a/benchmarks/build.rs b/benchmarks/build.rs
index 4622b3c9c..8b1378917 100644
--- a/benchmarks/build.rs
+++ b/benchmarks/build.rs
@@ -1,81 +1 @@
-use std::fs::File;
-use std::io::{Cursor, Read, Seek, Write};
-use std::path::{Path, PathBuf};
-use std::{env, fs};
 
-use bytes::Bytes;
-use convert_case::{Case, Casing};
-use reqwest::{blocking::get, IntoUrl};
-use zip::read::ZipArchive;
-
-const BASE_URL: &str = "https://github.com/RoaringBitmap/real-roaring-datasets/raw/master/";
-
-// const DATASET_CENSUS_INCOME: &str = "census-income";
-// const DATASET_CENSUS_INCOME_SRT: &str = "census-income_srt";
-// const DATASET_CENSUS1881: &str = "census1881";
-// const DATASET_CENSUS1881_SRT: &str = "census1881_srt";
-// const DATASET_DIMENSION_003: &str = "dimension_003";
-// const DATASET_DIMENSION_008: &str = "dimension_008";
-// const DATASET_DIMENSION_033: &str = "dimension_033";
-// const DATASET_USCENSUS2000: &str = "uscensus2000";
-// const DATASET_WEATHER_SEPT_85: &str = "weather_sept_85";
-// const DATASET_WEATHER_SEPT_85_SRT: &str = "weather_sept_85_srt";
-// const DATASET_WIKILEAKS_NOQUOTES: &str = "wikileaks-noquotes";
-const DATASET_WIKILEAKS_NOQUOTES_SRT: &str = "wikileaks-noquotes_srt";
-
-// const DATASETS: &[&str] = &[
-//     DATASET_CENSUS_INCOME,
-//     DATASET_CENSUS_INCOME_SRT,
-//     DATASET_CENSUS1881,
-//     DATASET_CENSUS1881_SRT,
-//     DATASET_DIMENSION_003,
-//     DATASET_DIMENSION_008,
-//     DATASET_DIMENSION_033,
-//     DATASET_USCENSUS2000,
-//     DATASET_WEATHER_SEPT_85,
-//     DATASET_WEATHER_SEPT_85_SRT,
-//     DATASET_WIKILEAKS_NOQUOTES,
-//     DATASET_WIKILEAKS_NOQUOTES_SRT,
-// ];
-
-fn main() -> anyhow::Result<()> {
-    let out_dir = PathBuf::from(env::var("OUT_DIR")?);
-
-    let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches");
-    let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?;
-
-    writeln!(
-        &mut manifest_paths_file,
-        "// This file is generated by the build script.\n// Do not modify by hand, use the build.rs file.\n"
-    )?;
-
-    #[allow(clippy::single_element_loop)]
-    for dataset in &[DATASET_WIKILEAKS_NOQUOTES_SRT] {
-        let out_path = out_dir.join(dataset);
-        let url = format!("{}/{}.zip", BASE_URL, dataset);
-        let bytes = download_dataset(url)?;
-        unzip_in_folder(bytes, &out_path)?;
-
-        writeln!(
-            &mut manifest_paths_file,
-            r#"pub const {}: &str = {:?};"#,
-            dataset.to_case(Case::ScreamingSnake),
-            out_path.display(),
-        )?;
-    }
-
-    Ok(())
-}
-
-fn download_dataset<U: IntoUrl>(url: U) -> anyhow::Result<Cursor<Bytes>> {
-    let bytes = get(url)?.bytes()?;
-    Ok(Cursor::new(bytes))
-}
-
-fn unzip_in_folder<R: Read + Seek, P: AsRef<Path>>(bytes: R, path: P) -> anyhow::Result<()> {
-    let path = path.as_ref();
-    fs::create_dir_all(path).unwrap();
-    let mut zip = ZipArchive::new(bytes)?;
-    zip.extract(path)?;
-    Ok(())
-}

From 01bf5b624471182ffebe001293e45aad87353d3e Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Tue, 8 Feb 2022 21:56:08 -0800
Subject: [PATCH 02/14] remove build-deps and build.rs

---
 benchmarks/Cargo.toml     |  7 -------
 benchmarks/benches/lib.rs | 14 --------------
 benchmarks/build.rs       |  1 -
 3 files changed, 22 deletions(-)
 delete mode 100644 benchmarks/build.rs

diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index a6a5326d7..e82fb9f87 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -18,13 +18,6 @@ criterion = { version = "0.3", features = ["html_reports"] }
 quickcheck = "0.9"
 quickcheck_macros = "0.9"
 
-[build-dependencies]
-anyhow = "1.0"
-bytes = "1.0"
-convert_case = "0.4"
-reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
-zip = "0.5.12"
-
 [features]
 simd = ["roaring/simd"]
 
diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs
index bff39f76f..468bf3ebc 100644
--- a/benchmarks/benches/lib.rs
+++ b/benchmarks/benches/lib.rs
@@ -346,20 +346,6 @@ fn serialized_size(c: &mut Criterion) {
     });
 }
 
-fn extract_integers<A: AsRef<str>>(content: A) -> Result<Vec<u32>, ParseIntError> {
-    content.as_ref().split(',').map(|s| s.trim().parse()).collect()
-}
-
-// Parse every file into a vector of integer.
-fn parse_dir_files<A: AsRef<Path>>(
-    files: A,
-) -> io::Result<Vec<(PathBuf, Result<Vec<u32>, ParseIntError>)>> {
-    fs::read_dir(files)?
-        .map(|r| r.and_then(|e| fs::read_to_string(e.path()).map(|r| (e.path(), r))))
-        .map(|r| r.map(|(p, c)| (p, extract_integers(c))))
-        .collect()
-}
-
 fn from_sorted_iter(c: &mut Criterion) {
     let mut group = c.benchmark_group("from_sorted_iter");
 
diff --git a/benchmarks/build.rs b/benchmarks/build.rs
deleted file mode 100644
index 8b1378917..000000000
--- a/benchmarks/build.rs
+++ /dev/null
@@ -1 +0,0 @@
-

From 6050c684e3afef6aa1e100c387a21ec80523a131 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 02:05:04 -0800
Subject: [PATCH 03/14] cleanup benchmark dependencies

---
 benchmarks/Cargo.toml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index e82fb9f87..6c5ea887a 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -10,13 +10,11 @@ publish = false
 roaring = { path = ".." }
 
 [dev-dependencies]
-once_cell = "1.9.0"
-git2 = "0.13.25"
-zip = "0.5.13"
-indicatif = "0.16.2"
+once_cell = "1.9"
+git2 = { version = "0.13", default-features = false }
+zip = { version = "0.5", default-features = false, features = ["deflate"] }
+indicatif = "0.16"
 criterion = { version = "0.3", features = ["html_reports"] }
-quickcheck = "0.9"
-quickcheck_macros = "0.9"
 
 [features]
 simd = ["roaring/simd"]

From 1776f50fc06a1fb0e4285cafa5d7ef8900094545 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 02:05:21 -0800
Subject: [PATCH 04/14] fix benchmark warnings

---
 benchmarks/benches/datasets_paths.rs | 4 ++++
 benchmarks/benches/lib.rs            | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/benches/datasets_paths.rs

diff --git a/benchmarks/benches/datasets_paths.rs b/benchmarks/benches/datasets_paths.rs
new file mode 100644
index 000000000..ca4f3a280
--- /dev/null
+++ b/benchmarks/benches/datasets_paths.rs
@@ -0,0 +1,4 @@
+// This file is generated by the build script.
+// Do not modify by hand, use the build.rs file.
+
+pub const WIKILEAKS_NOQUOTES_SRT: &str = "/Users/joel.pedraza/src/roaring-rs-saik0/benchmarks/target/debug/build/benchmarks-4eb3dbdf81f7da70/out/wikileaks-noquotes_srt";
diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs
index 468bf3ebc..a87ab9c6f 100644
--- a/benchmarks/benches/lib.rs
+++ b/benchmarks/benches/lib.rs
@@ -1,7 +1,4 @@
 use std::cmp::Reverse;
-use std::num::ParseIntError;
-use std::path::{Path, PathBuf};
-use std::{fs, io};
 
 use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion};
 
@@ -381,7 +378,7 @@ fn successive_and(c: &mut Criterion) {
                 || sorted_bitmaps.clone(),
                 |bitmaps| {
                     let mut iter = bitmaps.into_iter();
-                    let mut first = iter.next().unwrap().clone();
+                    let mut first = iter.next().unwrap();
                     for bitmap in iter {
                         first &= bitmap;
                     }

From a6648fe07caa972480ccbbaa6ea2fc9dbb81bbb7 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 02:06:46 -0800
Subject: [PATCH 05/14] remove datasets_paths.rs file

---
 benchmarks/benches/datasets_paths.rs | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 benchmarks/benches/datasets_paths.rs

diff --git a/benchmarks/benches/datasets_paths.rs b/benchmarks/benches/datasets_paths.rs
deleted file mode 100644
index ca4f3a280..000000000
--- a/benchmarks/benches/datasets_paths.rs
+++ /dev/null
@@ -1,4 +0,0 @@
-// This file is generated by the build script.
-// Do not modify by hand, use the build.rs file.
-
-pub const WIKILEAKS_NOQUOTES_SRT: &str = "/Users/joel.pedraza/src/roaring-rs-saik0/benchmarks/target/debug/build/benchmarks-4eb3dbdf81f7da70/out/wikileaks-noquotes_srt";

From 9e4ffa7663551b7525ed2d5768d1831740b78d44 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 03:12:32 -0800
Subject: [PATCH 06/14] add trailing newline to benchmark gitignore

---
 benchmarks/.gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore
index 7adbbb807..4c312bb62 100644
--- a/benchmarks/.gitignore
+++ b/benchmarks/.gitignore
@@ -1,3 +1,3 @@
 /target
 /Cargo.lock
-/real-roaring-datasets
\ No newline at end of file
+/real-roaring-datasets

From 9eacae1387a86a1899c1ef78a22842b94de63dc3 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 03:13:23 -0800
Subject: [PATCH 07/14] remove benchmark src dir

---
 benchmarks/src/lib.rs | 5 -----
 1 file changed, 5 deletions(-)
 delete mode 100644 benchmarks/src/lib.rs

diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs
deleted file mode 100644
index 829adafa0..000000000
--- a/benchmarks/src/lib.rs
+++ /dev/null
@@ -1,5 +0,0 @@
-//! This library is only used to isolate the benchmarks
-//! from the original roaring library.
-//!
-//! It does not include interesting functions for roaring library
-//! users only for roaring contributors.

From b51b93c61bd5656d28a2b26d86a14efefa1728ca Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 05:07:25 -0800
Subject: [PATCH 08/14] add benchmark offline mode. use by CI.

---
 .github/workflows/test.yml     | 75 ++++++++++++++++++++++------------
 benchmarks/benches/datasets.rs | 17 ++++++++
 2 files changed, 65 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index aa9e62002..7d64dca1f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -19,66 +19,87 @@ jobs:
           - beta
           - nightly
           - 1.56.1
+    env:
+      RUSTFLAGS: "-C target-cpu=native -C opt-level=3"
+      ROARINGRS_BENCH_OFFLINE: "true"
 
     steps:
-      - uses: actions/checkout@v2
+      - name: Checkout roaring-rs
+        uses: actions/checkout@v2
 
-      - uses: actions-rs/toolchain@v1
+      - name: Checkout benchmark datasets
+        uses: actions/checkout@v2
+        with:
+          repository: "RoaringBitmap/real-roaring-datasets"
+          path: "benchmarks/real-roaring-datasets"
+
+      - name: Initialize rust toolchain
+        uses: actions-rs/toolchain@v1
         with:
           profile: minimal
           toolchain: ${{ matrix.rust }}
           override: true
           components: rustfmt, clippy
 
-      - uses: actions-rs/cargo@v1
+      - name: Build
+        uses: actions-rs/cargo@v1
         with:
           command: build
+          args: --all-targets
 
-      - uses: actions-rs/cargo@v1
+      - name: Build benchmarks
+        uses: actions-rs/cargo@v1
         with:
-          command: test
+          command: build
+          args: --manifest-path benchmarks/Cargo.toml --all-targets
 
-      - uses: actions-rs/cargo@v1
+      - name: Check
+        uses: actions-rs/cargo@v1
         with:
-          command: test
-          args: --benches --manifest-path benchmarks/Cargo.toml
+          command: clippy
+          args: --all-targets -- -D warnings
 
-      - uses: actions-rs/cargo@v1
+      - name: Check benchmarks
+        uses: actions-rs/cargo@v1
+        with:
+          command: clippy
+          args: --manifest-path benchmarks/Cargo.toml --all-targets -- -D warnings
+
+      - name: Check formatting
+        uses: actions-rs/cargo@v1
         with:
           command: fmt
           args: -- --check
 
-      - uses: actions-rs/cargo@v1
+      - name: Check benchmark formatting
+        uses: actions-rs/cargo@v1
         with:
           command: fmt
           args: --manifest-path benchmarks/Cargo.toml -- --check
 
-      - uses: actions-rs/cargo@v1
+      - name: Test
+        uses: actions-rs/cargo@v1
         with:
-          command: clippy
-          args: --all-targets -- -D warnings
-  simd:
-    name: SIMD Feature
-    runs-on: ubuntu-latest
-
-    steps:
-      - uses: actions/checkout@v2
+          command: test
 
-      - uses: actions-rs/toolchain@v1
+      - name: Test benchmarks
+        uses: actions-rs/cargo@v1
         with:
-          profile: minimal
-          toolchain: nightly
-          override: true
-          components: rustfmt, clippy
+          command: test
+          args: --manifest-path benchmarks/Cargo.toml --benches
 
-      - uses: actions-rs/cargo@v1
+      - name: SIMD test
+        if: matrix.rust == 'nightly'
+        uses: actions-rs/cargo@v1
         with:
           toolchain: nightly
           command: test
           args: --features "simd"
 
-      - uses: actions-rs/cargo@v1
+      - name: SIMD test benchmarks
+        if: matrix.rust == 'nightly'
+        uses: actions-rs/cargo@v1
         with:
           toolchain: nightly
           command: test
-          args: --features "simd" --benches --manifest-path benchmarks/Cargo.toml
\ No newline at end of file
+          args: --manifest-path benchmarks/Cargo.toml --features "simd"
\ No newline at end of file
diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs
index c814c23ab..392ae211b 100644
--- a/benchmarks/benches/datasets.rs
+++ b/benchmarks/benches/datasets.rs
@@ -50,6 +50,23 @@ fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
     let out_path = Path::new(&out_dir);
     let repo_path = out_path.join("real-roaring-datasets");
 
+    // Check if in offline mode
+
+    let offline = env::var("ROARINGRS_BENCH_OFFLINE");
+    match offline {
+        Ok(value) => {
+            if value.parse::<bool>()? {
+                return Ok(repo_path);
+            }
+        }
+        Err(ref err) => match err {
+            env::VarError::NotPresent => (),
+            _ => {
+                offline?;
+            }
+        },
+    };
+
     // Setup progress callbacks
 
     let pb_cell = once_cell::unsync::OnceCell::new();

From 4758c511212d81f208a0644c65c189f3e2996761 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 05:11:12 -0800
Subject: [PATCH 09/14] add trailing newline to test.yml

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 7d64dca1f..99a07e479 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -102,4 +102,4 @@ jobs:
         with:
           toolchain: nightly
           command: test
-          args: --manifest-path benchmarks/Cargo.toml --features "simd"
\ No newline at end of file
+          args: --manifest-path benchmarks/Cargo.toml --features "simd"

From 0a3bbd05cf448b313a918df86fa9e3094537914b Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 05:34:34 -0800
Subject: [PATCH 10/14] add fetch step to ci

---
 .github/workflows/test.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 99a07e479..c98b732a5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -41,6 +41,18 @@ jobs:
           override: true
           components: rustfmt, clippy
 
+      - name: Fetch
+        uses: actions-rs/cargo@v1
+        with:
+          command: fetch
+          args: --all-targets
+
+      - name: Fetch benchmarks
+        uses: actions-rs/cargo@v1
+        with:
+          command: fetch
+          args: --manifest-path benchmarks/Cargo.toml --all-targets
+
       - name: Build
         uses: actions-rs/cargo@v1
         with:

From 173f63c9c3e0869118d229a589cbb4b5846ef13f Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 05:35:57 -0800
Subject: [PATCH 11/14] remove all-targets arg from fetch step

---
 .github/workflows/test.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c98b732a5..08e67b65b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -45,13 +45,12 @@ jobs:
         uses: actions-rs/cargo@v1
         with:
           command: fetch
-          args: --all-targets
 
       - name: Fetch benchmarks
         uses: actions-rs/cargo@v1
         with:
           command: fetch
-          args: --manifest-path benchmarks/Cargo.toml --all-targets
+          args: --manifest-path benchmarks/Cargo.toml
 
       - name: Build
         uses: actions-rs/cargo@v1

From b9afd1f59ad451bafc262892de91168d1277962c Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 05:37:14 -0800
Subject: [PATCH 12/14] add vendored ssl to git deps

---
 benchmarks/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml
index 6c5ea887a..5fa81514e 100644
--- a/benchmarks/Cargo.toml
+++ b/benchmarks/Cargo.toml
@@ -11,7 +11,7 @@ roaring = { path = ".." }
 
 [dev-dependencies]
 once_cell = "1.9"
-git2 = { version = "0.13", default-features = false }
+git2 = { version = "0.13", default-features = false, features = ["vendored-openssl"] }
 zip = { version = "0.5", default-features = false, features = ["deflate"] }
 indicatif = "0.16"
 criterion = { version = "0.3", features = ["html_reports"] }

From 0fdc45527e6a43ec600c3c82d79735eceb84d856 Mon Sep 17 00:00:00 2001
From: saik0 <github@saik0.net>
Date: Wed, 9 Feb 2022 06:30:08 -0800
Subject: [PATCH 13/14] add --benches flag to simd benchmark tests

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 08e67b65b..8302716d0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -113,4 +113,4 @@ jobs:
         with:
           toolchain: nightly
           command: test
-          args: --manifest-path benchmarks/Cargo.toml --features "simd"
+          args: --manifest-path benchmarks/Cargo.toml --features "simd" --benches

From 6dbbd8002a65af971ddb17122da2fe579f09b1d4 Mon Sep 17 00:00:00 2001
From: Joel Pedraza <github@saik0.net>
Date: Wed, 9 Feb 2022 06:32:21 -0800
Subject: [PATCH 14/14] Update benchmarks/benches/datasets.rs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <renault.cle@gmail.com>
---
 benchmarks/benches/datasets.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs
index 392ae211b..3dd6d9a67 100644
--- a/benchmarks/benches/datasets.rs
+++ b/benchmarks/benches/datasets.rs
@@ -84,7 +84,7 @@ fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
                         .progress_chars("#> "),
                 )
                 .with_prefix("    ")
-                .with_message("Recieving objects")
+                .with_message("Receiving objects")
         });
 
         pb.set_position((progress.local_objects() + progress.received_objects()) as u64);