From 906144acbd805796c60b9e051461ec35f015044d Mon Sep 17 00:00:00 2001 From: saik0 Date: Tue, 8 Feb 2022 21:23:53 -0800 Subject: [PATCH 01/14] add runtime dataset fetch and parse in-place --- .gitignore | 5 - benchmarks/.gitignore | 3 + benchmarks/Cargo.toml | 4 + benchmarks/benches/datasets.rs | 186 +++++++++++++++++++++ benchmarks/benches/lib.rs | 288 +++++++++++++-------------------- benchmarks/build.rs | 80 --------- 6 files changed, 308 insertions(+), 258 deletions(-) create mode 100644 benchmarks/.gitignore create mode 100644 benchmarks/benches/datasets.rs diff --git a/.gitignore b/.gitignore index 75d4b9447..4fffb2f89 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,2 @@ /target /Cargo.lock - -# This is generated by the benchmarks crate build script, do not version with git. -/benchmarks/benches/datasets_paths.rs -/benchmarks/target -/benchmarks/Cargo.lock diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore new file mode 100644 index 000000000..7adbbb807 --- /dev/null +++ b/benchmarks/.gitignore @@ -0,0 +1,3 @@ +/target +/Cargo.lock +/real-roaring-datasets \ No newline at end of file diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 924794ecb..a6a5326d7 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -10,6 +10,10 @@ publish = false roaring = { path = ".." } [dev-dependencies] +once_cell = "1.9.0" +git2 = "0.13.25" +zip = "0.5.13" +indicatif = "0.16.2" criterion = { version = "0.3", features = ["html_reports"] } quickcheck = "0.9" quickcheck_macros = "0.9" diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs new file mode 100644 index 000000000..c814c23ab --- /dev/null +++ b/benchmarks/benches/datasets.rs @@ -0,0 +1,186 @@ +use std::env; +use std::fs::File; +use std::io::BufReader; +use std::path::{Path, PathBuf}; + +use git2::FetchOptions; +use once_cell::sync::OnceCell as SyncOnceCell; + +use roaring::RoaringBitmap; + +static INSTANCE: SyncOnceCell> = SyncOnceCell::new(); + +pub struct Datasets; + +pub struct DatasetsIter { + iter: std::slice::Iter<'static, Dataset>, +} + +impl Iterator for DatasetsIter { + type Item = &'static Dataset; + + fn next(&mut self) -> Option { + self.iter.next() + } +} + +impl IntoIterator for Datasets { + type Item = &'static Dataset; + type IntoIter = DatasetsIter; + + fn into_iter(self) -> Self::IntoIter { + DatasetsIter { + iter: INSTANCE + .get_or_init(|| { + init_datasets().and_then(parse_datasets).expect("a collection of datasets") + }) + .iter(), + } + } +} + +pub struct Dataset { + pub name: String, + pub bitmaps: Vec, +} + +fn init_datasets() -> Result> { + let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?; + + let out_path = Path::new(&out_dir); + let repo_path = out_path.join("real-roaring-datasets"); + + // Setup progress callbacks + + let pb_cell = once_cell::unsync::OnceCell::new(); + let mut cb = git2::RemoteCallbacks::new(); + + cb.transfer_progress(|progress| { + let pb = pb_cell.get_or_init(|| { + indicatif::ProgressBar::new(progress.total_objects() as u64) + .with_style( + indicatif::ProgressStyle::default_bar() + .template(&format!( + "{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}", + progress.total_objects() + )) + .progress_chars("#> "), + ) + .with_prefix(" ") + .with_message("Recieving objects") + }); + + pb.set_position((progress.local_objects() + progress.received_objects()) as u64); + true + }); + + let mut fetch_opts = FetchOptions::new(); + fetch_opts.remote_callbacks(cb); + + // Do update + + if !Path::new(&repo_path).exists() { + git2::build::RepoBuilder::new() + .fetch_options(fetch_opts) + .clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?; + } else { + let repo = git2::Repository::open(&repo_path)?; + repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?; + + let head = repo.head()?.peel_to_commit()?; + let origin_master_head = repo + .find_branch("origin/master", git2::BranchType::Remote)? + .into_reference() + .peel_to_commit()?; + + if head.id() != origin_master_head.id() { + repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?; + } + } + + if let Some(pb) = pb_cell.get() { + pb.finish() + } + + Ok(repo_path) +} + +fn parse_datasets>(path: P) -> Result, Box> { + const DATASET_FILENAME_WHITELIST: &[&str] = &[ + "census-income.zip", + "census-income_srt.zip", + "census1881.zip", + "census1881_srt.zip", + "weather_sept_85.zip", + "weather_sept_85_srt.zip", + "wikileaks-noquotes.zip", + "wikileaks-noquotes_srt.zip", + ]; + + use indicatif::{ProgressBar, ProgressStyle}; + use std::io::BufRead; + use zip::ZipArchive; + + let dir = path.as_ref().read_dir()?; + + let mut datasets = Vec::new(); + + // Future work: Reuse this buffer to parse croaring bitmaps for comparison + let mut numbers = Vec::new(); + + for dir_entry_result in dir { + let dir_entry = dir_entry_result?; + let metadata = dir_entry.metadata()?; + let file_name = dir_entry.file_name(); + // TODO dont panic + let file_name_str = file_name.to_str().expect("utf-8 filename"); + + if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) { + let file = File::open(dir_entry.path())?; + let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string(); + + let mut zip = ZipArchive::new(file)?; + + let mut total_size = 0; + for i in 0..zip.len() { + let file = zip.by_index(i)?; + total_size += file.size(); + } + + let pb = ProgressBar::new(total_size) + .with_style( + ProgressStyle::default_bar() + .template(" {prefix:.green} [{bar}] {msg}") + .progress_chars("#> "), + ) + .with_prefix("Parsing") + .with_message(name.clone()); + + let mut bitmaps = Vec::with_capacity(zip.len()); + for i in 0..zip.len() { + let file = zip.by_index(i)?; + let size = file.size(); + let buf = BufReader::new(file); + + for bytes in buf.split(b',') { + let bytes = bytes?; + let str = String::from_utf8(bytes)?; + let n = str.trim().parse::()?; + numbers.push(n); + } + + let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?; + numbers.clear(); + bitmaps.push(bitmap); + + pb.set_position(pb.position() + size); + } + + pb.finish(); + datasets.push(Dataset { name, bitmaps }); + } + } + datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name)); + println!(); + Ok(datasets) +} diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index 25e52e2c7..bff39f76f 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -1,14 +1,16 @@ -mod datasets_paths; - use std::cmp::Reverse; -use std::convert::TryInto; use std::num::ParseIntError; use std::path::{Path, PathBuf}; use std::{fs, io}; -use criterion::{black_box, criterion_group, criterion_main, BatchSize, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; + use roaring::RoaringBitmap; +use crate::datasets::Datasets; + +mod datasets; + fn create(c: &mut Criterion) { c.bench_function("create", |b| { b.iter(|| { @@ -96,34 +98,24 @@ fn len(c: &mut Criterion) { } fn rank(c: &mut Criterion) { - let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT; - let parsed_numbers = parse_dir_files(files).unwrap(); - - // Cache len to prevent len calculation from effecting benchmark - let bitmaps: Vec<_> = parsed_numbers - .into_iter() - .map(|(_, r)| { - r.map(|iter| { - let bitmap = RoaringBitmap::from_sorted_iter(iter).unwrap(); - let len: u32 = bitmap.len().try_into().expect("len <= u32::MAX"); - (bitmap, len) - }) - .unwrap() - }) - .collect(); - - // Rank all multiples of 100 < bitmap.len() - // Mupliplier chosen arbitrarily, but should be sure not to rank many values > len() - // Doing so would degenerate into benchmarking len() - c.bench_function("rank", |b| { - b.iter(|| { - for (bitmap, len) in bitmaps.iter() { - for i in (0..*len).step_by(100) { - black_box(bitmap.rank(i)); + let mut group = c.benchmark_group("rank"); + for dataset in Datasets { + let bitmaps = + dataset.bitmaps.iter().map(|bitmap| (bitmap, bitmap.len() as u32)).collect::>(); + + // Rank all multiples of 100 < bitmap.len() + // Mupliplier chosen arbitrarily, but should be sure not to rank many values > len() + // Doing so would degenerate into benchmarking len() + group.bench_function(BenchmarkId::new("rank", &dataset.name), |b| { + b.iter(|| { + for (bitmap, len) in bitmaps.iter() { + for i in (0..*len).step_by(100) { + black_box(bitmap.rank(i)); + } } - } + }); }); - }); + } } fn and(c: &mut Criterion) { @@ -277,66 +269,18 @@ fn insert_range_bitmap(c: &mut Criterion) { } fn iter(c: &mut Criterion) { - c.bench_function("iter bitmap 1..10_000", |b| { - let bitmap: RoaringBitmap = (1..10_000).collect(); - b.iter(|| { - bitmap.iter().for_each(|i| { - black_box(i); - }); - }); - }); - - c.bench_function("iter bitmap sparse", |b| { - let bitmap: RoaringBitmap = (0..1 << 16).step_by(61).collect(); - b.iter(|| { - bitmap.iter().for_each(|i| { - black_box(i); + let mut group = c.benchmark_group("iter"); + for dataset in Datasets { + group.bench_function(BenchmarkId::new("iter", &dataset.name), |b| { + b.iter(|| { + dataset.bitmaps.iter().flat_map(|bitmap| bitmap.iter()).for_each(|i| { + black_box(i); + }); }); }); - }); - - c.bench_function("iter bitmap dense", |b| { - let bitmap: RoaringBitmap = (0..1 << 16).step_by(2).collect(); - b.iter(|| { - bitmap.iter().for_each(|i| { - black_box(i); - }); - }); - }); - - c.bench_function("iter bitmap minimal", |b| { - let bitmap: RoaringBitmap = (0..4096).collect(); - b.iter(|| { - bitmap.iter().for_each(|i| { - black_box(i); - }); - }); - }); - - c.bench_function("iter bitmap full", |b| { - let bitmap: RoaringBitmap = (0..1 << 16).collect(); - b.iter(|| { - bitmap.iter().for_each(|i| { - black_box(i); - }); - }); - }); - - c.bench_function("iter parsed", |b| { - let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT; - let parsed_numbers = parse_dir_files(files).unwrap(); - - let bitmaps: Vec<_> = parsed_numbers - .into_iter() - .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap()) - .collect(); + } - b.iter(|| { - bitmaps.iter().flat_map(|bitmap| bitmap.iter()).for_each(|i| { - black_box(i); - }); - }); - }); + group.finish(); } fn is_empty(c: &mut Criterion) { @@ -417,112 +361,110 @@ fn parse_dir_files>( } fn from_sorted_iter(c: &mut Criterion) { - let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT; - let parsed_numbers = parse_dir_files(files).unwrap(); + let mut group = c.benchmark_group("from_sorted_iter"); + + for dataset in Datasets { + let dataset_numbers = dataset + .bitmaps + .iter() + .map(|bitmap| bitmap.iter().collect::>()) + .collect::>(); + + group.bench_function(BenchmarkId::new("from_sorted_iter", &dataset.name), |b| { + b.iter(|| { + for bitmap_numbers in &dataset_numbers { + RoaringBitmap::from_sorted_iter(bitmap_numbers.iter().copied()).unwrap(); + } + }) + }); + } - c.bench_function("from_sorted_iter", |b| { - b.iter(|| { - for (_, numbers) in &parsed_numbers { - let numbers = numbers.as_ref().unwrap(); - RoaringBitmap::from_sorted_iter(numbers.iter().copied()).unwrap(); - } - }) - }); + group.finish(); } fn successive_and(c: &mut Criterion) { - let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT; - let parsed_numbers = parse_dir_files(files).unwrap(); - - let mut bitmaps: Vec<_> = parsed_numbers - .into_iter() - .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap()) - .collect(); - - // biggest bitmaps first. - bitmaps.sort_unstable_by_key(|b| Reverse(b.len())); - let mut group = c.benchmark_group("Successive And"); - group.bench_function("Successive And Assign Ref", |b| { - b.iter_batched( - || bitmaps.clone(), - |bitmaps| { - let mut iter = bitmaps.into_iter(); - let mut first = iter.next().unwrap().clone(); - for bitmap in iter { - first &= bitmap; - } - }, - BatchSize::LargeInput, - ); - }); + for dataset in Datasets { + // biggest bitmaps first. + let mut sorted_bitmaps = dataset.bitmaps.clone(); + sorted_bitmaps.sort_unstable_by_key(|b| Reverse(b.len())); - group.bench_function("Successive And Assign Owned", |b| { - b.iter_batched( - || bitmaps.clone(), - |bitmaps| { - black_box(bitmaps.into_iter().reduce(|a, b| a & b).unwrap()); - }, - BatchSize::LargeInput, - ); - }); + group.bench_function(BenchmarkId::new("Successive And Assign Ref", &dataset.name), |b| { + b.iter_batched( + || sorted_bitmaps.clone(), + |bitmaps| { + let mut iter = bitmaps.into_iter(); + let mut first = iter.next().unwrap().clone(); + for bitmap in iter { + first &= bitmap; + } + }, + BatchSize::LargeInput, + ); + }); - group.bench_function("Successive And Ref Ref", |b| { - b.iter_batched( - || bitmaps.clone(), - |bitmaps| { - let mut iter = bitmaps.iter(); - let first = iter.next().unwrap().clone(); - black_box(iter.fold(first, |acc, x| (&acc) & x)); - }, - BatchSize::LargeInput, - ); - }); + group.bench_function(BenchmarkId::new("Successive And Assign Owned", &dataset.name), |b| { + b.iter_batched( + || sorted_bitmaps.clone(), + |bitmaps| { + black_box(bitmaps.into_iter().reduce(|a, b| a & b).unwrap()); + }, + BatchSize::LargeInput, + ); + }); + + group.bench_function(BenchmarkId::new("Successive And Ref Ref", &dataset.name), |b| { + b.iter_batched( + || sorted_bitmaps.clone(), + |bitmaps| { + let mut iter = bitmaps.iter(); + let first = iter.next().unwrap().clone(); + black_box(iter.fold(first, |acc, x| (&acc) & x)); + }, + BatchSize::LargeInput, + ); + }); + } group.finish(); } fn successive_or(c: &mut Criterion) { - let files = self::datasets_paths::WIKILEAKS_NOQUOTES_SRT; - let parsed_numbers = parse_dir_files(files).unwrap(); - - let bitmaps: Vec<_> = parsed_numbers - .into_iter() - .map(|(_, r)| r.map(|iter| RoaringBitmap::from_sorted_iter(iter).unwrap()).unwrap()) - .collect(); - let mut group = c.benchmark_group("Successive Or"); - group.bench_function("Successive Or Assign Ref", |b| { - b.iter(|| { - let mut output = RoaringBitmap::new(); - for bitmap in &bitmaps { - output |= bitmap; - } - }); - }); - group.bench_function("Successive Or Assign Owned", |b| { - b.iter_batched( - || bitmaps.clone(), - |bitmaps: Vec| { + for dataset in Datasets { + group.bench_function(BenchmarkId::new("Successive Or Assign Ref", &dataset.name), |b| { + b.iter(|| { let mut output = RoaringBitmap::new(); - for bitmap in bitmaps { + for bitmap in &dataset.bitmaps { output |= bitmap; } - }, - BatchSize::LargeInput, - ); - }); + }); + }); - group.bench_function("Successive Or Ref Ref", |b| { - b.iter(|| { - let mut output = RoaringBitmap::new(); - for bitmap in &bitmaps { - output = (&output) | bitmap; - } + group.bench_function(BenchmarkId::new("Successive Or Assign Owned", &dataset.name), |b| { + b.iter_batched( + || dataset.bitmaps.clone(), + |bitmaps: Vec| { + let mut output = RoaringBitmap::new(); + for bitmap in bitmaps { + output |= bitmap; + } + }, + BatchSize::LargeInput, + ); }); - }); + + group.bench_function(BenchmarkId::new("Successive Or Ref Ref", &dataset.name), |b| { + b.iter(|| { + let mut output = RoaringBitmap::new(); + for bitmap in &dataset.bitmaps { + output = (&output) | bitmap; + } + }); + }); + } group.finish(); } diff --git a/benchmarks/build.rs b/benchmarks/build.rs index 4622b3c9c..8b1378917 100644 --- a/benchmarks/build.rs +++ b/benchmarks/build.rs @@ -1,81 +1 @@ -use std::fs::File; -use std::io::{Cursor, Read, Seek, Write}; -use std::path::{Path, PathBuf}; -use std::{env, fs}; -use bytes::Bytes; -use convert_case::{Case, Casing}; -use reqwest::{blocking::get, IntoUrl}; -use zip::read::ZipArchive; - -const BASE_URL: &str = "https://github.com/RoaringBitmap/real-roaring-datasets/raw/master/"; - -// const DATASET_CENSUS_INCOME: &str = "census-income"; -// const DATASET_CENSUS_INCOME_SRT: &str = "census-income_srt"; -// const DATASET_CENSUS1881: &str = "census1881"; -// const DATASET_CENSUS1881_SRT: &str = "census1881_srt"; -// const DATASET_DIMENSION_003: &str = "dimension_003"; -// const DATASET_DIMENSION_008: &str = "dimension_008"; -// const DATASET_DIMENSION_033: &str = "dimension_033"; -// const DATASET_USCENSUS2000: &str = "uscensus2000"; -// const DATASET_WEATHER_SEPT_85: &str = "weather_sept_85"; -// const DATASET_WEATHER_SEPT_85_SRT: &str = "weather_sept_85_srt"; -// const DATASET_WIKILEAKS_NOQUOTES: &str = "wikileaks-noquotes"; -const DATASET_WIKILEAKS_NOQUOTES_SRT: &str = "wikileaks-noquotes_srt"; - -// const DATASETS: &[&str] = &[ -// DATASET_CENSUS_INCOME, -// DATASET_CENSUS_INCOME_SRT, -// DATASET_CENSUS1881, -// DATASET_CENSUS1881_SRT, -// DATASET_DIMENSION_003, -// DATASET_DIMENSION_008, -// DATASET_DIMENSION_033, -// DATASET_USCENSUS2000, -// DATASET_WEATHER_SEPT_85, -// DATASET_WEATHER_SEPT_85_SRT, -// DATASET_WIKILEAKS_NOQUOTES, -// DATASET_WIKILEAKS_NOQUOTES_SRT, -// ]; - -fn main() -> anyhow::Result<()> { - let out_dir = PathBuf::from(env::var("OUT_DIR")?); - - let benches_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?).join("benches"); - let mut manifest_paths_file = File::create(benches_dir.join("datasets_paths.rs"))?; - - writeln!( - &mut manifest_paths_file, - "// This file is generated by the build script.\n// Do not modify by hand, use the build.rs file.\n" - )?; - - #[allow(clippy::single_element_loop)] - for dataset in &[DATASET_WIKILEAKS_NOQUOTES_SRT] { - let out_path = out_dir.join(dataset); - let url = format!("{}/{}.zip", BASE_URL, dataset); - let bytes = download_dataset(url)?; - unzip_in_folder(bytes, &out_path)?; - - writeln!( - &mut manifest_paths_file, - r#"pub const {}: &str = {:?};"#, - dataset.to_case(Case::ScreamingSnake), - out_path.display(), - )?; - } - - Ok(()) -} - -fn download_dataset(url: U) -> anyhow::Result> { - let bytes = get(url)?.bytes()?; - Ok(Cursor::new(bytes)) -} - -fn unzip_in_folder>(bytes: R, path: P) -> anyhow::Result<()> { - let path = path.as_ref(); - fs::create_dir_all(path).unwrap(); - let mut zip = ZipArchive::new(bytes)?; - zip.extract(path)?; - Ok(()) -} From 01bf5b624471182ffebe001293e45aad87353d3e Mon Sep 17 00:00:00 2001 From: saik0 Date: Tue, 8 Feb 2022 21:56:08 -0800 Subject: [PATCH 02/14] remove build-deps and build.rs --- benchmarks/Cargo.toml | 7 ------- benchmarks/benches/lib.rs | 14 -------------- benchmarks/build.rs | 1 - 3 files changed, 22 deletions(-) delete mode 100644 benchmarks/build.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index a6a5326d7..e82fb9f87 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -18,13 +18,6 @@ criterion = { version = "0.3", features = ["html_reports"] } quickcheck = "0.9" quickcheck_macros = "0.9" -[build-dependencies] -anyhow = "1.0" -bytes = "1.0" -convert_case = "0.4" -reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false } -zip = "0.5.12" - [features] simd = ["roaring/simd"] diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index bff39f76f..468bf3ebc 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -346,20 +346,6 @@ fn serialized_size(c: &mut Criterion) { }); } -fn extract_integers>(content: A) -> Result, ParseIntError> { - content.as_ref().split(',').map(|s| s.trim().parse()).collect() -} - -// Parse every file into a vector of integer. -fn parse_dir_files>( - files: A, -) -> io::Result, ParseIntError>)>> { - fs::read_dir(files)? - .map(|r| r.and_then(|e| fs::read_to_string(e.path()).map(|r| (e.path(), r)))) - .map(|r| r.map(|(p, c)| (p, extract_integers(c)))) - .collect() -} - fn from_sorted_iter(c: &mut Criterion) { let mut group = c.benchmark_group("from_sorted_iter"); diff --git a/benchmarks/build.rs b/benchmarks/build.rs deleted file mode 100644 index 8b1378917..000000000 --- a/benchmarks/build.rs +++ /dev/null @@ -1 +0,0 @@ - From 6050c684e3afef6aa1e100c387a21ec80523a131 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 02:05:04 -0800 Subject: [PATCH 03/14] cleanup benchmark dependencies --- benchmarks/Cargo.toml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index e82fb9f87..6c5ea887a 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -10,13 +10,11 @@ publish = false roaring = { path = ".." } [dev-dependencies] -once_cell = "1.9.0" -git2 = "0.13.25" -zip = "0.5.13" -indicatif = "0.16.2" +once_cell = "1.9" +git2 = { version = "0.13", default-features = false } +zip = { version = "0.5", default-features = false, features = ["deflate"] } +indicatif = "0.16" criterion = { version = "0.3", features = ["html_reports"] } -quickcheck = "0.9" -quickcheck_macros = "0.9" [features] simd = ["roaring/simd"] From 1776f50fc06a1fb0e4285cafa5d7ef8900094545 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 02:05:21 -0800 Subject: [PATCH 04/14] fix benchmark warnings --- benchmarks/benches/datasets_paths.rs | 4 ++++ benchmarks/benches/lib.rs | 5 +---- 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 benchmarks/benches/datasets_paths.rs diff --git a/benchmarks/benches/datasets_paths.rs b/benchmarks/benches/datasets_paths.rs new file mode 100644 index 000000000..ca4f3a280 --- /dev/null +++ b/benchmarks/benches/datasets_paths.rs @@ -0,0 +1,4 @@ +// This file is generated by the build script. +// Do not modify by hand, use the build.rs file. + +pub const WIKILEAKS_NOQUOTES_SRT: &str = "/Users/joel.pedraza/src/roaring-rs-saik0/benchmarks/target/debug/build/benchmarks-4eb3dbdf81f7da70/out/wikileaks-noquotes_srt"; diff --git a/benchmarks/benches/lib.rs b/benchmarks/benches/lib.rs index 468bf3ebc..a87ab9c6f 100644 --- a/benchmarks/benches/lib.rs +++ b/benchmarks/benches/lib.rs @@ -1,7 +1,4 @@ use std::cmp::Reverse; -use std::num::ParseIntError; -use std::path::{Path, PathBuf}; -use std::{fs, io}; use criterion::{black_box, criterion_group, criterion_main, BatchSize, BenchmarkId, Criterion}; @@ -381,7 +378,7 @@ fn successive_and(c: &mut Criterion) { || sorted_bitmaps.clone(), |bitmaps| { let mut iter = bitmaps.into_iter(); - let mut first = iter.next().unwrap().clone(); + let mut first = iter.next().unwrap(); for bitmap in iter { first &= bitmap; } From a6648fe07caa972480ccbbaa6ea2fc9dbb81bbb7 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 02:06:46 -0800 Subject: [PATCH 05/14] remove datasets_paths.rs file --- benchmarks/benches/datasets_paths.rs | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 benchmarks/benches/datasets_paths.rs diff --git a/benchmarks/benches/datasets_paths.rs b/benchmarks/benches/datasets_paths.rs deleted file mode 100644 index ca4f3a280..000000000 --- a/benchmarks/benches/datasets_paths.rs +++ /dev/null @@ -1,4 +0,0 @@ -// This file is generated by the build script. -// Do not modify by hand, use the build.rs file. - -pub const WIKILEAKS_NOQUOTES_SRT: &str = "/Users/joel.pedraza/src/roaring-rs-saik0/benchmarks/target/debug/build/benchmarks-4eb3dbdf81f7da70/out/wikileaks-noquotes_srt"; From 9e4ffa7663551b7525ed2d5768d1831740b78d44 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 03:12:32 -0800 Subject: [PATCH 06/14] add trailing newline to benchmark gitignore --- benchmarks/.gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/.gitignore b/benchmarks/.gitignore index 7adbbb807..4c312bb62 100644 --- a/benchmarks/.gitignore +++ b/benchmarks/.gitignore @@ -1,3 +1,3 @@ /target /Cargo.lock -/real-roaring-datasets \ No newline at end of file +/real-roaring-datasets From 9eacae1387a86a1899c1ef78a22842b94de63dc3 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 03:13:23 -0800 Subject: [PATCH 07/14] remove benchmark src dir --- benchmarks/src/lib.rs | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 benchmarks/src/lib.rs diff --git a/benchmarks/src/lib.rs b/benchmarks/src/lib.rs deleted file mode 100644 index 829adafa0..000000000 --- a/benchmarks/src/lib.rs +++ /dev/null @@ -1,5 +0,0 @@ -//! This library is only used to isolate the benchmarks -//! from the original roaring library. -//! -//! It does not include interesting functions for roaring library -//! users only for roaring contributors. From b51b93c61bd5656d28a2b26d86a14efefa1728ca Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 05:07:25 -0800 Subject: [PATCH 08/14] add benchmark offline mode. use by CI. --- .github/workflows/test.yml | 75 ++++++++++++++++++++++------------ benchmarks/benches/datasets.rs | 17 ++++++++ 2 files changed, 65 insertions(+), 27 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index aa9e62002..7d64dca1f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,66 +19,87 @@ jobs: - beta - nightly - 1.56.1 + env: + RUSTFLAGS: "-C target-cpu=native -C opt-level=3" + ROARINGRS_BENCH_OFFLINE: "true" steps: - - uses: actions/checkout@v2 + - name: Checkout roaring-rs + uses: actions/checkout@v2 - - uses: actions-rs/toolchain@v1 + - name: Checkout benchmark datasets + uses: actions/checkout@v2 + with: + repository: "RoaringBitmap/real-roaring-datasets" + path: "benchmarks/real-roaring-datasets" + + - name: Initialize rust toolchain + uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust }} override: true components: rustfmt, clippy - - uses: actions-rs/cargo@v1 + - name: Build + uses: actions-rs/cargo@v1 with: command: build + args: --all-targets - - uses: actions-rs/cargo@v1 + - name: Build benchmarks + uses: actions-rs/cargo@v1 with: - command: test + command: build + args: --manifest-path benchmarks/Cargo.toml --all-targets - - uses: actions-rs/cargo@v1 + - name: Check + uses: actions-rs/cargo@v1 with: - command: test - args: --benches --manifest-path benchmarks/Cargo.toml + command: clippy + args: --all-targets -- -D warnings - - uses: actions-rs/cargo@v1 + - name: Check benchmarks + uses: actions-rs/cargo@v1 + with: + command: clippy + args: --manifest-path benchmarks/Cargo.toml --all-targets -- -D warnings + + - name: Check formatting + uses: actions-rs/cargo@v1 with: command: fmt args: -- --check - - uses: actions-rs/cargo@v1 + - name: Check benchmark formatting + uses: actions-rs/cargo@v1 with: command: fmt args: --manifest-path benchmarks/Cargo.toml -- --check - - uses: actions-rs/cargo@v1 + - name: Test + uses: actions-rs/cargo@v1 with: - command: clippy - args: --all-targets -- -D warnings - simd: - name: SIMD Feature - runs-on: ubuntu-latest - - steps: - - uses: actions/checkout@v2 + command: test - - uses: actions-rs/toolchain@v1 + - name: Test benchmarks + uses: actions-rs/cargo@v1 with: - profile: minimal - toolchain: nightly - override: true - components: rustfmt, clippy + command: test + args: --manifest-path benchmarks/Cargo.toml --benches - - uses: actions-rs/cargo@v1 + - name: SIMD test + if: matrix.rust == 'nightly' + uses: actions-rs/cargo@v1 with: toolchain: nightly command: test args: --features "simd" - - uses: actions-rs/cargo@v1 + - name: SIMD test benchmarks + if: matrix.rust == 'nightly' + uses: actions-rs/cargo@v1 with: toolchain: nightly command: test - args: --features "simd" --benches --manifest-path benchmarks/Cargo.toml \ No newline at end of file + args: --manifest-path benchmarks/Cargo.toml --features "simd" \ No newline at end of file diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs index c814c23ab..392ae211b 100644 --- a/benchmarks/benches/datasets.rs +++ b/benchmarks/benches/datasets.rs @@ -50,6 +50,23 @@ fn init_datasets() -> Result> { let out_path = Path::new(&out_dir); let repo_path = out_path.join("real-roaring-datasets"); + // Check if in offline mode + + let offline = env::var("ROARINGRS_BENCH_OFFLINE"); + match offline { + Ok(value) => { + if value.parse::()? { + return Ok(repo_path); + } + } + Err(ref err) => match err { + env::VarError::NotPresent => (), + _ => { + offline?; + } + }, + }; + // Setup progress callbacks let pb_cell = once_cell::unsync::OnceCell::new(); From 4758c511212d81f208a0644c65c189f3e2996761 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 05:11:12 -0800 Subject: [PATCH 09/14] add trailing newline to test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 7d64dca1f..99a07e479 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -102,4 +102,4 @@ jobs: with: toolchain: nightly command: test - args: --manifest-path benchmarks/Cargo.toml --features "simd" \ No newline at end of file + args: --manifest-path benchmarks/Cargo.toml --features "simd" From 0a3bbd05cf448b313a918df86fa9e3094537914b Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 05:34:34 -0800 Subject: [PATCH 10/14] add fetch step to ci --- .github/workflows/test.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 99a07e479..c98b732a5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -41,6 +41,18 @@ jobs: override: true components: rustfmt, clippy + - name: Fetch + uses: actions-rs/cargo@v1 + with: + command: fetch + args: --all-targets + + - name: Fetch benchmarks + uses: actions-rs/cargo@v1 + with: + command: fetch + args: --manifest-path benchmarks/Cargo.toml --all-targets + - name: Build uses: actions-rs/cargo@v1 with: From 173f63c9c3e0869118d229a589cbb4b5846ef13f Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 05:35:57 -0800 Subject: [PATCH 11/14] remove all-targets arg from fetch step --- .github/workflows/test.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c98b732a5..08e67b65b 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -45,13 +45,12 @@ jobs: uses: actions-rs/cargo@v1 with: command: fetch - args: --all-targets - name: Fetch benchmarks uses: actions-rs/cargo@v1 with: command: fetch - args: --manifest-path benchmarks/Cargo.toml --all-targets + args: --manifest-path benchmarks/Cargo.toml - name: Build uses: actions-rs/cargo@v1 From b9afd1f59ad451bafc262892de91168d1277962c Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 05:37:14 -0800 Subject: [PATCH 12/14] add vendored ssl to git deps --- benchmarks/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 6c5ea887a..5fa81514e 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -11,7 +11,7 @@ roaring = { path = ".." } [dev-dependencies] once_cell = "1.9" -git2 = { version = "0.13", default-features = false } +git2 = { version = "0.13", default-features = false, features = ["vendored-openssl"] } zip = { version = "0.5", default-features = false, features = ["deflate"] } indicatif = "0.16" criterion = { version = "0.3", features = ["html_reports"] } From 0fdc45527e6a43ec600c3c82d79735eceb84d856 Mon Sep 17 00:00:00 2001 From: saik0 Date: Wed, 9 Feb 2022 06:30:08 -0800 Subject: [PATCH 13/14] add --benches flag to simd benchmark tests --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 08e67b65b..8302716d0 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -113,4 +113,4 @@ jobs: with: toolchain: nightly command: test - args: --manifest-path benchmarks/Cargo.toml --features "simd" + args: --manifest-path benchmarks/Cargo.toml --features "simd" --benches From 6dbbd8002a65af971ddb17122da2fe579f09b1d4 Mon Sep 17 00:00:00 2001 From: Joel Pedraza Date: Wed, 9 Feb 2022 06:32:21 -0800 Subject: [PATCH 14/14] Update benchmarks/benches/datasets.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- benchmarks/benches/datasets.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benches/datasets.rs b/benchmarks/benches/datasets.rs index 392ae211b..3dd6d9a67 100644 --- a/benchmarks/benches/datasets.rs +++ b/benchmarks/benches/datasets.rs @@ -84,7 +84,7 @@ fn init_datasets() -> Result> { .progress_chars("#> "), ) .with_prefix(" ") - .with_message("Recieving objects") + .with_message("Receiving objects") }); pb.set_position((progress.local_objects() + progress.received_objects()) as u64);