Skip to content

Commit

Permalink
Merge #186
Browse files Browse the repository at this point in the history
186: add runtime dataset fetch and parse in-place r=Kerollmops a=saik0

Closes #129
Closes #171
Closes #185

Here's my go at fetching the datasets at runtime

 * Datasets are lazily fetched the first time they're needed (or updated, if local `HEAD != origin/master`).
 * The zip files are parsed-in place on every benchmark run, to keep the on-disk size down.
 * The parsing is also lazy, and happens at most once.
 * This PR updates any benchmarks that were already using limited data from `wikileaks-noquotes` to use all the datasets.
 * A fast follow PR will update all the benchmarks.

`@Kerollmops` Third times the charm?

Co-authored-by: saik0 <[email protected]>
Co-authored-by: Joel Pedraza <[email protected]>
  • Loading branch information
bors[bot] and saik0 authored Feb 9, 2022
2 parents 60ec7d2 + dbfe350 commit 31ed4ca
Show file tree
Hide file tree
Showing 8 changed files with 384 additions and 317 deletions.
86 changes: 59 additions & 27 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,66 +19,98 @@ jobs:
- beta
- nightly
- 1.56.1
env:
RUSTFLAGS: "-C target-cpu=native -C opt-level=3"
ROARINGRS_BENCH_OFFLINE: "true"

steps:
- uses: actions/checkout@v2
- name: Checkout roaring-rs
uses: actions/checkout@v2

- uses: actions-rs/toolchain@v1
- name: Checkout benchmark datasets
uses: actions/checkout@v2
with:
repository: "RoaringBitmap/real-roaring-datasets"
path: "benchmarks/real-roaring-datasets"

- name: Initialize rust toolchain
uses: actions-rs/toolchain@v1
with:
profile: minimal
toolchain: ${{ matrix.rust }}
override: true
components: rustfmt, clippy

- uses: actions-rs/cargo@v1
- name: Fetch
uses: actions-rs/cargo@v1
with:
command: fetch

- name: Fetch benchmarks
uses: actions-rs/cargo@v1
with:
command: fetch
args: --manifest-path benchmarks/Cargo.toml

- name: Build
uses: actions-rs/cargo@v1
with:
command: build
args: --all-targets

- uses: actions-rs/cargo@v1
- name: Build benchmarks
uses: actions-rs/cargo@v1
with:
command: test
command: build
args: --manifest-path benchmarks/Cargo.toml --all-targets

- uses: actions-rs/cargo@v1
- name: Check
uses: actions-rs/cargo@v1
with:
command: test
args: --benches --manifest-path benchmarks/Cargo.toml
command: clippy
args: --all-targets -- -D warnings

- uses: actions-rs/cargo@v1
- name: Check benchmarks
uses: actions-rs/cargo@v1
with:
command: clippy
args: --manifest-path benchmarks/Cargo.toml --all-targets -- -D warnings

- name: Check formatting
uses: actions-rs/cargo@v1
with:
command: fmt
args: -- --check

- uses: actions-rs/cargo@v1
- name: Check benchmark formatting
uses: actions-rs/cargo@v1
with:
command: fmt
args: --manifest-path benchmarks/Cargo.toml -- --check

- uses: actions-rs/cargo@v1
- name: Test
uses: actions-rs/cargo@v1
with:
command: clippy
args: --all-targets -- -D warnings
simd:
name: SIMD Feature
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
command: test

- uses: actions-rs/toolchain@v1
- name: Test benchmarks
uses: actions-rs/cargo@v1
with:
profile: minimal
toolchain: nightly
override: true
components: rustfmt, clippy
command: test
args: --manifest-path benchmarks/Cargo.toml --benches

- uses: actions-rs/cargo@v1
- name: SIMD test
if: matrix.rust == 'nightly'
uses: actions-rs/cargo@v1
with:
toolchain: nightly
command: test
args: --features "simd"

- uses: actions-rs/cargo@v1
- name: SIMD test benchmarks
if: matrix.rust == 'nightly'
uses: actions-rs/cargo@v1
with:
toolchain: nightly
command: test
args: --features "simd" --benches --manifest-path benchmarks/Cargo.toml
args: --manifest-path benchmarks/Cargo.toml --features "simd" --benches
5 changes: 0 additions & 5 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,2 @@
/target
/Cargo.lock

# This is generated by the benchmarks crate build script, do not version with git.
/benchmarks/benches/datasets_paths.rs
/benchmarks/target
/benchmarks/Cargo.lock
3 changes: 3 additions & 0 deletions benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
/target
/Cargo.lock
/real-roaring-datasets
13 changes: 4 additions & 9 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,11 @@ publish = false
roaring = { path = ".." }

[dev-dependencies]
once_cell = "1.9"
git2 = { version = "0.13", default-features = false, features = ["vendored-openssl"] }
zip = { version = "0.5", default-features = false, features = ["deflate"] }
indicatif = "0.16"
criterion = { version = "0.3", features = ["html_reports"] }
quickcheck = "0.9"
quickcheck_macros = "0.9"

[build-dependencies]
anyhow = "1.0"
bytes = "1.0"
convert_case = "0.4"
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
zip = "0.5.12"

[features]
simd = ["roaring/simd"]
Expand Down
203 changes: 203 additions & 0 deletions benchmarks/benches/datasets.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
use std::env;
use std::fs::File;
use std::io::BufReader;
use std::path::{Path, PathBuf};

use git2::FetchOptions;
use once_cell::sync::OnceCell as SyncOnceCell;

use roaring::RoaringBitmap;

static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new();

pub struct Datasets;

pub struct DatasetsIter {
iter: std::slice::Iter<'static, Dataset>,
}

impl Iterator for DatasetsIter {
type Item = &'static Dataset;

fn next(&mut self) -> Option<Self::Item> {
self.iter.next()
}
}

impl IntoIterator for Datasets {
type Item = &'static Dataset;
type IntoIter = DatasetsIter;

fn into_iter(self) -> Self::IntoIter {
DatasetsIter {
iter: INSTANCE
.get_or_init(|| {
init_datasets().and_then(parse_datasets).expect("a collection of datasets")
})
.iter(),
}
}
}

pub struct Dataset {
pub name: String,
pub bitmaps: Vec<RoaringBitmap>,
}

fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?;

let out_path = Path::new(&out_dir);
let repo_path = out_path.join("real-roaring-datasets");

// Check if in offline mode

let offline = env::var("ROARINGRS_BENCH_OFFLINE");
match offline {
Ok(value) => {
if value.parse::<bool>()? {
return Ok(repo_path);
}
}
Err(ref err) => match err {
env::VarError::NotPresent => (),
_ => {
offline?;
}
},
};

// Setup progress callbacks

let pb_cell = once_cell::unsync::OnceCell::new();
let mut cb = git2::RemoteCallbacks::new();

cb.transfer_progress(|progress| {
let pb = pb_cell.get_or_init(|| {
indicatif::ProgressBar::new(progress.total_objects() as u64)
.with_style(
indicatif::ProgressStyle::default_bar()
.template(&format!(
"{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}",
progress.total_objects()
))
.progress_chars("#> "),
)
.with_prefix(" ")
.with_message("Receiving objects")
});

pb.set_position((progress.local_objects() + progress.received_objects()) as u64);
true
});

let mut fetch_opts = FetchOptions::new();
fetch_opts.remote_callbacks(cb);

// Do update

if !Path::new(&repo_path).exists() {
git2::build::RepoBuilder::new()
.fetch_options(fetch_opts)
.clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?;
} else {
let repo = git2::Repository::open(&repo_path)?;
repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?;

let head = repo.head()?.peel_to_commit()?;
let origin_master_head = repo
.find_branch("origin/master", git2::BranchType::Remote)?
.into_reference()
.peel_to_commit()?;

if head.id() != origin_master_head.id() {
repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?;
}
}

if let Some(pb) = pb_cell.get() {
pb.finish()
}

Ok(repo_path)
}

fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> {
const DATASET_FILENAME_WHITELIST: &[&str] = &[
"census-income.zip",
"census-income_srt.zip",
"census1881.zip",
"census1881_srt.zip",
"weather_sept_85.zip",
"weather_sept_85_srt.zip",
"wikileaks-noquotes.zip",
"wikileaks-noquotes_srt.zip",
];

use indicatif::{ProgressBar, ProgressStyle};
use std::io::BufRead;
use zip::ZipArchive;

let dir = path.as_ref().read_dir()?;

let mut datasets = Vec::new();

// Future work: Reuse this buffer to parse croaring bitmaps for comparison
let mut numbers = Vec::new();

for dir_entry_result in dir {
let dir_entry = dir_entry_result?;
let metadata = dir_entry.metadata()?;
let file_name = dir_entry.file_name();
// TODO dont panic
let file_name_str = file_name.to_str().expect("utf-8 filename");

if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) {
let file = File::open(dir_entry.path())?;
let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string();

let mut zip = ZipArchive::new(file)?;

let mut total_size = 0;
for i in 0..zip.len() {
let file = zip.by_index(i)?;
total_size += file.size();
}

let pb = ProgressBar::new(total_size)
.with_style(
ProgressStyle::default_bar()
.template(" {prefix:.green} [{bar}] {msg}")
.progress_chars("#> "),
)
.with_prefix("Parsing")
.with_message(name.clone());

let mut bitmaps = Vec::with_capacity(zip.len());
for i in 0..zip.len() {
let file = zip.by_index(i)?;
let size = file.size();
let buf = BufReader::new(file);

for bytes in buf.split(b',') {
let bytes = bytes?;
let str = String::from_utf8(bytes)?;
let n = str.trim().parse::<u32>()?;
numbers.push(n);
}

let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?;
numbers.clear();
bitmaps.push(bitmap);

pb.set_position(pb.position() + size);
}

pb.finish();
datasets.push(Dataset { name, bitmaps });
}
}
datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name));
println!();
Ok(datasets)
}
Loading

0 comments on commit 31ed4ca

Please sign in to comment.