-
Notifications
You must be signed in to change notification settings - Fork 93
add runtime dataset fetch and parse in-place #186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
bors
merged 14 commits into
RoaringBitmap:master
from
saik0:bench-datasets-fetch-runtime
Feb 9, 2022
Merged
Changes from 8 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
d1fa477
add runtime dataset fetch and parse in-place
saik0 7461720
remove build-deps and build.rs
saik0 37c10a3
cleanup benchmark dependencies
saik0 7da34b7
fix benchmark warnings
saik0 1f6cc64
remove datasets_paths.rs file
saik0 6b420e0
add trailing newline to benchmark gitignore
saik0 e846983
remove benchmark src dir
saik0 684b4b2
add benchmark offline mode. use by CI.
saik0 eb3de40
add trailing newline to test.yml
saik0 a6e88f4
add fetch step to ci
saik0 3c63761
remove all-targets arg from fetch step
saik0 b99c445
add vendored ssl to git deps
saik0 68de418
add --benches flag to simd benchmark tests
saik0 dbfe350
Update benchmarks/benches/datasets.rs
saik0 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,2 @@ | ||
| /target | ||
| /Cargo.lock | ||
|
|
||
| # This is generated by the benchmarks crate build script, do not version with git. | ||
| /benchmarks/benches/datasets_paths.rs | ||
| /benchmarks/target | ||
| /benchmarks/Cargo.lock |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| /target | ||
| /Cargo.lock | ||
| /real-roaring-datasets |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,203 @@ | ||
| use std::env; | ||
| use std::fs::File; | ||
| use std::io::BufReader; | ||
| use std::path::{Path, PathBuf}; | ||
|
|
||
| use git2::FetchOptions; | ||
| use once_cell::sync::OnceCell as SyncOnceCell; | ||
|
|
||
| use roaring::RoaringBitmap; | ||
|
|
||
| static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new(); | ||
|
|
||
| pub struct Datasets; | ||
|
|
||
| pub struct DatasetsIter { | ||
| iter: std::slice::Iter<'static, Dataset>, | ||
| } | ||
|
|
||
| impl Iterator for DatasetsIter { | ||
| type Item = &'static Dataset; | ||
|
|
||
| fn next(&mut self) -> Option<Self::Item> { | ||
| self.iter.next() | ||
| } | ||
| } | ||
|
|
||
| impl IntoIterator for Datasets { | ||
| type Item = &'static Dataset; | ||
| type IntoIter = DatasetsIter; | ||
|
|
||
| fn into_iter(self) -> Self::IntoIter { | ||
| DatasetsIter { | ||
| iter: INSTANCE | ||
| .get_or_init(|| { | ||
| init_datasets().and_then(parse_datasets).expect("a collection of datasets") | ||
| }) | ||
| .iter(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| pub struct Dataset { | ||
| pub name: String, | ||
| pub bitmaps: Vec<RoaringBitmap>, | ||
| } | ||
|
|
||
| fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> { | ||
| let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?; | ||
Kerollmops marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| let out_path = Path::new(&out_dir); | ||
| let repo_path = out_path.join("real-roaring-datasets"); | ||
Kerollmops marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| // Check if in offline mode | ||
|
|
||
| let offline = env::var("ROARINGRS_BENCH_OFFLINE"); | ||
| match offline { | ||
| Ok(value) => { | ||
| if value.parse::<bool>()? { | ||
| return Ok(repo_path); | ||
| } | ||
| } | ||
| Err(ref err) => match err { | ||
| env::VarError::NotPresent => (), | ||
| _ => { | ||
| offline?; | ||
| } | ||
| }, | ||
| }; | ||
|
|
||
| // Setup progress callbacks | ||
|
|
||
| let pb_cell = once_cell::unsync::OnceCell::new(); | ||
| let mut cb = git2::RemoteCallbacks::new(); | ||
|
|
||
| cb.transfer_progress(|progress| { | ||
| let pb = pb_cell.get_or_init(|| { | ||
| indicatif::ProgressBar::new(progress.total_objects() as u64) | ||
| .with_style( | ||
| indicatif::ProgressStyle::default_bar() | ||
| .template(&format!( | ||
| "{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}", | ||
| progress.total_objects() | ||
| )) | ||
| .progress_chars("#> "), | ||
| ) | ||
| .with_prefix(" ") | ||
| .with_message("Recieving objects") | ||
saik0 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| }); | ||
|
|
||
| pb.set_position((progress.local_objects() + progress.received_objects()) as u64); | ||
| true | ||
| }); | ||
|
|
||
| let mut fetch_opts = FetchOptions::new(); | ||
| fetch_opts.remote_callbacks(cb); | ||
|
|
||
| // Do update | ||
|
|
||
| if !Path::new(&repo_path).exists() { | ||
| git2::build::RepoBuilder::new() | ||
| .fetch_options(fetch_opts) | ||
| .clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?; | ||
| } else { | ||
| let repo = git2::Repository::open(&repo_path)?; | ||
| repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?; | ||
|
|
||
| let head = repo.head()?.peel_to_commit()?; | ||
| let origin_master_head = repo | ||
| .find_branch("origin/master", git2::BranchType::Remote)? | ||
| .into_reference() | ||
| .peel_to_commit()?; | ||
|
|
||
| if head.id() != origin_master_head.id() { | ||
| repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?; | ||
| } | ||
| } | ||
|
|
||
| if let Some(pb) = pb_cell.get() { | ||
| pb.finish() | ||
| } | ||
|
|
||
| Ok(repo_path) | ||
| } | ||
|
|
||
| fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> { | ||
| const DATASET_FILENAME_WHITELIST: &[&str] = &[ | ||
| "census-income.zip", | ||
| "census-income_srt.zip", | ||
| "census1881.zip", | ||
| "census1881_srt.zip", | ||
Kerollmops marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| "weather_sept_85.zip", | ||
| "weather_sept_85_srt.zip", | ||
| "wikileaks-noquotes.zip", | ||
| "wikileaks-noquotes_srt.zip", | ||
| ]; | ||
|
|
||
| use indicatif::{ProgressBar, ProgressStyle}; | ||
| use std::io::BufRead; | ||
| use zip::ZipArchive; | ||
|
|
||
| let dir = path.as_ref().read_dir()?; | ||
|
|
||
| let mut datasets = Vec::new(); | ||
|
|
||
| // Future work: Reuse this buffer to parse croaring bitmaps for comparison | ||
| let mut numbers = Vec::new(); | ||
|
|
||
| for dir_entry_result in dir { | ||
| let dir_entry = dir_entry_result?; | ||
| let metadata = dir_entry.metadata()?; | ||
| let file_name = dir_entry.file_name(); | ||
| // TODO dont panic | ||
| let file_name_str = file_name.to_str().expect("utf-8 filename"); | ||
|
|
||
| if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) { | ||
| let file = File::open(dir_entry.path())?; | ||
| let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string(); | ||
|
|
||
| let mut zip = ZipArchive::new(file)?; | ||
|
|
||
| let mut total_size = 0; | ||
| for i in 0..zip.len() { | ||
| let file = zip.by_index(i)?; | ||
| total_size += file.size(); | ||
| } | ||
|
|
||
| let pb = ProgressBar::new(total_size) | ||
| .with_style( | ||
| ProgressStyle::default_bar() | ||
| .template(" {prefix:.green} [{bar}] {msg}") | ||
| .progress_chars("#> "), | ||
| ) | ||
| .with_prefix("Parsing") | ||
| .with_message(name.clone()); | ||
|
|
||
| let mut bitmaps = Vec::with_capacity(zip.len()); | ||
| for i in 0..zip.len() { | ||
| let file = zip.by_index(i)?; | ||
| let size = file.size(); | ||
| let buf = BufReader::new(file); | ||
|
|
||
| for bytes in buf.split(b',') { | ||
| let bytes = bytes?; | ||
| let str = String::from_utf8(bytes)?; | ||
| let n = str.trim().parse::<u32>()?; | ||
| numbers.push(n); | ||
| } | ||
|
|
||
| let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?; | ||
| numbers.clear(); | ||
| bitmaps.push(bitmap); | ||
|
|
||
| pb.set_position(pb.position() + size); | ||
| } | ||
|
|
||
| pb.finish(); | ||
| datasets.push(Dataset { name, bitmaps }); | ||
| } | ||
| } | ||
| datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name)); | ||
| println!(); | ||
Kerollmops marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| Ok(datasets) | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.