Skip to content

Commit ba2eb04

Browse files
bors[bot]saik0
andauthored
186: add runtime dataset fetch and parse in-place r=Kerollmops a=saik0 Closes RoaringBitmap#129 Closes RoaringBitmap#171 Closes RoaringBitmap#185 Here's my go at fetching the datasets at runtime * Datasets are lazily fetched the first time they're needed (or updated, if local `HEAD != origin/master`). * The zip files are parsed-in place on every benchmark run, to keep the on-disk size down. * The parsing is also lazy, and happens at most once. * This PR updates any benchmarks that were already using limited data from `wikileaks-noquotes` to use all the datasets. * A fast follow PR will update all the benchmarks. `@Kerollmops` Third times the charm? Co-authored-by: saik0 <[email protected]> Co-authored-by: Joel Pedraza <[email protected]>
2 parents 9748254 + 6dbbd80 commit ba2eb04

File tree

8 files changed

+384
-317
lines changed

8 files changed

+384
-317
lines changed

.github/workflows/test.yml

Lines changed: 59 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -19,66 +19,98 @@ jobs:
1919
- beta
2020
- nightly
2121
- 1.56.1
22+
env:
23+
RUSTFLAGS: "-C target-cpu=native -C opt-level=3"
24+
ROARINGRS_BENCH_OFFLINE: "true"
2225

2326
steps:
24-
- uses: actions/checkout@v2
27+
- name: Checkout roaring-rs
28+
uses: actions/checkout@v2
2529

26-
- uses: actions-rs/toolchain@v1
30+
- name: Checkout benchmark datasets
31+
uses: actions/checkout@v2
32+
with:
33+
repository: "RoaringBitmap/real-roaring-datasets"
34+
path: "benchmarks/real-roaring-datasets"
35+
36+
- name: Initialize rust toolchain
37+
uses: actions-rs/toolchain@v1
2738
with:
2839
profile: minimal
2940
toolchain: ${{ matrix.rust }}
3041
override: true
3142
components: rustfmt, clippy
3243

33-
- uses: actions-rs/cargo@v1
44+
- name: Fetch
45+
uses: actions-rs/cargo@v1
46+
with:
47+
command: fetch
48+
49+
- name: Fetch benchmarks
50+
uses: actions-rs/cargo@v1
51+
with:
52+
command: fetch
53+
args: --manifest-path benchmarks/Cargo.toml
54+
55+
- name: Build
56+
uses: actions-rs/cargo@v1
3457
with:
3558
command: build
59+
args: --all-targets
3660

37-
- uses: actions-rs/cargo@v1
61+
- name: Build benchmarks
62+
uses: actions-rs/cargo@v1
3863
with:
39-
command: test
64+
command: build
65+
args: --manifest-path benchmarks/Cargo.toml --all-targets
4066

41-
- uses: actions-rs/cargo@v1
67+
- name: Check
68+
uses: actions-rs/cargo@v1
4269
with:
43-
command: test
44-
args: --benches --manifest-path benchmarks/Cargo.toml
70+
command: clippy
71+
args: --all-targets -- -D warnings
4572

46-
- uses: actions-rs/cargo@v1
73+
- name: Check benchmarks
74+
uses: actions-rs/cargo@v1
75+
with:
76+
command: clippy
77+
args: --manifest-path benchmarks/Cargo.toml --all-targets -- -D warnings
78+
79+
- name: Check formatting
80+
uses: actions-rs/cargo@v1
4781
with:
4882
command: fmt
4983
args: -- --check
5084

51-
- uses: actions-rs/cargo@v1
85+
- name: Check benchmark formatting
86+
uses: actions-rs/cargo@v1
5287
with:
5388
command: fmt
5489
args: --manifest-path benchmarks/Cargo.toml -- --check
5590

56-
- uses: actions-rs/cargo@v1
91+
- name: Test
92+
uses: actions-rs/cargo@v1
5793
with:
58-
command: clippy
59-
args: --all-targets -- -D warnings
60-
simd:
61-
name: SIMD Feature
62-
runs-on: ubuntu-latest
63-
64-
steps:
65-
- uses: actions/checkout@v2
94+
command: test
6695

67-
- uses: actions-rs/toolchain@v1
96+
- name: Test benchmarks
97+
uses: actions-rs/cargo@v1
6898
with:
69-
profile: minimal
70-
toolchain: nightly
71-
override: true
72-
components: rustfmt, clippy
99+
command: test
100+
args: --manifest-path benchmarks/Cargo.toml --benches
73101

74-
- uses: actions-rs/cargo@v1
102+
- name: SIMD test
103+
if: matrix.rust == 'nightly'
104+
uses: actions-rs/cargo@v1
75105
with:
76106
toolchain: nightly
77107
command: test
78108
args: --features "simd"
79109

80-
- uses: actions-rs/cargo@v1
110+
- name: SIMD test benchmarks
111+
if: matrix.rust == 'nightly'
112+
uses: actions-rs/cargo@v1
81113
with:
82114
toolchain: nightly
83115
command: test
84-
args: --features "simd" --benches --manifest-path benchmarks/Cargo.toml
116+
args: --manifest-path benchmarks/Cargo.toml --features "simd" --benches

.gitignore

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,2 @@
11
/target
22
/Cargo.lock
3-
4-
# This is generated by the benchmarks crate build script, do not version with git.
5-
/benchmarks/benches/datasets_paths.rs
6-
/benchmarks/target
7-
/benchmarks/Cargo.lock

benchmarks/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
/target
2+
/Cargo.lock
3+
/real-roaring-datasets

benchmarks/Cargo.toml

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,11 @@ publish = false
1010
roaring = { path = ".." }
1111

1212
[dev-dependencies]
13+
once_cell = "1.9"
14+
git2 = { version = "0.13", default-features = false, features = ["vendored-openssl"] }
15+
zip = { version = "0.5", default-features = false, features = ["deflate"] }
16+
indicatif = "0.16"
1317
criterion = { version = "0.3", features = ["html_reports"] }
14-
quickcheck = "0.9"
15-
quickcheck_macros = "0.9"
16-
17-
[build-dependencies]
18-
anyhow = "1.0"
19-
bytes = "1.0"
20-
convert_case = "0.4"
21-
reqwest = { version = "0.11.3", features = ["blocking", "rustls-tls"], default-features = false }
22-
zip = "0.5.12"
2318

2419
[features]
2520
simd = ["roaring/simd"]

benchmarks/benches/datasets.rs

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
use std::env;
2+
use std::fs::File;
3+
use std::io::BufReader;
4+
use std::path::{Path, PathBuf};
5+
6+
use git2::FetchOptions;
7+
use once_cell::sync::OnceCell as SyncOnceCell;
8+
9+
use roaring::RoaringBitmap;
10+
11+
static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new();
12+
13+
pub struct Datasets;
14+
15+
pub struct DatasetsIter {
16+
iter: std::slice::Iter<'static, Dataset>,
17+
}
18+
19+
impl Iterator for DatasetsIter {
20+
type Item = &'static Dataset;
21+
22+
fn next(&mut self) -> Option<Self::Item> {
23+
self.iter.next()
24+
}
25+
}
26+
27+
impl IntoIterator for Datasets {
28+
type Item = &'static Dataset;
29+
type IntoIter = DatasetsIter;
30+
31+
fn into_iter(self) -> Self::IntoIter {
32+
DatasetsIter {
33+
iter: INSTANCE
34+
.get_or_init(|| {
35+
init_datasets().and_then(parse_datasets).expect("a collection of datasets")
36+
})
37+
.iter(),
38+
}
39+
}
40+
}
41+
42+
pub struct Dataset {
43+
pub name: String,
44+
pub bitmaps: Vec<RoaringBitmap>,
45+
}
46+
47+
fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> {
48+
let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?;
49+
50+
let out_path = Path::new(&out_dir);
51+
let repo_path = out_path.join("real-roaring-datasets");
52+
53+
// Check if in offline mode
54+
55+
let offline = env::var("ROARINGRS_BENCH_OFFLINE");
56+
match offline {
57+
Ok(value) => {
58+
if value.parse::<bool>()? {
59+
return Ok(repo_path);
60+
}
61+
}
62+
Err(ref err) => match err {
63+
env::VarError::NotPresent => (),
64+
_ => {
65+
offline?;
66+
}
67+
},
68+
};
69+
70+
// Setup progress callbacks
71+
72+
let pb_cell = once_cell::unsync::OnceCell::new();
73+
let mut cb = git2::RemoteCallbacks::new();
74+
75+
cb.transfer_progress(|progress| {
76+
let pb = pb_cell.get_or_init(|| {
77+
indicatif::ProgressBar::new(progress.total_objects() as u64)
78+
.with_style(
79+
indicatif::ProgressStyle::default_bar()
80+
.template(&format!(
81+
"{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}",
82+
progress.total_objects()
83+
))
84+
.progress_chars("#> "),
85+
)
86+
.with_prefix(" ")
87+
.with_message("Receiving objects")
88+
});
89+
90+
pb.set_position((progress.local_objects() + progress.received_objects()) as u64);
91+
true
92+
});
93+
94+
let mut fetch_opts = FetchOptions::new();
95+
fetch_opts.remote_callbacks(cb);
96+
97+
// Do update
98+
99+
if !Path::new(&repo_path).exists() {
100+
git2::build::RepoBuilder::new()
101+
.fetch_options(fetch_opts)
102+
.clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?;
103+
} else {
104+
let repo = git2::Repository::open(&repo_path)?;
105+
repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?;
106+
107+
let head = repo.head()?.peel_to_commit()?;
108+
let origin_master_head = repo
109+
.find_branch("origin/master", git2::BranchType::Remote)?
110+
.into_reference()
111+
.peel_to_commit()?;
112+
113+
if head.id() != origin_master_head.id() {
114+
repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?;
115+
}
116+
}
117+
118+
if let Some(pb) = pb_cell.get() {
119+
pb.finish()
120+
}
121+
122+
Ok(repo_path)
123+
}
124+
125+
fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> {
126+
const DATASET_FILENAME_WHITELIST: &[&str] = &[
127+
"census-income.zip",
128+
"census-income_srt.zip",
129+
"census1881.zip",
130+
"census1881_srt.zip",
131+
"weather_sept_85.zip",
132+
"weather_sept_85_srt.zip",
133+
"wikileaks-noquotes.zip",
134+
"wikileaks-noquotes_srt.zip",
135+
];
136+
137+
use indicatif::{ProgressBar, ProgressStyle};
138+
use std::io::BufRead;
139+
use zip::ZipArchive;
140+
141+
let dir = path.as_ref().read_dir()?;
142+
143+
let mut datasets = Vec::new();
144+
145+
// Future work: Reuse this buffer to parse croaring bitmaps for comparison
146+
let mut numbers = Vec::new();
147+
148+
for dir_entry_result in dir {
149+
let dir_entry = dir_entry_result?;
150+
let metadata = dir_entry.metadata()?;
151+
let file_name = dir_entry.file_name();
152+
// TODO dont panic
153+
let file_name_str = file_name.to_str().expect("utf-8 filename");
154+
155+
if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) {
156+
let file = File::open(dir_entry.path())?;
157+
let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string();
158+
159+
let mut zip = ZipArchive::new(file)?;
160+
161+
let mut total_size = 0;
162+
for i in 0..zip.len() {
163+
let file = zip.by_index(i)?;
164+
total_size += file.size();
165+
}
166+
167+
let pb = ProgressBar::new(total_size)
168+
.with_style(
169+
ProgressStyle::default_bar()
170+
.template(" {prefix:.green} [{bar}] {msg}")
171+
.progress_chars("#> "),
172+
)
173+
.with_prefix("Parsing")
174+
.with_message(name.clone());
175+
176+
let mut bitmaps = Vec::with_capacity(zip.len());
177+
for i in 0..zip.len() {
178+
let file = zip.by_index(i)?;
179+
let size = file.size();
180+
let buf = BufReader::new(file);
181+
182+
for bytes in buf.split(b',') {
183+
let bytes = bytes?;
184+
let str = String::from_utf8(bytes)?;
185+
let n = str.trim().parse::<u32>()?;
186+
numbers.push(n);
187+
}
188+
189+
let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?;
190+
numbers.clear();
191+
bitmaps.push(bitmap);
192+
193+
pb.set_position(pb.position() + size);
194+
}
195+
196+
pb.finish();
197+
datasets.push(Dataset { name, bitmaps });
198+
}
199+
}
200+
datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name));
201+
println!();
202+
Ok(datasets)
203+
}

0 commit comments

Comments
 (0)