|
| 1 | +use std::env; |
| 2 | +use std::fs::File; |
| 3 | +use std::io::BufReader; |
| 4 | +use std::path::{Path, PathBuf}; |
| 5 | + |
| 6 | +use git2::FetchOptions; |
| 7 | +use once_cell::sync::OnceCell as SyncOnceCell; |
| 8 | + |
| 9 | +use roaring::RoaringBitmap; |
| 10 | + |
| 11 | +static INSTANCE: SyncOnceCell<Vec<Dataset>> = SyncOnceCell::new(); |
| 12 | + |
| 13 | +pub struct Datasets; |
| 14 | + |
| 15 | +pub struct DatasetsIter { |
| 16 | + iter: std::slice::Iter<'static, Dataset>, |
| 17 | +} |
| 18 | + |
| 19 | +impl Iterator for DatasetsIter { |
| 20 | + type Item = &'static Dataset; |
| 21 | + |
| 22 | + fn next(&mut self) -> Option<Self::Item> { |
| 23 | + self.iter.next() |
| 24 | + } |
| 25 | +} |
| 26 | + |
| 27 | +impl IntoIterator for Datasets { |
| 28 | + type Item = &'static Dataset; |
| 29 | + type IntoIter = DatasetsIter; |
| 30 | + |
| 31 | + fn into_iter(self) -> Self::IntoIter { |
| 32 | + DatasetsIter { |
| 33 | + iter: INSTANCE |
| 34 | + .get_or_init(|| { |
| 35 | + init_datasets().and_then(parse_datasets).expect("a collection of datasets") |
| 36 | + }) |
| 37 | + .iter(), |
| 38 | + } |
| 39 | + } |
| 40 | +} |
| 41 | + |
| 42 | +pub struct Dataset { |
| 43 | + pub name: String, |
| 44 | + pub bitmaps: Vec<RoaringBitmap>, |
| 45 | +} |
| 46 | + |
| 47 | +fn init_datasets() -> Result<PathBuf, Box<dyn std::error::Error>> { |
| 48 | + let out_dir = env::var_os("CARGO_MANIFEST_DIR").ok_or(env::VarError::NotPresent)?; |
| 49 | + |
| 50 | + let out_path = Path::new(&out_dir); |
| 51 | + let repo_path = out_path.join("real-roaring-datasets"); |
| 52 | + |
| 53 | + // Check if in offline mode |
| 54 | + |
| 55 | + let offline = env::var("ROARINGRS_BENCH_OFFLINE"); |
| 56 | + match offline { |
| 57 | + Ok(value) => { |
| 58 | + if value.parse::<bool>()? { |
| 59 | + return Ok(repo_path); |
| 60 | + } |
| 61 | + } |
| 62 | + Err(ref err) => match err { |
| 63 | + env::VarError::NotPresent => (), |
| 64 | + _ => { |
| 65 | + offline?; |
| 66 | + } |
| 67 | + }, |
| 68 | + }; |
| 69 | + |
| 70 | + // Setup progress callbacks |
| 71 | + |
| 72 | + let pb_cell = once_cell::unsync::OnceCell::new(); |
| 73 | + let mut cb = git2::RemoteCallbacks::new(); |
| 74 | + |
| 75 | + cb.transfer_progress(|progress| { |
| 76 | + let pb = pb_cell.get_or_init(|| { |
| 77 | + indicatif::ProgressBar::new(progress.total_objects() as u64) |
| 78 | + .with_style( |
| 79 | + indicatif::ProgressStyle::default_bar() |
| 80 | + .template(&format!( |
| 81 | + "{{prefix}}{{msg:.cyan/blue}} [{{bar}}] {{pos}}/{}", |
| 82 | + progress.total_objects() |
| 83 | + )) |
| 84 | + .progress_chars("#> "), |
| 85 | + ) |
| 86 | + .with_prefix(" ") |
| 87 | + .with_message("Receiving objects") |
| 88 | + }); |
| 89 | + |
| 90 | + pb.set_position((progress.local_objects() + progress.received_objects()) as u64); |
| 91 | + true |
| 92 | + }); |
| 93 | + |
| 94 | + let mut fetch_opts = FetchOptions::new(); |
| 95 | + fetch_opts.remote_callbacks(cb); |
| 96 | + |
| 97 | + // Do update |
| 98 | + |
| 99 | + if !Path::new(&repo_path).exists() { |
| 100 | + git2::build::RepoBuilder::new() |
| 101 | + .fetch_options(fetch_opts) |
| 102 | + .clone("git://github.com/RoaringBitmap/real-roaring-datasets.git", &repo_path)?; |
| 103 | + } else { |
| 104 | + let repo = git2::Repository::open(&repo_path)?; |
| 105 | + repo.find_remote("origin")?.fetch(&["master"], Some(&mut fetch_opts), None)?; |
| 106 | + |
| 107 | + let head = repo.head()?.peel_to_commit()?; |
| 108 | + let origin_master_head = repo |
| 109 | + .find_branch("origin/master", git2::BranchType::Remote)? |
| 110 | + .into_reference() |
| 111 | + .peel_to_commit()?; |
| 112 | + |
| 113 | + if head.id() != origin_master_head.id() { |
| 114 | + repo.reset(origin_master_head.as_object(), git2::ResetType::Hard, None)?; |
| 115 | + } |
| 116 | + } |
| 117 | + |
| 118 | + if let Some(pb) = pb_cell.get() { |
| 119 | + pb.finish() |
| 120 | + } |
| 121 | + |
| 122 | + Ok(repo_path) |
| 123 | +} |
| 124 | + |
| 125 | +fn parse_datasets<P: AsRef<Path>>(path: P) -> Result<Vec<Dataset>, Box<dyn std::error::Error>> { |
| 126 | + const DATASET_FILENAME_WHITELIST: &[&str] = &[ |
| 127 | + "census-income.zip", |
| 128 | + "census-income_srt.zip", |
| 129 | + "census1881.zip", |
| 130 | + "census1881_srt.zip", |
| 131 | + "weather_sept_85.zip", |
| 132 | + "weather_sept_85_srt.zip", |
| 133 | + "wikileaks-noquotes.zip", |
| 134 | + "wikileaks-noquotes_srt.zip", |
| 135 | + ]; |
| 136 | + |
| 137 | + use indicatif::{ProgressBar, ProgressStyle}; |
| 138 | + use std::io::BufRead; |
| 139 | + use zip::ZipArchive; |
| 140 | + |
| 141 | + let dir = path.as_ref().read_dir()?; |
| 142 | + |
| 143 | + let mut datasets = Vec::new(); |
| 144 | + |
| 145 | + // Future work: Reuse this buffer to parse croaring bitmaps for comparison |
| 146 | + let mut numbers = Vec::new(); |
| 147 | + |
| 148 | + for dir_entry_result in dir { |
| 149 | + let dir_entry = dir_entry_result?; |
| 150 | + let metadata = dir_entry.metadata()?; |
| 151 | + let file_name = dir_entry.file_name(); |
| 152 | + // TODO dont panic |
| 153 | + let file_name_str = file_name.to_str().expect("utf-8 filename"); |
| 154 | + |
| 155 | + if metadata.is_file() && DATASET_FILENAME_WHITELIST.contains(&file_name_str) { |
| 156 | + let file = File::open(dir_entry.path())?; |
| 157 | + let name = file_name_str.split_at(file_name_str.len() - ".zip".len()).0.to_string(); |
| 158 | + |
| 159 | + let mut zip = ZipArchive::new(file)?; |
| 160 | + |
| 161 | + let mut total_size = 0; |
| 162 | + for i in 0..zip.len() { |
| 163 | + let file = zip.by_index(i)?; |
| 164 | + total_size += file.size(); |
| 165 | + } |
| 166 | + |
| 167 | + let pb = ProgressBar::new(total_size) |
| 168 | + .with_style( |
| 169 | + ProgressStyle::default_bar() |
| 170 | + .template(" {prefix:.green} [{bar}] {msg}") |
| 171 | + .progress_chars("#> "), |
| 172 | + ) |
| 173 | + .with_prefix("Parsing") |
| 174 | + .with_message(name.clone()); |
| 175 | + |
| 176 | + let mut bitmaps = Vec::with_capacity(zip.len()); |
| 177 | + for i in 0..zip.len() { |
| 178 | + let file = zip.by_index(i)?; |
| 179 | + let size = file.size(); |
| 180 | + let buf = BufReader::new(file); |
| 181 | + |
| 182 | + for bytes in buf.split(b',') { |
| 183 | + let bytes = bytes?; |
| 184 | + let str = String::from_utf8(bytes)?; |
| 185 | + let n = str.trim().parse::<u32>()?; |
| 186 | + numbers.push(n); |
| 187 | + } |
| 188 | + |
| 189 | + let bitmap = RoaringBitmap::from_sorted_iter(numbers.iter().copied())?; |
| 190 | + numbers.clear(); |
| 191 | + bitmaps.push(bitmap); |
| 192 | + |
| 193 | + pb.set_position(pb.position() + size); |
| 194 | + } |
| 195 | + |
| 196 | + pb.finish(); |
| 197 | + datasets.push(Dataset { name, bitmaps }); |
| 198 | + } |
| 199 | + } |
| 200 | + datasets.sort_unstable_by(|a, b| a.name.cmp(&b.name)); |
| 201 | + println!(); |
| 202 | + Ok(datasets) |
| 203 | +} |
0 commit comments