Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make features configurable #59

Merged
merged 8 commits into from
Mar 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Consider adding "--codegen=link-args=-Wl,--compress-debug-sections=zlib"

[target.x86_64-unknown-linux-gnu]
# SSE3 is requred by simd-varint.
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]

[target.x86_64-apple-darwin]
# SSE3 is requred by simd-varint.
# POPCNT makes `count_ones` (which we use in geofilter and bitrank) more efficient.
rustflags = ["-C", "target-feature=+ssse3,+avx2,+popcnt"]

[target.aarch64-apple-darwin]
rustflags = ["-C", "target-feature=+neon"]
2 changes: 1 addition & 1 deletion crates/string-offsets/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "string-offsets"
authors = ["The blackbird team <[email protected]>"]
version = "0.1.0"
version = "0.2.0"
edition = "2021"
description = "Converts string offsets between UTF-8 bytes, UTF-16 code units, Unicode code points, and lines."
repository = "https://github.com/github/rust-gems"
Expand Down
30 changes: 24 additions & 6 deletions crates/string-offsets/benchmarks/performance.rs
Original file line number Diff line number Diff line change
@@ -1,20 +1,38 @@
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use rand::{rng, Rng};
use string_offsets::StringOffsets;
use string_offsets::{AllConfig, OnlyLines, StringOffsets};

fn construction_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("construction");
fn only_lines_construction_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("only_lines_construction");
for size in [1000, 10000, 100000] {
let mut rng = rng();
// Generate random ascii input.
let random_input: String = (0..size)
.map(|_| rng.random_range(32u8..128) as char)
.map(|_| rng.random_range(32u8..128u8) as char)
.collect();
group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
group.bench_with_input(
BenchmarkId::from_parameter(size),
&random_input,
|b, input| b.iter(|| black_box(StringOffsets::new(input))),
|b, input| b.iter(|| black_box(StringOffsets::<OnlyLines>::new(input))),
);
}
group.finish();
}

fn full_construction_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("full_construction");
for size in [1000, 10000, 100000] {
let mut rng = rng();
// Generate random ascii input.
let random_input: String = (0..size)
.map(|_| rng.random_range(32u8..128u8) as char)
.collect();
group.throughput(criterion::Throughput::Bytes(random_input.len() as u64));
group.bench_with_input(
BenchmarkId::from_parameter(size),
&random_input,
|b, input| b.iter(|| black_box(StringOffsets::<AllConfig>::new(input))),
);
}
group.finish();
Expand All @@ -23,6 +41,6 @@ fn construction_benchmark(c: &mut Criterion) {
criterion_group!(
name = benches;
config = Criterion::default();
targets = construction_benchmark
targets = only_lines_construction_benchmark, full_construction_benchmark
);
criterion_main!(benches);
70 changes: 33 additions & 37 deletions crates/string-offsets/src/bitrank.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,6 @@ struct Block {

impl Block {
/// Set a bit without updating `self.sub_blocks`.
///
/// This panics if the bit was already set, because that indicates that the original positions
/// list is invalid/had duplicates.
fn set(&mut self, index: usize) {
debug_assert!(index < BITS_PER_BLOCK);
let chunk_idx = index / BITS_PER_SUB_BLOCK;
Expand All @@ -52,11 +49,7 @@ impl Block {
self.bits[chunk_idx] |= mask;
}

/// The **total rank** of the block relative local index, and the index of the one
/// bit that establishes that rank (aka "select") **if** it occurs within that same
/// chunk, otherwise ['None']. The assumption is that if you would have to look back
/// through previous chunks it would actually be cheaper to do a lookup in the original
/// data structure that the bit vector was created from.
/// The **total rank** of the block relative local index.
fn rank(&self, local_idx: usize) -> usize {
let mut rank = self.rank as usize;
let sub_block = local_idx / BITS_PER_SUB_BLOCK;
Expand All @@ -65,11 +58,7 @@ impl Block {
let remainder = local_idx % BITS_PER_SUB_BLOCK;

let last_chunk = local_idx / BITS_PER_SUB_BLOCK;
let masked = if remainder == 0 {
0
} else {
self.bits[last_chunk] << (BITS_PER_SUB_BLOCK - remainder)
};
let masked = self.bits[last_chunk] & !(SubblockBits::MAX << remainder);
rank + masked.count_ones() as usize
}

Expand Down Expand Up @@ -176,42 +165,52 @@ mod tests {

/// Creates a `BitRank` containing the integers in `iter` (which should be strictly
/// increasing).
pub fn bitrank<I: IntoIterator<Item = usize>>(capacity: usize, iter: I) -> BitRank {
let mut builder = BitRankBuilder::with_capacity(capacity);
for position in iter {
builder.push(position);
pub fn bitrank<I>(iter: I) -> BitRank
where
I: IntoIterator<Item = usize>,
I::IntoIter: DoubleEndedIterator,
{
let mut iter = iter.into_iter().rev();
if let Some(last) = iter.next() {
let mut builder = BitRankBuilder::with_capacity(last + 1);
builder.push(last);
for position in iter {
builder.push(position);
}
builder.finish()
} else {
BitRank { blocks: vec![] }
}
builder.finish()
}

#[test]
fn test_rank_zero() {
let br = bitrank(1, [0]);
let br = bitrank([0]);
assert_eq!(br.rank(0), 0);
assert_eq!(br.rank(1), 1);
}

#[test]
fn test_empty() {
let br = bitrank(0, []);
let br = bitrank([]);
assert!(br.blocks.is_empty());
}

#[test]
fn test_index_out_of_bounds() {
let br = bitrank(BITS_PER_BLOCK, [BITS_PER_BLOCK - 1]);
let br = bitrank([BITS_PER_BLOCK - 1]);
assert_eq!(br.rank(BITS_PER_BLOCK), 1);
}

#[test]
#[should_panic]
fn test_duplicate_position() {
bitrank(91, [64, 66, 68, 68, 90]);
bitrank([64, 66, 68, 68, 90]);
}

#[test]
fn test_rank_exclusive() {
let br = bitrank(133, 0..132);
let br = bitrank(0..132);
assert_eq!(br.blocks.len(), 1);
assert_eq!(br.rank(64), 64);
assert_eq!(br.rank(132), 132);
Expand All @@ -221,37 +220,37 @@ mod tests {
fn test_rank() {
let mut positions: Vec<usize> = (0..132).collect();
positions.append(&mut vec![138usize, 140, 146]);
let br = bitrank(146, positions);
let br = bitrank(positions);
assert_eq!(br.rank(135), 132);

let br2 = bitrank(BITS_PER_BLOCK, 0..BITS_PER_BLOCK - 5);
let br2 = bitrank(0..BITS_PER_BLOCK - 5);
assert_eq!(br2.rank(169), 169);

let br3 = bitrank(BITS_PER_BLOCK + 6, 0..BITS_PER_BLOCK + 5);
let br3 = bitrank(0..BITS_PER_BLOCK + 5);
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);
}

#[test]
fn test_rank_idx() {
let mut positions: Vec<usize> = (0..132).collect();
positions.append(&mut vec![138usize, 140, 146]);
let br = bitrank(147, positions);
let br = bitrank(positions);
assert_eq!(br.rank(135), 132);

let bits2: Vec<usize> = (0..BITS_PER_BLOCK - 5).collect();
let br2 = bitrank(BITS_PER_BLOCK, bits2);
let br2 = bitrank(bits2);
assert_eq!(br2.rank(169), 169);

let bits3: Vec<usize> = (0..BITS_PER_BLOCK + 5).collect();
let br3 = bitrank(BITS_PER_BLOCK + 6, bits3);
let br3 = bitrank(bits3);
assert_eq!(br3.rank(BITS_PER_BLOCK), BITS_PER_BLOCK);

let bits4: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
let br4 = bitrank(BITS_PER_BLOCK + 1, bits4);
let br4 = bitrank(bits4);
assert_eq!(br4.rank(8000), 3);

let bits5: Vec<usize> = vec![1, 1000, 7777, BITS_PER_BLOCK + 1];
let br5 = bitrank(BITS_PER_BLOCK + 1, bits5);
let br5 = bitrank(bits5);
assert_eq!(br5.rank(BITS_PER_BLOCK), 3);
}

Expand All @@ -267,7 +266,7 @@ mod tests {
// This isn't strictly necessary, given that the bit would just be toggled again, but it
// ensures that we are meeting the contract.
random_bits.dedup();
let br = bitrank(1_000_000, random_bits.iter().copied());
let br = bitrank(random_bits.iter().copied());
let mut rank = 0;
for i in 0..random_bits.capacity() {
assert_eq!(br.rank(i), rank);
Expand All @@ -282,7 +281,7 @@ mod tests {
#[test]
fn test_rank_out_of_bounds() {
for i in 1..30 {
let br = bitrank(BITS_PER_BLOCK * i, [BITS_PER_BLOCK * i - 1]);
let br = bitrank([BITS_PER_BLOCK * i - 1]);
assert_eq!(br.max_rank(), 1);
assert_eq!(br.rank(BITS_PER_BLOCK * i - 1), 0);
for j in 0..10 {
Expand All @@ -293,10 +292,7 @@ mod tests {

#[test]
fn test_large_gap() {
let br = bitrank(
BITS_PER_BLOCK * 16,
(3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17),
);
let br = bitrank((3..4).chain(BITS_PER_BLOCK * 15..BITS_PER_BLOCK * 15 + 17));
for i in 1..15 {
assert_eq!(br.rank(BITS_PER_BLOCK * i), 1);
}
Expand Down
50 changes: 50 additions & 0 deletions crates/string-offsets/src/config.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
//! Configuration types for enabling/disabling features are compile time.
//!
//! By disabling features, the compiler can generate faster code which can be important for certain use cases.
//! Certain implementations/conversion operations will only be available if the corresponding features were enabled.

/// Type-level boolean.
pub trait Bool {
/// The value of the boolean.
const VALUE: bool;
}
/// Type-level true.
pub struct True {}
/// Type-level false.
pub struct False {}
impl Bool for True {
const VALUE: bool = true;
}
impl Bool for False {
const VALUE: bool = false;
}

/// Configures which features should be enabled for a [`StringOffsets`] instance.
pub trait ConfigType {
/// Whether to enable character conversions.
type HasChars: Bool;
/// Whether to enable UTF-16 conversions.
type HasUtf16: Bool;
/// Whether to enable line conversions.
type HasLines: Bool;
/// Whether to enable whitespace checks.
type HasWhitespace: Bool;
}

/// Configuration type that enables all features.
pub struct AllConfig {}
impl ConfigType for AllConfig {
type HasChars = True;
type HasUtf16 = True;
type HasLines = True;
type HasWhitespace = True;
}

/// Configuration type that only enables line conversions.
pub struct OnlyLines {}
impl ConfigType for OnlyLines {
type HasChars = False;
type HasUtf16 = False;
type HasLines = True;
type HasWhitespace = False;
}
Loading