Skip to content

Commit 3ff9de6

Browse files
authoredApr 16, 2024··
Merge pull request #133 from Jonas-Heinrich/master
Modify benchmarks to compare against stdlib functions
2 parents baab923 + d6a6eb8 commit 3ff9de6

File tree

6 files changed

+154
-184
lines changed

6 files changed

+154
-184
lines changed
 

‎Cargo.toml

+3-4
Original file line numberDiff line numberDiff line change
@@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
2323

2424
[dev-dependencies]
2525
quickcheck = "0.7"
26-
criterion = "0.3"
26+
criterion = "0.5"
2727

2828
[[bench]]
29-
name = "graphemes"
29+
name = "chars"
3030
harness = false
3131

3232
[[bench]]
33-
name = "unicode_words"
33+
name = "words"
3434
harness = false
3535

3636
[[bench]]
3737
name = "word_bounds"
3838
harness = false
39-

‎benches/chars.rs

+60
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
2+
//! `std::str::chars`.
3+
//!
4+
//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
5+
//! does not consider the complexity of grapheme clusters. The question in this benchmark
6+
//! is how much slower full unicode handling is.
7+
8+
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
9+
use unicode_segmentation;
10+
11+
use std::fs;
12+
use unicode_segmentation::UnicodeSegmentation;
13+
14+
const FILES: &[&str] = &[
15+
"arabic",
16+
"english",
17+
"hindi",
18+
"japanese",
19+
"korean",
20+
"mandarin",
21+
"russian",
22+
"source_code",
23+
];
24+
25+
#[inline(always)]
26+
fn grapheme(text: &str) {
27+
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
28+
black_box(c);
29+
}
30+
}
31+
32+
#[inline(always)]
33+
fn scalar(text: &str) {
34+
for c in black_box(&*text).chars() {
35+
black_box(c);
36+
}
37+
}
38+
39+
fn bench_all(c: &mut Criterion) {
40+
let mut group = c.benchmark_group("chars");
41+
42+
for file in FILES {
43+
group.bench_with_input(
44+
BenchmarkId::new("grapheme", file),
45+
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
46+
|b, content| b.iter(|| grapheme(content)),
47+
);
48+
}
49+
50+
for file in FILES {
51+
group.bench_with_input(
52+
BenchmarkId::new("scalar", file),
53+
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
54+
|b, content| b.iter(|| scalar(content)),
55+
);
56+
}
57+
}
58+
59+
criterion_group!(benches, bench_all);
60+
criterion_main!(benches);

‎benches/graphemes.rs

-63
This file was deleted.

‎benches/unicode_words.rs

-61
This file was deleted.

‎benches/word_bounds.rs

+32-56
Original file line numberDiff line numberDiff line change
@@ -1,61 +1,37 @@
1-
use criterion::{black_box, criterion_group, criterion_main, Criterion};
1+
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
22

33
use std::fs;
44
use unicode_segmentation::UnicodeSegmentation;
55

6-
fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
7-
let text = fs::read_to_string(path).unwrap();
8-
c.bench_function(&format!("word_bounds_{}", lang), |bench| {
9-
bench.iter(|| {
10-
for w in text.split_word_bounds() {
11-
black_box(w);
12-
}
13-
});
14-
});
15-
}
16-
17-
fn word_bounds_arabic(c: &mut Criterion) {
18-
word_bounds(c, "arabic", "benches/texts/arabic.txt");
19-
}
20-
21-
fn word_bounds_english(c: &mut Criterion) {
22-
word_bounds(c, "english", "benches/texts/english.txt");
23-
}
24-
25-
fn word_bounds_hindi(c: &mut Criterion) {
26-
word_bounds(c, "hindi", "benches/texts/hindi.txt");
27-
}
28-
29-
fn word_bounds_japanese(c: &mut Criterion) {
30-
word_bounds(c, "japanese", "benches/texts/japanese.txt");
31-
}
32-
33-
fn word_bounds_korean(c: &mut Criterion) {
34-
word_bounds(c, "korean", "benches/texts/korean.txt");
35-
}
36-
37-
fn word_bounds_mandarin(c: &mut Criterion) {
38-
word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
39-
}
40-
41-
fn word_bounds_russian(c: &mut Criterion) {
42-
word_bounds(c, "russian", "benches/texts/russian.txt");
43-
}
44-
45-
fn word_bounds_source_code(c: &mut Criterion) {
46-
word_bounds(c, "source_code", "benches/texts/source_code.txt");
47-
}
48-
49-
criterion_group!(
50-
benches,
51-
word_bounds_arabic,
52-
word_bounds_english,
53-
word_bounds_hindi,
54-
word_bounds_japanese,
55-
word_bounds_korean,
56-
word_bounds_mandarin,
57-
word_bounds_russian,
58-
word_bounds_source_code,
59-
);
60-
6+
const FILES: &[&str] = &[
7+
"arabic",
8+
"english",
9+
"hindi",
10+
"japanese",
11+
"korean",
12+
"mandarin",
13+
"russian",
14+
"source_code",
15+
];
16+
17+
#[inline(always)]
18+
fn grapheme(text: &str) {
19+
for w in text.split_word_bounds() {
20+
black_box(w);
21+
}
22+
}
23+
24+
fn bench_all(c: &mut Criterion) {
25+
let mut group = c.benchmark_group("word_bounds");
26+
27+
for file in FILES {
28+
group.bench_with_input(
29+
BenchmarkId::new("grapheme", file),
30+
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
31+
|b, content| b.iter(|| grapheme(content)),
32+
);
33+
}
34+
}
35+
36+
criterion_group!(benches, bench_all);
6137
criterion_main!(benches);

‎benches/words.rs

+59
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
2+
//! scalar-based `std::str::split_whitespace`.
3+
//!
4+
//! It is expected that `std::str::split_whitespace` is faster than
5+
//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
6+
//! clusters. The question in this benchmark is how much slower full unicode handling is.
7+
8+
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
9+
10+
use std::fs;
11+
use unicode_segmentation::UnicodeSegmentation;
12+
13+
const FILES: &[&str] = &[
14+
"arabic",
15+
"english",
16+
"hindi",
17+
"japanese",
18+
"korean",
19+
"mandarin",
20+
"russian",
21+
"source_code",
22+
];
23+
24+
#[inline(always)]
25+
fn grapheme(text: &str) {
26+
for w in text.unicode_words() {
27+
black_box(w);
28+
}
29+
}
30+
31+
#[inline(always)]
32+
fn scalar(text: &str) {
33+
for w in text.split_whitespace() {
34+
black_box(w);
35+
}
36+
}
37+
38+
fn bench_all(c: &mut Criterion) {
39+
let mut group = c.benchmark_group("words");
40+
41+
for file in FILES {
42+
group.bench_with_input(
43+
BenchmarkId::new("grapheme", file),
44+
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
45+
|b, content| b.iter(|| grapheme(content)),
46+
);
47+
}
48+
49+
for file in FILES {
50+
group.bench_with_input(
51+
BenchmarkId::new("scalar", file),
52+
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
53+
|b, content| b.iter(|| scalar(content)),
54+
);
55+
}
56+
}
57+
58+
criterion_group!(benches, bench_all);
59+
criterion_main!(benches);

0 commit comments

Comments
 (0)
Please sign in to comment.