Skip to content

Commit 8ecf192

Browse files
Merge pull request #29 from github/intern-tiktoken-data
Upgrade tiktoken-rs
2 parents 7d7cad4 + ed45357 commit 8ecf192

22 files changed

+325
-342
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
members = [
44
"crates/*",
55
"crates/bpe/benchmarks",
6+
"crates/bpe/tests",
67
]
78
resolver = "2"
89

crates/bpe-openai/Cargo.toml

+5-5
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ bpe = { version = "0.1.0", path = "../bpe" }
1717
either = "1.13"
1818
fancy-regex = "0.13"
1919
rmp-serde = "1"
20-
serde = { version = "1" }
2120

2221
[dev-dependencies]
23-
tiktoken-rs = { version = "0.5" }
22+
tiktoken-rs = "0.6"
2423

2524
[build-dependencies]
26-
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken-rs"] }
25+
base64 = "0.22.1"
26+
bpe = { version = "0.1.0", path = "../bpe", features = ["tiktoken"] }
27+
flate2 = "1.0"
2728
rmp-serde = "1"
28-
tiktoken-rs = { version = "0.5" }
29-
serde = { version = "1" }
29+
serde = "1"

crates/bpe-openai/build.rs

+16-30
Original file line numberDiff line numberDiff line change
@@ -1,51 +1,37 @@
11
use std::env;
22
use std::fs::File;
3+
use std::io::Read;
34
use std::path::PathBuf;
45

5-
use bpe::byte_pair_encoding::BytePairEncoding;
6+
use bpe::byte_pair_encoding::{read_tiktoken, BytePairEncoding};
67
use serde::Serialize;
7-
use tiktoken_rs::CoreBPE;
88

99
fn main() {
10-
serialize_tokens(
11-
"r50k",
12-
&tiktoken_rs::r50k_base().expect("tiktoken initialization must not fail!"),
13-
50256,
14-
1,
15-
);
16-
serialize_tokens(
17-
"p50k",
18-
&tiktoken_rs::p50k_base().expect("tiktoken initialization must not fail!"),
19-
50280,
20-
1,
21-
);
22-
serialize_tokens(
23-
"cl100k",
24-
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
25-
100256,
26-
17846336922010275747,
27-
);
28-
serialize_tokens(
29-
"cl100k",
30-
&tiktoken_rs::cl100k_base().expect("tiktoken initialization must not fail!"),
31-
100256,
10+
serialize_tiktoken_bpe("r50k_base", include_bytes!("data/r50k_base.tiktoken.gz"), 1);
11+
serialize_tiktoken_bpe("p50k_base", include_bytes!("data/p50k_base.tiktoken.gz"), 1);
12+
serialize_tiktoken_bpe(
13+
"cl100k_base",
14+
include_bytes!("data/cl100k_base.tiktoken.gz"),
3215
17846336922010275747,
3316
);
34-
serialize_tokens(
35-
"o200k",
36-
&tiktoken_rs::o200k_base().expect("tiktoken initialization must not fail!"),
37-
199998,
17+
serialize_tiktoken_bpe(
18+
"o200k_base",
19+
include_bytes!("data/o200k_base.tiktoken.gz"),
3820
17846336922010275747,
3921
);
4022
println!("cargo::rerun-if-changed=build.rs");
4123
}
4224

43-
fn serialize_tokens(name: &str, bpe: &CoreBPE, num_tokens: usize, hash_factor: u64) {
25+
fn serialize_tiktoken_bpe(name: &str, data: &[u8], hash_factor: u64) {
26+
let mut dec = flate2::read::GzDecoder::new(data);
27+
let mut tiktoken = String::new();
28+
dec.read_to_string(&mut tiktoken).expect("can decode data");
29+
let tokens = read_tiktoken(&tiktoken).expect("can read data");
4430
let mut path = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR is set during build"));
4531
path.push(format!("bpe_{name}.dict"));
4632
let file = File::create(path).expect("can create output file");
4733
let mut serializer = rmp_serde::Serializer::new(file);
48-
let bpe = BytePairEncoding::from_tiktoken(bpe, num_tokens, Some(hash_factor));
34+
let bpe = BytePairEncoding::from_dictionary(tokens, Some(hash_factor));
4935
bpe.serialize(&mut serializer)
5036
.expect("serialization succeeds");
5137
}
758 KB
Binary file not shown.
1.62 MB
Binary file not shown.
359 KB
Binary file not shown.
359 KB
Binary file not shown.

crates/bpe-openai/src/lib.rs

+22-23
Original file line numberDiff line numberDiff line change
@@ -4,29 +4,29 @@ use bpe::byte_pair_encoding::BytePairEncoding;
44
use either::Either;
55
use fancy_regex::Regex;
66

7-
static BPE_R50K: LazyLock<Tokenizer> = LazyLock::new(|| {
8-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k.dict"));
7+
static BPE_R50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
8+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_r50k_base.dict"));
99
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
1010
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
1111
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
1212
});
1313

14-
static BPE_P50K: LazyLock<Tokenizer> = LazyLock::new(|| {
15-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k.dict"));
14+
static BPE_P50K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
15+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_p50k_base.dict"));
1616
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
1717
let pat = "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+";
1818
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
1919
});
2020

21-
static BPE_CL100K: LazyLock<Tokenizer> = LazyLock::new(|| {
22-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k.dict"));
21+
static BPE_CL100K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
22+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_cl100k_base.dict"));
2323
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
2424
let pat = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+";
2525
Tokenizer::new(bpe, Some(pat)).expect("valid regex")
2626
});
2727

28-
static BPE_O200K: LazyLock<Tokenizer> = LazyLock::new(|| {
29-
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k.dict"));
28+
static BPE_O200K_BASE: LazyLock<Tokenizer> = LazyLock::new(|| {
29+
let bytes = include_bytes!(concat!(env!("OUT_DIR"), "/bpe_o200k_base.dict"));
3030
let bpe = rmp_serde::from_slice(bytes).expect("valid bpe data");
3131
let pat = [
3232
"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?",
@@ -91,20 +91,20 @@ impl Tokenizer {
9191
}
9292
}
9393

94-
pub fn r50k() -> &'static Tokenizer {
95-
&BPE_R50K
94+
pub fn r50k_base() -> &'static Tokenizer {
95+
&BPE_R50K_BASE
9696
}
9797

98-
pub fn p50k() -> &'static Tokenizer {
99-
&BPE_P50K
98+
pub fn p50k_base() -> &'static Tokenizer {
99+
&BPE_P50K_BASE
100100
}
101101

102-
pub fn cl100k() -> &'static Tokenizer {
103-
&BPE_CL100K
102+
pub fn cl100k_base() -> &'static Tokenizer {
103+
&BPE_CL100K_BASE
104104
}
105105

106-
pub fn o200k() -> &'static Tokenizer {
107-
&BPE_O200K
106+
pub fn o200k_base() -> &'static Tokenizer {
107+
&BPE_O200K_BASE
108108
}
109109

110110
#[cfg(test)]
@@ -115,22 +115,22 @@ mod tests {
115115

116116
#[test]
117117
fn can_load_r50k() {
118-
r50k().count("");
118+
r50k_base().count("");
119119
}
120120

121121
#[test]
122122
fn can_load_p50k() {
123-
p50k().count("");
123+
p50k_base().count("");
124124
}
125125

126126
#[test]
127127
fn can_load_cl100k() {
128-
cl100k().count("");
128+
cl100k_base().count("");
129129
}
130130

131131
#[test]
132132
fn can_load_o200k() {
133-
o200k().count("");
133+
o200k_base().count("");
134134
}
135135

136136
/// Test demonstrating a case where input splitting makes a difference.
@@ -142,13 +142,12 @@ mod tests {
142142
.lock()
143143
.encode_ordinary(text)
144144
.into_iter()
145-
.map(|i| i as u32)
146145
.collect();
147146

148-
let without_splitting = BPE_CL100K.bpe.encode_via_backtracking(input);
147+
let without_splitting = BPE_CL100K_BASE.bpe.encode_via_backtracking(input);
149148
assert_ne!(without_splitting, expected);
150149

151-
let with_splitting: Vec<_> = BPE_CL100K.encode(text);
150+
let with_splitting: Vec<_> = BPE_CL100K_BASE.encode(text);
152151
assert_eq!(with_splitting, expected);
153152
}
154153
}

crates/bpe/CONTRIBUTING.md

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# Contributing
2+
3+
Here are specific details that are useful when you want to contribute to the BPE crates.
4+
Make sure to read the repository's [contribution guidelines][contributing] as well.
5+
6+
## Project structure
7+
8+
This project has a slightly unusual structure to resolve some dependency issues.
9+
10+
- This directory contains `bpe`, the BPE code itself.
11+
- A sibling directory contains `bpe-openai`, which exposes tokenizers for OpenAI token sets, and depends on `bpe`.
12+
- Tests are located in the `tests` subdirectory, and benchmarks in the `benchmarks` subdirectory. Both of these are separate crates so they can depend on `bpe-openai` without causing a cyclic dependency.
13+
14+
Only the `bpe` and `bpe-openai` crates are meant to be published. The other ones are for development use only.
15+
16+
## Running benchmarks
17+
18+
Change the working directory to the `benchmarks` directory:
19+
20+
```sh
21+
cd benchmarks
22+
```
23+
24+
Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):
25+
26+
```sh
27+
cargo criterion
28+
```
29+
30+
(Using `cargo bench` ignores the settings in `criterion.toml`!)
31+
Open the full report which should be located in `target/criterion/reports/index.html`.
32+
33+
Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):
34+
35+
```sh
36+
script/copy-results
37+
```
38+
39+
[contributing]: ../../CONTRIBUTING.md

crates/bpe/Cargo.toml

+7-4
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@ bench = false
1414

1515
[features]
1616
rand = ["dep:rand"]
17-
tiktoken-rs = ["dep:tiktoken-rs"]
17+
tiktoken = ["dep:base64"]
1818

1919
[dependencies]
2020
aneubeck-daachorse = "1.1.1"
21+
base64 = { version = "0.22", optional = true }
2122
fnv = "1.0"
2223
itertools = "0.12"
2324
rand = { version = "0.8", optional = true }
24-
rmp-serde = "1"
2525
serde = { version = "1", features = ["derive"] }
26-
tiktoken-rs = { version = "0.5", optional = true }
2726

2827
[dev-dependencies]
29-
bpe = { path = ".", features = ["rand", "tiktoken-rs"] }
28+
bpe = { path = "." }
29+
tiktoken-rs = "0.6"
30+
31+
[package.metadata.docs.rs]
32+
all-features = true

crates/bpe/README.md

-23
Original file line numberDiff line numberDiff line change
@@ -296,26 +296,3 @@ The performance of tiktoken shows a quadratic growth with the input size.
296296
The Huggingface encoder scales better, but becomes slower and slower compared to our implementation as input size increases.
297297

298298
![worst-case encoding runtime comparison](./images/performance-worstcase.svg)
299-
300-
### Running the benchmarks
301-
302-
Benchmarks are located in a separate crate in the `benchmarks` directory.
303-
304-
```sh
305-
cd benchmarks
306-
```
307-
308-
Run the benchmark as follows (required [cargo-criterion](https://crates.io/crates/cargo-criterion) installed):
309-
310-
```sh
311-
cargo criterion
312-
```
313-
314-
(Using `cargo bench` ignores the settings in `criterion.toml`!)
315-
Open the full report which should be located in `target/criterion/reports/index.html`.
316-
317-
Update the figures in this repo as follows (requires `rsvg-convert` from `librsvg` installed):
318-
319-
```sh
320-
script/copy-results
321-
```

crates/bpe/benchmarks/Cargo.toml

+3-2
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,10 @@ path = "equivalence.rs"
1818
test = true
1919

2020
[dependencies]
21-
bpe = { path = "../../bpe", features = ["rand", "tiktoken-rs"] }
21+
bpe = { path = "../../bpe" }
2222
bpe-openai = { path = "../../bpe-openai" }
23+
bpe-tests = { path = "../tests" }
2324
criterion = "0.5"
2425
rand = "0.8"
25-
tiktoken-rs = "0.5"
26+
tiktoken-rs = "0.6"
2627
tokenizers = { version = "0.20", features = ["http"] }

crates/bpe/benchmarks/equivalence.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ fn test_encoding_equivalence_without_pretokenization() {
1616
for input in inputs {
1717
let text = std::str::from_utf8(input).unwrap();
1818
let out = bpe.bpe.encode_via_backtracking(input);
19-
let huggingface_out: Vec<_> = huggingface
19+
let huggingface_out = huggingface
2020
.encode_fast(text, false)
2121
.unwrap()
2222
.get_ids()
@@ -52,10 +52,10 @@ fn test_encoding_equivalence_with_pretokenization() {
5252
for input in inputs {
5353
let text = std::str::from_utf8(input).unwrap();
5454
let out = bpe.encode(text);
55-
let tiktoken_out: Vec<_> = tiktoken.encode_ordinary(text);
56-
let tiktoken_out2: Vec<_> = tiktoken_out.iter().map(|i| *i as u32).collect();
55+
let tiktoken_out = tiktoken.encode_ordinary(text);
56+
let tiktoken_out2 = tiktoken_out.to_vec();
5757
let tiktoken_text = tiktoken.decode(tiktoken_out.clone()).unwrap();
58-
let huggingface_out: Vec<_> = huggingface
58+
let huggingface_out = huggingface
5959
.encode_fast(text, false)
6060
.unwrap()
6161
.get_ids()

crates/bpe/benchmarks/lib.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@ pub static TOKENIZERS: LazyLock<
1818
[
1919
(
2020
"cl100k",
21-
bpe_openai::cl100k(),
21+
bpe_openai::cl100k_base(),
2222
tiktoken_rs::cl100k_base().expect("tokenizer available"),
2323
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4", None).expect("model available"),
2424
),
2525
(
2626
"o200k",
27-
bpe_openai::o200k(),
27+
bpe_openai::o200k_base(),
2828
tiktoken_rs::o200k_base().expect("tokenizer available"),
2929
HuggingfaceTokenizer::from_pretrained("Xenova/gpt-4o", None).expect("model available"),
3030
),

crates/bpe/benchmarks/performance.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
use std::time::Duration;
22

33
use bpe::appendable_encoder::AppendableEncoder;
4-
use bpe::byte_pair_encoding::create_test_bytes;
54
use bpe::interval_encoding::IntervalEncoding;
65
use bpe_benchmarks::*;
6+
use bpe_tests::create_test_bytes;
77
use criterion::{
88
criterion_group, criterion_main, AxisScale, BenchmarkId, Criterion, PlotConfiguration,
99
};

crates/bpe/src/appendable_encoder.rs

-18
Original file line numberDiff line numberDiff line change
@@ -87,21 +87,3 @@ impl<'a> AppendableEncoder<'a> {
8787
self.states.is_empty()
8888
}
8989
}
90-
91-
#[cfg(test)]
92-
mod tests {
93-
use crate::byte_pair_encoding::{create_test_bytes, BPE_CL100K};
94-
95-
use super::AppendableEncoder;
96-
97-
#[test]
98-
fn test_appendable_encoder() {
99-
let bpe = &BPE_CL100K;
100-
let mut enc = AppendableEncoder::new(bpe);
101-
let input_string = create_test_bytes(bpe, 100);
102-
for (i, c) in input_string.iter().enumerate() {
103-
assert_eq!(enc.token_count(), bpe.count(&input_string[0..i]));
104-
enc.push(*c);
105-
}
106-
}
107-
}

0 commit comments

Comments
 (0)