1
1
#[ cfg( test) ]
2
2
mod tests {
3
- use std:: time:: Instant ;
4
-
5
3
use itertools:: Itertools ;
6
4
use rand:: { thread_rng, Rng } ;
7
- use tiktoken_rs:: { cl100k_base_singleton, o200k_base_singleton } ;
5
+ use tiktoken_rs:: cl100k_base_singleton;
8
6
9
7
use bpe:: appendable_encoder:: AppendableEncoder ;
10
- use bpe:: byte_pair_encoding:: { create_test_string , BytePairEncoding } ;
8
+ use bpe:: byte_pair_encoding:: { create_test_bytes , BytePairEncoding } ;
11
9
use bpe:: interval_encoding:: IntervalEncoding ;
12
10
use bpe:: prependable_encoder:: PrependableEncoder ;
13
- use bpe_openai:: { cl100k_base, o200k_base } ;
11
+ use bpe_openai:: cl100k_base;
14
12
15
13
/// This test produces the output for the encoding example in the README.
16
14
#[ test]
@@ -72,93 +70,64 @@ mod tests {
72
70
fn test_appendable_encoder ( ) {
73
71
let bpe = & cl100k_base ( ) . bpe ;
74
72
let mut enc = AppendableEncoder :: new ( bpe) ;
75
- let input_string = create_test_string ( bpe, 100 ) ;
76
- for ( i, b) in input_string . as_bytes ( ) . iter ( ) . enumerate ( ) {
73
+ let input = create_test_bytes ( bpe, 100 ) ;
74
+ for ( i, b) in input . iter ( ) . enumerate ( ) {
77
75
enc. push ( * b) ;
78
- assert_eq ! (
79
- enc. token_count( ) ,
80
- bpe. count( & input_string. as_bytes( ) [ 0 ..i + 1 ] )
81
- ) ;
76
+ assert_eq ! ( enc. token_count( ) , bpe. count( & input[ 0 ..i + 1 ] ) ) ;
82
77
}
83
78
}
84
79
85
80
#[ test]
86
- fn test_correctness_cl100k ( ) {
81
+ fn test_correctness ( ) {
87
82
// This is quite a challenging test case...
88
- let test_string = std:: str:: from_utf8 ( & [
83
+ let input = std:: str:: from_utf8 ( & [
89
84
125 , 34 , 10 , 10 , 46 , 109 , 107 , 100 , 105 , 114 , 115 , 32 , 102 , 100 , 115 , 32 , 97 , 100 , 105 ,
90
85
112 , 105 , 115 , 105 , 99 , 105 , 110 , 103 , 105 , 116 , 121 , 69 , 110 , 103 , 105 , 110 , 101 , 32 ,
91
86
69 , 67 , 105 , 114 , 105 , 101 , 32 , 111 , 112 , 116 , 105 , 109 , 97 , 108 , 95 , 68 , 65 , 32 , 111 ,
92
87
102 , 102 , 101 , 110 , 100 ,
93
88
] )
94
89
. unwrap ( ) ;
95
- let time = Instant :: now ( ) ;
96
90
let bpe = & cl100k_base ( ) . bpe ;
97
- println ! ( "{:?}" , time. elapsed( ) ) ;
98
91
let encoded1 = cl100k_base_singleton ( )
99
92
. lock ( )
100
- . encode_ordinary ( test_string)
101
- . into_iter ( )
102
- . collect_vec ( ) ;
103
- let encoded2 = bpe. encode_via_backtracking ( test_string. as_bytes ( ) ) ;
104
- assert_eq ! ( encoded1, encoded2) ;
105
- let encoded3 = bpe. encode_via_table ( test_string. as_bytes ( ) ) ;
106
- assert_eq ! ( encoded1, encoded3) ;
107
- let encoded4 = bpe. encode_via_bitfield ( test_string. as_bytes ( ) ) ;
108
- assert_eq ! ( encoded1, encoded4) ;
109
- }
110
-
111
- #[ test]
112
- fn test_correctness_o200k ( ) {
113
- // This is quite a challenging test case...
114
- let test_string = std:: str:: from_utf8 ( & [
115
- 125 , 34 , 10 , 10 , 46 , 109 , 107 , 100 , 105 , 114 , 115 , 32 , 102 , 100 , 115 , 32 , 97 , 100 , 105 ,
116
- 112 , 105 , 115 , 105 , 99 , 105 , 110 , 103 , 105 , 116 , 121 , 69 , 110 , 103 , 105 , 110 , 101 , 32 ,
117
- 69 , 67 , 105 , 114 , 105 , 101 , 32 , 111 , 112 , 116 , 105 , 109 , 97 , 108 , 95 , 68 , 65 , 32 , 111 ,
118
- 102 , 102 , 101 , 110 , 100 ,
119
- ] )
120
- . unwrap ( ) ;
121
- let time = Instant :: now ( ) ;
122
- let bpe = & o200k_base ( ) . bpe ;
123
- println ! ( "{:?}" , time. elapsed( ) ) ;
124
- let encoded1 = o200k_base_singleton ( )
125
- . lock ( )
126
- . encode_ordinary ( test_string)
93
+ . encode_ordinary ( input)
127
94
. into_iter ( )
128
95
. collect_vec ( ) ;
129
- let encoded2 = bpe. encode_via_backtracking ( test_string . as_bytes ( ) ) ;
96
+ let encoded2 = bpe. encode_via_backtracking ( input . as_bytes ( ) ) ;
130
97
assert_eq ! ( encoded1, encoded2) ;
131
- let encoded3 = bpe. encode_via_table ( test_string . as_bytes ( ) ) ;
98
+ let encoded3 = bpe. encode_via_table ( input . as_bytes ( ) ) ;
132
99
assert_eq ! ( encoded1, encoded3) ;
133
- let encoded4 = bpe. encode_via_bitfield ( test_string . as_bytes ( ) ) ;
100
+ let encoded4 = bpe. encode_via_bitfield ( input . as_bytes ( ) ) ;
134
101
assert_eq ! ( encoded1, encoded4) ;
135
102
}
136
103
137
104
#[ test]
138
105
fn test_bpe_equivalence ( ) {
139
106
let bpe = & cl100k_base ( ) . bpe ;
140
107
for bytes in [ 10 , 1000 , 10000 ] {
141
- for _ in 0 ..5 {
142
- let test_input = create_test_string ( bpe, bytes) ;
143
- let encoded1 = bpe. encode_via_backtracking ( test_input . as_bytes ( ) ) ;
144
- let encoded2 = bpe. encode_via_bitfield ( test_input . as_bytes ( ) ) ;
108
+ for _ in 0 ..8 {
109
+ let input = create_test_bytes ( bpe, bytes) ;
110
+ let encoded1 = bpe. encode_via_backtracking ( & input ) ;
111
+ let encoded2 = bpe. encode_via_bitfield ( & input ) ;
145
112
assert_eq ! ( encoded1, encoded2, "{} {}" , encoded1. len( ) , encoded2. len( ) ) ;
113
+ let encoded3 = bpe. encode_via_table ( & input) ;
114
+ assert_eq ! ( encoded1, encoded3, "{} {}" , encoded1. len( ) , encoded3. len( ) ) ;
146
115
}
147
116
}
148
117
}
149
118
150
119
#[ test]
151
120
fn test_interval_count ( ) {
152
121
let bpe = & cl100k_base ( ) . bpe ;
153
- let text = create_test_string ( bpe, 10000 ) ;
154
- let intervals = IntervalEncoding :: new ( bpe, text . as_bytes ( ) ) ;
122
+ let input = create_test_bytes ( bpe, 10000 ) ;
123
+ let intervals = IntervalEncoding :: new ( bpe, & input ) ;
155
124
for _ in 0 ..1000 {
156
- let start = thread_rng ( ) . gen_range ( 0 ..text . len ( ) ) ;
157
- let end = thread_rng ( ) . gen_range ( 0 ..text . len ( ) ) ;
125
+ let start = thread_rng ( ) . gen_range ( 0 ..input . len ( ) ) ;
126
+ let end = thread_rng ( ) . gen_range ( 0 ..input . len ( ) ) ;
158
127
let range = start. min ( end) ..start. max ( end) ;
159
128
assert_eq ! (
160
129
intervals. count( range. clone( ) ) ,
161
- bpe. encode_via_backtracking( & text . as_bytes ( ) [ range] ) . len( )
130
+ bpe. encode_via_backtracking( & input [ range] ) . len( )
162
131
) ;
163
132
}
164
133
}
@@ -167,10 +136,10 @@ mod tests {
167
136
fn test_prependable_encoder ( ) {
168
137
let bpe = & cl100k_base ( ) . bpe ;
169
138
let mut enc = PrependableEncoder :: new ( bpe) ;
170
- let input_string = create_test_string ( bpe, 100 ) ;
171
- for ( i, b) in input_string . as_bytes ( ) . iter ( ) . enumerate ( ) . rev ( ) {
139
+ let input = create_test_bytes ( bpe, 100 ) ;
140
+ for ( i, b) in input . iter ( ) . enumerate ( ) . rev ( ) {
172
141
enc. push ( * b) ;
173
- assert_eq ! ( enc. token_count( ) , bpe. count( & input_string . as_bytes ( ) [ i..] ) ) ;
142
+ assert_eq ! ( enc. token_count( ) , bpe. count( & input [ i..] ) ) ;
174
143
}
175
144
}
176
145
}
0 commit comments