@@ -80,22 +80,41 @@ impl Tokenizer {
80
80
Ok ( Self { bpe, pre } )
81
81
}
82
82
83
+ /// Count the number of tokens produced when encoding the text. Applies pre-tokenization
84
+ /// before counting.
83
85
pub fn count ( & self , text : & str ) -> usize {
84
86
self . split ( text)
85
87
. map ( |piece| self . bpe . count ( piece. as_bytes ( ) ) )
86
88
. sum ( )
87
89
}
88
90
91
+ /// Returns the token count iff the total token count stays below the specified token_limit.
92
+ /// Otherwise, it returns none. This function can be faster than [`Self::count`]` when the
93
+ /// token limit is much smaller than the provided text. Applies pre-tokenization before counting.
94
+ pub fn count_till_limit ( & self , text : & str , token_limit : usize ) -> Option < usize > {
95
+ self . split ( text)
96
+ . try_fold ( token_limit, |token_limit, piece| {
97
+ self . bpe
98
+ . count_till_limit ( piece. as_bytes ( ) , token_limit)
99
+ . map ( |piece_count| token_limit - piece_count)
100
+ } )
101
+ }
102
+
103
+ /// Returns the tokens for the encoding of the given text. Applies pre-tokenization before
104
+ /// encoding.
89
105
pub fn encode ( & self , text : & str ) -> Vec < u32 > {
90
106
self . split ( text)
91
107
. flat_map ( |piece| self . bpe . encode_via_backtracking ( piece. as_bytes ( ) ) )
92
108
. collect ( )
93
109
}
94
-
110
+ /// Returns the text corresponding to the given encoding if it is valid UTF-8. Otherwise,
111
+ /// returns none.
95
112
pub fn decode ( & self , tokens : & [ u32 ] ) -> Option < String > {
96
113
String :: from_utf8 ( self . bpe . decode_tokens ( tokens) ) . ok ( )
97
114
}
98
115
116
+ /// Returns an iterator with the text pieces resulting from pre-tokenization. If this
117
+ /// tokenizer does not have pre-tokenization, the iterator returns the full text.
99
118
pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
100
119
match & self . pre {
101
120
Some ( pre) => Either :: Left ( pre. split ( text) ) ,
@@ -124,6 +143,7 @@ impl Pretokenizer {
124
143
Ok ( Self { pat, lookahead } )
125
144
}
126
145
146
+ /// Returns an iterator with the text pieces after splitting with the regular expression.
127
147
pub fn split < ' a > ( & ' a self , text : & ' a str ) -> impl Iterator < Item = & str > + ' a {
128
148
Splits {
129
149
pat : & self . pat ,
0 commit comments