@@ -46,16 +46,16 @@ def pythia_tokenizer():
4646
4747
4848@pytest .fixture (scope = "module" )
49- def gemma_tokenizer ():
49+ def olmo_tokenizer ():
5050 """
51- Load gemma-2-2b-it tokenizer for testing.
51+ Load OLMo-3-7B-Instruct tokenizer for testing.
5252
5353 Properties:
54- - BOS token: '<bos >' (ID: 2 )
54+ - BOS token: '<|endoftext| >' (ID: 100257 )
5555 - DOES add BOS by default (add_bos_token=True in tokenizer)
5656 - Used to test tokenizers that add BOS by default
5757 """
58- tokenizer = AutoTokenizer .from_pretrained ("google/gemma-2-2b-it " )
58+ tokenizer = AutoTokenizer .from_pretrained ("allenai/OLMo-3-7B-Instruct " )
5959 # Set pad token to avoid padding errors in batch encoding
6060 tokenizer .pad_token = tokenizer .eos_token
6161 tokenizer .padding_side = "right"
@@ -150,14 +150,14 @@ def test_both_none_returns_empty(self):
150150class TestDefaultsToNone :
151151 """Test that add_bos_token defaults to None, allowing tokenizer defaults."""
152152
153- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
153+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
154154 def test_huggingface_none_uses_tokenizer_default (self , tokenizer_name , request ):
155155 """
156156 HF: When add_bos_token=None, should respect tokenizer's default.
157157
158158 Tests both tokenizer types:
159159 - Pythia: Doesn't add BOS by default
160- - Gemma : DOES add BOS by default
160+ - OLMo : DOES add BOS by default
161161 """
162162 tokenizer = request .getfixturevalue (tokenizer_name )
163163 mock_hflm = create_hf_mock (tokenizer , add_bos_token = None )
@@ -166,14 +166,14 @@ def test_huggingface_none_uses_tokenizer_default(self, tokenizer_name, request):
166166 expected = tokenizer .encode ("Hello" )
167167 assert result == expected
168168
169- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
169+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
170170 def test_vllm_none_uses_tokenizer_default (self , tokenizer_name , request ):
171171 """
172172 vLLM: When add_bos_token=None, should respect tokenizer's default.
173173
174174 Tests both tokenizer types:
175175 - Pythia: Doesn't add BOS by default
176- - Gemma : DOES add BOS by default
176+ - OLMo : DOES add BOS by default
177177 """
178178 tokenizer = request .getfixturevalue (tokenizer_name )
179179 mock_vllm = create_vllm_mock (tokenizer , add_bos_token = None )
@@ -191,7 +191,7 @@ def test_vllm_none_uses_tokenizer_default(self, tokenizer_name, request):
191191class TestNoDuplicateBos :
192192 """Test that BOS tokens are never duplicated when already present."""
193193
194- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
194+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
195195 def test_huggingface_detects_bos_in_single_string (self , tokenizer_name , request ):
196196 """HF: Should detect BOS prefix and avoid duplication."""
197197 tokenizer = request .getfixturevalue (tokenizer_name )
@@ -215,7 +215,7 @@ def test_huggingface_detects_bos_in_single_string(self, tokenizer_name, request)
215215 # Should avoid duplication (fewer or equal tokens)
216216 assert input_ids .shape [1 ] <= without_detection .shape [1 ]
217217
218- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
218+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
219219 def test_huggingface_adds_bos_when_missing (self , tokenizer_name , request ):
220220 """HF: Should add BOS when string doesn't have it (using add_special_tokens=True)"""
221221 tokenizer = request .getfixturevalue (tokenizer_name )
@@ -228,13 +228,13 @@ def test_huggingface_adds_bos_when_missing(self, tokenizer_name, request):
228228
229229 assert input_ids .tolist () == expected .tolist ()
230230
231- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
231+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
232232 def test_huggingface_follows_tokenizer_default (self , tokenizer_name , request ):
233233 """
234234 HF: When add_bos_token is not set (None), follows tokenizer default.
235235
236236 - Pythia: Doesn't add BOS by default
237- - Gemma : DOES add BOS by default
237+ - OLMo : DOES add BOS by default
238238 """
239239 tokenizer = request .getfixturevalue (tokenizer_name )
240240 mock_hflm = create_hf_mock (tokenizer , add_bos_token = None )
@@ -244,7 +244,7 @@ def test_huggingface_follows_tokenizer_default(self, tokenizer_name, request):
244244
245245 assert input_ids .tolist () == expected .tolist ()
246246
247- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
247+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
248248 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
249249 def test_vllm_handles_mixed_batch (self , tokenizer_name , add_bos_token , request ):
250250 """
@@ -284,7 +284,7 @@ def test_vllm_handles_mixed_batch(self, tokenizer_name, add_bos_token, request):
284284 for i , exp in enumerate (expected ):
285285 assert result [i ] == exp
286286
287- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
287+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
288288 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
289289 def test_vllm_preserves_order_in_mixed_batch (
290290 self , tokenizer_name , add_bos_token , request
@@ -328,7 +328,7 @@ def test_vllm_preserves_order_in_mixed_batch(
328328class TestChatTemplateCompatibility :
329329 """Test that chat templates (which add BOS) work without duplication."""
330330
331- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
331+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
332332 def test_huggingface_chat_template_no_duplicate_bos (self , tokenizer_name , request ):
333333 """
334334 HF: Chat template adds BOS, tokenizer should not add another.
@@ -352,7 +352,7 @@ def test_huggingface_chat_template_no_duplicate_bos(self, tokenizer_name, reques
352352
353353 assert torch .equal (input_ids , expected )
354354
355- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
355+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
356356 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
357357 def test_vllm_mixed_chat_batch (self , tokenizer_name , add_bos_token , request ):
358358 """
@@ -424,7 +424,7 @@ def test_huggingface_seq2seq_skips_causal_bos_logic(self, pythia_tokenizer):
424424class TestLoglikelihoodBosHandling :
425425 """Test BOS handling in loglikelihood method."""
426426
427- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
427+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
428428 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
429429 def test_empty_context_continuation_with_bos (
430430 self , tokenizer_name , add_bos_token , request
@@ -479,7 +479,7 @@ def capture_and_return(reqs, disable_tqdm=False):
479479 )
480480 assert continuation_enc == continuation_without_bos [1 :] # Skip the BOS token
481481
482- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
482+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
483483 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
484484 def test_empty_context_continuation_without_bos (
485485 self , tokenizer_name , add_bos_token , request
@@ -523,7 +523,7 @@ def capture_and_return(reqs, disable_tqdm=False):
523523 expected_continuation = tokenizer .encode (continuation , add_special_tokens = False )
524524 assert continuation_enc == expected_continuation
525525
526- @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "gemma_tokenizer " ])
526+ @pytest .mark .parametrize ("tokenizer_name" , ["pythia_tokenizer" , "olmo_tokenizer " ])
527527 @pytest .mark .parametrize ("add_bos_token" , [None , True ])
528528 def test_context_with_bos_prefix (self , tokenizer_name , add_bos_token , request ):
529529 """When context starts with BOS (e.g., from chat template), should not duplicate BOS."""
0 commit comments