Skip to content

Commit e0dbec3

Browse files
MaxwellDeJongrasbt
andauthored
Fix encoding of multiple preceding spaces in BPE tokenizer. (#945)
* Fix encoding of multiple preceding spaces in BPE tokenizer. * Add test --------- Co-authored-by: rasbt <[email protected]>
1 parent 90e0f3c commit e0dbec3

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

ch02/05_bpe-from-scratch/bpe-from-scratch.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,9 +609,9 @@
609609
" else:\n",
610610
" word = m.group(2)\n",
611611
" if pending_spaces > 0:\n",
612-
" tokens.append(\"Ġ\" + word) # one leading space\n",
613612
" for _ in range(pending_spaces - 1):\n",
614613
" tokens.append(\"Ġ\") # remaining spaces as standalone\n",
614+
" tokens.append(\"Ġ\" + word) # one leading space\n",
615615
" pending_spaces = 0\n",
616616
" else:\n",
617617
" tokens.append(word)\n",

ch02/05_bpe-from-scratch/tests.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,4 +238,15 @@ def test_space_newline_space_patterns(imported_module, gpt2_files):
238238
"Hello\n world",
239239
]
240240
for s in samples:
241-
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
241+
assert tok.encode(s) == tik.encode(s), f"Mismatch vs tiktoken: {repr(s)}"
242+
243+
244+
def test_multiple_leading_spaces_roundtrip(imported_module, gpt2_files):
245+
BPETokenizerSimple = getattr(imported_module, "BPETokenizerSimple", None)
246+
tok = BPETokenizerSimple()
247+
tok.load_vocab_and_merges_from_openai(
248+
vocab_path=gpt2_files["encoder.json"], bpe_merges_path=gpt2_files["vocab.bpe"]
249+
)
250+
251+
text = " Hello World."
252+
assert tok.decode(tok.encode(text)) == text

0 commit comments

Comments
 (0)