Skip to content

Commit f63f04d

Browse files
authored
Fix BPE bonus materials (#561)
* Fix BPE bonus materials * fix bpe implementation * update * Add 'Hello, world. Is this-- a test?' test case * update link to test file * update path handling * update path handling * fix pytest paths
1 parent 96ca2fc commit f63f04d

File tree

5 files changed

+305
-85
lines changed

5 files changed

+305
-85
lines changed

.github/workflows/basic-tests-linux-uv.yml

+6
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,9 @@ jobs:
6060
pytest --ruff --nbval ch02/01_main-chapter-code/dataloader.ipynb
6161
pytest --ruff --nbval ch03/01_main-chapter-code/multihead-attention.ipynb
6262
pytest --ruff --nbval ch02/04_bonus_dataloader-intuition/dataloader-intuition.ipynb
63+
64+
- name: Test Selected Bonus Materials
65+
shell: bash
66+
run: |
67+
source .venv/bin/activate
68+
pytest ch02/05_bpe-from-scratch/tests/tests.py

.gitignore

+7-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
12
# Configs and keys
23
ch05/07_gpt_to_llama/config.json
34
ch07/02_dataset-utilities/config.json
@@ -63,13 +64,16 @@ ch07/01_main-chapter-code/Smalltestmodel-sft-standalone.pth
6364
ch07/01_main-chapter-code/gpt2/
6465

6566
# Datasets
67+
the-verdict.txt
68+
6669
appendix-E/01_main-chapter-code/sms_spam_collection.zip
6770
appendix-E/01_main-chapter-code/sms_spam_collection
6871
appendix-E/01_main-chapter-code/train.csv
6972
appendix-E/01_main-chapter-code/test.csv
7073
appendix-E/01_main-chapter-code/validation.csv
7174

7275
ch02/01_main-chapter-code/number-data.txt
76+
ch02/05_bpe-from-scratch/the-verdict.txt
7377

7478
ch05/03_bonus_pretraining_on_gutenberg/gutenberg
7579
ch05/03_bonus_pretraining_on_gutenberg/gutenberg_preprocessed
@@ -107,7 +111,9 @@ ch02/05_bpe-from-scratch/bpe_merges.txt
107111
ch02/05_bpe-from-scratch/encoder.json
108112
ch02/05_bpe-from-scratch/vocab.bpe
109113
ch02/05_bpe-from-scratch/vocab.json
110-
114+
encoder.json
115+
vocab.bpe
116+
vocab.json
111117

112118
# Other
113119
ch0?/0?_user_interface/.chainlit/

ch02/02_bonus_bytepair-encoder/compare-bpe-tiktoken.ipynb

+22-14
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
"name": "stdout",
6868
"output_type": "stream",
6969
"text": [
70-
"tiktoken version: 0.7.0\n"
70+
"tiktoken version: 0.9.0\n"
7171
]
7272
}
7373
],
@@ -180,8 +180,8 @@
180180
"name": "stderr",
181181
"output_type": "stream",
182182
"text": [
183-
"Fetching encoder.json: 1.04Mit [00:00, 4.13Mit/s] \n",
184-
"Fetching vocab.bpe: 457kit [00:00, 2.56Mit/s] \n"
183+
"Fetching encoder.json: 1.04Mit [00:00, 3.69Mit/s] \n",
184+
"Fetching vocab.bpe: 457kit [00:00, 2.53Mit/s] \n"
185185
]
186186
}
187187
],
@@ -256,10 +256,18 @@
256256
"id": "e9077bf4-f91f-42ad-ab76-f3d89128510e",
257257
"metadata": {},
258258
"outputs": [
259+
{
260+
"name": "stderr",
261+
"output_type": "stream",
262+
"text": [
263+
"/Users/sebastian/Developer/LLMs-from-scratch/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
264+
" from .autonotebook import tqdm as notebook_tqdm\n"
265+
]
266+
},
259267
{
260268
"data": {
261269
"text/plain": [
262-
"'4.48.0'"
270+
"'4.49.0'"
263271
]
264272
},
265273
"execution_count": 12,
@@ -423,7 +431,7 @@
423431
"name": "stdout",
424432
"output_type": "stream",
425433
"text": [
426-
"[1544, 18798, 11, 995, 13, 1148, 256, 5303, 82, 438, 257, 1332, 30]\n"
434+
"[15496, 11, 995, 13, 1148, 428, 438, 257, 1332, 30]\n"
427435
]
428436
}
429437
],
@@ -451,7 +459,7 @@
451459
"metadata": {},
452460
"outputs": [],
453461
"source": [
454-
"with open('../01_main-chapter-code/the-verdict.txt', 'r', encoding='utf-8') as f:\n",
462+
"with open(\"../01_main-chapter-code/the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
455463
" raw_text = f.read()"
456464
]
457465
},
@@ -473,7 +481,7 @@
473481
"name": "stdout",
474482
"output_type": "stream",
475483
"text": [
476-
"3.39 ms ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
484+
"3.84 ms ± 9.83 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
477485
]
478486
}
479487
],
@@ -499,7 +507,7 @@
499507
"name": "stdout",
500508
"output_type": "stream",
501509
"text": [
502-
"1.08 ms ± 5.99 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
510+
"901 μs ± 6.27 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
503511
]
504512
}
505513
],
@@ -532,7 +540,7 @@
532540
"name": "stdout",
533541
"output_type": "stream",
534542
"text": [
535-
"10.2 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
543+
"11 ms ± 94.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
536544
]
537545
}
538546
],
@@ -550,7 +558,7 @@
550558
"name": "stdout",
551559
"output_type": "stream",
552560
"text": [
553-
"10 ms ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
561+
"10.8 ms ± 180 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
554562
]
555563
}
556564
],
@@ -575,7 +583,7 @@
575583
"name": "stdout",
576584
"output_type": "stream",
577585
"text": [
578-
"3.79 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
586+
"3.66 ms ± 3.67 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
579587
]
580588
}
581589
],
@@ -593,7 +601,7 @@
593601
"name": "stdout",
594602
"output_type": "stream",
595603
"text": [
596-
"3.83 ms ± 58.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
604+
"3.77 ms ± 49.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
597605
]
598606
}
599607
],
@@ -619,7 +627,7 @@
619627
"name": "stdout",
620628
"output_type": "stream",
621629
"text": [
622-
"1.59 ms ± 11.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
630+
"9.37 ms ± 50.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
623631
]
624632
}
625633
],
@@ -644,7 +652,7 @@
644652
"name": "python",
645653
"nbconvert_exporter": "python",
646654
"pygments_lexer": "ipython3",
647-
"version": "3.11.4"
655+
"version": "3.10.16"
648656
}
649657
},
650658
"nbformat": 4,

0 commit comments

Comments
 (0)