|
861 | 861 | "metadata": {},
|
862 | 862 | "source": [
|
863 | 863 | "- Next, let's initialize and train the BPE tokenizer with a vocabulary size of 1,000\n",
|
864 |
| - "- Note that the vocabulary size is already 255 by default due to the byte values discussed earlier, so we are only \"learning\" 745 vocabulary entries \n", |
| 864 | + "- Note that the vocabulary size is already 256 by default due to the byte values discussed earlier, so we are only \"learning\" 744 vocabulary entries (if we consider the `<|endoftext|>` special token and the `Ġ` whitespace token; so, that's 742 to be precise)\n", |
865 | 865 | "- For comparison, the GPT-2 vocabulary is 50,257 tokens, the GPT-4 vocabulary is 100,256 tokens (`cl100k_base` in tiktoken), and GPT-4o uses 199,997 tokens (`o200k_base` in tiktoken); they have all much bigger training sets compared to our simple example text above"
|
866 | 866 | ]
|
867 | 867 | },
|
|
908 | 908 | "id": "36c9da0f-8a18-41cd-91ea-9ccc2bb5febb",
|
909 | 909 | "metadata": {},
|
910 | 910 | "source": [
|
911 |
| - "- This vocabulary is created by merging 742 times (~ `1000 - len(range(0, 256))`)" |
| 911 | + "- This vocabulary is created by merging 742 times (`= 1000 - len(range(0, 256)) - len(special_tokens) - \"Ġ\" = 1000 - 256 - 1 - 1 = 742`)" |
912 | 912 | ]
|
913 | 913 | },
|
914 | 914 | {
|
|
975 | 975 | "name": "stdout",
|
976 | 976 | "output_type": "stream",
|
977 | 977 | "text": [
|
978 |
| - "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 256, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n" |
| 978 | + "[424, 256, 654, 531, 302, 311, 256, 296, 97, 465, 121, 595, 841, 116, 287, 466, 256, 326, 972, 46, 60, 124, 271, 683, 102, 116, 461, 116, 124, 62]\n" |
979 | 979 | ]
|
980 | 980 | }
|
981 | 981 | ],
|
982 | 982 | "source": [
|
983 |
| - "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", |
| 983 | + "input_text = \"Jack embraced beauty through art and life.<|endoftext|> \"\n", |
984 | 984 | "token_ids = tokenizer.encode(input_text)\n",
|
985 | 985 | "print(token_ids)"
|
986 | 986 | ]
|
|
1000 | 1000 | }
|
1001 | 1001 | ],
|
1002 | 1002 | "source": [
|
1003 |
| - "input_text = \"Jack embraced beauty through art and life. <|endoftext|> \"\n", |
| 1003 | + "input_text = \"Jack embraced beauty through art and life.<|endoftext|> \"\n", |
1004 | 1004 | "token_ids = tokenizer.encode(input_text, allowed_special={\"<|endoftext|>\"})\n",
|
1005 | 1005 | "print(token_ids)"
|
1006 | 1006 | ]
|
|
1015 | 1015 | "name": "stdout",
|
1016 | 1016 | "output_type": "stream",
|
1017 | 1017 | "text": [
|
1018 |
| - "Number of characters: 57\n", |
| 1018 | + "Number of characters: 56\n", |
1019 | 1019 | "Number of token IDs: 21\n"
|
1020 | 1020 | ]
|
1021 | 1021 | }
|
|
0 commit comments