Skip to content

Commit cec445f

Browse files
tao-qianrasbt
andauthored
Minor readability improvement in dataloader.ipynb (#461)
* Minor readability improvement in dataloader.ipynb - The tokenizer and encoded_text variables at the root level are unused. - The default params for create_dataloader_v1 are confusing, especially for the default batch_size 4, which happens to be the same as the max_length. * readability improvements --------- Co-authored-by: rasbt <[email protected]>
1 parent 1b635f7 commit cec445f

File tree

1 file changed

+10
-7
lines changed

1 file changed

+10
-7
lines changed

ch02/01_main-chapter-code/dataloader.ipynb

+10-7
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,8 @@
103103
" return self.input_ids[idx], self.target_ids[idx]\n",
104104
"\n",
105105
"\n",
106-
"def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
107-
" stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
106+
"def create_dataloader_v1(txt, batch_size, max_length, stride,\n",
107+
" shuffle=True, drop_last=True, num_workers=0):\n",
108108
" # Initialize the tokenizer\n",
109109
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
110110
"\n",
@@ -121,9 +121,6 @@
121121
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
122122
" raw_text = f.read()\n",
123123
"\n",
124-
"tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
125-
"encoded_text = tokenizer.encode(raw_text)\n",
126-
"\n",
127124
"vocab_size = 50257\n",
128125
"output_dim = 256\n",
129126
"context_length = 1024\n",
@@ -132,8 +129,14 @@
132129
"token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
133130
"pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
134131
"\n",
132+
"batch_size = 8\n",
135133
"max_length = 4\n",
136-
"dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)"
134+
"dataloader = create_dataloader_v1(\n",
135+
" raw_text,\n",
136+
" batch_size=batch_size,\n",
137+
" max_length=max_length,\n",
138+
" stride=max_length\n",
139+
")"
137140
]
138141
},
139142
{
@@ -189,7 +192,7 @@
189192
"name": "python",
190193
"nbconvert_exporter": "python",
191194
"pygments_lexer": "ipython3",
192-
"version": "3.10.6"
195+
"version": "3.11.4"
193196
}
194197
},
195198
"nbformat": 4,

0 commit comments

Comments
 (0)