Skip to content

Commit 4f16cea

Browse files
authored
Minor readability improvement in dataloader.ipynb
- The tokenizer and encoded_text variables at the root level are unused. - The default params for create_dataloader_v1 are confusing, especially for the default batch_size 4, which happens to be the same as the max_length.
1 parent bb31de8 commit 4f16cea

File tree

1 file changed

+3
-6
lines changed

1 file changed

+3
-6
lines changed

ch02/01_main-chapter-code/dataloader.ipynb

+3-6
Original file line numberDiff line numberDiff line change
@@ -103,8 +103,7 @@
103103
" return self.input_ids[idx], self.target_ids[idx]\n",
104104
"\n",
105105
"\n",
106-
"def create_dataloader_v1(txt, batch_size=4, max_length=256, \n",
107-
" stride=128, shuffle=True, drop_last=True, num_workers=0):\n",
106+
"def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle=True, drop_last=True, num_workers=0):\n",
108107
" # Initialize the tokenizer\n",
109108
" tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
110109
"\n",
@@ -121,9 +120,6 @@
121120
"with open(\"the-verdict.txt\", \"r\", encoding=\"utf-8\") as f:\n",
122121
" raw_text = f.read()\n",
123122
"\n",
124-
"tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
125-
"encoded_text = tokenizer.encode(raw_text)\n",
126-
"\n",
127123
"vocab_size = 50257\n",
128124
"output_dim = 256\n",
129125
"context_length = 1024\n",
@@ -132,8 +128,9 @@
132128
"token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)\n",
133129
"pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)\n",
134130
"\n",
131+
"batch_size = 8\n",
135132
"max_length = 4\n",
136-
"dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=max_length, stride=max_length)"
133+
"dataloader = create_dataloader_v1(raw_text, batch_size=batch_size, max_length=max_length, stride=max_length)"
137134
]
138135
},
139136
{

0 commit comments

Comments
 (0)