Skip to content

Commit bb31de8

Browse files
casincarasbt
andauthored
[minor] typo & comments (#441)
* typo & comment - safe -> save - commenting code: batch_size, seq_len = in_idx.shape * comment - adding # NEW for assert num_heads % num_kv_groups == 0 * update memory wording --------- Co-authored-by: rasbt <[email protected]>
1 parent e95c898 commit bb31de8

File tree

3 files changed

+14
-14
lines changed

3 files changed

+14
-14
lines changed

ch05/07_gpt_to_llama/converting-gpt-to-llama2.ipynb

+5-5
Original file line numberDiff line numberDiff line change
@@ -381,7 +381,7 @@
381381
"id": "qcD8LSHNhBRW"
382382
},
383383
"source": [
384-
"- Note that we also added a `dtype=cfg[\"dtype\"]` setting above, which will allow us to load the model directly in lower precision formats later to save memory (versus instantiating it in the original 32-bit precision format and then converting it)\n",
384+
"- Note that we also added a `dtype=cfg[\"dtype\"]` setting above, which will allow us to load the model directly in lower precision formats later to reduce memory usage (versus instantiating it in the original 32-bit precision format and then converting it)\n",
385385
"- We also set `bias=False` since Llama doesn't use any bias units"
386386
]
387387
},
@@ -648,7 +648,7 @@
648648
"\n",
649649
"mha(example_batch)\n",
650650
"\n",
651-
"del mha # delete to safe memory"
651+
"del mha # delete to free up memory"
652652
]
653653
},
654654
{
@@ -781,7 +781,7 @@
781781
" self.out_head = nn.Linear(cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False, dtype=cfg[\"dtype\"])\n",
782782
"\n",
783783
" def forward(self, in_idx):\n",
784-
" batch_size, seq_len = in_idx.shape\n",
784+
" # batch_size, seq_len = in_idx.shape\n",
785785
" tok_embeds = self.tok_emb(in_idx)\n",
786786
" # pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
787787
" x = tok_embeds # + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n",
@@ -890,7 +890,7 @@
890890
" \"n_heads\": 32, # Number of attention heads\n",
891891
" \"n_layers\": 32, # Number of layers\n",
892892
" \"hidden_dim\": 11008, # NEW: Size of the intermediate dimension in FeedForward\n",
893-
" \"dtype\": torch.bfloat16 # NEW: Lower-precision dtype to save memory\n",
893+
" \"dtype\": torch.bfloat16 # NEW: Lower-precision dtype to reduce memory usage\n",
894894
"}"
895895
]
896896
},
@@ -1691,7 +1691,7 @@
16911691
"name": "python",
16921692
"nbconvert_exporter": "python",
16931693
"pygments_lexer": "ipython3",
1694-
"version": "3.10.6"
1694+
"version": "3.11.4"
16951695
},
16961696
"widgets": {
16971697
"application/vnd.jupyter.widget-state+json": {

ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb

+7-7
Original file line numberDiff line numberDiff line change
@@ -481,7 +481,7 @@
481481
" ):\n",
482482
" super().__init__()\n",
483483
" assert d_out % num_heads == 0, \"d_out must be divisible by num_heads\"\n",
484-
" assert num_heads % num_kv_groups == 0, \"num_heads must be divisible by num_kv_groups\"\n",
484+
" assert num_heads % num_kv_groups == 0, \"num_heads must be divisible by num_kv_groups\" # NEW\n",
485485
"\n",
486486
" self.d_out = d_out\n",
487487
" self.num_heads = num_heads\n",
@@ -886,7 +886,7 @@
886886
" \"n_heads\": 32, # Number of attention heads\n",
887887
" \"n_layers\": 32, # Number of layers\n",
888888
" \"hidden_dim\": 11_008, # Size of the intermediate dimension in FeedForward\n",
889-
" \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
889+
" \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n",
890890
"}"
891891
]
892892
},
@@ -909,7 +909,7 @@
909909
" \"n_kv_groups\": 8, # NEW: Key-Value groups for grouped-query attention\n",
910910
" \"rope_base\": 500_000.0, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n",
911911
" \"rope_freq\": None, # NEW: Additional configuration for adjusting the RoPE frequencies\n",
912-
" \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
912+
" \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n",
913913
"}"
914914
]
915915
},
@@ -2062,7 +2062,7 @@
20622062
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
20632063
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
20642064
" \"rope_freq\": None, # Additional configuration for adjusting the RoPE frequencies\n",
2065-
" \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
2065+
" \"dtype\": torch.bfloat16 # Lower-precision dtype to reduce memory usage\n",
20662066
"}\n",
20672067
"\n",
20682068
"LLAMA31_CONFIG_8B = {\n",
@@ -2074,7 +2074,7 @@
20742074
" \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
20752075
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
20762076
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
2077-
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
2077+
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
20782078
" \"rope_freq\": { # NEW: RoPE frequency scaling\n",
20792079
" \"factor\": 8.0,\n",
20802080
" \"low_freq_factor\": 1.0,\n",
@@ -2448,7 +2448,7 @@
24482448
" \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
24492449
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
24502450
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
2451-
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
2451+
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usagey\n",
24522452
" \"rope_freq\": { # NEW: RoPE frequency scaling\n",
24532453
" \"factor\": 8.0,\n",
24542454
" \"low_freq_factor\": 1.0,\n",
@@ -2467,7 +2467,7 @@
24672467
" \"hidden_dim\": 8192, # NEW: Almost half the size of the intermediate dimension in FeedForward\n",
24682468
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
24692469
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
2470-
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
2470+
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
24712471
" \"rope_freq\": { # RoPE frequency scaling\n",
24722472
" \"factor\": 32.0, # NEW: Adjustment of the rescaling factor\n",
24732473
" \"low_freq_factor\": 1.0,\n",

ch05/07_gpt_to_llama/standalone-llama32.ipynb

+2-2
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@
438438
" \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n",
439439
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
440440
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
441-
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
441+
" \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
442442
" \"rope_freq\": { # RoPE frequency scaling\n",
443443
" \"factor\": 32.0,\n",
444444
" \"low_freq_factor\": 1.0,\n",
@@ -458,7 +458,7 @@
458458
"# \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n",
459459
"# \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
460460
"# \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
461-
"# \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
461+
"# \"dtype\": torch.bfloat16, # Lower-precision dtype to reduce memory usage\n",
462462
"# \"rope_freq\": { # RoPE frequency scaling\n",
463463
"# \"factor\": 32.0,\n",
464464
"# \"low_freq_factor\": 1.0,\n",

0 commit comments

Comments
 (0)