Skip to content

Commit e8c2f96

Browse files
d-kleinerasbt
andauthored
minor fixes: Llama 3.2 standalone (#420)
* minor fixes * reformat rope base as float --------- Co-authored-by: rasbt <[email protected]>
1 parent 1516de5 commit e8c2f96

File tree

2 files changed

+10
-10
lines changed

2 files changed

+10
-10
lines changed

ch05/07_gpt_to_llama/converting-llama2-to-llama3.ipynb

+5-5
Original file line numberDiff line numberDiff line change
@@ -907,7 +907,7 @@
907907
" \"n_layers\": 32, # Number of layers\n",
908908
" \"hidden_dim\": 14_336, # NEW: Larger size of the intermediate dimension in FeedForward\n",
909909
" \"n_kv_groups\": 8, # NEW: Key-Value groups for grouped-query attention\n",
910-
" \"rope_base\": 500_000, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n",
910+
" \"rope_base\": 500_000.0, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n",
911911
" \"rope_freq\": None, # NEW: Additional configuration for adjusting the RoPE frequencies\n",
912912
" \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
913913
"}"
@@ -2060,7 +2060,7 @@
20602060
" \"n_layers\": 32, # Number of layers\n",
20612061
" \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
20622062
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
2063-
" \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
2063+
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
20642064
" \"rope_freq\": None, # Additional configuration for adjusting the RoPE frequencies\n",
20652065
" \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
20662066
"}\n",
@@ -2073,7 +2073,7 @@
20732073
" \"n_layers\": 32, # Number of layers\n",
20742074
" \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
20752075
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
2076-
" \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
2076+
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
20772077
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
20782078
" \"rope_freq\": { # NEW: RoPE frequency scaling\n",
20792079
" \"factor\": 8.0,\n",
@@ -2447,7 +2447,7 @@
24472447
" \"n_layers\": 32, # Number of layers\n",
24482448
" \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
24492449
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
2450-
" \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
2450+
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
24512451
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
24522452
" \"rope_freq\": { # NEW: RoPE frequency scaling\n",
24532453
" \"factor\": 8.0,\n",
@@ -2466,7 +2466,7 @@
24662466
" \"n_layers\": 16, # NEW: Half the number of layers\n",
24672467
" \"hidden_dim\": 8192, # NEW: Almost half the size of the intermediate dimension in FeedForward\n",
24682468
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
2469-
" \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
2469+
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
24702470
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
24712471
" \"rope_freq\": { # RoPE frequency scaling\n",
24722472
" \"factor\": 32.0, # NEW: Adjustment of the rescaling factor\n",

ch05/07_gpt_to_llama/standalone-llama32.ipynb

+5-5
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,7 @@
437437
" \"n_layers\": 16, # Number of layers\n",
438438
" \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n",
439439
" \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
440-
" \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
440+
" \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
441441
" \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
442442
" \"rope_freq\": { # RoPE frequency scaling\n",
443443
" \"factor\": 32.0,\n",
@@ -451,13 +451,13 @@
451451
"\n",
452452
"# LLAMA32_CONFIG = {\n",
453453
"# \"vocab_size\": 128_256, # Vocabulary size\n",
454-
"# \"context_length\": 131_000, # Context length\n",
454+
"# \"context_length\": 131_072, # Context length\n",
455455
"# \"emb_dim\": 3072, # Embedding dimension\n",
456456
"# \"n_heads\": 24, # Number of attention heads\n",
457457
"# \"n_layers\": 28, # Number of layers\n",
458458
"# \"hidden_dim\": 8192, # Size of the intermediate dimension in FeedForward\n",
459459
"# \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
460-
"# \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n",
460+
"# \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n",
461461
"# \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
462462
"# \"rope_freq\": { # RoPE frequency scaling\n",
463463
"# \"factor\": 32.0,\n",
@@ -697,7 +697,6 @@
697697
" def __init__(self, model_path):\n",
698698
" assert os.path.isfile(model_path), f\"Model file {model_path} not found\"\n",
699699
" mergeable_ranks = load_tiktoken_bpe(model_path)\n",
700-
" num_base_tokens = len(mergeable_ranks)\n",
701700
"\n",
702701
" self.special_tokens = {\n",
703702
" \"<|begin_of_text|>\": 128000,\n",
@@ -1013,7 +1012,8 @@
10131012
"\n",
10141013
"\n",
10151014
"load_weights_into_llama(model, LLAMA32_CONFIG, combined_weights)\n",
1016-
"model.to(device);"
1015+
"model.to(device)\n",
1016+
"del combined_weights # free up memory"
10171017
]
10181018
},
10191019
{

0 commit comments

Comments
 (0)