|
907 | 907 | " \"n_layers\": 32, # Number of layers\n",
|
908 | 908 | " \"hidden_dim\": 14_336, # NEW: Larger size of the intermediate dimension in FeedForward\n",
|
909 | 909 | " \"n_kv_groups\": 8, # NEW: Key-Value groups for grouped-query attention\n",
|
910 |
| - " \"rope_base\": 500_000, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n", |
| 910 | + " \"rope_base\": 500_000.0, # NEW: The base in RoPE's \"theta\" was increased to 500_000\n", |
911 | 911 | " \"rope_freq\": None, # NEW: Additional configuration for adjusting the RoPE frequencies\n",
|
912 | 912 | " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
|
913 | 913 | "}"
|
|
2060 | 2060 | " \"n_layers\": 32, # Number of layers\n",
|
2061 | 2061 | " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
|
2062 | 2062 | " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
2063 |
| - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", |
| 2063 | + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", |
2064 | 2064 | " \"rope_freq\": None, # Additional configuration for adjusting the RoPE frequencies\n",
|
2065 | 2065 | " \"dtype\": torch.bfloat16 # Lower-precision dtype to save memory\n",
|
2066 | 2066 | "}\n",
|
|
2073 | 2073 | " \"n_layers\": 32, # Number of layers\n",
|
2074 | 2074 | " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
|
2075 | 2075 | " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
2076 |
| - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", |
| 2076 | + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", |
2077 | 2077 | " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
|
2078 | 2078 | " \"rope_freq\": { # NEW: RoPE frequency scaling\n",
|
2079 | 2079 | " \"factor\": 8.0,\n",
|
|
2447 | 2447 | " \"n_layers\": 32, # Number of layers\n",
|
2448 | 2448 | " \"hidden_dim\": 14_336, # Size of the intermediate dimension in FeedForward\n",
|
2449 | 2449 | " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
2450 |
| - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", |
| 2450 | + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", |
2451 | 2451 | " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
|
2452 | 2452 | " \"rope_freq\": { # NEW: RoPE frequency scaling\n",
|
2453 | 2453 | " \"factor\": 8.0,\n",
|
|
2466 | 2466 | " \"n_layers\": 16, # NEW: Half the number of layers\n",
|
2467 | 2467 | " \"hidden_dim\": 8192, # NEW: Almost half the size of the intermediate dimension in FeedForward\n",
|
2468 | 2468 | " \"n_kv_groups\": 8, # Key-Value groups for grouped-query attention\n",
|
2469 |
| - " \"rope_base\": 500_000, # The base in RoPE's \"theta\"\n", |
| 2469 | + " \"rope_base\": 500_000.0, # The base in RoPE's \"theta\"\n", |
2470 | 2470 | " \"dtype\": torch.bfloat16, # Lower-precision dtype to save memory\n",
|
2471 | 2471 | " \"rope_freq\": { # RoPE frequency scaling\n",
|
2472 | 2472 | " \"factor\": 32.0, # NEW: Adjustment of the rescaling factor\n",
|
|
0 commit comments