@@ -66,6 +66,7 @@ const char * llm_type_name(llm_type type) {
66
66
case MODEL_70B: return " 70B" ;
67
67
case MODEL_236B: return " 236B" ;
68
68
case MODEL_314B: return " 314B" ;
69
+ case MODEL_671B: return " 671B" ;
69
70
case MODEL_SMALL: return " 0.1B" ;
70
71
case MODEL_MEDIUM: return " 0.4B" ;
71
72
case MODEL_LARGE: return " 0.8B" ;
@@ -125,6 +126,14 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
125
126
}
126
127
}
127
128
129
+ static const char * llama_expert_gating_func_name (llama_expert_gating_func_type type) {
130
+ switch (type) {
131
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX: return " softmax" ;
132
+ case LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID: return " sigmoid" ;
133
+ default : return " unknown" ;
134
+ }
135
+ }
136
+
128
137
std::string llama_model_arch_name (const llama_model & model) {
129
138
return llm_arch_name (model.arch );
130
139
}
@@ -933,11 +942,19 @@ void llm_load_hparams(llama_model_loader & ml, llama_model & model) {
933
942
ml.get_key (LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp );
934
943
ml.get_key (LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared );
935
944
ml.get_key (LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale );
945
+ ml.get_key (LLM_KV_EXPERT_WEIGHTS_NORM, hparams.expert_weights_norm , false );
946
+ ml.get_key (LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func , false );
947
+ if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) {
948
+ // for compatibility with existing DeepSeek V2 and V2.5 GGUFs
949
+ // that have no expert_gating_func model parameter set
950
+ hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX;
951
+ }
936
952
ml.get_key (LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul );
937
953
938
954
switch (hparams.n_layer ) {
939
955
case 27 : model.type = e_model::MODEL_16B; break ;
940
956
case 60 : model.type = e_model::MODEL_236B; break ;
957
+ case 61 : model.type = e_model::MODEL_671B; break ;
941
958
default : model.type = e_model::MODEL_UNKNOWN;
942
959
}
943
960
} break ;
@@ -1259,6 +1276,10 @@ void llm_load_vocab(llama_model_loader & ml, llama_model & model) {
1259
1276
tokenizer_pre == " deepseek-coder" ) {
1260
1277
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER;
1261
1278
vocab.tokenizer_clean_spaces = false ;
1279
+ } else if (
1280
+ tokenizer_pre == " deepseek-v3" ) {
1281
+ vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM;
1282
+ vocab.tokenizer_clean_spaces = false ;
1262
1283
} else if (
1263
1284
tokenizer_pre == " falcon" ) {
1264
1285
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_FALCON;
@@ -1941,6 +1962,8 @@ void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
1941
1962
LLAMA_LOG_INFO (" %s: n_ff_exp = %d\n " , __func__, hparams.n_ff_exp );
1942
1963
LLAMA_LOG_INFO (" %s: n_expert_shared = %d\n " , __func__, hparams.n_expert_shared );
1943
1964
LLAMA_LOG_INFO (" %s: expert_weights_scale = %.1f\n " , __func__, hparams.expert_weights_scale );
1965
+ LLAMA_LOG_INFO (" %s: expert_weights_norm = %d\n " , __func__, hparams.expert_weights_norm );
1966
+ LLAMA_LOG_INFO (" %s: expert_gating_func = %s\n " , __func__, llama_expert_gating_func_name ((enum llama_expert_gating_func_type) hparams.expert_gating_func ));
1944
1967
LLAMA_LOG_INFO (" %s: rope_yarn_log_mul = %.4f\n " , __func__, hparams.rope_yarn_log_mul );
1945
1968
}
1946
1969
0 commit comments