From 8c024429cf0e3d33c071820d024bbadec73bf705 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 7 Apr 2025 15:28:28 +0100 Subject: [PATCH 01/13] Merged using squash to remove all noise commit messages --- convert_hf_to_gguf.py | 35 ++++- gguf-py/gguf/constants.py | 8 ++ gguf-py/gguf/gguf_writer.py | 6 + gguf-py/gguf/tensor_mapping.py | 8 ++ src/llama-arch.cpp | 23 +--- src/llama-arch.h | 4 + src/llama-context.cpp | 10 +- src/llama-graph.cpp | 162 +++++++++++++++++++++-- src/llama-graph.h | 23 +++- src/llama-hparams.h | 4 + src/llama-kv-cache.cpp | 2 +- src/llama-model.cpp | 235 ++++++++++++++++++++++----------- src/llama-model.h | 2 + 13 files changed, 408 insertions(+), 114 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index cfe94deaf76ef..c21467f975d74 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4335,6 +4335,10 @@ def set_vocab(self): self._set_vocab_gpt2() def set_gguf_parameters(self): + + # note: deepseek2 using MLA converts into MQA (ie: GQA with 1 group) + self.hparams["num_key_value_heads"] = 1 + super().set_gguf_parameters() hparams = self.hparams @@ -4343,8 +4347,13 @@ def set_gguf_parameters(self): if "q_lora_rank" in hparams and hparams["q_lora_rank"] is not None: self.gguf_writer.add_q_lora_rank(hparams["q_lora_rank"]) self.gguf_writer.add_kv_lora_rank(hparams["kv_lora_rank"]) - self.gguf_writer.add_key_length(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) - self.gguf_writer.add_value_length(hparams["v_head_dim"]) + + # note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA + self.gguf_writer.add_key_length(hparams["kv_lora_rank"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length(hparams["kv_lora_rank"]) + self.gguf_writer.add_key_length_mla(hparams["qk_nope_head_dim"] + hparams["qk_rope_head_dim"]) + self.gguf_writer.add_value_length_mla(hparams["v_head_dim"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) self.gguf_writer.add_expert_count(hparams["n_routed_experts"]) self.gguf_writer.add_expert_shared_count(hparams["n_shared_experts"]) @@ -4413,6 +4422,28 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: return [] + # note: MLA with the absorption optimization, needs these two split and k_b_proj transposed + if name.endswith("kv_b_proj.weight"): + name_kb = name.replace("kv_b_proj", "k_b_proj") + name_vb = name.replace("kv_b_proj", "v_b_proj") + + n_head_kv = self.hparams["num_key_value_heads"] + v_head_dim = self.hparams["v_head_dim"] + qk_nope_head_dim = self.hparams["qk_nope_head_dim"] + + assert data_torch.shape[0] == n_head_kv * (v_head_dim + qk_nope_head_dim) + + kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) + k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) + k_b = k_b.transpose(1, 2) + k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) + v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) + + return [ + (self.map_tensor_name(name_kb), k_b), + (self.map_tensor_name(name_vb), v_b) + ] + return [(self.map_tensor_name(name), data_torch)] def prepare_tensors(self): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 3a52cfd1e39ac..8de77e2baca92 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -138,6 +138,8 @@ class Attention: REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count" SLIDING_WINDOW = "{arch}.attention.sliding_window" SCALE = "{arch}.attention.scale" + KEY_LENGTH_MLA = "{arch}.attention.key_length_mla" + VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla" class Rope: DIMENSION_COUNT = "{arch}.rope.dimension_count" @@ -377,6 +379,8 @@ class MODEL_TENSOR(IntEnum): ATTN_Q_B = auto() ATTN_KV_A_MQA = auto() ATTN_KV_B = auto() + ATTN_K_B = auto() + ATTN_V_B = auto() ATTN_Q_A_NORM = auto() ATTN_KV_A_NORM = auto() FFN_SUB_NORM = auto() @@ -581,6 +585,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b", MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa", MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b", + MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b", + MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b", MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm", MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm", MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm", @@ -1451,6 +1457,8 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q_B, MODEL_TENSOR.ATTN_KV_A_MQA, MODEL_TENSOR.ATTN_KV_B, + MODEL_TENSOR.ATTN_K_B, + MODEL_TENSOR.ATTN_V_B, MODEL_TENSOR.ATTN_Q_A_NORM, MODEL_TENSOR.ATTN_KV_A_NORM, MODEL_TENSOR.ATTN_OUT, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index af8b388dfaba5..f9f76ae157303 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -689,6 +689,12 @@ def add_key_length(self, length: int) -> None: def add_value_length(self, length: int) -> None: self.add_uint32(Keys.Attention.VALUE_LENGTH.format(arch=self.arch), length) + def add_key_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.KEY_LENGTH_MLA.format(arch=self.arch), length) + + def add_value_length_mla(self, length: int) -> None: + self.add_uint32(Keys.Attention.VALUE_LENGTH_MLA.format(arch=self.arch), length) + def add_max_alibi_bias(self, bias: float) -> None: self.add_float32(Keys.Attention.MAX_ALIBI_BIAS.format(arch=self.arch), bias) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 50bef12e3dbe7..4a0dee2c6cc1b 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -656,6 +656,14 @@ class TensorNameMap: "model.layers.{bid}.self_attn.kv_b_proj", # deepseek2 ), + MODEL_TENSOR.ATTN_K_B: ( + "model.layers.{bid}.self_attn.k_b_proj", # deepseek2 + ), + + MODEL_TENSOR.ATTN_V_B: ( + "model.layers.{bid}.self_attn.v_b_proj", # deepseek2 + ), + MODEL_TENSOR.ATTN_Q_A_NORM: ( "model.layers.{bid}.self_attn.q_a_layernorm", # deepseek2 ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 047782e7d0fc8..a76fe88e0ca79 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -135,6 +135,8 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, "%s.attention.relative_buckets_count" }, { LLM_KV_ATTENTION_SLIDING_WINDOW, "%s.attention.sliding_window" }, { LLM_KV_ATTENTION_SCALE, "%s.attention.scale" }, + { LLM_KV_ATTENTION_KEY_LENGTH_MLA, "%s.attention.key_length_mla" }, + { LLM_KV_ATTENTION_VALUE_LENGTH_MLA, "%s.attention.value_length_mla" }, { LLM_KV_ROPE_DIMENSION_COUNT, "%s.rope.dimension_count" }, { LLM_KV_ROPE_DIMENSION_SECTIONS, "%s.rope.dimension_sections" }, @@ -1030,6 +1032,8 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_ATTN_Q_B, "blk.%d.attn_q_b" }, { LLM_TENSOR_ATTN_KV_A_MQA, "blk.%d.attn_kv_a_mqa" }, { LLM_TENSOR_ATTN_KV_B, "blk.%d.attn_kv_b" }, + { LLM_TENSOR_ATTN_K_B, "blk.%d.attn_k_b" }, + { LLM_TENSOR_ATTN_V_B, "blk.%d.attn_v_b" }, { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, @@ -1471,23 +1475,8 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_QKV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_OUT, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_GATE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_UP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_DOWN_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_GATE_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_FFN_UP_SHEXP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_Q_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_KV_A_MQA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ATTN_KV_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_K_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ATTN_V_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_Q, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_DEC_ATTN_V, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 297cfa4dae571..588bd2b8ace5a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -139,6 +139,8 @@ enum llm_kv { LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, LLM_KV_ATTENTION_SLIDING_WINDOW, LLM_KV_ATTENTION_SCALE, + LLM_KV_ATTENTION_KEY_LENGTH_MLA, + LLM_KV_ATTENTION_VALUE_LENGTH_MLA, LLM_KV_ROPE_DIMENSION_COUNT, LLM_KV_ROPE_DIMENSION_SECTIONS, @@ -299,6 +301,8 @@ enum llm_tensor { LLM_TENSOR_ATTN_Q_B, LLM_TENSOR_ATTN_KV_A_MQA, LLM_TENSOR_ATTN_KV_B, + LLM_TENSOR_ATTN_K_B, + LLM_TENSOR_ATTN_V_B, LLM_TENSOR_ATTN_Q_A_NORM, LLM_TENSOR_ATTN_KV_A_NORM, LLM_TENSOR_ATTN_SUB_NORM, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4735e98ea040f..c2d0759cf16ea 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -10,6 +10,7 @@ #include #include #include +#include // // llama_context @@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift( const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_attn_factor = cparams.yarn_attn_factor; const auto & yarn_beta_fast = cparams.yarn_beta_fast; const auto & yarn_beta_slow = cparams.yarn_beta_slow; @@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift( const auto & n_rot = hparams.n_rot; const auto & rope_type = hparams.rope_type; + // See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly. + // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. + const float yarn_attn_factor_scaled = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor; + ggml_tensor * tmp; if (ggml_is_quantized(cur->type)) { @@ -500,14 +504,14 @@ ggml_tensor * llama_context::build_rope_shift( tmp = ggml_rope_ext_inplace(ctx0, tmp, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow); tmp = ggml_cpy(ctx0, tmp, cur); } else { // we rotate only the first n_rot dimensions tmp = ggml_rope_ext_inplace(ctx0, cur, shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow); } return tmp; diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index cec203df49268..6d3e4effbc969 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1130,6 +1130,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_tensor * v, ggml_tensor * kq_b, ggml_tensor * kq_mask, + ggml_tensor * v_mla, bool v_trans, float kq_scale) const { //const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); @@ -1141,11 +1142,18 @@ ggml_tensor * llm_graph_context::build_attn_mha( //const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; - const auto n_embd_head_v = v_trans ? v->ne[1] : v->ne[0]; + const auto n_embd = q->ne[0]; + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - const auto n_kv = k->ne[1]; + const auto n_kv = k->ne[1]; + const auto n_head_kv = k->ne[2]; + + // note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla + const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1]; + + GGML_ASSERT(k->ne[0] == q->ne[0] && "K and Q embedding size mismatch"); + GGML_ASSERT(k->ne[2] == v->ne[2] && "K and V number of heads mismatch"); ggml_tensor * cur; @@ -1164,12 +1172,29 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { + + // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply + if (n_head_kv == 1) { + q = ggml_view_2d(ctx0, q, + n_embd, n_tokens*n_head, + ggml_row_size(q->type, n_embd), + 0); + } + ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range // while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); + if (n_head_kv == 1) { + kq = ggml_view_3d(ctx0, kq, + n_kv, n_tokens, n_head, + ggml_row_size(kq->type, n_kv), + ggml_row_size(kq->type, n_kv)*n_tokens, + 0); + } + if (arch == LLM_ARCH_GROK) { // need to do the following: // multiply by attn_output_multiplyer of 0.08838834764831845 @@ -1200,6 +1225,11 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq); + // for MLA with the absorption optimization, we need to "decompress" from MQA back to MHA + if (v_mla) { + kqv = ggml_mul_mat(ctx0, v_mla, kqv); + } + ggml_tensor * kqv_merged = ggml_permute(ctx0, kqv, 0, 2, 1, 3); cur = ggml_cont_2d(ctx0, kqv_merged, n_embd_head_v*n_head, n_tokens); @@ -1258,7 +1288,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); //cb(k, "v", il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, false, kq_scale); cb(cur, "kqv_out", il); @@ -1397,7 +1427,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, 0); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_trans, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, v_trans, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1456,7 +1486,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); //cb(k, "v", il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, false, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, false, kq_scale); cb(cur, "kqv_out", il); @@ -1475,6 +1505,123 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } +// **************************************************************************************************************** +// *** THIS WILL BE REMOVED AFTER CODE REVIEW IS ACCPETED AND READY TO MERGE - IT'S JUST A COPY OF build_attn() *** +// **************************************************************************************************************** +ggml_tensor * llm_graph_context::build_attn_mla( + llm_graph_input_attn_kv_unified * inp, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, + ggml_tensor * k_cur, + ggml_tensor * v_cur, + ggml_tensor * kq_b, + ggml_tensor * v_mla, + float kq_scale, + int il) const { + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); + + const llama_kv_cache_unified * kv_self = static_cast(memory); + const auto & n_ctx = cparams.n_ctx; + + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + const auto n_tokens = q_cur->ne[2]; + + const bool v_trans = !cparams.flash_attn; + + // store to KV cache + { + GGML_ASSERT(!kv_self->recurrent); + + const auto kv_head = kv_self->head; + + GGML_ASSERT(kv_self->size == n_ctx); + + ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head); + //cb(k_cache_view, "k_cache_view", il); + + // note: storing RoPE-ed version of K in the KV cache + ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); + + v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens); + + ggml_tensor * v_cache_view = nullptr; + + if (!v_trans) { + v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head); + } else { + // note: the V cache is transposed when not using flash attention + v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa, + ( n_ctx)*ggml_element_size(kv_self->v_l[il]), + (kv_head)*ggml_element_size(kv_self->v_l[il])); + + v_cur = ggml_transpose(ctx0, v_cur); + } + //cb(v_cache_view, "v_cache_view", il); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); + } + + const bool is_swa = hparams.is_swa(il); + + const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); + + const auto n_kv = kv_self->n; + + const int64_t n_head_kv = hparams.n_head_kv(il); + + const auto & n_embd_head_k = hparams.n_embd_head_k; + const auto & n_embd_head_v = hparams.n_embd_head_v; + + ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); + //cb(q, "q", il); + + ggml_tensor * k = + ggml_view_3d(ctx0, kv_self->k_l[il], + n_embd_head_k, n_kv, n_head_kv, + ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k), + 0); + //cb(k, "k", il); + + ggml_tensor * v = !v_trans ? + ggml_view_3d(ctx0, kv_self->v_l[il], + n_embd_head_v, n_kv, n_head_kv, + ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v), + 0) : + ggml_view_3d(ctx0, kv_self->v_l[il], + n_kv, n_embd_head_v, n_head_kv, + ggml_element_size(kv_self->v_l[il])*n_ctx, + ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, + 0); + + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale); + cb(cur, "kqv_out", il); + + if (wo) { + cur = build_lora_mm(wo, cur); + } + + if (wo_b) { + //cb(cur, "kqv_wo", il); + } + + if (wo_b) { + cur = ggml_add(ctx0, cur, wo_b); + } + + return cur; + +} + ggml_tensor * llm_graph_context::build_copy_mask_state( ggml_cgraph * gf, ggml_tensor * s, @@ -1625,4 +1772,3 @@ void llm_graph_context::build_pooling( ggml_build_forward_expand(gf, cur); } - diff --git a/src/llama-graph.h b/src/llama-graph.h index bdf19ed015e35..af1bc907a36e8 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -487,11 +487,12 @@ struct llm_graph_context { ggml_tensor * build_attn_mha( ggml_cgraph * gf, - ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q] - ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k] - ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false) + ggml_tensor * q, // [n_embd_head_q, n_tokens, n_head_q] + ggml_tensor * k, // [n_embd_head_k, n_tokens, n_head_k] + ggml_tensor * v, // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false) ggml_tensor * kq_b, ggml_tensor * kq_mask, + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] bool v_trans, float kq_scale) const; @@ -537,6 +538,22 @@ struct llm_graph_context { float kq_scale, int il) const; + // **************************************************************************************************************** + // *** THIS WILL BE REMOVED AFTER CODE REVIEW IS ACCPETED AND READY TO MERGE - IT'S JUST A COPY OF build_attn() *** + // **************************************************************************************************************** + ggml_tensor * build_attn_mla( + llm_graph_input_attn_kv_unified * inp, + ggml_cgraph * gf, + ggml_tensor * wo, + ggml_tensor * wo_b, + ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] + ggml_tensor * k_cur, // [n_embd_head_k, 1, n_tokens] + ggml_tensor * v_cur, // [n_embd_head_v, 1, n_tokens] + ggml_tensor * kq_b, + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] + float kq_scale, + int il) const; + // // recurrent // diff --git a/src/llama-hparams.h b/src/llama-hparams.h index bb17ba86dc2fb..2f6c5569479d5 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -43,6 +43,10 @@ struct llama_hparams { uint32_t n_expert_used = 0; uint32_t n_rel_attn_bkts = 0; + // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA + uint32_t n_embd_head_k_mla = 0; + uint32_t n_embd_head_v_mla = 0; + // for WavTokenizer struct llama_hparams_posnet posnet; struct llama_hparams_convnext convnext; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index dbf5f1187d9e5..7c9d46d8119b3 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -27,7 +27,7 @@ bool llama_kv_cache_unified::init( recurrent = llama_model_is_recurrent(&model); v_trans = !recurrent && !cparams.flash_attn; - can_shift = !recurrent && model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA + can_shift = !recurrent; LLAMA_LOG_INFO("%s: kv_size = %d, offload = %d, type_k = '%s', type_v = '%s', n_layer = %d, can_shift = %d\n", __func__, kv_size, offload, ggml_type_name(type_k), ggml_type_name(type_v), n_layer, can_shift); diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ca6e3ab2caeb1..0a47e6afb1fbf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1119,6 +1119,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q); } ml.get_key(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv); + ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH_MLA, hparams.n_embd_head_k_mla, false); + ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH_MLA, hparams.n_embd_head_v_mla, false); ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp); ml.get_key(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared); ml.get_key(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale); @@ -3038,8 +3040,14 @@ bool llama_model::load_tensors(llama_model_loader & ml) { { const bool is_lite = (hparams.n_layer == 27); + const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); + + // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA + const int64_t n_embd_head_k_mla = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k; + const int64_t n_embd_head_v_mla = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v; + const int64_t n_embd_head_qk_rope = hparams.n_rot; - const int64_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; + const int64_t n_embd_head_qk_nope = n_embd_head_k_mla - n_embd_head_qk_rope; const int64_t q_lora_rank = hparams.n_lora_q; const int64_t kv_lora_rank = hparams.n_lora_kv; @@ -3065,14 +3073,22 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (!is_lite) { layer.wq_a = create_tensor(tn(LLM_TENSOR_ATTN_Q_A, "weight", i), {n_embd, q_lora_rank}, 0); - layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k}, 0); + layer.wq_b = create_tensor(tn(LLM_TENSOR_ATTN_Q_B, "weight", i), {q_lora_rank, n_head * n_embd_head_k_mla}, 0); } else { - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_k_gqa}, 0); + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_head * n_embd_head_k_mla}, 0); } - layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + (n_embd_head_qk_rope)}, 0); - layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_head * ( n_embd_head_v), n_embd}, 0); + layer.wkv_a_mqa = create_tensor(tn(LLM_TENSOR_ATTN_KV_A_MQA, "weight", i), {n_embd, kv_lora_rank + n_embd_head_qk_rope}, 0); + + // note: only old legacy GGUF files will have the unsplit wkv_b tensor in + if (is_mla) { + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v_mla}, 0); + } else { + layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0); + } + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_head * n_embd_head_v_mla, n_embd}, 0); layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); @@ -4084,6 +4100,8 @@ void llama_model::print_info() const { LLAMA_LOG_INFO("%s: n_layer_dense_lead = %d\n", __func__, hparams.n_layer_dense_lead); LLAMA_LOG_INFO("%s: n_lora_q = %d\n", __func__, hparams.n_lora_q); LLAMA_LOG_INFO("%s: n_lora_kv = %d\n", __func__, hparams.n_lora_kv); + LLAMA_LOG_INFO("%s: n_embd_head_k_mla = %d\n", __func__, hparams.n_embd_head_k_mla); + LLAMA_LOG_INFO("%s: n_embd_head_v_mla = %d\n", __func__, hparams.n_embd_head_v_mla); LLAMA_LOG_INFO("%s: n_ff_exp = %d\n", __func__, hparams.n_ff_exp); LLAMA_LOG_INFO("%s: n_expert_shared = %d\n", __func__, hparams.n_expert_shared); LLAMA_LOG_INFO("%s: expert_weights_scale = %.1f\n", __func__, hparams.expert_weights_scale); @@ -9503,16 +9521,23 @@ struct llm_build_deepseek2 : public llm_graph_context { llm_build_deepseek2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { bool is_lite = (hparams.n_layer == 27); + const bool is_mla = (hparams.n_embd_head_k_mla != 0 && hparams.n_embd_head_v_mla != 0); + + // note: these are the actual head sizes you get when treating as MHA or after "decompression" using wv_b for MLA + const int64_t n_embd_head_k = is_mla ? hparams.n_embd_head_k_mla : hparams.n_embd_head_k; + const int64_t n_embd_head_v = is_mla ? hparams.n_embd_head_v_mla : hparams.n_embd_head_v; + + const int64_t n_embd_head_qk_rope = hparams.n_rot; + const int64_t n_embd_head_qk_nope = n_embd_head_k - n_embd_head_qk_rope; + + const uint32_t kv_lora_rank = hparams.n_lora_kv; + // We have to pre-scale kq_scale and attn_factor to make the YaRN RoPE work correctly. // See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation. const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale)); - const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(hparams.n_embd_head_k)); + const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k)); const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)); - const uint32_t n_embd_head_qk_rope = hparams.n_rot; - const uint32_t n_embd_head_qk_nope = hparams.n_embd_head_k - hparams.n_rot; - const uint32_t kv_lora_rank = hparams.n_lora_kv; - ggml_tensor * cur; ggml_tensor * inpL; @@ -9537,16 +9562,14 @@ struct llm_build_deepseek2 : public llm_graph_context { { ggml_tensor * q = NULL; if (!is_lite) { - // {n_embd, q_lora_rank} * {n_embd, n_tokens} -> {q_lora_rank, n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_a, cur); cb(q, "q", il); q = build_norm(q, - model.layers[il].attn_q_a_norm, NULL, + model.layers[il].attn_q_a_norm, nullptr, LLM_NORM_RMS, il); cb(q, "q", il); - // {q_lora_rank, n_head * hparams.n_embd_head_k} * {q_lora_rank, n_tokens} -> {n_head * hparams.n_embd_head_k, n_tokens} q = ggml_mul_mat(ctx0, model.layers[il].wq_b, q); cb(q, "q", il); } else { @@ -9554,96 +9577,148 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(q, "q", il); } - // split into {n_head * n_embd_head_qk_nope, n_tokens} - ggml_tensor * q_nope = ggml_view_3d(ctx0, q, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + // split into {n_embd_head_qk_nope, n_head, n_tokens} + ggml_tensor * q_nope = ggml_view_3d(ctx0, q, + n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(q->type, n_embd_head_k), + ggml_row_size(q->type, n_embd_head_k) * n_head, 0); cb(q_nope, "q_nope", il); - // and {n_head * n_embd_head_qk_rope, n_tokens} - ggml_tensor * q_pe = ggml_view_3d(ctx0, q, n_embd_head_qk_rope, n_head, n_tokens, - ggml_row_size(q->type, hparams.n_embd_head_k), - ggml_row_size(q->type, hparams.n_embd_head_k * n_head), + // and {n_embd_head_qk_rope, n_head, n_tokens} + ggml_tensor * q_pe = ggml_view_3d(ctx0, q, + n_embd_head_qk_rope, n_head, n_tokens, + ggml_row_size(q->type, n_embd_head_k), + ggml_row_size(q->type, n_embd_head_k) * n_head, ggml_row_size(q->type, n_embd_head_qk_nope)); cb(q_pe, "q_pe", il); - // {n_embd, kv_lora_rank + n_embd_head_qk_rope} * {n_embd, n_tokens} -> {kv_lora_rank + n_embd_head_qk_rope, n_tokens} - ggml_tensor * kv_pe_compresseed = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); - cb(kv_pe_compresseed, "kv_pe_compresseed", il); + ggml_tensor * kv_cmpr_pe = ggml_mul_mat(ctx0, model.layers[il].wkv_a_mqa, cur); + cb(kv_cmpr_pe, "kv_cmpr_pe", il); // split into {kv_lora_rank, n_tokens} - ggml_tensor * kv_compressed = ggml_view_2d(ctx0, kv_pe_compresseed, kv_lora_rank, n_tokens, - kv_pe_compresseed->nb[1], + ggml_tensor * kv_cmpr = ggml_view_2d(ctx0, kv_cmpr_pe, + kv_lora_rank, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), 0); - cb(kv_compressed, "kv_compressed", il); + cb(kv_cmpr, "kv_cmpr", il); + + // and {n_embd_head_qk_rope, 1, n_tokens} + ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_cmpr_pe, + n_embd_head_qk_rope, 1, n_tokens, + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank + n_embd_head_qk_rope), + ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); + cb(k_pe, "k_pe", il); - // and {n_embd_head_qk_rope, n_tokens} - ggml_tensor * k_pe = ggml_view_3d(ctx0, kv_pe_compresseed, n_embd_head_qk_rope, 1, n_tokens, - kv_pe_compresseed->nb[1], - kv_pe_compresseed->nb[1], - ggml_row_size(kv_pe_compresseed->type, kv_lora_rank)); + // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + q_pe = ggml_cont(ctx0, q_pe); + q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); + cb(q_pe, "q_pe", il); + + // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + k_pe = ggml_cont(ctx0, k_pe); + k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor_scaled, beta_fast, beta_slow + ); cb(k_pe, "k_pe", il); // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_compressed = ggml_cont(ctx0, kv_compressed); - kv_compressed = build_norm(kv_compressed, - model.layers[il].attn_kv_a_norm, NULL, + kv_cmpr = ggml_cont(ctx0, kv_cmpr); + kv_cmpr = build_norm(kv_cmpr, + model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); - cb(kv_compressed, "kv_compressed", il); + cb(kv_cmpr, "kv_cmpr", il); - // {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v)} * {kv_lora_rank, n_tokens} -> {n_head * (n_embd_head_qk_nope + n_embd_head_v), n_tokens} - ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_compressed); - cb(kv, "kv", il); + if (is_mla) { + ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, + n_embd_head_qk_nope, kv_lora_rank, n_head, + ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), + ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope) * kv_lora_rank, + 0); + cb(wk_b, "wk_b", il); - // split into {n_head * n_embd_head_qk_nope, n_tokens} - ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, n_embd_head_qk_nope, n_head, n_tokens, - ggml_row_size(kv->type, n_embd_head_qk_nope + hparams.n_embd_head_v), - ggml_row_size(kv->type, n_head * (n_embd_head_qk_nope + hparams.n_embd_head_v)), - 0); - cb(k_nope, "k_nope", il); + // {n_embd_head_qk_nope, n_tokens, n_head} + q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); + cb(q_nope, "q_nope_perm", il); - // and {n_head * n_embd_head_v, n_tokens} - ggml_tensor * v_states = ggml_view_3d(ctx0, kv, hparams.n_embd_head_v, n_head, n_tokens, - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)), - ggml_row_size(kv->type, (n_embd_head_qk_nope + hparams.n_embd_head_v)*n_head), - ggml_row_size(kv->type, (n_embd_head_qk_nope))); - cb(v_states, "v_states", il); + ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope); + cb(q_nope_absorbed, "q_nope_absorbed", il); - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states", il); + // {n_embd_head_qk_rope, n_tokens, n_head} + q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); + cb(q_pe, "q_pe_perm", il); - v_states = ggml_view_2d(ctx0, v_states, hparams.n_embd_head_v * n_head, n_tokens, - ggml_row_size(kv->type, hparams.n_embd_head_v * n_head), - 0); - cb(v_states, "v_states", il); + // note: rope must go first for in-place context shifting in build_rope_shift() + ggml_tensor * q_states = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); + cb(q_states, "q_states", il); - q_pe = ggml_cont(ctx0, q_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_rope_ext( - ctx0, q_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(q_pe, "q_pe", il); + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} + q_states = ggml_permute(ctx0, q_states, 0, 2, 1, 3); + cb(q_states, "q_states_perm", il); - // shared RoPE key - k_pe = ggml_cont(ctx0, k_pe); // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_rope_ext( - ctx0, k_pe, inp_pos, nullptr, - n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - ext_factor, attn_factor_scaled, beta_fast, beta_slow - ); - cb(k_pe, "k_pe", il); + k_pe = ggml_view_2d(ctx0, k_pe, + n_embd_head_qk_rope, n_tokens, + ggml_row_size(k_pe->type, n_embd_head_qk_rope), + 0); + cb(k_pe, "k_pe_view", il); - ggml_tensor * q_states = ggml_concat(ctx0, q_nope, q_pe, 0); - cb(q_states, "q_states", il); + ggml_tensor * k_states = ggml_concat(ctx0, k_pe, kv_cmpr, 0); + cb(k_states, "k_states", il); - ggml_tensor * k_states = ggml_concat(ctx0, k_nope, ggml_repeat(ctx0, k_pe, q_pe), 0); - cb(k_states, "k_states", il); + ggml_tensor * v_states = kv_cmpr; + cb(v_states, "v_states", il); - cur = build_attn(inp_attn, gf, - model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, kq_scale, il); + ggml_tensor * v_mla = ggml_view_3d(ctx0, model.layers[il].wv_b, + kv_lora_rank, n_embd_head_v, n_head, + ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), + ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank) * n_embd_head_v, + 0); + cb(v_mla, "v_mla", il); + + // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) + cur = build_attn_mla(inp_attn, gf, + model.layers[il].wo, NULL, + q_states, k_states, v_states, nullptr, v_mla, kq_scale, il); + } else { + ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr); + cb(kv, "kv", il); + + // split into {n_embd_head_qk_nope, n_head, n_tokens} + ggml_tensor * k_nope = ggml_view_3d(ctx0, kv, + n_embd_head_qk_nope, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v), + ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, + 0); + cb(k_nope, "k_nope_view", il); + + // and {n_embd_head_v, n_head, n_tokens} + ggml_tensor * v_states = ggml_view_3d(ctx0, kv, + n_embd_head_v, n_head, n_tokens, + ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v), + ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, + ggml_row_size(kv->type, n_embd_head_qk_nope)); + cb(v_states, "v_states_view", il); + + v_states = ggml_cont(ctx0, v_states); + cb(v_states, "v_states_cont", il); + + // note: rope must go first for in-place context shifting in build_rope_shift() + ggml_tensor * q_states = ggml_concat(ctx0, q_pe, q_nope, 0); + cb(q_states, "q_states", il); + + ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0); + cb(k_states, "k_states", il); + + // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) + cur = build_attn(inp_attn, gf, + model.layers[il].wo, NULL, + q_states, k_states, v_states, nullptr, kq_scale, il); + } } if (il == n_layer - 1) { diff --git a/src/llama-model.h b/src/llama-model.h index 91e6e8725acd2..77b4b0e1bc24e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -169,6 +169,8 @@ struct llama_layer { struct ggml_tensor * wq_b = nullptr; struct ggml_tensor * wkv_a_mqa = nullptr; struct ggml_tensor * wkv_b = nullptr; + struct ggml_tensor * wk_b = nullptr; + struct ggml_tensor * wv_b = nullptr; struct ggml_tensor * wq_cross = nullptr; struct ggml_tensor * wk_cross = nullptr; struct ggml_tensor * wv_cross = nullptr; From ddab5e43edf6e8e0053b89bcf3210fbb9301c197 Mon Sep 17 00:00:00 2001 From: juk Date: Mon, 7 Apr 2025 19:44:48 +0100 Subject: [PATCH 02/13] Force flash attention off for `LLM_ARCH_DEEPSEEK2` - embedding too large --- src/llama-context.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index c2d0759cf16ea..f6015bee746ec 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2278,6 +2278,11 @@ llama_context * llama_init_from_model( params.flash_attn = false; } + if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) { + LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__); + params.flash_attn = false; + } + if (ggml_is_quantized(params.type_v) && !params.flash_attn) { LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__); return nullptr; From 2a4e1b25b0ba81312f3c7c24a5820fb316c6bc3a Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 18:52:05 +0100 Subject: [PATCH 03/13] Removed 3 conts (2x RoPE and 1x RMS-norm) --- src/llama-model.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0a47e6afb1fbf..e659e2cb0ea4b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9611,24 +9611,18 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); cb(k_pe, "k_pe", il); - // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_cont(ctx0, q_pe); q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_pe, "q_pe", il); - // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_cont(ctx0, k_pe); k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(k_pe, "k_pe", il); - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_cmpr = ggml_cont(ctx0, kv_cmpr); kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); From 77fe59b402fc3975ffe8f8e3f09ea6b68582bb1c Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 18:54:40 +0100 Subject: [PATCH 04/13] Changed to use `` instead of `` --- src/llama-context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f6015bee746ec..d3ef1cbdeb65e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include // // llama_context From e2153236ce3c9958b5c3fae0f36ee4874ca16a03 Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 19:28:13 +0100 Subject: [PATCH 05/13] Reverted removal of the 3 conts --- src/llama-model.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e659e2cb0ea4b..0a47e6afb1fbf 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9611,18 +9611,24 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); cb(k_pe, "k_pe", il); + // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + q_pe = ggml_cont(ctx0, q_pe); q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_pe, "q_pe", il); + // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this + k_pe = ggml_cont(ctx0, k_pe); k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(k_pe, "k_pe", il); + // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont + kv_cmpr = ggml_cont(ctx0, kv_cmpr); kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); From 815f4f9ecf70af94c67da5744f2c6a70b5092b7d Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 19:32:19 +0100 Subject: [PATCH 06/13] Used `reshape` in `llm_graph_context::build_attn_mha()` --- src/llama-graph.cpp | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 6d3e4effbc969..1dade6f5d85d8 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1175,10 +1175,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply if (n_head_kv == 1) { - q = ggml_view_2d(ctx0, q, - n_embd, n_tokens*n_head, - ggml_row_size(q->type, n_embd), - 0); + q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); } ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); @@ -1188,11 +1185,7 @@ ggml_tensor * llm_graph_context::build_attn_mha( ggml_mul_mat_set_prec(kq, GGML_PREC_F32); if (n_head_kv == 1) { - kq = ggml_view_3d(ctx0, kq, - n_kv, n_tokens, n_head, - ggml_row_size(kq->type, n_kv), - ggml_row_size(kq->type, n_kv)*n_tokens, - 0); + kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head); } if (arch == LLM_ARCH_GROK) { From 57788614a0a30e653e303388b0e8bdca839ac102 Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 19:35:43 +0100 Subject: [PATCH 07/13] Use `k_pe = ggml_reshape` --- src/llama-model.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0a47e6afb1fbf..c68ea7356212e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -9661,11 +9661,8 @@ struct llm_build_deepseek2 : public llm_graph_context { q_states = ggml_permute(ctx0, q_states, 0, 2, 1, 3); cb(q_states, "q_states_perm", il); - k_pe = ggml_view_2d(ctx0, k_pe, - n_embd_head_qk_rope, n_tokens, - ggml_row_size(k_pe->type, n_embd_head_qk_rope), - 0); - cb(k_pe, "k_pe_view", il); + k_pe = ggml_reshape_2d(ctx0, k_pe, n_embd_head_qk_rope, n_tokens); + cb(k_pe, "k_pe_reshape", il); ggml_tensor * k_states = ggml_concat(ctx0, k_pe, kv_cmpr, 0); cb(k_states, "k_states", il); From 5d037ae935790a9357e046651d47062375b0a9f7 Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 20:19:46 +0100 Subject: [PATCH 08/13] Removed the 3 conts again --- src/llama-model.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 434c7a3915005..6937398667cbd 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10125,24 +10125,18 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_row_size(kv_cmpr_pe->type, kv_lora_rank)); cb(k_pe, "k_pe", il); - // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - q_pe = ggml_cont(ctx0, q_pe); q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(q_pe, "q_pe", il); - // TODO: the CUDA backend used to not support non-cont. RoPE, investigate removing this - k_pe = ggml_cont(ctx0, k_pe); k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor, attn_factor_scaled, beta_fast, beta_slow ); cb(k_pe, "k_pe", il); - // TODO: the CUDA backend used to not support non-cont. (RMS) norm, investigate removing ggml_cont - kv_cmpr = ggml_cont(ctx0, kv_cmpr); kv_cmpr = build_norm(kv_cmpr, model.layers[il].attn_kv_a_norm, nullptr, LLM_NORM_RMS, il); From 638b092d7a16a00b019c1e3e6f5a9031ab285b30 Mon Sep 17 00:00:00 2001 From: juk Date: Sat, 12 Apr 2025 20:26:24 +0100 Subject: [PATCH 09/13] Removed the 3D views of `wk_b` and `wv_b`, and just save and 3D in GGUF --- convert_hf_to_gguf.py | 2 -- src/llama-model.cpp | 22 ++++------------------ 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 43ecc53854dad..89522dee8b8ad 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -4523,8 +4523,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter kv_b = data_torch.view(n_head_kv, v_head_dim + qk_nope_head_dim, data_torch.shape[-1]) k_b, v_b = torch.split(kv_b, [qk_nope_head_dim, v_head_dim], dim=1) k_b = k_b.transpose(1, 2) - k_b = k_b.reshape(n_head_kv * data_torch.shape[-1], qk_nope_head_dim) - v_b = v_b.reshape(n_head_kv * v_head_dim, data_torch.shape[-1]) return [ (self.map_tensor_name(name_kb), k_b), diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 6937398667cbd..9982bf95bc4a3 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3249,8 +3249,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { // note: only old legacy GGUF files will have the unsplit wkv_b tensor in if (is_mla) { - layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, n_head * kv_lora_rank}, 0); - layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_head * n_embd_head_v_mla}, 0); + layer.wk_b = create_tensor(tn(LLM_TENSOR_ATTN_K_B, "weight", i), {n_embd_head_qk_nope, kv_lora_rank, n_head}, 0); + layer.wv_b = create_tensor(tn(LLM_TENSOR_ATTN_V_B, "weight", i), {kv_lora_rank, n_embd_head_v_mla, n_head}, 0); } else { layer.wkv_b = create_tensor(tn(LLM_TENSOR_ATTN_KV_B, "weight", i), {kv_lora_rank, n_head * (n_embd_head_qk_nope + n_embd_head_v_mla)}, 0); } @@ -10143,18 +10143,11 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(kv_cmpr, "kv_cmpr", il); if (is_mla) { - ggml_tensor * wk_b = ggml_view_3d(ctx0, model.layers[il].wk_b, - n_embd_head_qk_nope, kv_lora_rank, n_head, - ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope), - ggml_row_size(model.layers[il].wk_b->type, n_embd_head_qk_nope) * kv_lora_rank, - 0); - cb(wk_b, "wk_b", il); - // {n_embd_head_qk_nope, n_tokens, n_head} q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope, "q_nope_perm", il); - ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, wk_b, q_nope); + ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope); cb(q_nope_absorbed, "q_nope_absorbed", il); // {n_embd_head_qk_rope, n_tokens, n_head} @@ -10178,17 +10171,10 @@ struct llm_build_deepseek2 : public llm_graph_context { ggml_tensor * v_states = kv_cmpr; cb(v_states, "v_states", il); - ggml_tensor * v_mla = ggml_view_3d(ctx0, model.layers[il].wv_b, - kv_lora_rank, n_embd_head_v, n_head, - ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank), - ggml_row_size(model.layers[il].wv_b->type, kv_lora_rank) * n_embd_head_v, - 0); - cb(v_mla, "v_mla", il); - // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) cur = build_attn_mla(inp_attn, gf, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, v_mla, kq_scale, il); + q_states, k_states, v_states, nullptr, model.layers[il].wv_b, kq_scale, il); } else { ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr); cb(kv, "kv", il); From a5df71ec9c6725c6412f8341cc78661c8d0191e4 Mon Sep 17 00:00:00 2001 From: juk Date: Sun, 13 Apr 2025 12:40:31 +0100 Subject: [PATCH 10/13] Removed MQA optimisation from `build_attn_mha()` as no gains now --- src/llama-graph.cpp | 22 +++------------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index ab69f666a9834..d0dbbdd951bde 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1200,18 +1200,12 @@ ggml_tensor * llm_graph_context::build_attn_mha( //const auto & n_embd_head_k = hparams.n_embd_head_k; //const auto & n_embd_head_v = hparams.n_embd_head_v; - const auto n_embd = q->ne[0]; - const auto n_tokens = q->ne[1]; - const auto n_head = q->ne[2]; - - const auto n_kv = k->ne[1]; - const auto n_head_kv = k->ne[2]; - // note: for MLA with the absorption optimization, the final embedding size will be changed via v_mla const auto n_embd_head_v = v_mla == nullptr ? v_trans ? v->ne[1] : v->ne[0] : v_mla->ne[1]; - GGML_ASSERT(k->ne[0] == q->ne[0] && "K and Q embedding size mismatch"); - GGML_ASSERT(k->ne[2] == v->ne[2] && "K and V number of heads mismatch"); + const auto n_tokens = q->ne[1]; + const auto n_head = q->ne[2]; + const auto n_kv = k->ne[1]; ggml_tensor * cur; @@ -1239,22 +1233,12 @@ ggml_tensor * llm_graph_context::build_attn_mha( cur = ggml_reshape_2d(ctx0, cur, n_embd_head_v*n_head, n_tokens); } else { - - // for MQA (ie: GQA with 1 group) we don't need to use a batched matrix multiply - if (n_head_kv == 1) { - q = ggml_reshape_2d(ctx0, q, n_embd, n_tokens*n_head); - } - ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); // note: this op tends to require high floating point range // while for some models F16 is enough, for others it is not, so we default to F32 here ggml_mul_mat_set_prec(kq, GGML_PREC_F32); - if (n_head_kv == 1) { - kq = ggml_reshape_3d(ctx0, kq, n_kv, n_tokens, n_head); - } - if (arch == LLM_ARCH_GROK) { // need to do the following: // multiply by attn_output_multiplyer of 0.08838834764831845 From 925af997e8e029a3b184ffb478d3e9bdd35f787b Mon Sep 17 00:00:00 2001 From: juk Date: Sun, 13 Apr 2025 12:41:33 +0100 Subject: [PATCH 11/13] Simplified `is_mla` branch in `llm_build_deepseek2()` --- src/llama-model.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9982bf95bc4a3..8bb9bdc4a0470 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10147,27 +10147,27 @@ struct llm_build_deepseek2 : public llm_graph_context { q_nope = ggml_permute(ctx0, q_nope, 0, 2, 1, 3); cb(q_nope, "q_nope_perm", il); + // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head} ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope); cb(q_nope_absorbed, "q_nope_absorbed", il); - // {n_embd_head_qk_rope, n_tokens, n_head} - q_pe = ggml_permute(ctx0, q_pe, 0, 2, 1, 3); - cb(q_pe, "q_pe_perm", il); + // {kv_lora_rank, n_head, n_tokens} + q_nope_absorbed = ggml_permute(ctx0, q_nope_absorbed, 0, 2, 1, 3); + cb(q_nope_absorbed, "q_nope_absorbed_perm", il); + // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} // note: rope must go first for in-place context shifting in build_rope_shift() ggml_tensor * q_states = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); cb(q_states, "q_states", il); - // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} - q_states = ggml_permute(ctx0, q_states, 0, 2, 1, 3); - cb(q_states, "q_states_perm", il); - - k_pe = ggml_reshape_2d(ctx0, k_pe, n_embd_head_qk_rope, n_tokens); - cb(k_pe, "k_pe_reshape", il); + kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); + cb(kv_cmpr, "kv_cmpr_reshape", il); + // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} ggml_tensor * k_states = ggml_concat(ctx0, k_pe, kv_cmpr, 0); cb(k_states, "k_states", il); + // {kv_lora_rank, 1, n_tokens} ggml_tensor * v_states = kv_cmpr; cb(v_states, "v_states", il); From 36ce2353c33cfce7b1571e21c633b8bd071f0163 Mon Sep 17 00:00:00 2001 From: juk Date: Sun, 13 Apr 2025 13:15:28 +0100 Subject: [PATCH 12/13] Removed `build_attn_mla` and added `nullptr` to all `build_atnn` calls --- src/llama-graph.cpp | 126 ++-------------------------------------- src/llama-graph.h | 17 +----- src/llama-model.cpp | 138 ++++++++++++++++++++++---------------------- 3 files changed, 77 insertions(+), 204 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index d0dbbdd951bde..5d0222b981058 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1311,6 +1311,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * v_mla, float kq_scale, int il) const { GGML_UNUSED(n_tokens); @@ -1332,7 +1333,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); //cb(k, "v", il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, false, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale); cb(cur, "kqv_out", il); @@ -1386,6 +1387,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * v_mla, float kq_scale, int il) const { // these nodes are added to the graph together so that they are not reordered @@ -1471,7 +1473,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, 0); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, v_trans, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale); cb(cur, "kqv_out", il); if (wo) { @@ -1511,6 +1513,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * k_cur, ggml_tensor * v_cur, ggml_tensor * kq_b, + ggml_tensor * v_mla, float kq_scale, int il) const { // these nodes are added to the graph together so that they are not reordered @@ -1530,7 +1533,7 @@ ggml_tensor * llm_graph_context::build_attn( ggml_tensor * v = ggml_permute(ctx0, v_cur, 0, 2, 1, 3); //cb(k, "v", il); - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, nullptr, false, kq_scale); + ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, false, kq_scale); cb(cur, "kqv_out", il); @@ -1549,123 +1552,6 @@ ggml_tensor * llm_graph_context::build_attn( return cur; } -// **************************************************************************************************************** -// *** THIS WILL BE REMOVED AFTER CODE REVIEW IS ACCPETED AND READY TO MERGE - IT'S JUST A COPY OF build_attn() *** -// **************************************************************************************************************** -ggml_tensor * llm_graph_context::build_attn_mla( - llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, - ggml_tensor * k_cur, - ggml_tensor * v_cur, - ggml_tensor * kq_b, - ggml_tensor * v_mla, - float kq_scale, - int il) const { - // these nodes are added to the graph together so that they are not reordered - // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(gf, q_cur); - ggml_build_forward_expand(gf, k_cur); - ggml_build_forward_expand(gf, v_cur); - - const llama_kv_cache_unified * kv_self = static_cast(memory); - const auto & n_ctx = cparams.n_ctx; - - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - const auto n_tokens = q_cur->ne[2]; - - const bool v_trans = !cparams.flash_attn; - - // store to KV cache - { - GGML_ASSERT(!kv_self->recurrent); - - const auto kv_head = kv_self->head; - - GGML_ASSERT(kv_self->size == n_ctx); - - ggml_tensor * k_cache_view = ggml_view_1d(ctx0, kv_self->k_l[il], n_tokens*n_embd_k_gqa, ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa)*kv_head); - //cb(k_cache_view, "k_cache_view", il); - - // note: storing RoPE-ed version of K in the KV cache - ggml_build_forward_expand(gf, ggml_cpy(ctx0, k_cur, k_cache_view)); - - v_cur = ggml_reshape_2d(ctx0, v_cur, n_embd_v_gqa, n_tokens); - - ggml_tensor * v_cache_view = nullptr; - - if (!v_trans) { - v_cache_view = ggml_view_1d(ctx0, kv_self->v_l[il], n_tokens*n_embd_v_gqa, ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa)*kv_head); - } else { - // note: the V cache is transposed when not using flash attention - v_cache_view = ggml_view_2d(ctx0, kv_self->v_l[il], n_tokens, n_embd_v_gqa, - ( n_ctx)*ggml_element_size(kv_self->v_l[il]), - (kv_head)*ggml_element_size(kv_self->v_l[il])); - - v_cur = ggml_transpose(ctx0, v_cur); - } - //cb(v_cache_view, "v_cache_view", il); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, v_cur, v_cache_view)); - } - - const bool is_swa = hparams.is_swa(il); - - const auto & kq_mask = is_swa ? inp->get_kq_mask_swa() : inp->get_kq_mask(); - - const auto n_kv = kv_self->n; - - const int64_t n_head_kv = hparams.n_head_kv(il); - - const auto & n_embd_head_k = hparams.n_embd_head_k; - const auto & n_embd_head_v = hparams.n_embd_head_v; - - ggml_tensor * q = ggml_permute(ctx0, q_cur, 0, 2, 1, 3); - //cb(q, "q", il); - - ggml_tensor * k = - ggml_view_3d(ctx0, kv_self->k_l[il], - n_embd_head_k, n_kv, n_head_kv, - ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self->k_l[il]->type, n_embd_head_k), - 0); - //cb(k, "k", il); - - ggml_tensor * v = !v_trans ? - ggml_view_3d(ctx0, kv_self->v_l[il], - n_embd_head_v, n_kv, n_head_kv, - ggml_row_size(kv_self->v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self->v_l[il]->type, n_embd_head_v), - 0) : - ggml_view_3d(ctx0, kv_self->v_l[il], - n_kv, n_embd_head_v, n_head_kv, - ggml_element_size(kv_self->v_l[il])*n_ctx, - ggml_element_size(kv_self->v_l[il])*n_ctx*n_embd_head_v, - 0); - - ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, v_trans, kq_scale); - cb(cur, "kqv_out", il); - - if (wo) { - cur = build_lora_mm(wo, cur); - } - - if (wo_b) { - //cb(cur, "kqv_wo", il); - } - - if (wo_b) { - cur = ggml_add(ctx0, cur, wo_b); - } - - return cur; - -} - ggml_tensor * llm_graph_context::build_copy_mask_state( ggml_cgraph * gf, ggml_tensor * s, diff --git a/src/llama-graph.h b/src/llama-graph.h index 5bea0404e82eb..d192dc1495787 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -525,6 +525,7 @@ struct llm_graph_context { ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; @@ -539,6 +540,7 @@ struct llm_graph_context { ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] ggml_tensor * kq_b, + ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, int il) const; @@ -552,21 +554,6 @@ struct llm_graph_context { ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] - ggml_tensor * kq_b, - float kq_scale, - int il) const; - - // **************************************************************************************************************** - // *** THIS WILL BE REMOVED AFTER CODE REVIEW IS ACCPETED AND READY TO MERGE - IT'S JUST A COPY OF build_attn() *** - // **************************************************************************************************************** - ggml_tensor * build_attn_mla( - llm_graph_input_attn_kv_unified * inp, - ggml_cgraph * gf, - ggml_tensor * wo, - ggml_tensor * wo_b, - ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens] - ggml_tensor * k_cur, // [n_embd_head_k, 1, n_tokens] - ggml_tensor * v_cur, // [n_embd_head_v, 1, n_tokens] ggml_tensor * kq_b, ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v] float kq_scale, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8bb9bdc4a0470..e97a50cb43867 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4514,7 +4514,7 @@ struct llm_build_llama : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); cb(cur, "attn_out", il); } @@ -4727,7 +4727,7 @@ struct llm_build_deci : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1) { @@ -4869,7 +4869,7 @@ struct llm_build_baichuan : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -4984,7 +4984,7 @@ struct llm_build_xverse : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5109,7 +5109,7 @@ struct llm_build_falcon : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5239,7 +5239,7 @@ struct llm_build_grok : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1) { @@ -5390,7 +5390,7 @@ struct llm_build_dbrx : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5504,7 +5504,7 @@ struct llm_build_starcoder : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5603,7 +5603,7 @@ struct llm_build_refact : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -5757,7 +5757,7 @@ struct llm_build_bert : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "kqv_out", il); if (il == n_layer - 1 && pooling_type == LLAMA_POOLING_TYPE_NONE) { @@ -5874,7 +5874,7 @@ struct llm_build_bloom : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6015,7 +6015,7 @@ struct llm_build_mpt : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6161,7 +6161,7 @@ struct llm_build_stablelm : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6284,7 +6284,7 @@ struct llm_build_qwen : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6404,7 +6404,7 @@ struct llm_build_qwen2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6525,7 +6525,7 @@ struct llm_build_qwen2vl : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6652,7 +6652,7 @@ struct llm_build_qwen2moe : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6805,7 +6805,7 @@ struct llm_build_qwen3 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -6926,7 +6926,7 @@ struct llm_build_qwen3moe : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7066,7 +7066,7 @@ struct llm_build_phi2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1) { @@ -7195,7 +7195,7 @@ struct llm_build_phi3 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1) { @@ -7330,7 +7330,7 @@ struct llm_build_plamo : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } ggml_tensor * sa_out = cur; @@ -7437,7 +7437,7 @@ struct llm_build_gpt2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7553,7 +7553,7 @@ struct llm_build_codeshell : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7682,7 +7682,7 @@ struct llm_build_orion : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -7809,7 +7809,7 @@ struct llm_build_internlm2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8006,7 +8006,7 @@ struct llm_build_minicpm3 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, kq_scale, il); + q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1) { @@ -8136,7 +8136,7 @@ struct llm_build_gemma : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } if (il == n_layer - 1) { @@ -8258,7 +8258,7 @@ struct llm_build_gemma2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); } cur = build_norm(cur, @@ -8399,7 +8399,7 @@ struct llm_build_gemma3 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, hparams.f_attention_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, hparams.f_attention_scale, il); } cur = build_norm(cur, @@ -8539,7 +8539,7 @@ struct llm_build_starcoder2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -8874,7 +8874,7 @@ struct llm_build_command_r : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9009,7 +9009,7 @@ struct llm_build_cohere2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9140,7 +9140,7 @@ struct llm_build_olmo : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9260,7 +9260,7 @@ struct llm_build_olmo2 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } cur = build_norm(cur, @@ -9393,7 +9393,7 @@ struct llm_build_olmoe : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9526,7 +9526,7 @@ struct llm_build_openelm : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9640,7 +9640,7 @@ struct llm_build_gptneox : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9790,7 +9790,7 @@ struct llm_build_arctic : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -9945,7 +9945,7 @@ struct llm_build_deepseek : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1) { @@ -10157,24 +10157,24 @@ struct llm_build_deepseek2 : public llm_graph_context { // {n_embd_head_qk_rope + kv_lora_rank, n_head, n_tokens} // note: rope must go first for in-place context shifting in build_rope_shift() - ggml_tensor * q_states = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); - cb(q_states, "q_states", il); + ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope_absorbed, 0); + cb(Qcur, "Qcur", il); kv_cmpr = ggml_reshape_3d(ctx0, kv_cmpr, kv_lora_rank, 1, n_tokens); cb(kv_cmpr, "kv_cmpr_reshape", il); // {n_embd_head_qk_rope + kv_lora_rank, 1, n_tokens} - ggml_tensor * k_states = ggml_concat(ctx0, k_pe, kv_cmpr, 0); - cb(k_states, "k_states", il); + ggml_tensor * Kcur = ggml_concat(ctx0, k_pe, kv_cmpr, 0); + cb(Kcur, "Kcur", il); // {kv_lora_rank, 1, n_tokens} - ggml_tensor * v_states = kv_cmpr; - cb(v_states, "v_states", il); + ggml_tensor * Vcur = kv_cmpr; + cb(Vcur, "Vcur", il); // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group) - cur = build_attn_mla(inp_attn, gf, + cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, model.layers[il].wv_b, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, model.layers[il].wv_b, kq_scale, il); } else { ggml_tensor * kv = ggml_mul_mat(ctx0, model.layers[il].wkv_b, kv_cmpr); cb(kv, "kv", il); @@ -10188,27 +10188,27 @@ struct llm_build_deepseek2 : public llm_graph_context { cb(k_nope, "k_nope_view", il); // and {n_embd_head_v, n_head, n_tokens} - ggml_tensor * v_states = ggml_view_3d(ctx0, kv, + ggml_tensor * Vcur = ggml_view_3d(ctx0, kv, n_embd_head_v, n_head, n_tokens, ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v), ggml_row_size(kv->type, n_embd_head_qk_nope + n_embd_head_v) * n_head, ggml_row_size(kv->type, n_embd_head_qk_nope)); - cb(v_states, "v_states_view", il); + cb(Vcur, "Vcur_view", il); - v_states = ggml_cont(ctx0, v_states); - cb(v_states, "v_states_cont", il); + Vcur = ggml_cont(ctx0, Vcur); + cb(Vcur, "Vcur_cont", il); // note: rope must go first for in-place context shifting in build_rope_shift() - ggml_tensor * q_states = ggml_concat(ctx0, q_pe, q_nope, 0); - cb(q_states, "q_states", il); + ggml_tensor * Qcur = ggml_concat(ctx0, q_pe, q_nope, 0); + cb(Qcur, "Qcur", il); - ggml_tensor * k_states = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0); - cb(k_states, "k_states", il); + ggml_tensor * Kcur = ggml_concat(ctx0, ggml_repeat(ctx0, k_pe, q_pe), k_nope, 0); + cb(Kcur, "Kcur", il); // note: MLA without the absorption optimization converts into MHA (ie: GQA with full n_head groups) cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, kq_scale, il); + Qcur, Kcur, Vcur, nullptr, nullptr, kq_scale, il); } } @@ -10375,7 +10375,7 @@ struct llm_build_bitnet : public llm_graph_context { cur = build_attn(inp_attn, gf, NULL, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cur = build_norm(cur, model.layers[il].attn_sub_norm, NULL, @@ -10498,7 +10498,7 @@ struct llm_build_t5_enc : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo_enc, nullptr, - Qcur, Kcur, Vcur, kq_b, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, kq_b, 1.0f, il); cb(cur, "kqv_out", il); } @@ -10604,7 +10604,7 @@ struct llm_build_t5_dec : public llm_graph_context { cur = build_attn(inp_attn_self, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, kq_b, 1.0f, il); + Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); } @@ -10636,7 +10636,7 @@ struct llm_build_t5_dec : public llm_graph_context { cur = build_attn(inp_attn_cross, gf, model.layers[il].wo_cross, nullptr, - Qcur, Kcur, Vcur, nullptr, 1.0f, il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f, il); cb(cur, "kqv_out", il); //ggml_tensor * q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3); @@ -10769,7 +10769,7 @@ struct llm_build_jais : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/float(n_embd_head), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/float(n_embd_head), il); } if (il == n_layer - 1) { @@ -10901,7 +10901,7 @@ struct llm_build_chatglm : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -11034,7 +11034,7 @@ struct llm_build_glm4 : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -11178,7 +11178,7 @@ struct llm_build_nemotron : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -11309,7 +11309,7 @@ struct llm_build_exaone : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); } if (il == n_layer - 1) { @@ -12211,7 +12211,7 @@ struct llm_build_chameleon : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, nullptr, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); if (hparams.swin_norm) { cur = build_norm(cur, @@ -12567,7 +12567,7 @@ struct llm_build_plm : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, - q_states, k_states, v_states, nullptr, kq_scale, il); + q_states, k_states, v_states, nullptr, nullptr, kq_scale, il); } if (il == n_layer - 1) { @@ -12690,7 +12690,7 @@ struct llm_build_bailingmoe : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo, model.layers[il].bo, - Qcur, Kcur, Vcur, nullptr, 1.0f/sqrtf(float(n_rot)), il); + Qcur, Kcur, Vcur, nullptr, nullptr, 1.0f/sqrtf(float(n_rot)), il); } if (il == n_layer - 1) { From a5742780b2fe787f54ad7db1c52425e19eb957e9 Mon Sep 17 00:00:00 2001 From: juk Date: Sun, 13 Apr 2025 13:25:41 +0100 Subject: [PATCH 13/13] Fixed call to `build_attn` in `llm_build_t5_enc` --- src/llama-model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index e97a50cb43867..248c61748eaa8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -10498,7 +10498,7 @@ struct llm_build_t5_enc : public llm_graph_context { cur = build_attn(inp_attn, gf, model.layers[il].wo_enc, nullptr, - Qcur, Kcur, Vcur, nullptr, kq_b, 1.0f, il); + Qcur, Kcur, Vcur, kq_b, nullptr, 1.0f, il); cb(cur, "kqv_out", il); }