fix(gemma3): update gemma3 multimodal forward implementation (#787)

Tcc0403 · web-flow · commit 0e3792089514 · 2025-07-07T14:25:20.000-07:00
## Summary  Fix #786, #774.  ## Testing Done   - Hardware Type: <BLANK> - [ ] run `make test` to ensure correctness - [ ] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/src/liger_kernel/transformers/model/gemma3.py b/src/liger_kernel/transformers/model/gemma3.py
@@ -1,4 +1,3 @@
-from typing import List
 from typing import Optional
 from typing import Tuple
 from typing import Union
@@ -10,17 +9,14 @@
 from transformers.cache_utils import HybridCache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.gemma3.modeling_gemma3 import Gemma3CausalLMOutputWithPast
-from transformers.utils import is_torchdynamo_compiling
 from transformers.utils import logging
-from transformers.utils.deprecation import deprecate_kwarg
 
 from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 
 logger = logging.get_logger(__name__)
 
 
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
 def causal_forward(
     self,
     input_ids: torch.LongTensor = None,
@@ -139,14 +135,13 @@ def causal_forward(
     )
 
 
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
 def multimodal_forward(
     self,
     input_ids: torch.LongTensor = None,
     pixel_values: torch.FloatTensor = None,
     attention_mask: Optional[torch.Tensor] = None,
     position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
     token_type_ids: Optional[torch.LongTensor] = None,
     cache_position: Optional[torch.LongTensor] = None,
     inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -158,21 +153,12 @@ def multimodal_forward(
     logits_to_keep: Union[int, torch.Tensor] = 0,
     skip_logits: Optional[bool] = None,
     **lm_kwargs,
-) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
+) -> Union[tuple, Gemma3CausalLMOutputWithPast]:
     r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
+    labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+        config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+        (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
 
     Example:
 
@@ -181,111 +167,76 @@ def multimodal_forward(
     >>> import requests
     >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
 
-    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
-    >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
-
-    >>> prompt = "answer en Where is the cow standing?"
-    >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
+    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
+    >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
+
+    >>> messages = [
+    ...     {
+    ...         "role": "system",
+    ...         "content": [
+    ...             {"type": "text", "text": "You are a helpful assistant."}
+    ...         ]
+    ...     },
+    ...     {
+    ...         "role": "user", "content": [
+    ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
+    ...             {"type": "text", "text": "Where is the cat standing?"},
+    ...         ]
+    ...     },
+    ... ]
+
+    >>> inputs = processor.apply_chat_template(
+    ...     messages,
+    ...     tokenize=True,
+    ...     return_dict=True,
+    ...     return_tensors="pt",
+    ...     add_generation_prompt=True
+    ... )
     >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_length=30)
+    >>> generate_ids = model.generate(**inputs)
     >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "answer en Where is the cow standing?\nbeach"
-    ```"""
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+    "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
+    ```
+    """
 
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
     output_hidden_states = (
         output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
     )
     return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
-    is_training = token_type_ids is not None and labels is not None
-
-    # Replace image id woth PAD if the image token if OOV, to avoid index-errors
-    if input_ids is not None and self.config.image_token_index >= self.vocab_size:
-        special_image_mask = input_ids == self.config.image_token_index
-        llm_input_ids = input_ids.clone()
-        llm_input_ids[special_image_mask] = 0
-    else:
-        llm_input_ids = input_ids
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(llm_input_ids)
-
-    if cache_position is None:
-        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-        cache_position = torch.arange(
-            past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-        )
-
-    if position_ids is None:
-        position_ids = cache_position.unsqueeze(0) + 1  # Gemma3 positions are 1-indexed
-
-    # Merge text and images
-    if pixel_values is not None:
-        image_features = self.get_image_features(pixel_values)
-
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device)
-            )
-        else:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
-
-        if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
-            image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
-            raise ValueError(
-                f"Number of images does not match number of special image tokens in the input text. "
-                f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
-                "tokens from image embeddings."
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
-
-    # mask out pad-token-ids in labels for BC
-    if labels is not None and self.pad_token_id in labels:
-        logger.warning_once(
-            "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
-            "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
-        )
-        labels = torch.where(input_ids == self.pad_token_id, self.config.ignore_index, labels)
-
-    causal_mask = self._update_causal_mask(
-        attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
-    )
-    outputs = self.language_model.model(
-        attention_mask=causal_mask,
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        token_type_ids=token_type_ids,
+        attention_mask=attention_mask,
         position_ids=position_ids,
         past_key_values=past_key_values,
         inputs_embeds=inputs_embeds,
         use_cache=use_cache,
+        labels=labels,
         output_attentions=output_attentions,
         output_hidden_states=output_hidden_states,
         return_dict=return_dict,
         cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
         **lm_kwargs,
     )
 
     hidden_states = outputs[0]
+
+    slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+    kept_hidden_states = hidden_states[:, slice_indices, :]
+
     loss = None
     logits = None
-
     if skip_logits and labels is None:
         raise ValueError("skip_logits is True, but labels is None")
 
     if skip_logits is None:
         skip_logits = self.training and (labels is not None)
 
     if skip_logits:
-        shift_hidden_states = hidden_states[..., :-1, :]
+        shift_hidden_states = kept_hidden_states[..., :-1, :]
         shift_labels = labels[..., 1:]
 
         hidden_device = shift_hidden_states.device
@@ -306,7 +257,7 @@ def multimodal_forward(
         lce = LigerFusedLinearCrossEntropyLoss()
         loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
     else:
-        logits = self.language_model.lm_head(hidden_states)
+        logits = self.lm_head(kept_hidden_states)
         if labels is not None:
             # Upcast to float if we need to compute the loss to avoid potential precision issues
             logits = logits.float()
@@ -327,6 +278,7 @@ def multimodal_forward(
             flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
             flat_labels = shift_labels.view(-1).to(shift_logits.device)
             loss = loss_fct(flat_logits, flat_labels)
+
     if not return_dict:
         output = (logits,) + outputs[1:]
         return (loss,) + output if loss is not None else output
@@ -337,5 +289,5 @@ def multimodal_forward(
         past_key_values=outputs.past_key_values,
         hidden_states=outputs.hidden_states,
         attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
+        image_hidden_states=outputs.image_hidden_states,
     )
diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py
@@ -1022,7 +1022,7 @@ def run_mini_model_multimodal(
             5e-2,
             5e-2,
             1e-1,
-            1e-1,
+            1e-2,
             1e-2,
             1e-2,
             marks=[

-Original file line number
+Diff line change
 e-2,
 e-2,
 e-1,
 -            1e-1,
 +            1e-2,
 e-2,
 e-2,
             marks=[