Use unpack_batch function across multimodal wrappers (#886)

MaxFeucht · web-flow · commit 092bec95045d · 2025-09-12T16:15:29.000Z
* Move `unpack_batch` to `multimodal/utils` to use for `LiteLLMModel`

* lint
diff --git a/src/eva/multimodal/models/wrappers/huggingface.py b/src/eva/multimodal/models/wrappers/huggingface.py
@@ -13,6 +13,7 @@
 from eva.language.utils.text import messages as language_message_utils
 from eva.multimodal.models.typings import TextImageBatch
 from eva.multimodal.models.wrappers import base
+from eva.multimodal.utils.batch import unpack_batch
 from eva.multimodal.utils.text import messages as message_utils
 
 
@@ -72,7 +73,7 @@ def format_inputs(self, batch: TextImageBatch | TextBatch) -> Dict[str, torch.Te
                 "pixel_values": ...
             }
         """
-        message_batch, image_batch, _, _ = self._unpack_batch(batch)
+        message_batch, image_batch, _, _ = unpack_batch(batch)
         with_images = image_batch is not None
 
         message_batch = language_message_utils.batch_insert_system_message(
@@ -158,11 +159,6 @@ def load_processor(self) -> Callable:
             **self.processor_kwargs,
         )
 
-    def _unpack_batch(self, batch: TextImageBatch | TextBatch) -> tuple:
-        if isinstance(batch, TextImageBatch):
-            return batch.text, batch.image, batch.target, batch.metadata
-        return batch.text, None, batch.target, batch.metadata
-
     def _decode_output(self, output: torch.Tensor, instruction_length: int) -> List[str]:
         """Decode the model's batch output to text.
 
diff --git a/src/eva/multimodal/models/wrappers/litellm.py b/src/eva/multimodal/models/wrappers/litellm.py
@@ -10,6 +10,7 @@
 from eva.language.utils.text import messages as language_message_utils
 from eva.multimodal.models.typings import TextImageBatch
 from eva.multimodal.models.wrappers import base
+from eva.multimodal.utils.batch import unpack_batch
 from eva.multimodal.utils.text import messages as message_utils
 
 
@@ -43,7 +44,7 @@ def __init__(
 
     @override
     def format_inputs(self, batch: TextImageBatch) -> List[List[Dict[str, Any]]]:
-        message_batch, image_batch, _, _ = TextImageBatch(*batch)
+        message_batch, image_batch, _, _ = unpack_batch(batch)
 
         message_batch = language_message_utils.batch_insert_system_message(
             message_batch, self.system_message
diff --git a/src/eva/multimodal/utils/batch/__init__.py b/src/eva/multimodal/utils/batch/__init__.py
@@ -0,0 +1,5 @@
+"""Multimodal batch utilities API."""
+
+from eva.multimodal.utils.batch.unpack import unpack_batch
+
+__all__ = ["unpack_batch"]
diff --git a/src/eva/multimodal/utils/batch/unpack.py b/src/eva/multimodal/utils/batch/unpack.py
@@ -0,0 +1,11 @@
+"""Unpack batch utility function."""
+
+from eva.language.models.typings import TextBatch
+from eva.multimodal.models.typings import TextImageBatch
+
+
+def unpack_batch(batch: TextImageBatch | TextBatch) -> tuple:
+    """Unpacks a TextImageBatch or TextBatch into its components."""
+    if isinstance(batch, TextImageBatch):
+        return batch.text, batch.image, batch.target, batch.metadata
+    return batch.text, None, batch.target, batch.metadata