Make image input kwarg in multimodal processor configurable (#887)

MaxFeucht · web-flow · commit 9ee124de804c · 2025-09-16T08:04:04.000Z
* Make image alias in processor configurable

* Rename alias to key

* Add default arg to Patho-R1
diff --git a/src/eva/multimodal/models/networks/others.py b/src/eva/multimodal/models/networks/others.py
@@ -44,4 +44,5 @@ def __init__(
                 "max_pixels": 451584,  # 672*672
             },
             system_prompt=system_prompt,
+            image_key="images",
         )
diff --git a/src/eva/multimodal/models/wrappers/huggingface.py b/src/eva/multimodal/models/wrappers/huggingface.py
@@ -36,6 +36,7 @@ def __init__(
         system_prompt: str | None = None,
         processor_kwargs: Dict[str, Any] | None = None,
         generation_kwargs: Dict[str, Any] | None = None,
+        image_key: str = "image",
     ):
         """Initialize the HuggingFace model wrapper.
 
@@ -46,6 +47,7 @@ def __init__(
             system_prompt: System prompt to use.
             processor_kwargs: Additional processor arguments.
             generation_kwargs: Additional generation arguments.
+            image_key: The key used for image inputs in the chat template.
         """
         super().__init__(system_prompt=system_prompt)
 
@@ -54,6 +56,7 @@ def __init__(
         self.base_model_class = model_class
         self.processor_kwargs = processor_kwargs or {}
         self.generation_kwargs = generation_kwargs or {}
+        self.image_key = image_key
 
         self.processor = self.load_processor()
         self.model = self.load_model()
@@ -106,7 +109,7 @@ def format_inputs(self, batch: TextImageBatch | TextBatch) -> Dict[str, torch.Te
         }
 
         if with_images:
-            processor_inputs["image"] = [[image] for image in image_batch]
+            processor_inputs[self.image_key] = [[image] for image in image_batch]
 
         return self.processor(**processor_inputs).to(self.model.device)  # type: ignore
 

Original file line number	Diff line number	Diff line change
`@@ -44,4 +44,5 @@ def __init__(`
`44`	`44`	`"max_pixels": 451584, # 672*672`
`45`	`45`	`},`
`46`	`46`	`system_prompt=system_prompt,`
	`47`	`+ image_key="images",`
`47`	`48`	`)`