Merge branch 'develop'

geoffroy-noel-ddh · geoffroy-noel-ddh · commit 2a467bc77d11 · 2025-03-11T01:03:32.000Z
diff --git a/README.md b/README.md
@@ -30,7 +30,7 @@ In this tool, a **describer** is a backend for a family of vision language model
 
 ![Workflow](doc/bvqa-workflow.jpg)
 
-[Example of a report on test data with various vision language models](https://github.com/kingsdigitallab/kdl-vqa/blob/main/doc/bvqa-tests-2025-03-07.pdf)
+[Example of a report on test data with various vision language models](https://github.com/kingsdigitallab/kdl-vqa/blob/main/doc/bvqa-tests-2025-03-11.pdf)
 
 ## Requirements
 
@@ -107,7 +107,8 @@ A describer is a backend for bvqa that provide support for a family of vision la
 | qwen-vl        | [Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4) | 2b:int4    | 7       | 4:53         | unlimited |
 | qwen-vl        | [Qwen/Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) -o              | 3b:BF16    | 21      | 2:49         |           |
 | qwen-vl        | [allenai/olmOCR-7B-0225-preview](https://huggingface.co/allenai/olmOCR-7B-0225-preview) -o        | 7b:BF16    | 24      | 3:21         |           |
-| ovis           | [AIDC-AI/Ovis2-1B](https://huggingface.co/AIDC-AI/Ovis2-1B)                                       | 1b:BF16    | 3       | 0:42         |           |
+| ovis           | [AIDC-AI/Ovis2-1B](https://huggingface.co/AIDC-AI/Ovis2-1B) -o                                    | 1b:BF16    | 3       | 0:42         |           |
+| ovis           | [AIDC-AI/Ovis2-4B](https://huggingface.co/AIDC-AI/Ovis2-4B) -o                                    | 4b:BF16    | 10      | 1:01         |           |
 | ollama         | [llama3.2-vision](https://ollama.com/library/llama3.2-vision)                                     | 11b:Q4_K_M | 12      | 0:59         |           |
 | ollama         | [minicpm-v](https://ollama.com/library/minicpm-v)                                                 | 8b:Q4_0    | 7       | 1:28         |           |
 | ollama         | [granite3.2-vision](https://ollama.com/library/granite3.2-vision)                                 | 2b:Q4_K_M  | 13      | UNRESPONSIVE |           |
@@ -140,7 +141,7 @@ For those describers, the models refer to model names on the Hugging Face hub. I
 
 **Qwen** models can crash as they eat up extraordinary amount of VRAM. To keep it under control use the `-o` flag with your `describe` action. It will use flash_attention to drastically reduce memory use. However the flash attention libraries need more recent generations of GPUs. The use -o flag is documented in the model column of the above table.
 
-**ovis** despite being small, fast and using very little VRAM, this model requires more recent GPUs due to the reliance on flash_attn package which we found often difficult to install or run on various machines.
+**ovis** also greatly benefits from `-o` (flash attention), reducing the VRAM use by 3x.
 
 ## Reviewing (`report`)
 
@@ -205,7 +206,8 @@ You can combine this with the -f option to test on a few images only.
 
 The -r option tells the tool to ignore the cache.
 When supplied, it will always ask the questions again.
-This is useful in the case where you want to compare the performance between different computing devices (e.g. Nvidia A100 vs L40s GPUs) to estimate the total duration on your entire collection.
+This is useful in the case where you want to compare the performance between different computing devices 
+(e.g. Nvidia A100 vs L40s GPUs) to estimate the total duration on your entire collection.
 
 ## Parallelism
 
@@ -262,7 +264,7 @@ After running your questions on a larger proportion of your collection, you migh
 As prompt engineering is usually very model-specific, moving to another model can be very disruptive.
 It aways mean reassessing the answers and often means reformulating many questions from scratch.
 
-## Design principles
+## Guiding principles
 
 * Reproducibility
 * Ease of use
diff --git a/bvqa.py b/bvqa.py
@@ -141,6 +141,8 @@ def action_describe(self):
         '''Submit questions about multiple images to a visual model & save answers.'''
         self.new_describer()
 
+        print(f'Describe with describer = {self.describer_name} ; model = {self.describer.get_name()}.')
+
         self.timer.step(f'model: {self.describer.get_name()}')
         import socket
         self.timer.step(f'host : {socket.gethostname()}')
diff --git a/describer/ovis.py b/describer/ovis.py
@@ -10,10 +10,14 @@
 
 
 class Ovis(ImageDescriber):
-    """Image description using SmolVLM model.
+    """Image description using Ovis model.
 
-    https://huggingface.co/AIDC-AI/Ovis2-1B
     1.27B params, BF16
+
+    https://huggingface.co/AIDC-AI/Ovis2-1B
+    https://github.com/AIDC-AI/Ovis
+
+    Works on CPU but extremely slow, even 1B model.
     """    
     
     def __init__(self, model_id='', model_version=''):
@@ -88,21 +92,25 @@ def _init_model(self):
             print('WARNING: running model on CPU')
             self._new_model()
 
-        from transformers import AutoProcessor
-        self.processor = AutoProcessor.from_pretrained(self.model_id)
-
         return self.model
     
     def _new_model(self, use_cuda=False, use_attention=False):
         from transformers import AutoModelForCausalLM
         import torch
 
-        self.model = AutoModelForCausalLM.from_pretrained(
-            self.model_id,
+        options = dict(
             torch_dtype=torch.bfloat16,
             multimodal_max_length=32768,
-            trust_remote_code=True
+            trust_remote_code=True,
         )
+
+        # https://github.com/AIDC-AI/Ovis/issues/64#issuecomment-2686944605
+        if not use_cuda:
+            options['device_map'] = 'cpu'
+        if not use_attention:
+            options['llm_attn_implementation'] = 'eager'
+
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_id, **options)
         if use_cuda:
             self.model = self.model.cuda()
 
diff --git a/test/data/test_cases.json b/test/data/test_cases.json
@@ -13,8 +13,8 @@
         "location": ["indoor"]
     },
     "form": {
-        "long_description": ["Streetcar Magazine"],
+        "long_description": ["streetcar museum"],
         "location": ["N/A"],
-        "text": ["Tour", "Recital", "50% off with SBO", "renew old memories"]
+        "text": ["tour", "recital", "50% off with sbo", "renew old memories"]
     }
 }
diff --git a/test/describe/all.bash b/test/describe/all.bash
@@ -2,5 +2,9 @@ cd "$(dirname "$0")"
 bash describer.bash moondream vikhyatk/moondream2
 CUDA_VISIBLE_DEVICES=0 bash describer.bash qwen-vl Qwen/Qwen2.5-VL-3B-Instruct
 bash describer.bash smol HuggingFaceTB/SmolVLM-Instruct
-bash describer.bash ollama llama3.2-vision
+bash describer.bash ovis AIDC-AI/Ovis2-1B
+# TODO: understand why it hangs on 'object' qst for susan-q image
+# bash describer.bash ollama llama3.2-vision
+bash describer.bash ollama minicpm-v
+cd ../..
 python3 bvqa.py report -R test/data -t
diff --git a/utils/helpers.py b/utils/helpers.py
@@ -103,7 +103,6 @@ def get_repeat_ratio(answer):
     while True:
         if len(words) < 2*l: break
         if ' '.join(words[-l:]) == ' '.join(words[-2*l:-l]):
-            print(words[-l:])
             ret = 1.0
             break
         l += 1

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,8 @@`
`13`	`13`	`"location": ["indoor"]`
`14`	`14`	`},`
`15`	`15`	`"form": {`
`16`		`- "long_description": ["Streetcar Magazine"],`
	`16`	`+ "long_description": ["streetcar museum"],`
`17`	`17`	`"location": ["N/A"],`
`18`		`- "text": ["Tour", "Recital", "50% off with SBO", "renew old memories"]`
	`18`	`+ "text": ["tour", "recital", "50% off with sbo", "renew old memories"]`
`19`	`19`	`}`
`20`	`20`	`}`