bytedance
diff --git a/‎.DS_Store
6 KB b/‎.DS_Store
6 KB
diff --git a/‎LICENSE
+674 b/‎LICENSE
+674
diff --git a/‎README.md
+25 b/‎README.md
+25
diff --git a/‎args.py
+175 b/‎args.py
+175
diff --git a/‎cambrian/__init__.py
+2 b/‎cambrian/__init__.py
+2
diff --git a/‎cambrian/constants.py
+13 b/‎cambrian/constants.py
+13
@@ -0,0 +1,25 @@
+# Where do Large Vision-Language Models Look at when Answering Questions?
+The official repo for "[Where do Large Vision-Language Models Look at when Answering Questions?](https://github.com/bytedance/LVLM_Interpretation)" A PyTorch implementation for a salieny heatmap visualization method that interprets the open-ended responses of LVLMs conditioned on an image.
+
+### Installation
+First clone this repository and navigate to the folder.
+
+The environment installation mainly follows [LLaVA](https://github.com/haotian-liu/LLaVA). You can update the pip and install the dependencies using:
+
+```
+$ pip install --upgrade pip
+$ bash install.sh
+```
+
+### Model Preparation
+For Mini-Gemini models, please follow the instructions in [MGM](https://github.com/dvlab-research/MGM) to download the models and put them in the folders following [Structure](https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure)
+
+### Quick Start
+To generate the saliency heatmap of an LVLM when generating free-form responses, an example command is as follows, with the hyperparameters passed as arguments:
+```
+$ python3 main.py --method iGOS+ --model llava --dataset <dataset name> --data_path <path/to/questions> --image_folder <path/to/images> --output_dir <path/to/output> --size 32 --L1 1.0 --L2 0.1 --L3 10.0 --ig_iter 10 --gamma 1.0 --iterations 5 --momentum 5
+```
+The explanations of each argument can be found in [args.py](args.py)
+
+### Acknowledgement
+Some parts of the code are built upon [IGOS_pp](https://github.com/khorrams/IGOS_pp). And we use the open-source LVLMs [LLaVA-1.5](https://github.com/haotian-liu/LLaVA), [LLaVA-OneVision](https://github.com/LLaVA-VL/LLaVA-NeXT), [Cambrian](https://github.com/cambrian-mllm/cambrian) and [Mini-Gemini](https://github.com/dvlab-research/MGM) in this project. We thank the authors for their excellent work.
@@ -0,0 +1,175 @@
+import argparse
+
+
+def init_args():
+
+    parser = argparse.ArgumentParser(
+        description='Generate explanations for open-ended responses of LVLMs.'
+    )
+
+    parser.add_argument(
+        '--model',
+        metavar='M',
+        type=str,
+        choices=['llava', 'cambrian', 'llava_next', 'mgm'],
+        default='llava',
+        help='The model to use for making predictions.')
+    
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='cvbench',
+        help='The dataset to use for making predictions.')
+
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        help='The path to the output directory for saving explanation results.',
+        required=True)
+
+    parser.add_argument(
+        '--size',
+        type=int,
+        default=32,
+        help='The resolution of mask to be generated.')
+
+    parser.add_argument(
+        '--input_size',
+        type=int,
+        default=336,
+        help='The input size to the network.')
+
+    parser.add_argument(
+        '--manual_seed',
+        type=int,
+        default=0,
+        help='The manual seed for experiments.')
+
+    parser.add_argument(
+        '--method',
+        type=str,
+        choices=['iGOS+', 'iGOS++'],
+        default='iGOS+'
+    )
+
+    parser.add_argument(
+        '--opt',
+        type=str,
+        choices=['LS', 'NAG'],
+        default='NAG',
+        help='The optimization algorithm.'
+    )
+
+    parser.add_argument(
+        '--diverse_k',
+        type=int,
+        default=1)
+
+    parser.add_argument(
+        '--init_posi',
+        type=int,
+        default=0,
+        help='The initialization position, which cell of the K x K grid will be used to initialize the mask with nonzero values (use init_val to control it)')
+    """
+            If K = 2:      If K = 3:
+            -------        ----------
+            |0 |1 |        |0 |1 |2 |
+            -------        ----------
+            |2 |3 |        |3 |4 |5 |
+            -------        ----------
+                           |6 |7 |8 |
+                           ----------
+    """
+
+    parser.add_argument(
+        '--init_val',
+        type=float,
+        default=0.,
+        help='The initialization value used to initialize the mask in only one cell of the K x K grid.')
+
+    parser.add_argument(
+        '--L1',
+        type=float,
+        default=0.1,
+        help='The weight of L1 norm'
+    )
+
+    parser.add_argument(
+        '--L2',
+        type=float,
+        default=1.0,
+        help='The weight of L2 norm'
+    )
+
+    parser.add_argument(
+        '--gamma',
+        type=float,
+        default=0.2,
+        help='The exponential decay rate of the graduated non-convexity'
+    )
+
+    parser.add_argument(
+        '--L3',
+        type=float,
+        default=20.0,
+        help='The weight of BTV norm'
+    )
+    
+    parser.add_argument(
+        '--momentum',
+        type=int,
+        default=3
+    )
+
+    parser.add_argument(
+        '--ig_iter',
+        type=int,
+        help='The step size of the integtated gradient accumulation.',
+        default=5)
+
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=5
+    )
+
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=10,
+        help='The step size for updating the mask'
+    )
+
+    parser.add_argument(
+        '--model_base',
+        default=None,
+        type=str,
+        help='The path to the model base file to be used.'
+    )
+
+    parser.add_argument(
+        "--data_path", 
+        type=str, 
+        required=True,
+        help="The path to the input question file."
+    )
+
+    parser.add_argument(
+        "--image_folder",
+        type=str,
+        required=True,
+        help="The path to the image folder"
+    )
+
+    # LVLM generation settings
+    parser.add_argument("--temperature", type=float,default=0)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--max_new_tokens", type=int, default=1024)
+    parser.add_argument("--num_beams", type=int, default=1)
+
+    parser.add_argument("--use_yake", type=bool, default=False, help="Whether use yake to detect keywords")
+    parser.add_argument("--choices", type=bool, default=False, help="Whether ask the LVLMs to generate single choice instead of open-ended responses")
+    parser.add_argument("--ablation_zero", type=bool, default=False, help="Ablation of baseline image using all-zero images")
+    parser.add_argument("--ablation_noise", type=bool, default=False, help="Ablation of baseline image using random noise")
+
+    return parser.parse_args()
@@ -0,0 +1,2 @@
+from .model.language_model.cambrian_llama import CambrianLlamaForCausalLM, CambrianConfig
+from .model.language_model.cambrian_mistral import CambrianMistralForCausalLM, CambrianMistralConfig
@@ -0,0 +1,13 @@
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+
+LOGDIR = "."
+
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .model.language_model.cambrian_llama import CambrianLlamaForCausalLM, CambrianConfig`
	`2`	`+from .model.language_model.cambrian_mistral import CambrianMistralForCausalLM, CambrianMistralConfig`