Skip to content

Commit af94d0a

Browse files
Initial commit
0 parents  commit af94d0a

File tree

177 files changed

+30684
-0
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

177 files changed

+30684
-0
lines changed

.DS_Store

6 KB
Binary file not shown.

LICENSE

+674
Large diffs are not rendered by default.

README.md

+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
# Where do Large Vision-Language Models Look at when Answering Questions?
2+
The official repo for "[Where do Large Vision-Language Models Look at when Answering Questions?](https://github.com/bytedance/LVLM_Interpretation)" A PyTorch implementation for a salieny heatmap visualization method that interprets the open-ended responses of LVLMs conditioned on an image.
3+
4+
### Installation
5+
First clone this repository and navigate to the folder.
6+
7+
The environment installation mainly follows [LLaVA](https://github.com/haotian-liu/LLaVA). You can update the pip and install the dependencies using:
8+
9+
```
10+
$ pip install --upgrade pip
11+
$ bash install.sh
12+
```
13+
14+
### Model Preparation
15+
For Mini-Gemini models, please follow the instructions in [MGM](https://github.com/dvlab-research/MGM) to download the models and put them in the folders following [Structure](https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure)
16+
17+
### Quick Start
18+
To generate the saliency heatmap of an LVLM when generating free-form responses, an example command is as follows, with the hyperparameters passed as arguments:
19+
```
20+
$ python3 main.py --method iGOS+ --model llava --dataset <dataset name> --data_path <path/to/questions> --image_folder <path/to/images> --output_dir <path/to/output> --size 32 --L1 1.0 --L2 0.1 --L3 10.0 --ig_iter 10 --gamma 1.0 --iterations 5 --momentum 5
21+
```
22+
The explanations of each argument can be found in [args.py](args.py)
23+
24+
### Acknowledgement
25+
Some parts of the code are built upon [IGOS_pp](https://github.com/khorrams/IGOS_pp). And we use the open-source LVLMs [LLaVA-1.5](https://github.com/haotian-liu/LLaVA), [LLaVA-OneVision](https://github.com/LLaVA-VL/LLaVA-NeXT), [Cambrian](https://github.com/cambrian-mllm/cambrian) and [Mini-Gemini](https://github.com/dvlab-research/MGM) in this project. We thank the authors for their excellent work.

args.py

+175
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
import argparse
2+
3+
4+
def init_args():
5+
6+
parser = argparse.ArgumentParser(
7+
description='Generate explanations for open-ended responses of LVLMs.'
8+
)
9+
10+
parser.add_argument(
11+
'--model',
12+
metavar='M',
13+
type=str,
14+
choices=['llava', 'cambrian', 'llava_next', 'mgm'],
15+
default='llava',
16+
help='The model to use for making predictions.')
17+
18+
parser.add_argument(
19+
'--dataset',
20+
type=str,
21+
default='cvbench',
22+
help='The dataset to use for making predictions.')
23+
24+
parser.add_argument(
25+
'--output_dir',
26+
type=str,
27+
help='The path to the output directory for saving explanation results.',
28+
required=True)
29+
30+
parser.add_argument(
31+
'--size',
32+
type=int,
33+
default=32,
34+
help='The resolution of mask to be generated.')
35+
36+
parser.add_argument(
37+
'--input_size',
38+
type=int,
39+
default=336,
40+
help='The input size to the network.')
41+
42+
parser.add_argument(
43+
'--manual_seed',
44+
type=int,
45+
default=0,
46+
help='The manual seed for experiments.')
47+
48+
parser.add_argument(
49+
'--method',
50+
type=str,
51+
choices=['iGOS+', 'iGOS++'],
52+
default='iGOS+'
53+
)
54+
55+
parser.add_argument(
56+
'--opt',
57+
type=str,
58+
choices=['LS', 'NAG'],
59+
default='NAG',
60+
help='The optimization algorithm.'
61+
)
62+
63+
parser.add_argument(
64+
'--diverse_k',
65+
type=int,
66+
default=1)
67+
68+
parser.add_argument(
69+
'--init_posi',
70+
type=int,
71+
default=0,
72+
help='The initialization position, which cell of the K x K grid will be used to initialize the mask with nonzero values (use init_val to control it)')
73+
"""
74+
If K = 2: If K = 3:
75+
------- ----------
76+
|0 |1 | |0 |1 |2 |
77+
------- ----------
78+
|2 |3 | |3 |4 |5 |
79+
------- ----------
80+
|6 |7 |8 |
81+
----------
82+
"""
83+
84+
parser.add_argument(
85+
'--init_val',
86+
type=float,
87+
default=0.,
88+
help='The initialization value used to initialize the mask in only one cell of the K x K grid.')
89+
90+
parser.add_argument(
91+
'--L1',
92+
type=float,
93+
default=0.1,
94+
help='The weight of L1 norm'
95+
)
96+
97+
parser.add_argument(
98+
'--L2',
99+
type=float,
100+
default=1.0,
101+
help='The weight of L2 norm'
102+
)
103+
104+
parser.add_argument(
105+
'--gamma',
106+
type=float,
107+
default=0.2,
108+
help='The exponential decay rate of the graduated non-convexity'
109+
)
110+
111+
parser.add_argument(
112+
'--L3',
113+
type=float,
114+
default=20.0,
115+
help='The weight of BTV norm'
116+
)
117+
118+
parser.add_argument(
119+
'--momentum',
120+
type=int,
121+
default=3
122+
)
123+
124+
parser.add_argument(
125+
'--ig_iter',
126+
type=int,
127+
help='The step size of the integtated gradient accumulation.',
128+
default=5)
129+
130+
parser.add_argument(
131+
'--iterations',
132+
type=int,
133+
default=5
134+
)
135+
136+
parser.add_argument(
137+
'--lr',
138+
type=float,
139+
default=10,
140+
help='The step size for updating the mask'
141+
)
142+
143+
parser.add_argument(
144+
'--model_base',
145+
default=None,
146+
type=str,
147+
help='The path to the model base file to be used.'
148+
)
149+
150+
parser.add_argument(
151+
"--data_path",
152+
type=str,
153+
required=True,
154+
help="The path to the input question file."
155+
)
156+
157+
parser.add_argument(
158+
"--image_folder",
159+
type=str,
160+
required=True,
161+
help="The path to the image folder"
162+
)
163+
164+
# LVLM generation settings
165+
parser.add_argument("--temperature", type=float,default=0)
166+
parser.add_argument("--top_p", type=float, default=None)
167+
parser.add_argument("--max_new_tokens", type=int, default=1024)
168+
parser.add_argument("--num_beams", type=int, default=1)
169+
170+
parser.add_argument("--use_yake", type=bool, default=False, help="Whether use yake to detect keywords")
171+
parser.add_argument("--choices", type=bool, default=False, help="Whether ask the LVLMs to generate single choice instead of open-ended responses")
172+
parser.add_argument("--ablation_zero", type=bool, default=False, help="Ablation of baseline image using all-zero images")
173+
parser.add_argument("--ablation_noise", type=bool, default=False, help="Ablation of baseline image using random noise")
174+
175+
return parser.parse_args()

cambrian/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from .model.language_model.cambrian_llama import CambrianLlamaForCausalLM, CambrianConfig
2+
from .model.language_model.cambrian_mistral import CambrianMistralForCausalLM, CambrianMistralConfig

cambrian/constants.py

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
CONTROLLER_HEART_BEAT_EXPIRATION = 30
2+
WORKER_HEART_BEAT_INTERVAL = 15
3+
4+
LOGDIR = "."
5+
6+
# Model Constants
7+
IGNORE_INDEX = -100
8+
IMAGE_TOKEN_INDEX = -200
9+
DEFAULT_IMAGE_TOKEN = "<image>"
10+
DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11+
DEFAULT_IM_START_TOKEN = "<im_start>"
12+
DEFAULT_IM_END_TOKEN = "<im_end>"
13+
IMAGE_PLACEHOLDER = "<image-placeholder>"

0 commit comments

Comments
 (0)