From 15157451872678373ce96833a9d16923cb0d4d82 Mon Sep 17 00:00:00 2001 From: Yao Lu Date: Fri, 3 May 2024 00:56:14 -0700 Subject: [PATCH] vila1.5 release --- README.md | 152 +- data_prepare/README.md | 121 +- demo_images/av.png | Bin demo_trt_llm/README.md | 121 ++ demo_trt_llm/apply_patch.sh | 14 + demo_trt_llm/av.png | Bin 0 -> 383825 bytes demo_trt_llm/build_visual_engine.py | 300 +++ demo_trt_llm/convert_checkpoint.py | 468 +++++ demo_trt_llm/quantization/quantize.py | 64 + demo_trt_llm/quantization/quantize_by_ammo.py | 433 +++++ demo_trt_llm/run.py | 513 ++++++ environment_setup.sh | 16 +- inference_test/inference_test.py | 67 +- inference_test/test_data/car_blocker.png | Bin 0 -> 383825 bytes inference_test/test_data/car_safety.jpg | Bin 0 -> 164214 bytes inference_test/test_data/count_dog_2.png | Bin 0 -> 279426 bytes inference_test/test_data/count_giraff_4.png | Bin 0 -> 992519 bytes inference_test/test_data/count_panda_3.png | Bin 0 -> 406857 bytes inference_test/test_data/factory.jpg | Bin 0 -> 108623 bytes inference_test/test_data/factory_count_1.jpg | Bin 0 -> 102684 bytes inference_test/test_data/factory_count_2.jpg | Bin 0 -> 122020 bytes inference_test/test_data/factory_count_3.jpg | Bin 0 -> 138574 bytes inference_test/test_data/factory_count_4.jpg | Bin 0 -> 79769 bytes inference_test/test_data/factory_count_5.jpg | Bin 0 -> 73313 bytes inference_test/test_data/factory_count_6.jpg | Bin 0 -> 63018 bytes inference_test/test_data/factory_count_7.jpg | Bin 0 -> 58049 bytes inference_test/test_data/factory_count_8.jpg | Bin 0 -> 53240 bytes inference_test/test_data/flamingo_1.png | Bin 542011 -> 699830 bytes inference_test/test_data/fuyu-capm.png | Bin 0 -> 639903 bytes inference_test/test_data/fuyu-heavy-table.png | Bin 0 -> 205200 bytes inference_test/test_data/menu_1.png | Bin 0 -> 1973710 bytes inference_test/test_data/menu_2.png | Bin 0 -> 1731194 bytes inference_test/test_data/orange_price.png | Bin 0 -> 219437 bytes inference_test/test_data/parking_sign.png | Bin 0 -> 1106945 bytes inference_test/test_data/princess_1.png | Bin 216390 -> 281183 bytes inference_test/test_data/princess_2.png | Bin 227817 -> 290297 bytes inference_test/test_data/tow_car.png | Bin 0 -> 271157 bytes inference_test/test_data/visual_inference.png | Bin 0 -> 2250344 bytes llava/__init__.py | 2 +- llava/conversation.py | 39 + llava/data/__init__.py | 3 +- llava/data/dataset.py | 1115 +++++++---- llava/data/dataset_impl/coyo_recap.py | 76 + llava/data/dataset_impl/hiertext.py | 191 ++ llava/data/dataset_impl/sam.py | 185 ++ llava/data/dataset_impl/textocr.py | 280 +++ llava/data/dataset_tar.py | 246 +++ llava/data/datasets_mixture.py | 235 ++- llava/data/simple_video_dataset.py | 269 +++ llava/data/simple_vila_webdataset.py | 322 ++++ llava/data/utils.py | 22 + llava/eval/eval_gpt_review.py | 0 llava/eval/eval_gpt_review_bench.py | 0 llava/eval/eval_gpt_review_visual.py | 0 llava/eval/eval_mmmu.py | 0 llava/eval/eval_mmvet.py | 0 llava/eval/eval_pope.py | 0 llava/eval/eval_science_qa.py | 3 + llava/eval/eval_science_qa_gpt4.py | 0 llava/eval/eval_science_qa_gpt4_requery.py | 0 llava/eval/eval_textvqa.py | 0 llava/eval/evaluate_vqa.py | 147 ++ llava/eval/evaluate_vqa_score.py | 143 ++ .../eval/generate_webpage_data_from_table.py | 0 llava/eval/m4c_evaluator.py | 0 llava/eval/mathvista_utils/calculate_score.py | 204 +++ llava/eval/mathvista_utils/extract_answer.py | 81 + llava/eval/mathvista_utils/prompts/ext_ans.py | 42 + llava/eval/mathvista_utils/utilities.py | 213 +++ llava/eval/mmmu_utils/configs/llava1.5.yaml | 0 llava/eval/mmmu_utils/data_utils.py | 199 +- llava/eval/mmmu_utils/eval_utils.py | 0 llava/eval/mmmu_utils/merge_jsons.py | 22 + llava/eval/mmmu_utils/model_utils.py | 65 +- llava/eval/model_qa.py | 1 + llava/eval/model_vqa.py | 8 +- llava/eval/model_vqa_loader.py | 8 +- llava/eval/model_vqa_mmbench.py | 8 +- llava/eval/model_vqa_mmmu.py | 58 +- llava/eval/model_vqa_mmmu_sharded.py | 162 ++ llava/eval/model_vqa_nextqa.py | 199 ++ llava/eval/model_vqa_qbench.py | 8 +- llava/eval/model_vqa_science.py | 14 +- llava/eval/model_vqa_video.py | 83 +- llava/eval/model_vqa_videodemo.py | 187 ++ llava/eval/model_vqa_videoperception.py | 174 ++ llava/eval/qa_baseline_gpt35.py | 0 llava/eval/{run_llava.py => run_vila.py} | 102 +- llava/eval/summarize_gpt_review.py | 0 .../eval/table/answer/answer_alpaca-13b.jsonl | 80 - llava/eval/table/answer/answer_bard.jsonl | 80 - llava/eval/table/answer/answer_gpt35.jsonl | 80 - .../eval/table/answer/answer_llama-13b.jsonl | 80 - .../eval/table/answer/answer_vicuna-13b.jsonl | 80 - .../table/caps_boxes_coco2014_val_80.jsonl | 80 - llava/eval/table/model.jsonl | 5 - llava/eval/table/prompt.jsonl | 4 - llava/eval/table/question.jsonl | 80 - .../review/review_alpaca-13b_vicuna-13b.jsonl | 80 - .../table/review/review_bard_vicuna-13b.jsonl | 80 - .../review/review_gpt35_vicuna-13b.jsonl | 80 - .../review/review_llama-13b_vicuna-13b.jsonl | 80 - llava/eval/table/reviewer.jsonl | 4 - .../video/eval_benchmark_1_correctness.py | 0 .../eval_benchmark_2_detailed_orientation.py | 0 llava/eval/video/eval_benchmark_3_context.py | 0 llava/eval/video/eval_benchmark_4_temporal.py | 0 .../video/eval_benchmark_5_consistency.py | 0 llava/eval/video/eval_video_nextqa.py | 163 ++ llava/eval/video/eval_video_perception.py | 37 + llava/eval/video/eval_video_qa.py | 11 +- llava/mm_utils.py | 228 ++- llava/model/__init__.py | 4 +- llava/model/apply_delta.py | 20 +- llava/model/builder.py | 241 ++- llava/model/configuration_llava.py | 51 + llava/model/language_model/builder.py | 111 ++ .../{llava_mixtral.py => llava_gemma.py} | 81 +- llava/model/language_model/llava_llama.py | 148 +- llava/model/language_model/llava_mistral.py | 7 +- llava/model/language_model/llava_mpt.py | 2 +- llava/model/llava_arch.py | 542 ++++-- llava/model/make_delta.py | 23 +- llava/model/multimodal_encoder/builder.py | 54 +- .../model/multimodal_encoder/clip_encoder.py | 39 +- .../multimodal_encoder/image_processor.py | 546 ++++++ .../intern/configuration_intern_vit.py | 119 ++ .../intern/flash_attention.py | 76 + .../intern/modeling_intern_vit.py | 547 ++++++ .../multimodal_encoder/intern_encoder.py | 53 + .../model/multimodal_encoder/radio_encoder.py | 317 ++++ .../multimodal_encoder/siglip/__init__.py | 94 + .../siglip/configuration_siglip.py | 302 +++ .../siglip/convert_siglip_to_hf.py | 413 +++++ .../siglip/image_processing_siglip.py | 302 +++ .../siglip/modeling_siglip.py | 1186 ++++++++++++ .../siglip/processing_siglip.py | 143 ++ .../siglip/tokenization_siglip.py | 389 ++++ .../multimodal_encoder/siglip_encoder.py | 39 + .../multimodal_encoder/vision_encoder.py | 146 +- .../multimodal_encoder/visualize_features.py | 351 ++++ .../multimodal_projector/base_projector.py | 100 + llava/model/multimodal_projector/builder.py | 67 +- llava/model/utils.py | 60 +- llava/train/args.py | 62 +- llava/train/callbacks/autoresume_callback.py | 61 + llava/train/llava_trainer.py | 184 +- llava/train/short_video_filter.py | 25 + llava/train/slurm_utils.py | 78 + llava/train/train.py | 437 +++-- .../models/gemma/__init__.py | 121 ++ .../models/gemma/configuration_gemma.py | 147 ++ .../models/gemma/modeling_gemma.py | 1322 ++++++++++++++ .../{ => models}/llama/configuring_llama.py | 0 .../{ => models}/llama/modeling_llama.py | 1 - .../models/llama/tokenization_llama.py | 482 +++++ .../models/mistral/__init__.py | 62 + .../models/mistral/configuration_mistral.py | 152 ++ .../models/mistral/modeling_mistral.py | 1364 ++++++++++++++ .../models/mixtral/__init__.py | 62 + .../models/mixtral/configuration_mixtral.py | 169 ++ .../models/mixtral/modeling_mixtral.py | 1627 +++++++++++++++++ llava/train/utils.py | 115 ++ llava/unit_test_utils.py | 102 ++ llava/wids/wids.py | 18 +- pyproject.toml | 11 +- scripts/convert_gqa_for_eval.py | 16 + scripts/convert_mmbench_for_submission.py | 18 +- scripts/convert_mmvet_for_eval.py | 16 +- scripts/convert_seed_for_submission.py | 16 +- scripts/convert_sqa_to_llava.py | 16 +- scripts/convert_sqa_to_llava_base_prompt.py | 16 +- scripts/convert_vizwiz_for_submission.py | 16 +- scripts/convert_vqav2_for_submission.py | 21 +- scripts/extract_mm_projector.py | 49 +- scripts/merge_lora_weights.py | 24 - scripts/v1_5/eval/ai2d.sh | 16 + scripts/v1_5/eval/ai2d_sharded.sh | 41 + scripts/v1_5/eval/chartqa.sh | 25 + scripts/v1_5/eval/chartqa_sharded.sh | 73 + scripts/v1_5/eval/copy_predictions.py | 51 + scripts/v1_5/eval/docvqa.sh | 26 + scripts/v1_5/eval/eval_all.sh | 26 +- scripts/v1_5/eval/gqa.sh | 20 +- scripts/v1_5/eval/gqa_sharded.sh | 49 + scripts/v1_5/eval/llavabench.sh | 18 +- scripts/v1_5/eval/llavabench_sharded.sh | 55 + scripts/v1_5/eval/mmbench.sh | 14 +- scripts/v1_5/eval/mmbench_cn.sh | 16 +- scripts/v1_5/eval/mmbench_cn_sharded.sh | 50 + scripts/v1_5/eval/mmbench_sharded.sh | 53 + scripts/v1_5/eval/mme.sh | 19 +- scripts/v1_5/eval/mme_sharded.sh | 47 + scripts/v1_5/eval/mmmu.sh | 13 +- scripts/v1_5/eval/mmmu_sharded.sh | 45 + scripts/v1_5/eval/mmvet.sh | 18 +- scripts/v1_5/eval/mmvet_sharded.sh | 49 + scripts/v1_5/eval/pope.sh | 14 +- scripts/v1_5/eval/pope_sharded.sh | 45 + scripts/v1_5/eval/qbench.sh | 2 +- scripts/v1_5/eval/qbench_zh.sh | 0 scripts/v1_5/eval/seed.sh | 14 +- scripts/v1_5/eval/seed_sharded.sh | 48 + scripts/v1_5/eval/sqa.sh | 19 +- scripts/v1_5/eval/sqa_sharded.sh | 88 + scripts/v1_5/eval/textvqa.sh | 14 +- scripts/v1_5/eval/textvqa_sharded.sh | 46 + scripts/v1_5/eval/video_chatgpt/eval_all.sh | 22 + .../eval_benchmark_1_correctness.sh | 17 + .../video_chatgpt/eval_benchmark_2_detail.sh | 18 + .../eval_benchmark_3_contextual.sh | 18 + .../eval_benchmark_4_temporal.sh | 18 + .../eval_benchmark_5_consistency.sh | 18 + .../eval/video_chatgpt/eval_qa_activitynet.sh | 13 + .../v1_5/eval/video_chatgpt/eval_qa_msrvtt.sh | 14 + .../v1_5/eval/video_chatgpt/eval_qa_msvd.sh | 13 + .../v1_5/eval/video_chatgpt/eval_qa_nextqa.sh | 15 + .../eval/video_chatgpt/eval_qa_perception.sh | 11 + .../v1_5/eval/video_chatgpt/eval_qa_tgif.sh | 14 + scripts/v1_5/eval/video_chatgpt/run_all.sh | 18 + .../run_benchmark_1_correctness.sh | 18 + .../video_chatgpt/run_benchmark_2_detail.sh | 18 + .../run_benchmark_3_contextual.sh | 18 + .../video_chatgpt/run_benchmark_4_temporal.sh | 17 + .../run_benchmark_5_consistency.sh | 18 + .../eval/video_chatgpt/run_qa_activitynet.sh | 49 + .../run_qa_activitynet_sharded.sh | 50 + .../v1_5/eval/video_chatgpt/run_qa_msrvtt.sh | 48 + .../video_chatgpt/run_qa_msrvtt_sharded.sh | 51 + .../v1_5/eval/video_chatgpt/run_qa_msvd.sh | 49 + .../eval/video_chatgpt/run_qa_msvd_sharded.sh | 52 + .../v1_5/eval/video_chatgpt/run_qa_nextqa.sh | 48 + .../video_chatgpt/run_qa_nextqa_sharded.sh | 51 + .../eval/video_chatgpt/run_qa_perception.sh | 48 + .../run_qa_perception_sharded.sh | 51 + .../v1_5/eval/video_chatgpt/run_qa_tgif.sh | 47 + .../eval/video_chatgpt/run_qa_tgif_sharded.sh | 50 + scripts/v1_5/eval/vizwiz.sh | 14 +- scripts/v1_5/eval/vizwiz_multigpu.sh | 42 + scripts/v1_5/eval/vizwiz_sharded.sh | 44 + scripts/v1_5/eval/vqav2.sh | 16 +- scripts/v1_5/eval/vqav2_sharded.sh | 48 + .../v1_5/{paper => release/13b}/1_mm_align.sh | 18 +- .../13b/2_pretrain.sh} | 25 +- scripts/v1_5/{paper => release/13b}/3_sft.sh | 25 +- scripts/v1_5/release/3b-s2/1_mm_align.sh | 62 + scripts/v1_5/release/3b-s2/2_pretrain.sh | 71 + scripts/v1_5/release/3b-s2/3_sft.sh | 61 + scripts/v1_5/release/3b/1_mm_align.sh | 59 + scripts/v1_5/release/3b/2_pretrain.sh | 68 + scripts/v1_5/release/3b/3_sft.sh | 58 + scripts/v1_5/release/40b/1_mm_align.sh | 58 + scripts/v1_5/release/40b/2_pretrain.sh | 67 + scripts/v1_5/release/40b/3_sft.sh | 57 + scripts/v1_5/release/8b/1_mm_align.sh | 59 + scripts/v1_5/release/8b/2_pretrain.sh | 68 + scripts/v1_5/release/8b/3_sft.sh | 58 + 257 files changed, 22378 insertions(+), 2606 deletions(-) mode change 100755 => 100644 demo_images/av.png create mode 100644 demo_trt_llm/README.md create mode 100644 demo_trt_llm/apply_patch.sh create mode 100755 demo_trt_llm/av.png create mode 100644 demo_trt_llm/build_visual_engine.py create mode 100644 demo_trt_llm/convert_checkpoint.py create mode 100644 demo_trt_llm/quantization/quantize.py create mode 100644 demo_trt_llm/quantization/quantize_by_ammo.py create mode 100644 demo_trt_llm/run.py create mode 100755 inference_test/test_data/car_blocker.png create mode 100644 inference_test/test_data/car_safety.jpg create mode 100644 inference_test/test_data/count_dog_2.png create mode 100644 inference_test/test_data/count_giraff_4.png create mode 100644 inference_test/test_data/count_panda_3.png create mode 100644 inference_test/test_data/factory.jpg create mode 100644 inference_test/test_data/factory_count_1.jpg create mode 100644 inference_test/test_data/factory_count_2.jpg create mode 100644 inference_test/test_data/factory_count_3.jpg create mode 100644 inference_test/test_data/factory_count_4.jpg create mode 100644 inference_test/test_data/factory_count_5.jpg create mode 100644 inference_test/test_data/factory_count_6.jpg create mode 100644 inference_test/test_data/factory_count_7.jpg create mode 100644 inference_test/test_data/factory_count_8.jpg create mode 100644 inference_test/test_data/fuyu-capm.png create mode 100644 inference_test/test_data/fuyu-heavy-table.png create mode 100644 inference_test/test_data/menu_1.png create mode 100644 inference_test/test_data/menu_2.png create mode 100644 inference_test/test_data/orange_price.png create mode 100644 inference_test/test_data/parking_sign.png create mode 100644 inference_test/test_data/tow_car.png create mode 100644 inference_test/test_data/visual_inference.png create mode 100644 llava/data/dataset_impl/coyo_recap.py create mode 100644 llava/data/dataset_impl/hiertext.py create mode 100644 llava/data/dataset_impl/sam.py create mode 100644 llava/data/dataset_impl/textocr.py create mode 100644 llava/data/dataset_tar.py create mode 100644 llava/data/simple_video_dataset.py create mode 100644 llava/data/simple_vila_webdataset.py create mode 100644 llava/data/utils.py mode change 100644 => 100755 llava/eval/eval_gpt_review.py mode change 100644 => 100755 llava/eval/eval_gpt_review_bench.py mode change 100644 => 100755 llava/eval/eval_gpt_review_visual.py mode change 100644 => 100755 llava/eval/eval_mmmu.py mode change 100644 => 100755 llava/eval/eval_mmvet.py mode change 100644 => 100755 llava/eval/eval_pope.py mode change 100644 => 100755 llava/eval/eval_science_qa.py mode change 100644 => 100755 llava/eval/eval_science_qa_gpt4.py mode change 100644 => 100755 llava/eval/eval_science_qa_gpt4_requery.py mode change 100644 => 100755 llava/eval/eval_textvqa.py create mode 100755 llava/eval/evaluate_vqa.py create mode 100755 llava/eval/evaluate_vqa_score.py mode change 100644 => 100755 llava/eval/generate_webpage_data_from_table.py mode change 100644 => 100755 llava/eval/m4c_evaluator.py create mode 100755 llava/eval/mathvista_utils/calculate_score.py create mode 100755 llava/eval/mathvista_utils/extract_answer.py create mode 100755 llava/eval/mathvista_utils/prompts/ext_ans.py create mode 100755 llava/eval/mathvista_utils/utilities.py mode change 100644 => 100755 llava/eval/mmmu_utils/configs/llava1.5.yaml mode change 100644 => 100755 llava/eval/mmmu_utils/data_utils.py mode change 100644 => 100755 llava/eval/mmmu_utils/eval_utils.py create mode 100644 llava/eval/mmmu_utils/merge_jsons.py mode change 100644 => 100755 llava/eval/mmmu_utils/model_utils.py mode change 100644 => 100755 llava/eval/model_qa.py mode change 100644 => 100755 llava/eval/model_vqa.py mode change 100644 => 100755 llava/eval/model_vqa_loader.py mode change 100644 => 100755 llava/eval/model_vqa_mmbench.py mode change 100644 => 100755 llava/eval/model_vqa_mmmu.py create mode 100644 llava/eval/model_vqa_mmmu_sharded.py create mode 100755 llava/eval/model_vqa_nextqa.py mode change 100644 => 100755 llava/eval/model_vqa_qbench.py mode change 100644 => 100755 llava/eval/model_vqa_science.py mode change 100644 => 100755 llava/eval/model_vqa_video.py create mode 100755 llava/eval/model_vqa_videodemo.py create mode 100755 llava/eval/model_vqa_videoperception.py mode change 100644 => 100755 llava/eval/qa_baseline_gpt35.py rename llava/eval/{run_llava.py => run_vila.py} (61%) mode change 100644 => 100755 mode change 100644 => 100755 llava/eval/summarize_gpt_review.py delete mode 100644 llava/eval/table/answer/answer_alpaca-13b.jsonl delete mode 100644 llava/eval/table/answer/answer_bard.jsonl delete mode 100644 llava/eval/table/answer/answer_gpt35.jsonl delete mode 100644 llava/eval/table/answer/answer_llama-13b.jsonl delete mode 100644 llava/eval/table/answer/answer_vicuna-13b.jsonl delete mode 100644 llava/eval/table/caps_boxes_coco2014_val_80.jsonl delete mode 100644 llava/eval/table/model.jsonl delete mode 100644 llava/eval/table/prompt.jsonl delete mode 100644 llava/eval/table/question.jsonl delete mode 100644 llava/eval/table/review/review_alpaca-13b_vicuna-13b.jsonl delete mode 100644 llava/eval/table/review/review_bard_vicuna-13b.jsonl delete mode 100644 llava/eval/table/review/review_gpt35_vicuna-13b.jsonl delete mode 100644 llava/eval/table/review/review_llama-13b_vicuna-13b.jsonl delete mode 100644 llava/eval/table/reviewer.jsonl mode change 100644 => 100755 llava/eval/video/eval_benchmark_1_correctness.py mode change 100644 => 100755 llava/eval/video/eval_benchmark_2_detailed_orientation.py mode change 100644 => 100755 llava/eval/video/eval_benchmark_3_context.py mode change 100644 => 100755 llava/eval/video/eval_benchmark_4_temporal.py mode change 100644 => 100755 llava/eval/video/eval_benchmark_5_consistency.py create mode 100644 llava/eval/video/eval_video_nextqa.py create mode 100644 llava/eval/video/eval_video_perception.py mode change 100644 => 100755 llava/eval/video/eval_video_qa.py create mode 100644 llava/model/configuration_llava.py create mode 100644 llava/model/language_model/builder.py rename llava/model/language_model/{llava_mixtral.py => llava_gemma.py} (56%) create mode 100644 llava/model/multimodal_encoder/image_processor.py create mode 100644 llava/model/multimodal_encoder/intern/configuration_intern_vit.py create mode 100644 llava/model/multimodal_encoder/intern/flash_attention.py create mode 100644 llava/model/multimodal_encoder/intern/modeling_intern_vit.py create mode 100644 llava/model/multimodal_encoder/intern_encoder.py create mode 100644 llava/model/multimodal_encoder/radio_encoder.py create mode 100644 llava/model/multimodal_encoder/siglip/__init__.py create mode 100644 llava/model/multimodal_encoder/siglip/configuration_siglip.py create mode 100644 llava/model/multimodal_encoder/siglip/convert_siglip_to_hf.py create mode 100644 llava/model/multimodal_encoder/siglip/image_processing_siglip.py create mode 100644 llava/model/multimodal_encoder/siglip/modeling_siglip.py create mode 100644 llava/model/multimodal_encoder/siglip/processing_siglip.py create mode 100644 llava/model/multimodal_encoder/siglip/tokenization_siglip.py create mode 100644 llava/model/multimodal_encoder/siglip_encoder.py create mode 100644 llava/model/multimodal_encoder/visualize_features.py create mode 100644 llava/model/multimodal_projector/base_projector.py create mode 100644 llava/train/callbacks/autoresume_callback.py create mode 100644 llava/train/short_video_filter.py create mode 100644 llava/train/slurm_utils.py create mode 100644 llava/train/transformers_replace/models/gemma/__init__.py create mode 100644 llava/train/transformers_replace/models/gemma/configuration_gemma.py create mode 100644 llava/train/transformers_replace/models/gemma/modeling_gemma.py rename llava/train/transformers_replace/{ => models}/llama/configuring_llama.py (100%) rename llava/train/transformers_replace/{ => models}/llama/modeling_llama.py (99%) create mode 100644 llava/train/transformers_replace/models/llama/tokenization_llama.py create mode 100644 llava/train/transformers_replace/models/mistral/__init__.py create mode 100644 llava/train/transformers_replace/models/mistral/configuration_mistral.py create mode 100644 llava/train/transformers_replace/models/mistral/modeling_mistral.py create mode 100644 llava/train/transformers_replace/models/mixtral/__init__.py create mode 100644 llava/train/transformers_replace/models/mixtral/configuration_mixtral.py create mode 100644 llava/train/transformers_replace/models/mixtral/modeling_mixtral.py create mode 100644 llava/train/utils.py create mode 100644 llava/unit_test_utils.py mode change 100644 => 100755 pyproject.toml mode change 100644 => 100755 scripts/convert_gqa_for_eval.py mode change 100644 => 100755 scripts/convert_mmbench_for_submission.py mode change 100644 => 100755 scripts/convert_mmvet_for_eval.py mode change 100644 => 100755 scripts/convert_seed_for_submission.py mode change 100644 => 100755 scripts/convert_sqa_to_llava.py mode change 100644 => 100755 scripts/convert_sqa_to_llava_base_prompt.py mode change 100644 => 100755 scripts/convert_vizwiz_for_submission.py mode change 100644 => 100755 scripts/convert_vqav2_for_submission.py mode change 100644 => 100755 scripts/extract_mm_projector.py delete mode 100644 scripts/merge_lora_weights.py create mode 100755 scripts/v1_5/eval/ai2d.sh create mode 100755 scripts/v1_5/eval/ai2d_sharded.sh create mode 100755 scripts/v1_5/eval/chartqa.sh create mode 100755 scripts/v1_5/eval/chartqa_sharded.sh create mode 100755 scripts/v1_5/eval/copy_predictions.py create mode 100755 scripts/v1_5/eval/docvqa.sh create mode 100755 scripts/v1_5/eval/gqa_sharded.sh create mode 100755 scripts/v1_5/eval/llavabench_sharded.sh create mode 100755 scripts/v1_5/eval/mmbench_cn_sharded.sh create mode 100755 scripts/v1_5/eval/mmbench_sharded.sh create mode 100755 scripts/v1_5/eval/mme_sharded.sh create mode 100755 scripts/v1_5/eval/mmmu_sharded.sh create mode 100755 scripts/v1_5/eval/mmvet_sharded.sh create mode 100755 scripts/v1_5/eval/pope_sharded.sh mode change 100644 => 100755 scripts/v1_5/eval/qbench.sh mode change 100644 => 100755 scripts/v1_5/eval/qbench_zh.sh create mode 100755 scripts/v1_5/eval/seed_sharded.sh create mode 100755 scripts/v1_5/eval/sqa_sharded.sh create mode 100755 scripts/v1_5/eval/textvqa_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_all.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_benchmark_1_correctness.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_benchmark_2_detail.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_benchmark_3_contextual.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_benchmark_4_temporal.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_benchmark_5_consistency.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_activitynet.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_msrvtt.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_msvd.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_nextqa.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_perception.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/eval_qa_tgif.sh create mode 100644 scripts/v1_5/eval/video_chatgpt/run_all.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_benchmark_1_correctness.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_benchmark_2_detail.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_benchmark_3_contextual.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_benchmark_4_temporal.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_benchmark_5_consistency.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_activitynet.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_activitynet_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_msrvtt.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_msrvtt_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_msvd.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_msvd_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_nextqa.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_nextqa_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_perception.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_perception_sharded.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_tgif.sh create mode 100755 scripts/v1_5/eval/video_chatgpt/run_qa_tgif_sharded.sh create mode 100755 scripts/v1_5/eval/vizwiz_multigpu.sh create mode 100755 scripts/v1_5/eval/vizwiz_sharded.sh create mode 100755 scripts/v1_5/eval/vqav2_sharded.sh rename scripts/v1_5/{paper => release/13b}/1_mm_align.sh (78%) mode change 100755 => 100644 rename scripts/v1_5/{paper/2_pretrain_mmc4_coyo.sh => release/13b/2_pretrain.sh} (79%) mode change 100755 => 100644 rename scripts/v1_5/{paper => release/13b}/3_sft.sh (65%) mode change 100755 => 100644 create mode 100644 scripts/v1_5/release/3b-s2/1_mm_align.sh create mode 100644 scripts/v1_5/release/3b-s2/2_pretrain.sh create mode 100644 scripts/v1_5/release/3b-s2/3_sft.sh create mode 100644 scripts/v1_5/release/3b/1_mm_align.sh create mode 100644 scripts/v1_5/release/3b/2_pretrain.sh create mode 100644 scripts/v1_5/release/3b/3_sft.sh create mode 100644 scripts/v1_5/release/40b/1_mm_align.sh create mode 100644 scripts/v1_5/release/40b/2_pretrain.sh create mode 100644 scripts/v1_5/release/40b/3_sft.sh create mode 100644 scripts/v1_5/release/8b/1_mm_align.sh create mode 100644 scripts/v1_5/release/8b/2_pretrain.sh create mode 100644 scripts/v1_5/release/8b/3_sft.sh diff --git a/README.md b/README.md index 077b9852..a55efa8b 100644 --- a/README.md +++ b/README.md @@ -12,39 +12,76 @@ [VILA arxiv](https://arxiv.org/abs/2312.07533) / [VILA Demo](https://vila-demo.hanlab.ai/) / [VILA Huggingface](https://huggingface.co/collections/Efficient-Large-Model/vila-on-pre-training-for-visual-language-models-65d8022a3a52cd9bcd62698e) ## 💡 Introduction -VILA is a visual language model (VLM) pretrained with interleaved image-text data at scale, enabling multi-image VLM. VILA is deployable on the edge, including Jetson Orin and laptop by [AWQ](https://arxiv.org/pdf/2306.00978.pdf) 4bit quantization through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) framework. We find: (1) image-text pairs are not enough, interleaved image-text is essential; (2) unfreezing LLM during interleaved image-text pre-training enables in-context learning; (3)re-blending text-only instruction data is crucial to boost both VLM and text-only performance. VILA unveils appealing capabilities, including: multi-image reasoning, in-context learning, visual chain-of-thought, and better world knowledge. +VILA is a visual language model (VLM) pretrained with interleaved image-text data at scale, enabling **multi-image** VLM and **video understanding** capabilities. VILA is deployable on the edge, including Jetson Orin and laptop by [AWQ](https://arxiv.org/pdf/2306.00978.pdf) 4bit quantization through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) framework. We find: (1) image-text pairs are not enough, interleaved image-text is essential; (2) unfreezing LLM during interleaved image-text pre-training enables in-context learning; (3)re-blending text-only instruction data is crucial to boost both VLM and text-only performance. VILA unveils appealing capabilities, including: multi-image reasoning, in-context learning, visual chain-of-thought, and better world knowledge. ## 💡 News -- [2024/03] [VILA-2.7B](https://huggingface.co/Efficient-Large-Model/VILA-2.7b) is released! It can run on NVIDIA Jetson Orin Nano ([Tutorial](https://www.jetson-ai-lab.com/tutorial_nano-vlm.html)) and appeared at GTC 2024! -- [2024/03] VILA is accepted by CVPR 2024! +- [2024/05] We release [AWQ](https://arxiv.org/pdf/2306.00978.pdf)-quantized 4bit VILA-1.5 models supported by [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) and [TensorRT-LLM](demo_trt_llm) backends. +- [2024/05] We release VILA-1.5, which comes with four model sizes (3B/8B/13B/40B) and offers native support for multi-image and video understanding. - [2024/02] We release [AWQ](https://arxiv.org/pdf/2306.00978.pdf)-quantized 4bit VILA models, deployable on Jetson Orin and laptops through [TinyChat](https://github.com/mit-han-lab/llm-awq/tree/main/tinychat) and [TinyChatEngine](https://github.com/mit-han-lab/TinyChatEngine). - [2024/02] VILA is released. We propose interleaved image-text pretraining that enables multi-image VLM. VILA comes with impressive in-context learning capabilities. We open source everything: including training code, evaluation code, datasets, model ckpts. - [2023/12] [Paper](https://arxiv.org/abs/2312.07533) is on Arxiv! ## Performance -| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | VQAv2 | GQA | VizWiz | SQA-I | VQA-T | POPE | MME | MMB | MMB-CN | SEED | llava-bench | MM-Vet | Average (w/o MME) | -| ----------------- | ---------------- | ---------------- | ---------- | ----------- | ----------- | ----- | ----- | ------- | ---- | ------ | ---- | ----------- | ------ | ----------------- | -| VILA-7B | fp16 | 80.3 | 63.1 | 59.6 | 68.0 | 62.6 | 86.3 | 1489.4 | 69.8 | 61.0 | 61.7 | 75.2 | 35.1 | 65.7 | -| VILA-7B-AWQ | int4 | 80.1 | 63.0 | 57.8 | 68.0 | 61.9 | 85.3 | 1486.3 | 68.8 | 59.0 | 61.3 | 75.8 | 35.9 | 65.2 | -| VILA-13B | fp16| 80.5 | 63.6 | 63.1 | 70.5 | 64.0 | 86.3 | 1553.6 | 73.8 | 66.7 | 62.8 | 78.3 | 42.6 | 68.4 | -| VILA-13B-AWQ | int4 | 80.4 | 63.6 | 63.0 | 71.2 | 63.5 | 87.0 | 1552.9 | 73.6 | 66.3 | 62.2 | 77.6 | 42.0 | 68.2 | +### Image QA Benchmarks + +| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | VQAv2 | GQA | VizWiz | SQA-I | VQA-T | POPE | MME | MMB | MMB-CN | SEED | SEED-I | MMMU | llava-bench | MM-Vet | Average (w/o MME) | +| -------------------------------- | ----- | ----- | ---- | ------ | ----- | ----- | ---- | ------- | ---- | ------ | ---- | ------ | ---- | ----------- | ------ | ----------------- | +| VILA1.5-3B | fp16 | 80.4 | 61.5 | 53.5 | 69.0 | 60.4 | 85.9 | 1442.44 | 63.4 | 52.7 | 60.9 | 67.9 | 33.3 | 75.9 | 35.4 | 61.6 | +| VILA1.5-3B-AWQ | int4 | 80.0 | 61.1 | 53.8 | 67.8 | 60.4 | 85.9 | 1437.34 | 63.3 | 51.4 | 59.8 | 66.6 | 32.7 | 75.0 | 37.3 | 61.2 | +| VILA1.5-3B-S2 | fp16 | 79.8 | 61.4 | 61.3 | 69.6 | 63.4 | 85.3 | 1431.65 | 62.8 | 52.2 | 60.0 | 66.4 | 32.8 | 76.7 | 38.6 | 62.3 | +| VILA1.5-3B-S2-AWQ | int4 | 79.4 | 61.3 | 62.3 | 69.2 | 63.0 | 85.8 | 1417.06 | 61.6 | 51.5 | 59.1 | 65.7 | 33.4 | 77.1 | 36.7 | 62.0 | +| Llama-3-VILA1.5-8B | fp16 | 80.9 | 61.9 | 58.7 | 79.9 | 66.3 | 84.4 | 1577.01 | 72.3 | 66.2 | 64.2 | 71.4 | 36.9 | 80.0 | 38.3 | 66.3 | +| Llama-3-VILA1.5-8B-AWQ | int4 | 80.3 | 61.7 | 59.3 | 79.0 | 65.4 | 82.9 | 1593.65 | 71.0 | 64.9 | 64.0 | 71.1 | 36.0 | 79.0 | 37.2 | 65.5 | +| VILA1.5-13B | fp16 | 82.8 | 64.3 | 62.6 | 80.1 | 65.0 | 86.3 | 1569.55 | 74.9 | 66.3 | 65.1 | 72.6 | 37.9 | 80.8 | 44.3 | 67.9 | +| VILA1.5-13B-AWQ | int4 | 82.7 | 64.5 | 63.3 | 79.7 | 64.7 | 86.7 | 1531.35 | 74.7 | 66.7 | 65.1 | 72.6 | 37.8 | 81.9 | 46.4 | 68.2 | +| VILA1.5-40B | fp16 | 84.3 | 64.6 | 62.2 | 87.2 | 73.6 | 87.3 | 1726.82 | 82.4 | 80.2 | 69.1 | 75.8 | 51.9 | 81.3 | 53.0 | 73.3 | +| VILA1.5-40B-AWQ | int | 84.1 | 64.4 | 61.3 | 86.7 | 73.2 | 88.2 | 1714.79 | 83.2 | 79.6 | 68.9 | 75.6 | 49.3 | 83.0 | 51.4 | 73.0 | + + +NOTE: VQAV2 and VizWiz are test-dev, for MMMU we report the validation set accuracy. + +### Video QA Benchmarks + +| $~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~$ | Prec. | Perception Test | ActivityNet | MSVD | MSRVTT | TGIF | +| -------------------------------- | ----- | ----- | ---- | ------ | ----- | ----- | +| VILA1.5-3B | fp16 | 39.3 | 50.2 | 76.6 | 57.5 | 51.7 | +| VILA1.5-3B-S2 | fp16 | 39 | 50.7 | 76.9 | 57.6 | 51.7 | +| Llama-3-VILA1.5-8B | fp16 | 41.8 | 54.3 | 78.3 | 60.1 | 54.1 | +| VILA1.5-13B | fp16 | 39.3 | 54.7 | 77.9 | 60.2 | 56 | +| VILA1.5-40B | fp16 | 41.7 | 58 | 80.1 | 63 | 58.2 | + -NOTE: The benchmark results are slightly different from what we report in the paper due to refactoring of the codebase based on LLava-1.5 and re-train the model. VQAV2 and VizWiz are test-dev. ### Inference speed ( Token/sec ) -| $~~~~~~$ | Precision | A100 | 4090 | Orin | -| --- | --- |--- | --- | --- | -| VILA-7B | fp16 | 81.6 | 58.5 | 11.5 | -| VILA-7B-AWQ| int4 |155.3| 168.1| 35.6 | -| VILA-13B | fp16 | 48.5 | OOM | 6.1 | -| VILA-13B-AWQ | int4 | 102.1| 99.0| 17.5 | +| $~~~~~~$ | Precision | A100 | 4090 | Orin | +| ---------------------- | --------- | ----- | ----- | ---- | +| VILA1.5-3B | fp16 | 104.6 | 137.6 | 25.4 | +| VILA1.5-3B-AWQ | int4 | 182.8 | 215.5 | 42.5 | +| VILA1.5-3B-S2 | fp16 | 104.3 | 137.2 | 24.6 | +| VILA1.5-3B-S2-AWQ | int4 | 180.2 | 219.3 | 40.1 | +| Llama-3-VILA1.5-8B | fp16 | 74.9 | 57.4 | 10.2 | +| Llama-3-VILA1.5-8B-AWQ | int4 | 168.9 | 150.2 | 28.7 | +| VILA1.5-13B | fp16 | 50.9 | OOM | 6.1 | +| VILA1.5-13B-AWQ | int4 | 115.9 | 105.7 | 20.6 | +| VILA1.5-40B | fp16 | OOM | OOM | -- | +| VILA1.5-40B-AWQ | int4 | 57.0 | OOM | -- | + +NOTE: Measured using the [TinyChat](https://github.com/mit-han-lab/llm-awq/tinychat) backend at batch size = 1. ## VILA Examples +### Video captioning + +https://github.com/Efficient-Large-Model/VILA/assets/156256291/c9520943-2478-4f97-bc95-121d625018a6 + +Prompt: Elaborate on the visual and narrative elements of the video in detail. + +Caption: The video shows a person's hands working on a white surface. They are folding a piece of fabric with a checkered pattern in shades of blue and white. The fabric is being folded into a smaller, more compact shape. The person's fingernails are painted red, and they are wearing a black and red garment. There are also a ruler and a pencil on the surface, suggesting that measurements and precision are involved in the process. + ### In context learning @@ -76,18 +113,21 @@ conda create -n vila python=3.10 -y conda activate vila pip install --upgrade pip # enable PEP 660 support +# this is optional if you prefer to system built-in nvcc. +conda install -c nvidia cuda-toolkit -y wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.4.2/flash_attn-2.4.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install flash_attn-2.4.2+cu118torch2.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl pip install -e . pip install -e ".[train]" pip install git+https://github.com/huggingface/transformers@v4.36.2 -cp -rv ./llava/train/transformers_replace/* ~/anaconda3/envs/vila/lib/python3.10/site-packages/transformers/models/ +site_pkg_path=$(python -c 'import site; print(site.getsitepackages()[0])') +cp -rv ./llava/train/transformers_replace/* $site_pkg_path/transformers/ ``` ## Training -VILA training contains three steps +VILA training contains three steps, for specific hyperparameters, please check out the [scripts/v1_5](scripts/v1_5) folder: ### Step-1: Alignment We utilize LLaVA-CC3M-Pretrain-595K dataset to align the textual and visual modalities. @@ -98,12 +138,6 @@ The stage 1 script takes in two parameters and it can run on a single 8xA100 nod bash scripts/v1_5/paper/1_mm_align.sh [BASE_MODEL_PATH] [OUTPUT_NAME] ``` -| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay | -| --- | ---: | ---: | ---: | ---: | ---: | -| VILA-7B | 256 | 2e-5 | 1 | 4096 | 0 | -| VILA-13B | 256 | 2e-5 | 1 | 4096 | 0 | - - ### Step-2: Pretraining We use MMC4 and Coyo dataset to train VLM with interleaved image-text pairs. @@ -113,10 +147,6 @@ bash scripts/v1_5/paper/2_pretrain_mmc4_coyo.sh [CODE_PATH] [BASE_MODEL_PATH] [S The stage 2 script takes in four arguments. `CODE_PATH` is the absolute path to our VILA codebase, `BASE_MODEL_PATH` has similar meaning to what is presented in the stage 1 script. `STAGE1_PATH` points to the `OUTPUT_NAME` of stage 1 (i.e. where the stage 1 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that saves the pretraining checkpoint. The script we provided for this stage is executed on slurm, and we expect it to execute on 16 nodes (128 GPUs). -| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay | -| --- | ---: | ---: | ---: | ---: | ---: | -| VILA-7B | 1024 | 5e-5 | 1 | 4096 | 0 | -| VILA-13B | 1024 | 5e-5 | 1 | 4096 | 0 | ### Step-3: Supervised fine-tuning This is the last stage of VILA training, in which we tune the model to follow multimodal instructions on a subset of M3IT, FLAN and ShareGPT4V. This stage runs on a 8xA100 node. @@ -126,26 +156,27 @@ bash scripts/v1_5/paper/3_sft.sh [STAGE2_PATH] [OUTPUT_NAME] ``` The stage 3 script takes in two arguments. `STAGE2_PATH` points to the `OUTPUT_NAME` of the stage 2 script (i.e. where the stage 2 checkpoint is stored). `OUTPUT_NAME` is the desired folder name under `checkpoints` that stores the final checkpoint. -| Hyperparameter | Global Batch Size | Learning rate | Epochs | Max length | Weight decay | -| --- | ---: | ---: | ---: | ---: | ---: | -| VILA-7B | 128 | 2e-5 | 1 | 4096 | 0 | -| VILA-13B | 128 | 2e-5 | 1 | 4096 | 0 | +## Evaluations -### Training with fewer GPUs -To train with fewer GPUs/nodes, you can reduce the `per_device_train_batch_size` and increase the `gradient_accumulation_steps` accordingly. As long as the global batch size same (`per_device_train_batch_size` x `gradient_accumulation_steps` x `num_gpus`) are kept the same, the training precision will not be affected. +### Image Benchmarks +You can follow [Llava1.5 eval](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) to download all datasets. After downloading all datasets, please put them under `playground/data/eval`. -Stage 1 completes within 3.5 (7B) - 5.5 (13B) hours on 8xA100, Stage 2 completes within 30 hours on 128xA100 for VILA-7B, and stage 3 completes in 25 (7B) - 40 (13B) hours on 8xA100. +Please make the following changes to the MME evaluation script. Please search for: -See [data_prepare/README.md](data_prepare/README.md) for more information about how to prepare datasets. +```python +data_path='MME_Benchmark_release_version' +``` -## Evaluations +and replace it with: -You can follow [Llava1.5 eval](https://github.com/haotian-liu/LLaVA/blob/main/docs/Evaluation.md) to download all datasets. After downloading all datasets, please put them under `playground/data/eval`. +```python +data_path=os.path.join(script_dir, 'MME_Benchmark_release_version') +``` We provide a push-the-button script to perform evaluation on all 10 datasets that do not require GPT-assisted evaluation: ```bash -./scripts/v1_5/eval/eval_all.sh [CHECKPOINT_PATH] [MODEL_NAME] +./scripts/v1_5/eval/eval_all.sh [CHECKPOINT_PATH] [MODEL_NAME] [CONV_MODE] ``` This script takes in two parameters, `CHECKPOINT_PATH` points to the stage 3 model checkpoint, and `MODEL_NAME` will be the name of evaluation results. @@ -163,26 +194,37 @@ python scripts/v1_5/eval/copy_predictions.py [MODEL_NAME] You will be able to find the predictions under `playground/data/predictions_upload/[MODEL_NAME]` after executing this script. + +### Video Benchmarks + +Please follow the evaluation steps in [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA/blob/main/TRAIN_AND_VALIDATE.md#data-for-validating) for dataset preparation. + +```bash +./scripts/v1_5/eval/video_chatgpt/run_all.sh [CHECKPOINT_PATH] [MODEL_NAME] [CONV_MODE] +./scripts/v1_5/eval/video_chatgpt/eval_all.sh [MODEL_NAME] +``` + + ## Inference We provide snippets for quick inference with user prompts and images. -VILA-7B inference: +Llama-3-VILA1.5-8B inference: ```bash -python -W ignore llava/eval/run_llava.py \ - --model-path Efficient-Large-Model/VILA-7B \ +python -W ignore llava/eval/run_vila.py \ + --model-path Efficient-Large-Model/Llama-3-VILA1.5-8b \ --conv-mode vicuna_v1 \ --query "\n Please describe the traffic condition." \ - --image-file "demo_images/av.png" + --image-file "av.png" ``` -VILA-13B inference: +VILA1.5-3B video inference: ```bash -python -W ignore llava/eval/run_llava.py \ - --model-path Efficient-Large-Model/VILA-13B \ +python -W ignore llava/eval/run_vila.py \ + --model-path Efficient-Large-Model/VILA1.5-3b \ --conv-mode vicuna_v1 \ - --query "\n Please describe the traffic condition." \ - --image-file "demo_images/av.png" + --query "