meta-llama · roywei · Oct 6, 2024 · Oct 6, 2024
diff --git a/recipes/quickstart/inference/local_inference/README.md b/recipes/quickstart/inference/local_inference/README.md
@@ -96,6 +96,14 @@ python inference.py --model_name <training_config.output_dir> --prompt_file <tes
 
 ```
 
+Note: if you want to convert llama vision models, set the `multimodal` argument to True, and inference using [multimodal inference](../../../../recipes/quickstart/inference/local_inference/README.md#multimodal-inference).
+```bash
+# convert finetuned model
+python -m llama_recipes.inference.checkpoint_converter_fsdp_hf --fsdp_checkpoint_path  PATH/to/FSDP/Checkpoints --consolidated_model_path PATH/to/save/checkpoints --HF_model_path_or_name PATH/or/HF/model_name --multimodal True
+# multimodal inference
+python multi_modal_infer.py --image_path "./resources/image.jpg" --prompt_text "Describe this image" --temperature 0.5 --top_p 0.8 --model_name PATH/to/save/checkpoints
+```
+
 ## Inference on large models like Meta Llama 405B
 The FP8 quantized variants of Meta Llama (i.e. meta-llama/Meta-Llama-3.1-405B-FP8 and meta-llama/Meta-Llama-3.1-405B-Instruct-FP8) can be executed on a single node with 8x80GB H100 using the scripts located in this folder.
 To run the unquantized Meta Llama 405B variants (i.e. meta-llama/Meta-Llama-3.1-405B and meta-llama/Meta-Llama-3.1-405B-Instruct) we need to use a multi-node setup for inference. The llama-recipes inference script currently does not allow multi-node inference. To run this model you can use vLLM with pipeline and tensor parallelism as showed in [this example](../../../3p_integrations/vllm/README.md).
diff --git a/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py b/src/llama_recipes/inference/checkpoint_converter_fsdp_hf.py
@@ -25,7 +25,8 @@
 def main(
     fsdp_checkpoint_path="", # Path to FSDP Sharded model checkpoints
     consolidated_model_path="", # Path to save the HF converted model checkpoints
-    HF_model_path_or_name="" # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf)
+    HF_model_path_or_name="", # Path/ name of the HF model that include config.json and tokenizer_config.json (e.g. meta-llama/Llama-2-7b-chat-hf)
+    multimodal=False # Use MllamaConfig for llama 3.2 vision models
     ):
 
     try:
@@ -50,7 +51,7 @@ def main(
 
 
     #load the HF model definition from config
-    model_def = load_llama_from_config(HF_model_path_or_name)
+    model_def = load_llama_from_config(HF_model_path_or_name, multimodal)
     print("model is loaded from config")
     #load the FSDP sharded checkpoints into the model
     model = load_sharded_model_single_gpu(model_def, fsdp_checkpoint_path)

diff --git a/src/llama_recipes/inference/model_utils.py b/src/llama_recipes/inference/model_utils.py
@@ -4,7 +4,7 @@
 from llama_recipes.utils.config_utils import update_config
 from llama_recipes.configs import quantization_config  as QUANT_CONFIG
 from peft import PeftModel
-from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
+from transformers import AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig, MllamaForConditionalGeneration, MllamaConfig
 from warnings import warn
 
 # Function to load the main model for text generation
@@ -41,9 +41,11 @@ def load_peft_model(model, peft_model):
     return peft_model
 
 # Loading the model from config to load FSDP checkpoints into that
-def load_llama_from_config(config_path):
-    model_config = LlamaConfig.from_pretrained(config_path) 
-    model = LlamaForCausalLM(config=model_config)
+def load_llama_from_config(config_path, multimodal):
+    if multimodal:
+        model_config = MllamaConfig.from_pretrained(config_path)
+        model = MllamaForConditionalGeneration(config=model_config)
+    else:
+        model_config = LlamaConfig.from_pretrained(config_path) 
+        model = LlamaForCausalLM(config=model_config)
     return model
-
-