gheinrich
diff --git a/‎llava/eval/model_videochatgpt_benchmark.py‎
Lines changed: 206 additions & 0 deletions b/‎llava/eval/model_videochatgpt_benchmark.py‎
Lines changed: 206 additions & 0 deletions
diff --git a/‎llava/eval/model_vqa_video.py‎
Lines changed: 2 additions & 1 deletion b/‎llava/eval/model_vqa_video.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎…eval/video/eval_benchmark_correctness.py‎ ‎…al/video/eval_benchmark_1_correctness.py‎llava/eval/video/eval_benchmark_correctness.py renamed to llava/eval/video/eval_benchmark_1_correctness.py
Lines changed: 39 additions & 15 deletions b/‎…eval/video/eval_benchmark_correctness.py‎ ‎…al/video/eval_benchmark_1_correctness.py‎llava/eval/video/eval_benchmark_correctness.py renamed to llava/eval/video/eval_benchmark_1_correctness.py
Lines changed: 39 additions & 15 deletions
@@ -0,0 +1,206 @@
+import argparse
+import json
+import math
+import os
+
+import shortuuid
+import torch
+from decord import VideoReader
+from PIL import Image
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+
+from llava import conversation as conversation_lib
+from llava.constants import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
+from llava.conversation import SeparatorStyle, conv_templates
+from llava.data.dataset import LazySupervisedDataset
+from llava.mm_utils import (
+    KeywordsStoppingCriteria,
+    get_model_name_from_path,
+    is_gemma_tokenizer,
+    process_images,
+    tokenizer_image_token,
+)
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)  # integer division
+    return [lst[i : i + chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+# Custom dataset class
+class CustomDataset(Dataset):
+    def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
+        self.questions = questions
+        self.image_folder = image_folder
+        self.tokenizer = tokenizer
+        self.image_processor = image_processor
+        self.model_config = model_config
+        if hasattr(model_config, "num_video_frames") and model_config.num_video_frames is not None:
+            self.num_video_frames = model_config.num_video_frames
+        else:
+            self.num_video_frames = 8
+
+        if hasattr(model_config, "fps") and model_config.fps is not None:
+            self.fps = model_config.fps
+        else:
+            self.fps = 0.0
+
+    def __getitem__(self, index):
+        line = self.questions[index]
+
+        # load visual
+        video_name = line["video_name"]
+        video_formats = [".mp4", ".avi", ".mov", ".mkv", ".webm"]
+        prepend = ["", "v_"]
+        video_path = None
+        for fmt in video_formats:
+            for pre in prepend:
+                temp_path = os.path.join(self.image_folder, f"{pre}{video_name}{fmt}")
+                if os.path.exists(temp_path):
+                    video_path = temp_path
+                    break
+            if video_path is not None:
+                break
+
+        images, frames_loaded = LazySupervisedDataset._load_video(video_path, self.num_video_frames, self.fps, args)
+        image_tensor = process_images(images, self.image_processor, self.model_config)
+        num_frames_loaded_successfully = len(images)
+
+        if "Q" in line:
+            questions = [line["Q"]]
+        elif "Q1" in line:
+            questions = [line["Q1"], line["Q2"]]
+
+        input_ids_list = []
+        for qs in questions:
+            qs = qs.replace("<image>\n", "").replace("\n<image>", "").replace("<image>", "")
+            qs = qs.replace("<video>\n", "").replace("\n<video>", "").replace("<video>", "")
+            qs = "<image>\n" * num_frames_loaded_successfully + qs
+
+            conv = conv_templates[args.conv_mode].copy()
+            conv.append_message(conv.roles[0], qs)
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+            input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
+            input_ids_list.append(input_ids)
+
+        return input_ids_list, image_tensor
+
+    def __len__(self):
+        return len(self.questions)
+
+
+def collate_fn(batch):
+    input_ids, image_tensors = zip(*batch)
+    input_ids = list(input_ids)
+    image_tensors = torch.stack(image_tensors, dim=0)
+    return input_ids, image_tensors
+
+
+# DataLoader
+def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
+    assert batch_size == 1, "batch_size must be 1"
+    dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
+    data_loader = DataLoader(
+        dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn
+    )
+    return data_loader
+
+
+def get_key(sample_set):
+    question = sample_set["Q"] if "Q" in sample_set else (sample_set["Q1"] + sample_set["Q2"])
+    k = question + sample_set["A"] + sample_set["video_name"]
+    return k
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, model_name, args.model_base)
+    args.image_processor = image_processor
+
+    conversation_lib.default_conversation = conversation_lib.conv_templates[args.conv_mode]
+
+    gt_questions = json.load(open(args.gt_file))
+    gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
+
+    answers_file = os.path.join(args.output_dir, f"{args.output_name}.jsonl")
+    os.makedirs(args.output_dir, exist_ok=True)
+    if os.path.exists(answers_file):
+        with open(answers_file) as f:
+            cache_ans = f.readlines()
+            cache_set = list(json.loads(line) for line in cache_ans)
+            cache_set = {get_key(line) for line in cache_set}
+    else:
+        cache_set = set()
+
+    ans_file = open(answers_file, "a")
+    data_loader = create_data_loader(gt_questions, args.image_folder, tokenizer, image_processor, model.config)
+
+    for (input_ids_list, image_tensor), sample_q in tqdm(zip(data_loader, gt_questions), total=len(gt_questions)):
+        input_ids_list = input_ids_list[0]
+        sample_set = sample_q
+        if get_key(sample_set) in cache_set:
+            print(f"skip exist answer")
+            continue
+        outputs_list = []
+        for input_ids in input_ids_list:
+            input_ids = input_ids.to(device="cuda", non_blocking=True).unsqueeze(0)
+
+            with torch.inference_mode():
+                output_ids = model.generate(
+                    input_ids,
+                    images=image_tensor.to(dtype=torch.float16, device="cuda", non_blocking=True),
+                    do_sample=True if args.temperature > 0 else False,
+                    temperature=args.temperature,
+                    top_p=args.top_p,
+                    num_beams=args.num_beams,
+                    max_new_tokens=args.max_new_tokens,
+                    use_cache=True,
+                )
+
+            outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
+            outputs_list.append(outputs)
+
+        if len(outputs_list) == 1:
+            sample_set["pred"] = outputs_list[0]
+        elif len(outputs_list) == 2:
+            sample_set["pred1"] = outputs_list[0]
+            sample_set["pred2"] = outputs_list[1]
+
+        ans_file.write(json.dumps(sample_set) + "\n")
+        ans_file.flush()
+    ans_file.close()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
+    parser.add_argument("--model-base", type=str, default=None)
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument(
+        "--gt_file", help="Path to the ground truth file containing question and answer.", required=True
+    )
+    parser.add_argument("--output_dir", help="Directory to save the model results JSON.", required=True)
+    parser.add_argument("--output_name", help="Name of the file for storing results JSON.", required=True)
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    parser.add_argument("--top_p", type=float, default=None)
+    parser.add_argument("--num_beams", type=int, default=1)
+    parser.add_argument("--max_new_tokens", type=int, default=1024)
+    args = parser.parse_args()
+
+    eval_model(args)
@@ -89,6 +89,7 @@ def get_model_output(model, image_processor, tokenizer, video_path, qs, args):
     if conv.sep_style == SeparatorStyle.LLAMA_3:
         keywords = [conv.sep, conv.sep2]
         stopping_criteria = [KeywordsStoppingCriteria(keywords, tokenizer, input_ids)]
+        stop_str = None
     else:
         stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
         keywords = [stop_str]
@@ -111,7 +112,7 @@ def get_model_output(model, image_processor, tokenizer, video_path, qs, args):
 
     outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
     outputs = outputs.strip()
-    if outputs.endswith(stop_str):
+    if stop_str is not None and outputs.endswith(stop_str):
         outputs = outputs[: -len(stop_str)]
     outputs = outputs.strip()
     return outputs
 
@@ -1,21 +1,26 @@
-# import openai
 import argparse
 import ast
 import json
 import os
 from multiprocessing.pool import Pool
 
 import openai
+from openai import BadRequestError
+
+from .utils import get_client
+
+client = None
 
 
 def parse_args():
     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
     parser.add_argument("--pred_path", required=True, help="The path to file containing prediction.")
     parser.add_argument("--output_dir", required=True, help="The path to save annotation json files.")
     parser.add_argument("--output_json", required=True, help="The path to save annotation final combined json file.")
-    parser.add_argument("--api_key", help="OpenAI API key.")
-    parser.add_argument("--api_base", default="", type=str, help="OpenAI API base.")
+    parser.add_argument("--api_key", required=True, help="OpenAI API key.")
+    parser.add_argument("--api_base", default=None, type=str, help="OpenAI API base.")
     parser.add_argument("--num_tasks", required=True, type=int, help="Number of splits.")
+    parser.add_argument("--model", default="gpt-3.5-turbo", type=str, help="OpenAI model.")
     args = parser.parse_args()
     return args
 
@@ -26,6 +31,9 @@ def annotate(prediction_set, caption_files, output_dir, args):
     Returns a score for correctness.
     """
     # Set the OpenAI API key.
+    openai.api_key = args.api_key
+    if args.api_base is not None:
+        openai.api_base = args.api_base
     for file in caption_files:
         key = file[:-5]  # Strip file extension
         qa_set = prediction_set[key]
@@ -34,8 +42,9 @@ def annotate(prediction_set, caption_files, output_dir, args):
         pred = qa_set["pred"]
         try:
             # Compute the correctness score
-            completion = openai.chat.completions.create(
-                model="gpt-4",
+            # completion = create_chat_completion(
+            completion = client.chat.completions.create(
+                model=args.model,
                 messages=[
                     {
                         "role": "system",
@@ -63,13 +72,20 @@ def annotate(prediction_set, caption_files, output_dir, args):
             )
             # Convert response to a Python dictionary.
             response_message = completion.choices[0].message.content
-            # response_message = completion["choices"][0]["message"]["content"]
             response_dict = ast.literal_eval(response_message)
             result_qa_pair = [response_dict, qa_set]
 
             # Save the question-answer pairs to a json file.
             with open(f"{output_dir}/{key}.json", "w") as f:
                 json.dump(result_qa_pair, f)
+        except BadRequestError as e:
+            print(f"BadRequestError processing file '{key}': {e}")
+            response_dict = {"score": 0}
+            qa_set["pred"] = ""
+            result_qa_pair = [response_dict, qa_set]
+            # Save the question-answer pairs to a json file.
+            with open(f"{output_dir}/{key}.json", "w") as f:
+                json.dump(result_qa_pair, f)
 
         except Exception as e:
             print(f"Error processing file '{key}': {e}")
@@ -83,7 +99,7 @@ def main():
     args = parse_args()
 
     file = open(args.pred_path)
-    pred_contents = json.load(file)
+    pred_contents = [eval(i.strip()) for i in file.readlines()]
 
     # Dictionary to store the count of occurrences for each video_id
     video_id_counts = {}
@@ -122,14 +138,16 @@ def main():
         prediction_set[id] = qa_set
 
     # Set the OpenAI API key.
-    # openai.api_key = args.api_key
-    openai.api_key = os.environ["OPENAI_API_KEY"]
+    openai.api_key = args.api_key
     num_tasks = args.num_tasks
 
     # While loop to ensure that all captions are processed.
     while True:
         try:
             # Files that have not been processed yet.
+            global client
+            client = get_client()
+
             completed_files = os.listdir(output_dir)
             print(f"completed_files: {len(completed_files)}")
 
@@ -149,12 +167,8 @@ def main():
             task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
 
             # Use a pool of workers to process the files in parallel.
-            # with Pool() as pool:
-            #    pool.starmap(annotate, task_args)
-            from tqdm import tqdm
-
-            for task_arg in tqdm(task_args):
-                annotate(*task_arg)
+            with Pool() as pool:
+                pool.starmap(annotate, task_args)
 
         except Exception as e:
             print(f"Error: {e}")
@@ -188,6 +202,16 @@ def main():
 
     print("Average score for correctness:", average_score)
 
+    result_file = os.path.join(os.path.dirname(os.path.dirname(args.output_json)), "results.json")
+    sample_set = {"gpt": args.model, "task": "1_correctness", "score": average_score}
+    with open(result_file, "a") as f:
+        f.write(json.dumps(sample_set) + "\n")
+
 
 if __name__ == "__main__":
+    import time
+
+    start_time = time.time()
     main()
+    end_time = time.time()
+    print(f"took {end_time - start_time} seconds")