jsk-ros-pkg · sktometometo · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024 · Jan 9, 2024
diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.0.2)
+project(gpt4v_vqa)
+
+find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure)
+
+catkin_python_setup()
+
+catkin_generate_virtualenv(
+    PYTHON_INTERPRETER python3.8
+    USE_SYSTEM_PACKAGES FALSE
+    ISOLATE_REQUIREMENTS TRUE 
+    CHECK_VENV FALSE
+)
+
+generate_dynamic_reconfigure_options(
+  cfg/GPT4V.cfg
+)
+
+catkin_package()
+
+catkin_install_python(
+  PROGRAMS
+    node_scripts/gpt4v_vqa_node.py
+  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+)
diff --git a/gpt4v_vqa/README.md b/gpt4v_vqa/README.md
@@ -0,0 +1,39 @@
+# GPT4V VQA
+
+This repository offers a ROS Node with GPT4V model.
+
+## Installation
+
+```bash
+catkin build gpt4v_vqa
+```
+
+## Usage
+
+```bash
+roslaunch gpt4v_vqa vqa.launch api_key:=<YOUR_API_KEY> VQA_INPUT_IMAGE:=<IMAGE TOPIC>
+```
+
+And from other terminal
+
+```bash
+$ rosrun gpt4v_vqa vqa_interpreter.py
+```
+
+## Nodes
+
+### gpt4v_vqa
+
+This node is a ROS wrapper for GPT4V model. Its behavior is similar to [VQA node](../jsk_perception/node_scripts/vqa_node.py). But there is a difference that this node does not support continuous inference. This node use API only when action server is called.
+
+#### Subscribed Topics
+
+* **`~image`** ([sensor_msgs/Image])
+
+    The image used for VQA as default image.
+
+#### Action Servers
+
+* **`~inference_server`** ([jsk_recognition_msgs/VQATaskAction])
+
+    The action server for VQA.
diff --git a/gpt4v_vqa/cfg/GPT4V.cfg b/gpt4v_vqa/cfg/GPT4V.cfg
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+PACKAGE = "gpt4v_vqa"
+
+from dynamic_reconfigure.parameter_generator_catkin import *
+
+gen = ParameterGenerator()
+
+gen.add("max_height", int_t, 0, "Maximum image height", 480, 0, 1080)
+gen.add("max_width", int_t, 0, "Maximum image width", 640, 0, 1920)
+gen.add("detail_level", str_t, 0, "Detail level of GPT4V API", "low")
+
+exit(gen.generate(PACKAGE, "gpt4v_vqa", "GPT4V"))
diff --git a/gpt4v_vqa/euslisp/run_simple_vqa.l b/gpt4v_vqa/euslisp/run_simple_vqa.l
@@ -0,0 +1,9 @@
+#!/usr/bin/env roseus
+
+(load "package://gpt4v_vqa/euslisp/utils.l")
+
+(ros::roseus "run_simple_vqa")
+
+(print "Question is \"Please describe the image briefly.\"")
+(print (format nil "Answer is ~a" (run-vqa "Please describe the image briefly.")))
+(exit)
diff --git a/gpt4v_vqa/euslisp/utils.l b/gpt4v_vqa/euslisp/utils.l
@@ -0,0 +1,33 @@
+(ros::load-ros-manifest "jsk_recognition_msgs")
+
+(defparameter *vqa-action* nil)
+
+(defun init-vqa-action (&key (timeout 10))
+  (unless *vqa-action*
+    (setq *vqa-action*
+          (instance ros::simple-action-client :init
+                    "/vqa/inference_server" jsk_recognition_msgs::VQATaskAction)))
+  (send *vqa-action* :wait-for-server timeout)
+  )
+
+(defun run-vqa (question &optional msg_image &key (timeout 30))
+  "run vqa action client. return answer string or nil if failed."
+  (if (not (init-vqa-action))
+    (return-from run-vqa nil))
+  (let* (result answer
+         (action-goal (instance jsk_recognition_msgs::VQATaskGoal :init)))
+    (send action-goal :questions (list question))
+    (if msg_image
+        (send action-goal :image msg_image)
+    )
+    (send *vqa-action* :send-goal action-goal)
+    (unless (send *vqa-action* :wait-for-result :timeout timeout)
+      (send *vqa-action* :cancel-all-goals)
+      (ros::ros-error "No result returned from /vqa action server")
+      (return-from run-vqa nil))
+    (setq result (send *vqa-action* :get-result))
+    (if (and result (> (length (send result :result :result)) 0))
+        (send (elt (send result :result :result) 0) :answer)
+        nil)
+    )
+  )
diff --git a/gpt4v_vqa/launch/vqa.launch b/gpt4v_vqa/launch/vqa.launch
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<launch>
+    <arg name="openai_api_key" default="$(optenv OPENAI_API_KEY)" />
+    <arg name="VQA_INPUT_IMAGE" default="vqa_image" />
+
+    <node name="vqa" pkg="gpt4v_vqa" type="gpt4v_vqa_node.py" output="screen">
+        <remap from="~image" to="$(arg VQA_INPUT_IMAGE)" />
+        <rosparam subst_value="true">
+            api_key: $(arg openai_api_key)
+        </rosparam>
+    </node>
+</launch>
diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python
+
+import base64
+from typing import Dict, Optional
+
+import actionlib
+import cv2
+import numpy as np
+import requests
+import ros_numpy
+import rospy
+from dynamic_reconfigure.server import Server
+from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction,
+                                      VQATaskGoal, VQATaskResult)
+from sensor_msgs.msg import Image
+
+from gpt4v_vqa.cfg import GPT4VConfig
+
+COST_PER_TOKEN_FOR_INPUT = 0.01 / 1000
+COST_PER_TOKEN_FOR_OUTPUT = 0.03 / 1000
+
+
+class GPT4VClientNode(object):
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        # Configuration
+        self.max_height: int = 480
+        self.max_width: int = 640
+        self.detail_level: str = "low"
+        # Node variables
+        self.default_img: Optional[np.ndarray] = None
+        self.sub = rospy.Subscriber("~image", Image, self._image_cb)
+        self.param_srv = Server(GPT4VConfig, self.config_cb)
+        self.ac = actionlib.SimpleActionServer(
+            "~inference_server", VQATaskAction, self._ac_cb, auto_start=False
+        )
+        self.ac.start()
+
+    def config_cb(self, config, level):
+        """Dynamic reconfigure callback"""
+        self.set_max_size(config["max_height"], config["max_width"])
+        self.detail_level = config["detail_level"]
+        return config
+
+    def set_max_size(self, max_height: int, max_width: int):
+        """Set max size of image to send to API
+
+        Args:
+            max_height (int): max height
+            max_width (int): max width
+        """
+        self.max_height = max_height
+        self.max_width = max_width
+
+    def resize_image(self, image: np.ndarray) -> np.ndarray:
+        """Resize image to maximum size configuration
+
+        Args:
+            image (np.ndarray): image to resize
+
+        Returns:
+            np.ndarray: resized image
+        """
+        height, width, num_channel = image.shape
+        if height > self.max_height or width > self.max_width:
+            scale = min(self.max_height / height, self.max_width / width)
+            image = cv2.resize(
+                image,
+                (int(width * scale), int(height * scale)),
+                interpolation=cv2.INTER_AREA,
+            )
+        return image
+
+    def _image_cb(self, msg: Image):
+        image = ros_numpy.numpify(msg)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        self.default_img = image
+
+    def _ac_cb(self, goal: VQATaskGoal):
+        """Action callback
+
+        Args:
+            goal (VQATaskAction): action goal
+        """
+        rospy.loginfo("Received goal")
+        result = VQATaskResult()
+
+        if len(goal.image.data) > 0:
+            image = ros_numpy.numpify(goal.image)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        elif len(goal.compressed_image.data) > 0:
+            rospy.logerr(f"Compressed image is not supported.")
+            self.ac.set_aborted(result)
+            return
+        else:
+            if self.default_img is not None:
+                image = self.default_img
+            else:
+                rospy.logerr("Image is empty")
+                self.ac.set_aborted(result)
+                return
+        image = self.resize_image(image)
+        for question in goal.questions:
+            response = self._get_multimodal_response(question, image)
+            if response is None:
+                rospy.logerr(f"Failed to get response from question {question}")
+                continue
+            if "choices" not in response or len(response["choices"]) == 0:
+                rospy.logerr(f"No choices in response: {response}")
+                continue
+            answer = response["choices"][0]["message"]["content"]
+            result.result.result.append(
+                QuestionAndAnswerText(question=question, answer=answer)
+            )
+            input_tokens = response["usage"]["prompt_tokens"]
+            output_tokens = response["usage"]["completion_tokens"]
+            rospy.loginfo(f"Used tokens for this completion is {input_tokens} for input and {output_tokens} for output.")
+            rospy.loginfo(f"Which costs ${input_tokens * COST_PER_TOKEN_FOR_INPUT} USD for input and ${output_tokens * COST_PER_TOKEN_FOR_OUTPUT} USD for output.")
+        if len(result.result.result) == 0:
+            rospy.logerr("No answers found")
+            self.ac.set_aborted(result)
+            return
+        else:
+            self.ac.set_succeeded(result)
+
+    def _get_multimodal_response(
+        self,
+        question: str,
+        image: np.ndarray,
+        max_tokens: int = 300,
+        detail: str = "low",
+    ) -> Optional[Dict]:
+        """Get response from GPT-4-Vision API
+
+        Args:
+            question (str): question to ask
+            image (np.ndarray): image to ask question about
+            max_tokens (int, optional): max tokens to use for output. Defaults to 300. Which is about $0.01 at 2024-01-09. (See https://openai.com/pricing)
+            detail (str, optional): detail level. Defaults to "low". See https://platform.openai.com/docs/guides/vision/managing-images for details.
+
+        Returns:
+            Dict: response from API"""
+        base64_image = base64.b64encode(cv2.imencode(".jpg", image)[1]).decode("utf-8")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+
+        payload = {
+            "model": "gpt-4-vision-preview",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}",
+                                "defail": detail,
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": max_tokens,
+        }
+        try:
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            rospy.logerr(e)
+            return None
+
+
+if __name__ == "__main__":
+    rospy.init_node("vqa")
+    api_key = rospy.get_param("~api_key")
+    GPT4VClientNode(api_key)
+    rospy.spin()
diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+
+import rospy
+
+from gpt4v_vqa import VQAClient
+
+if __name__ == "__main__":
+    rospy.init_node("vqa_interpreter")
+
+    client = VQAClient()
+    client.wait_for_server()
+
+    while not rospy.is_shutdown():
+        question = input("Enter a question: ")
+        if question == "exit":
+            break
+        result = client.vqa(question)
+        print(result)
diff --git a/gpt4v_vqa/package.xml b/gpt4v_vqa/package.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>gpt4v_vqa</name>
+  <version>1.2.17</version>
+  <description>The gpt4v_vqa package</description>
+
+  <author email="[email protected]">Koki Shinjo</author>
+  <maintainer email="[email protected]">Kei Okada</maintainer>
+
+  <license>BSD</license>
+
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <build_depend>catkin_virtualenv</build_depend>
+  <build_depend>actionlib</build_depend>
+  <build_depend>dynamic_reconfigure</build_depend>
+  <build_depend>jsk_recognition_msgs</build_depend>
+
+  <exec_depend>catkin_virtualenv</exec_depend>
+  <exec_depend>actionlib</exec_depend>
+  <exec_depend>dynamic_reconfigure</exec_depend>
+  <exec_depend>jsk_recognition_msgs</exec_depend>
+
+  <export>
+    <pip_requirements>requirements.txt</pip_requirements>
+  </export>
+</package>
diff --git a/gpt4v_vqa/python/gpt4v_vqa/__init__.py b/gpt4v_vqa/python/gpt4v_vqa/__init__.py
@@ -0,0 +1,24 @@
+import actionlib
+import rospy
+from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal
+
+
+class VQAClient:
+    def __init__(self, action_name="/vqa/inference_server"):
+        self.client = actionlib.SimpleActionClient(action_name, VQATaskAction)
+
+    def wait_for_server(self, timeout=10.0):
+        self.client.wait_for_server(timeout=rospy.Duration(timeout))
+
+    def vqa(self, question, image=None, timeout=30.0):
+        goal = VQATaskGoal()
+        goal.questions = [question]
+        if image is not None:
+            goal.image = image
+        self.client.send_goal(goal)
+        if self.client.wait_for_result(timeout=rospy.Duration(timeout)):
+            result = self.client.get_result()
+            return result
+        else:
+            rospy.logwarn("Timeout")
+            return None