From 3fa5f408610427abb872257bc31d5d50a257739c Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:18:49 +0900
Subject: [PATCH 01/15] [gpt4v_vqa] add gpt4v_vqa node

---
 gpt4v_vqa/CMakeLists.txt                 |  21 +++
 gpt4v_vqa/README.md                      |  31 ++++
 gpt4v_vqa/cfg/GPT4V.cfg                  |  12 ++
 gpt4v_vqa/launch/vqa.launch              |  12 ++
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 177 +++++++++++++++++++++++
 gpt4v_vqa/package.xml                    |  27 ++++
 gpt4v_vqa/requirements.txt               |   4 +
 7 files changed, 284 insertions(+)
 create mode 100644 gpt4v_vqa/CMakeLists.txt
 create mode 100644 gpt4v_vqa/README.md
 create mode 100644 gpt4v_vqa/cfg/GPT4V.cfg
 create mode 100644 gpt4v_vqa/launch/vqa.launch
 create mode 100644 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
 create mode 100644 gpt4v_vqa/package.xml
 create mode 100644 gpt4v_vqa/requirements.txt
diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt
new file mode 100644
index 0000000000..8336d59532
--- /dev/null
+++ b/gpt4v_vqa/CMakeLists.txt
@@ -0,0 +1,21 @@
+cmake_minimum_required(VERSION 3.0.2)
+project(gpt4v_vqa)
+
+find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure)
+
+catkin_generate_virtualenv(
+    PYTHON_INTERPRETER python3.8
+    USE_SYSTEM_PACKAGES FALSE
+    CHECK_VENV FALSE
+)
+
+generate_dynamic_reconfigure_options(
+  cfg/GPT4V.cfg
+)
+
+catkin_package()
+
+catkin_install_python(
+  PROGRAMS
+    node_scripts/gpt4v_vqa_node.py
+  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION})
\ No newline at end of file
diff --git a/gpt4v_vqa/README.md b/gpt4v_vqa/README.md
new file mode 100644
index 0000000000..4e6586cda1
--- /dev/null
+++ b/gpt4v_vqa/README.md
@@ -0,0 +1,31 @@
+# GPT4V VQA
+
+This repository offers a ROS Node with GPT4V model.
+
+## Installation
+
+```bash
+catkin build gpt4v_vqa
+```
+
+## Usage
+
+```bash
+roslaunch gpt4v_vqa vqa.launch api_key:=<YOUR_API_KEY>
+```
+
+## Nodes
+
+### gpt4v_vqa
+
+#### Subscribed Topics
+
+* **`~image`** ([sensor_msgs/Image])
+
+    The image used for VQA as default image.
+
+#### Action Servers
+
+* **`~inference_server`** ([jsk_recognition_msgs/VQATaskAction])
+
+    The action server for VQA.
\ No newline at end of file
diff --git a/gpt4v_vqa/cfg/GPT4V.cfg b/gpt4v_vqa/cfg/GPT4V.cfg
new file mode 100644
index 0000000000..21c4bf3431
--- /dev/null
+++ b/gpt4v_vqa/cfg/GPT4V.cfg
@@ -0,0 +1,12 @@
+#!/usr/bin/env python
+PACKAGE = "gpt4v_vqa"
+
+from dynamic_reconfigure.parameter_generator_catkin import *
+
+gen = ParameterGenerator()
+
+gen.add("max_height", int_t, 0, "Maximum image height", 480, 0, 1080)
+gen.add("max_width", int_t, 0, "Maximum image width", 640, 0, 1920)
+gen.add("detail_level", str_t, 0, "Detail level of GPT4V API", "low")
+
+exit(gen.generate(PACKAGE, "gpt4v_vqa", "GPT4V"))
\ No newline at end of file
diff --git a/gpt4v_vqa/launch/vqa.launch b/gpt4v_vqa/launch/vqa.launch
new file mode 100644
index 0000000000..24ea32641f
--- /dev/null
+++ b/gpt4v_vqa/launch/vqa.launch
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<launch>
+    <arg name="openai_api_key" default="$(OPTENV OPENAI_API_KEY)" />
+    <arg name="VQA_INPUT_IMAGE" default="vqa_image" />
+
+    <node name="vqa" pkg="gpt4v_vqa" type="gpt4v_vqa_node.py" output="screen">
+        <remap from="~image" to="$(arg VQA_INPUT_IMAGE)" />
+        <rosparam subst_value="true">
+            api_key: $(arg openai_api_key)
+        </rosparam>
+    </node>
+</launch>
\ No newline at end of file
diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
new file mode 100644
index 0000000000..4c66f2612f
--- /dev/null
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -0,0 +1,177 @@
+#!/usr/bin/env python
+
+import base64
+from typing import Dict, Optional
+
+import actionlib
+import cv2
+import numpy as np
+import requests
+import rospy
+from cv_bridge import CvBridge
+from dynamic_reconfigure.server import Server
+from sensor_msgs.msg import Image
+
+from gpt4v_vqa.cfg import GPT4VConfig
+from jsk_recognition_msgs.msg import (
+    QuestionAndAnswerText,
+    VQATaskAction,
+    VQATaskGoal,
+    VQATaskResult,
+)
+
+
+class GPT4VClientNode(object):
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        # Configuration
+        self.max_height: int = 480
+        self.max_width: int = 640
+        self.detail_level: str = "low"
+        # Node variables
+        self.default_img: Optional[np.ndarray] = None
+        self.sub = rospy.Subscriber("~image", Image, self._image_cb)
+        self.param_srv = Server(GPT4VConfig, self.config_cb)
+        self.ac = actionlib.SimpleActionServer(
+            "~inference_server", VQATaskAction, self._ac_cb, auto_start=False
+        )
+        self.ac.start()
+
+    def config_cb(self, config, level):
+        """Dynamic reconfigure callback"""
+        self.set_max_size(config["max_height"], config["max_width"])
+        self.detail_level = config["detail_level"]
+        return config
+
+    def set_max_size(self, max_height: int, max_width: int):
+        """Set max size of image to send to API
+
+        Args:
+            max_height (int): max height
+            max_width (int): max width
+        """
+        self.max_height = max_height
+        self.max_width = max_width
+
+    def resize_image(self, image: np.ndarray) -> np.ndarray:
+        """Resize image to maximum size configuration
+
+        Args:
+            image (np.ndarray): image to resize
+
+        Returns:
+            np.ndarray: resized image
+        """
+        height, width, num_channel = image.shape
+        if height > self.max_height or width > self.max_width:
+            scale = min(self.max_height / height, self.max_width / width)
+            image = cv2.resize(
+                image,
+                (int(width * scale), int(height * scale), num_channel),
+                interpolation=cv2.INTER_AREA,
+            )
+        return image
+
+    def _image_cb(self, msg: Image):
+        image = CvBridge().imgmsg_to_cv2(msg)
+        self.default_img = image
+
+    def _ac_cb(self, goal: VQATaskGoal):
+        """Action callback
+
+        Args:
+            goal (VQATaskAction): action goal
+        """
+        rospy.loginfo("Received goal")
+        result = VQATaskResult()
+
+        if goal.image is not None:
+            image = CvBridge().imgmsg_to_cv2(goal.image)
+        elif goal.compressed_image is not None:
+            image = CvBridge().compressed_imgmsg_to_cv2(goal.compressed_image)
+        else:
+            if self.default_img is not None:
+                image = self.default_img
+            else:
+                rospy.logerr("Image is empty")
+                self.ac.set_aborted(result)
+                return
+        image = self.resize_image(image)
+        for question in goal.questions:
+            response = self._get_multimodal_response(question, image)
+            if response is None:
+                rospy.logerr(f"Failed to get response from question {question}")
+                continue
+            if "choices" not in response or len(response["choices"]) == 0:
+                rospy.logerr(f"No choices in response: {response}")
+                continue
+            answer = response["choices"][0]["message"]["content"]
+            result.result.results.append(
+                QuestionAndAnswerText(question=question, answer=answer)
+            )
+        if len(result.result.results) == 0:
+            rospy.logerr("No answers found")
+            self.ac.set_aborted(result)
+            return
+        else:
+            self.ac.set_succeeded(result)
+
+    def _get_multimodal_response(
+        self,
+        question: str,
+        image: np.ndarray,
+        max_tokens: int = 300,
+        detail: str = "low",
+    ) -> Optional[Dict]:
+        """Get response from GPT-4-Vision API
+
+        Args:
+            question (str): question to ask
+            image (np.ndarray): image to ask question about
+            max_tokens (int, optional): max tokens to use for output. Defaults to 300. Which is about $0.01 at 2024-01-09. (See https://openai.com/pricing)
+            detail (str, optional): detail level. Defaults to "low". See https://platform.openai.com/docs/guides/vision/managing-images for details.
+
+        Returns:
+            Dict: response from API"""
+        base64_image = base64.b64encode(cv2.imencode(".jpg", image)[1]).decode("utf-8")
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+        }
+
+        payload = {
+            "model": "gpt-4-vision-preview",
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": question},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/jpeg;base64,{base64_image}",
+                                "defail": detail,
+                            },
+                        },
+                    ],
+                }
+            ],
+            "max_tokens": max_tokens,
+        }
+        try:
+            response = requests.post(
+                "https://api.openai.com/v1/chat/completions",
+                headers=headers,
+                json=payload,
+            )
+            return response.json()
+        except requests.exceptions.RequestException as e:
+            rospy.logerr(e)
+            return None
+
+
+if __name__ == "__main__":
+    rospy.init_node("vqa")
+    api_key = rospy.get_param("~api_key")
+    GPT4VClientNode(api_key)
+    rospy.spin()
diff --git a/gpt4v_vqa/package.xml b/gpt4v_vqa/package.xml
new file mode 100644
index 0000000000..3d53e11c7e
--- /dev/null
+++ b/gpt4v_vqa/package.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+<package format="2">
+  <name>gpt4v_vqa</name>
+  <version>1.2.17</version>
+  <description>The gpt4v_vqa package</description>
+
+  <author email="shinjo@jsk.imi.i.u-tokyo.ac.jp">Koki Shinjo</author>
+  <maintainer email="k-okada@jsk.t.u-tokyo.ac.jp">Kei Okada</maintainer>
+
+  <license>BSD</license>
+
+  <buildtool_depend>catkin</buildtool_depend>
+
+  <build_depend>catkin_virtualenv</build_depend>
+  <build_depend>actionlib</build_depend>
+  <build_depend>dynamic_reconfigure</build_depend>
+  <build_depend>jsk_recognition_msgs</build_depend>
+
+  <exec_depend>catkin_virtualenv</exec_depend>
+  <exec_depend>actionlib</exec_depend>
+  <exec_depend>dynamic_reconfigure</exec_depend>
+  <exec_depend>jsk_recognition_msgs</exec_depend>
+
+  <export>
+    <pip_requirements>requirements.txt</pip_requirements>
+  </export>
+</package>
\ No newline at end of file
diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt
new file mode 100644
index 0000000000..8b33391093
--- /dev/null
+++ b/gpt4v_vqa/requirements.txt
@@ -0,0 +1,4 @@
+requests
+openai
+opencv-python
+cvbridge3
\ No newline at end of file

From 786d963a0aa0278741041903a63eb492a3d4f6df Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:25:58 +0900
Subject: [PATCH 02/15] [gpt4v_vqa] fix bug

---
 gpt4v_vqa/launch/vqa.launch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt4v_vqa/launch/vqa.launch b/gpt4v_vqa/launch/vqa.launch
index 24ea32641f..9e508b308a 100644
--- a/gpt4v_vqa/launch/vqa.launch
+++ b/gpt4v_vqa/launch/vqa.launch
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <launch>
-    <arg name="openai_api_key" default="$(OPTENV OPENAI_API_KEY)" />
+    <arg name="openai_api_key" default="$(optenv OPENAI_API_KEY)" />
     <arg name="VQA_INPUT_IMAGE" default="vqa_image" />
 
     <node name="vqa" pkg="gpt4v_vqa" type="gpt4v_vqa_node.py" output="screen">

From 7d18b1988c846bc3ddebf5919516f65094c7be70 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:33:25 +0900
Subject: [PATCH 03/15] [gpt4v_vqa] bugfix for catkin_virtualenv

---
 gpt4v_vqa/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt
index 8336d59532..4d75fe8ea2 100644
--- a/gpt4v_vqa/CMakeLists.txt
+++ b/gpt4v_vqa/CMakeLists.txt
@@ -6,6 +6,7 @@ find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure)
 catkin_generate_virtualenv(
     PYTHON_INTERPRETER python3.8
     USE_SYSTEM_PACKAGES FALSE
+    ISOLATE_REQUIREMENTS TRUE 
     CHECK_VENV FALSE
 )
 
@@ -18,4 +19,5 @@ catkin_package()
 catkin_install_python(
   PROGRAMS
     node_scripts/gpt4v_vqa_node.py
-  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION})
\ No newline at end of file
+  DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
+)
\ No newline at end of file

From 063d9043d735e220c457761930fe02fd59b34a7a Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:40:28 +0900
Subject: [PATCH 04/15] [gpt4v_vqa] use ros_numpy and add node#

---
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py  | 18 ++++++++----------
 gpt4v_vqa/node_scripts/vqa_interpreter.py | 23 +++++++++++++++++++++++
 2 files changed, 31 insertions(+), 10 deletions(-)
 create mode 100755 gpt4v_vqa/node_scripts/vqa_interpreter.py

diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
index 4c66f2612f..3776498571 100644
--- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -7,18 +7,14 @@
 import cv2
 import numpy as np
 import requests
+import ros_numpy
 import rospy
-from cv_bridge import CvBridge
 from dynamic_reconfigure.server import Server
 from sensor_msgs.msg import Image
 
 from gpt4v_vqa.cfg import GPT4VConfig
-from jsk_recognition_msgs.msg import (
-    QuestionAndAnswerText,
-    VQATaskAction,
-    VQATaskGoal,
-    VQATaskResult,
-)
+from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction,
+                                      VQATaskGoal, VQATaskResult)
 
 
 class GPT4VClientNode(object):
@@ -73,7 +69,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray:
         return image
 
     def _image_cb(self, msg: Image):
-        image = CvBridge().imgmsg_to_cv2(msg)
+        image = ros_numpy.numpify(msg)
         self.default_img = image
 
     def _ac_cb(self, goal: VQATaskGoal):
@@ -86,9 +82,11 @@ def _ac_cb(self, goal: VQATaskGoal):
         result = VQATaskResult()
 
         if goal.image is not None:
-            image = CvBridge().imgmsg_to_cv2(goal.image)
+            image = ros_numpy.numpify(goal.image)
         elif goal.compressed_image is not None:
-            image = CvBridge().compressed_imgmsg_to_cv2(goal.compressed_image)
+            rospy.logerr(f"Compressed image is not supported.")
+            self.ac.set_aborted(result)
+            return
         else:
             if self.default_img is not None:
                 image = self.default_img
diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py
new file mode 100755
index 0000000000..6ac3473975
--- /dev/null
+++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import actionlib
+import rospy
+from jsk_recognition.msg import VQATaskAction, VQATaskGoal
+
+if __name__ == "__main__":
+    rospy.init_node("vqa_interpreter")
+
+    client = actionlib.SimpleActionClient("/vqa/inference_server", VQATaskAction)
+
+    while not rospy.is_shutdown():
+        question = input("Enter a question: ")
+        if question == "exit":
+            break
+        goal = VQATaskGoal()
+        goal.questions = [question]
+        client.send_goal(goal)
+        if client.wait_for_result(timeout=rospy.Duration(30.0)):
+            result = client.get_result()
+            print(result)
+        else:
+            print("Timeout")

From 36348008191fd10f7ba929129b932e3d89cf615c Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:42:23 +0900
Subject: [PATCH 05/15] [gpt4v_vqa] update requirements

---
 gpt4v_vqa/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt
index 8b33391093..0df5dd2bf4 100644
--- a/gpt4v_vqa/requirements.txt
+++ b/gpt4v_vqa/requirements.txt
@@ -1,4 +1,4 @@
 requests
 openai
 opencv-python
-cvbridge3
\ No newline at end of file
+rosnumpy
\ No newline at end of file

From c03dcdf6f5a42b9af03e2f7113dffbbe8e4702f6 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:49:19 +0900
Subject: [PATCH 06/15] [gpt4v_vqa] bugfix

---
 gpt4v_vqa/node_scripts/vqa_interpreter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py
index 6ac3473975..3cc7fcd28d 100755
--- a/gpt4v_vqa/node_scripts/vqa_interpreter.py
+++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py
@@ -2,7 +2,7 @@
 
 import actionlib
 import rospy
-from jsk_recognition.msg import VQATaskAction, VQATaskGoal
+from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal
 
 if __name__ == "__main__":
     rospy.init_node("vqa_interpreter")

From 454f4f6843d0c7896be23a4e85fea88a8765ac11 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:52:57 +0900
Subject: [PATCH 07/15] [gpt4v_vqa] add euslisp and python utils

---
 gpt4v_vqa/CMakeLists.txt               | 2 ++
 gpt4v_vqa/euslisp/utils.l              | 0
 gpt4v_vqa/python/gpt4v_vqa/__init__.py | 0
 gpt4v_vqa/setup.py                     | 7 +++++++
 4 files changed, 9 insertions(+)
 create mode 100644 gpt4v_vqa/euslisp/utils.l
 create mode 100644 gpt4v_vqa/python/gpt4v_vqa/__init__.py
 create mode 100644 gpt4v_vqa/setup.py

diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt
index 4d75fe8ea2..aab6dc26c1 100644
--- a/gpt4v_vqa/CMakeLists.txt
+++ b/gpt4v_vqa/CMakeLists.txt
@@ -3,6 +3,8 @@ project(gpt4v_vqa)
 
 find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure)
 
+catkin_python_setup()
+
 catkin_generate_virtualenv(
     PYTHON_INTERPRETER python3.8
     USE_SYSTEM_PACKAGES FALSE
diff --git a/gpt4v_vqa/euslisp/utils.l b/gpt4v_vqa/euslisp/utils.l
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpt4v_vqa/python/gpt4v_vqa/__init__.py b/gpt4v_vqa/python/gpt4v_vqa/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/gpt4v_vqa/setup.py b/gpt4v_vqa/setup.py
new file mode 100644
index 0000000000..12dc413107
--- /dev/null
+++ b/gpt4v_vqa/setup.py
@@ -0,0 +1,7 @@
+from distutils.core import setup
+
+from catkin_pkg.python_setup import generate_distutils_setup
+
+d = generate_distutils_setup(packages=["gpt4v_vqa"], package_dir={"": "python"})
+
+setup(**d)

From b9486cb6b8011710bbfab25ed247c8dca67384a7 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:53:15 +0900
Subject: [PATCH 08/15] [gpt4v_vqa] bugfix

---
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
index 3776498571..4c3eb21667 100644
--- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -10,11 +10,15 @@
 import ros_numpy
 import rospy
 from dynamic_reconfigure.server import Server
+from jsk_recognition_msgs.msg import (
+    QuestionAndAnswerText,
+    VQATaskAction,
+    VQATaskGoal,
+    VQATaskResult,
+)
 from sensor_msgs.msg import Image
 
 from gpt4v_vqa.cfg import GPT4VConfig
-from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction,
-                                      VQATaskGoal, VQATaskResult)
 
 
 class GPT4VClientNode(object):
@@ -81,9 +85,9 @@ def _ac_cb(self, goal: VQATaskGoal):
         rospy.loginfo("Received goal")
         result = VQATaskResult()
 
-        if goal.image is not None:
+        if len(goal.image.data) > 0:
             image = ros_numpy.numpify(goal.image)
-        elif goal.compressed_image is not None:
+        elif len(goal.compressed_image.data) > 0:
             rospy.logerr(f"Compressed image is not supported.")
             self.ac.set_aborted(result)
             return

From 2dbb8def2f9dd020e0ce4a2bc672abcdcc9a6f98 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:56:13 +0900
Subject: [PATCH 09/15] [gpt4v_vqa] update requirements

---
 gpt4v_vqa/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt
index 0df5dd2bf4..6e98da90e8 100644
--- a/gpt4v_vqa/requirements.txt
+++ b/gpt4v_vqa/requirements.txt
@@ -1,4 +1,5 @@
 requests
 openai
+numpy==1.19.5
 opencv-python
 rosnumpy
\ No newline at end of file

From bb4b28e02ae9dba562e8553a83e5e8214a72d69a Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:56:18 +0900
Subject: [PATCH 10/15] [gpt4v_vqa] bugfix

---
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
index 4c3eb21667..1c20d794fa 100644
--- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -67,7 +67,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray:
             scale = min(self.max_height / height, self.max_width / width)
             image = cv2.resize(
                 image,
-                (int(width * scale), int(height * scale), num_channel),
+                (int(width * scale), int(height * scale)),
                 interpolation=cv2.INTER_AREA,
             )
         return image
@@ -108,10 +108,10 @@ def _ac_cb(self, goal: VQATaskGoal):
                 rospy.logerr(f"No choices in response: {response}")
                 continue
             answer = response["choices"][0]["message"]["content"]
-            result.result.results.append(
+            result.result.result.append(
                 QuestionAndAnswerText(question=question, answer=answer)
             )
-        if len(result.result.results) == 0:
+        if len(result.result.result) == 0:
             rospy.logerr("No answers found")
             self.ac.set_aborted(result)
             return

From 1383e369ee9c1100d113e65ff7f3f0ecf57ac99b Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 22:58:52 +0900
Subject: [PATCH 11/15] [gpt4v_vqa] bugfix

---
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
index 1c20d794fa..aafcdeef95 100644
--- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -74,6 +74,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray:
 
     def _image_cb(self, msg: Image):
         image = ros_numpy.numpify(msg)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         self.default_img = image
 
     def _ac_cb(self, goal: VQATaskGoal):
@@ -87,6 +88,7 @@ def _ac_cb(self, goal: VQATaskGoal):
 
         if len(goal.image.data) > 0:
             image = ros_numpy.numpify(goal.image)
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
         elif len(goal.compressed_image.data) > 0:
             rospy.logerr(f"Compressed image is not supported.")
             self.ac.set_aborted(result)

From 8c3ecde104a9dc53674bf56c0792fa3c06f7345d Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 23:10:40 +0900
Subject: [PATCH 12/15] [gpt4v_vqa] add VQAClient lib

---
 gpt4v_vqa/node_scripts/vqa_interpreter.py | 17 ++++++----------
 gpt4v_vqa/python/gpt4v_vqa/__init__.py    | 24 +++++++++++++++++++++++
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py
index 3cc7fcd28d..3c2ff48696 100755
--- a/gpt4v_vqa/node_scripts/vqa_interpreter.py
+++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py
@@ -1,23 +1,18 @@
 #!/usr/bin/env python
 
-import actionlib
 import rospy
-from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal
+
+from gpt4v_vqa import VQAClient
 
 if __name__ == "__main__":
     rospy.init_node("vqa_interpreter")
 
-    client = actionlib.SimpleActionClient("/vqa/inference_server", VQATaskAction)
+    client = VQAClient()
+    client.wait_for_server()
 
     while not rospy.is_shutdown():
         question = input("Enter a question: ")
         if question == "exit":
             break
-        goal = VQATaskGoal()
-        goal.questions = [question]
-        client.send_goal(goal)
-        if client.wait_for_result(timeout=rospy.Duration(30.0)):
-            result = client.get_result()
-            print(result)
-        else:
-            print("Timeout")
+        result = client.vqa(question)
+        print(result)
diff --git a/gpt4v_vqa/python/gpt4v_vqa/__init__.py b/gpt4v_vqa/python/gpt4v_vqa/__init__.py
index e69de29bb2..a69cee0e27 100644
--- a/gpt4v_vqa/python/gpt4v_vqa/__init__.py
+++ b/gpt4v_vqa/python/gpt4v_vqa/__init__.py
@@ -0,0 +1,24 @@
+import actionlib
+import rospy
+from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal
+
+
+class VQAClient:
+    def __init__(self, action_name="/vqa/inference_server"):
+        self.client = actionlib.SimpleActionClient(action_name, VQATaskAction)
+
+    def wait_for_server(self, timeout=10.0):
+        self.client.wait_for_server(timeout=rospy.Duration(timeout))
+
+    def vqa(self, question, image=None, timeout=30.0):
+        goal = VQATaskGoal()
+        goal.questions = [question]
+        if image is not None:
+            goal.image = image
+        self.client.send_goal(goal)
+        if self.client.wait_for_result(timeout=rospy.Duration(timeout)):
+            result = self.client.get_result()
+            return result
+        else:
+            rospy.logwarn("Timeout")
+            return None

From ff9d5c085156e3ae9eb026fe322fbb74e348b8a9 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 23:16:09 +0900
Subject: [PATCH 13/15] [gpt4v_vqa] add token and quota calculation

---
 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
index aafcdeef95..8e21dc658c 100644
--- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
+++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
@@ -10,16 +10,15 @@
 import ros_numpy
 import rospy
 from dynamic_reconfigure.server import Server
-from jsk_recognition_msgs.msg import (
-    QuestionAndAnswerText,
-    VQATaskAction,
-    VQATaskGoal,
-    VQATaskResult,
-)
+from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction,
+                                      VQATaskGoal, VQATaskResult)
 from sensor_msgs.msg import Image
 
 from gpt4v_vqa.cfg import GPT4VConfig
 
+COST_PER_TOKEN_FOR_INPUT = 0.01 / 1000
+COST_PER_TOKEN_FOR_OUTPUT = 0.03 / 1000
+
 
 class GPT4VClientNode(object):
     def __init__(self, api_key: str):
@@ -113,6 +112,10 @@ def _ac_cb(self, goal: VQATaskGoal):
             result.result.result.append(
                 QuestionAndAnswerText(question=question, answer=answer)
             )
+            input_tokens = response["usage"]["prompt_tokens"]
+            output_tokens = response["usage"]["completion_tokens"]
+            rospy.loginfo(f"Used tokens for this completion is {input_tokens} for input and {output_tokens} for output.")
+            rospy.loginfo(f"Which costs ${input_tokens * COST_PER_TOKEN_FOR_INPUT} USD for input and ${output_tokens * COST_PER_TOKEN_FOR_OUTPUT} USD for output.")
         if len(result.result.result) == 0:
             rospy.logerr("No answers found")
             self.ac.set_aborted(result)

From a963fb97aa88b50c474f0d3765f2a86d2a581ea9 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 23:33:05 +0900
Subject: [PATCH 14/15] [gpt4v_vqa] update eusilsp lib and add simple script

---
 gpt4v_vqa/euslisp/run_simple_vqa.l |  9 ++++++++
 gpt4v_vqa/euslisp/utils.l          | 33 ++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100755 gpt4v_vqa/euslisp/run_simple_vqa.l

diff --git a/gpt4v_vqa/euslisp/run_simple_vqa.l b/gpt4v_vqa/euslisp/run_simple_vqa.l
new file mode 100755
index 0000000000..bb724d4dbb
--- /dev/null
+++ b/gpt4v_vqa/euslisp/run_simple_vqa.l
@@ -0,0 +1,9 @@
+#!/usr/bin/env roseus
+
+(load "package://gpt4v_vqa/euslisp/utils.l")
+
+(ros::roseus "run_simple_vqa")
+
+(print "Question is \"Please describe the image briefly.\"")
+(print (format nil "Answer is ~a" (run-vqa "Please describe the image briefly.")))
+(exit)
\ No newline at end of file
diff --git a/gpt4v_vqa/euslisp/utils.l b/gpt4v_vqa/euslisp/utils.l
index e69de29bb2..3c7acd4998 100644
--- a/gpt4v_vqa/euslisp/utils.l
+++ b/gpt4v_vqa/euslisp/utils.l
@@ -0,0 +1,33 @@
+(ros::load-ros-manifest "jsk_recognition_msgs")
+
+(defparameter *vqa-action* nil)
+
+(defun init-vqa-action (&key (timeout 10))
+  (unless *vqa-action*
+    (setq *vqa-action*
+          (instance ros::simple-action-client :init
+                    "/vqa/inference_server" jsk_recognition_msgs::VQATaskAction)))
+  (send *vqa-action* :wait-for-server timeout)
+  )
+
+(defun run-vqa (question &optional msg_image &key (timeout 30))
+  "run vqa action client. return answer string or nil if failed."
+  (if (not (init-vqa-action))
+    (return-from run-vqa nil))
+  (let* (result answer
+         (action-goal (instance jsk_recognition_msgs::VQATaskGoal :init)))
+    (send action-goal :questions (list question))
+    (if msg_image
+        (send action-goal :image msg_image)
+    )
+    (send *vqa-action* :send-goal action-goal)
+    (unless (send *vqa-action* :wait-for-result :timeout timeout)
+      (send *vqa-action* :cancel-all-goals)
+      (ros::ros-error "No result returned from /vqa action server")
+      (return-from run-vqa nil))
+    (setq result (send *vqa-action* :get-result))
+    (if (and result (> (length (send result :result :result)) 0))
+        (send (elt (send result :result :result) 0) :answer)
+        nil)
+    )
+  )
\ No newline at end of file

From 156c4eb4f2f6d5ac917f53b7091841dfff66a077 Mon Sep 17 00:00:00 2001
From: Koki Shinjo <sktometometo@gmail.com>
Date: Tue, 9 Jan 2024 23:47:39 +0900
Subject: [PATCH 15/15] [gpt4v_vqa] update README.md

---
 gpt4v_vqa/README.md | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/gpt4v_vqa/README.md b/gpt4v_vqa/README.md
index 4e6586cda1..a5573726da 100644
--- a/gpt4v_vqa/README.md
+++ b/gpt4v_vqa/README.md
@@ -11,13 +11,21 @@ catkin build gpt4v_vqa
 ## Usage
 
 ```bash
-roslaunch gpt4v_vqa vqa.launch api_key:=<YOUR_API_KEY>
+roslaunch gpt4v_vqa vqa.launch api_key:=<YOUR_API_KEY> VQA_INPUT_IMAGE:=<IMAGE TOPIC>
+```
+
+And from other terminal
+
+```bash
+$ rosrun gpt4v_vqa vqa_interpreter.py
 ```
 
 ## Nodes
 
 ### gpt4v_vqa
 
+This node is a ROS wrapper for GPT4V model. Its behavior is similar to [VQA node](../jsk_perception/node_scripts/vqa_node.py). But there is a difference that this node does not support continuous inference. This node use API only when action server is called.
+
 #### Subscribed Topics
 
 * **`~image`** ([sensor_msgs/Image])