From 3fa5f408610427abb872257bc31d5d50a257739c Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:18:49 +0900 Subject: [PATCH 01/15] [gpt4v_vqa] add gpt4v_vqa node --- gpt4v_vqa/CMakeLists.txt | 21 +++ gpt4v_vqa/README.md | 31 ++++ gpt4v_vqa/cfg/GPT4V.cfg | 12 ++ gpt4v_vqa/launch/vqa.launch | 12 ++ gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 177 +++++++++++++++++++++++ gpt4v_vqa/package.xml | 27 ++++ gpt4v_vqa/requirements.txt | 4 + 7 files changed, 284 insertions(+) create mode 100644 gpt4v_vqa/CMakeLists.txt create mode 100644 gpt4v_vqa/README.md create mode 100644 gpt4v_vqa/cfg/GPT4V.cfg create mode 100644 gpt4v_vqa/launch/vqa.launch create mode 100644 gpt4v_vqa/node_scripts/gpt4v_vqa_node.py create mode 100644 gpt4v_vqa/package.xml create mode 100644 gpt4v_vqa/requirements.txt diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt new file mode 100644 index 0000000000..8336d59532 --- /dev/null +++ b/gpt4v_vqa/CMakeLists.txt @@ -0,0 +1,21 @@ +cmake_minimum_required(VERSION 3.0.2) +project(gpt4v_vqa) + +find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure) + +catkin_generate_virtualenv( + PYTHON_INTERPRETER python3.8 + USE_SYSTEM_PACKAGES FALSE + CHECK_VENV FALSE +) + +generate_dynamic_reconfigure_options( + cfg/GPT4V.cfg +) + +catkin_package() + +catkin_install_python( + PROGRAMS + node_scripts/gpt4v_vqa_node.py + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) \ No newline at end of file diff --git a/gpt4v_vqa/README.md b/gpt4v_vqa/README.md new file mode 100644 index 0000000000..4e6586cda1 --- /dev/null +++ b/gpt4v_vqa/README.md @@ -0,0 +1,31 @@ +# GPT4V VQA + +This repository offers a ROS Node with GPT4V model. + +## Installation + +```bash +catkin build gpt4v_vqa +``` + +## Usage + +```bash +roslaunch gpt4v_vqa vqa.launch api_key:= +``` + +## Nodes + +### gpt4v_vqa + +#### Subscribed Topics + +* **`~image`** ([sensor_msgs/Image]) + + The image used for VQA as default image. + +#### Action Servers + +* **`~inference_server`** ([jsk_recognition_msgs/VQATaskAction]) + + The action server for VQA. \ No newline at end of file diff --git a/gpt4v_vqa/cfg/GPT4V.cfg b/gpt4v_vqa/cfg/GPT4V.cfg new file mode 100644 index 0000000000..21c4bf3431 --- /dev/null +++ b/gpt4v_vqa/cfg/GPT4V.cfg @@ -0,0 +1,12 @@ +#!/usr/bin/env python +PACKAGE = "gpt4v_vqa" + +from dynamic_reconfigure.parameter_generator_catkin import * + +gen = ParameterGenerator() + +gen.add("max_height", int_t, 0, "Maximum image height", 480, 0, 1080) +gen.add("max_width", int_t, 0, "Maximum image width", 640, 0, 1920) +gen.add("detail_level", str_t, 0, "Detail level of GPT4V API", "low") + +exit(gen.generate(PACKAGE, "gpt4v_vqa", "GPT4V")) \ No newline at end of file diff --git a/gpt4v_vqa/launch/vqa.launch b/gpt4v_vqa/launch/vqa.launch new file mode 100644 index 0000000000..24ea32641f --- /dev/null +++ b/gpt4v_vqa/launch/vqa.launch @@ -0,0 +1,12 @@ + + + + + + + + + api_key: $(arg openai_api_key) + + + \ No newline at end of file diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py new file mode 100644 index 0000000000..4c66f2612f --- /dev/null +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python + +import base64 +from typing import Dict, Optional + +import actionlib +import cv2 +import numpy as np +import requests +import rospy +from cv_bridge import CvBridge +from dynamic_reconfigure.server import Server +from sensor_msgs.msg import Image + +from gpt4v_vqa.cfg import GPT4VConfig +from jsk_recognition_msgs.msg import ( + QuestionAndAnswerText, + VQATaskAction, + VQATaskGoal, + VQATaskResult, +) + + +class GPT4VClientNode(object): + def __init__(self, api_key: str): + self.api_key = api_key + # Configuration + self.max_height: int = 480 + self.max_width: int = 640 + self.detail_level: str = "low" + # Node variables + self.default_img: Optional[np.ndarray] = None + self.sub = rospy.Subscriber("~image", Image, self._image_cb) + self.param_srv = Server(GPT4VConfig, self.config_cb) + self.ac = actionlib.SimpleActionServer( + "~inference_server", VQATaskAction, self._ac_cb, auto_start=False + ) + self.ac.start() + + def config_cb(self, config, level): + """Dynamic reconfigure callback""" + self.set_max_size(config["max_height"], config["max_width"]) + self.detail_level = config["detail_level"] + return config + + def set_max_size(self, max_height: int, max_width: int): + """Set max size of image to send to API + + Args: + max_height (int): max height + max_width (int): max width + """ + self.max_height = max_height + self.max_width = max_width + + def resize_image(self, image: np.ndarray) -> np.ndarray: + """Resize image to maximum size configuration + + Args: + image (np.ndarray): image to resize + + Returns: + np.ndarray: resized image + """ + height, width, num_channel = image.shape + if height > self.max_height or width > self.max_width: + scale = min(self.max_height / height, self.max_width / width) + image = cv2.resize( + image, + (int(width * scale), int(height * scale), num_channel), + interpolation=cv2.INTER_AREA, + ) + return image + + def _image_cb(self, msg: Image): + image = CvBridge().imgmsg_to_cv2(msg) + self.default_img = image + + def _ac_cb(self, goal: VQATaskGoal): + """Action callback + + Args: + goal (VQATaskAction): action goal + """ + rospy.loginfo("Received goal") + result = VQATaskResult() + + if goal.image is not None: + image = CvBridge().imgmsg_to_cv2(goal.image) + elif goal.compressed_image is not None: + image = CvBridge().compressed_imgmsg_to_cv2(goal.compressed_image) + else: + if self.default_img is not None: + image = self.default_img + else: + rospy.logerr("Image is empty") + self.ac.set_aborted(result) + return + image = self.resize_image(image) + for question in goal.questions: + response = self._get_multimodal_response(question, image) + if response is None: + rospy.logerr(f"Failed to get response from question {question}") + continue + if "choices" not in response or len(response["choices"]) == 0: + rospy.logerr(f"No choices in response: {response}") + continue + answer = response["choices"][0]["message"]["content"] + result.result.results.append( + QuestionAndAnswerText(question=question, answer=answer) + ) + if len(result.result.results) == 0: + rospy.logerr("No answers found") + self.ac.set_aborted(result) + return + else: + self.ac.set_succeeded(result) + + def _get_multimodal_response( + self, + question: str, + image: np.ndarray, + max_tokens: int = 300, + detail: str = "low", + ) -> Optional[Dict]: + """Get response from GPT-4-Vision API + + Args: + question (str): question to ask + image (np.ndarray): image to ask question about + max_tokens (int, optional): max tokens to use for output. Defaults to 300. Which is about $0.01 at 2024-01-09. (See https://openai.com/pricing) + detail (str, optional): detail level. Defaults to "low". See https://platform.openai.com/docs/guides/vision/managing-images for details. + + Returns: + Dict: response from API""" + base64_image = base64.b64encode(cv2.imencode(".jpg", image)[1]).decode("utf-8") + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + } + + payload = { + "model": "gpt-4-vision-preview", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": question}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{base64_image}", + "defail": detail, + }, + }, + ], + } + ], + "max_tokens": max_tokens, + } + try: + response = requests.post( + "https://api.openai.com/v1/chat/completions", + headers=headers, + json=payload, + ) + return response.json() + except requests.exceptions.RequestException as e: + rospy.logerr(e) + return None + + +if __name__ == "__main__": + rospy.init_node("vqa") + api_key = rospy.get_param("~api_key") + GPT4VClientNode(api_key) + rospy.spin() diff --git a/gpt4v_vqa/package.xml b/gpt4v_vqa/package.xml new file mode 100644 index 0000000000..3d53e11c7e --- /dev/null +++ b/gpt4v_vqa/package.xml @@ -0,0 +1,27 @@ + + + gpt4v_vqa + 1.2.17 + The gpt4v_vqa package + + Koki Shinjo + Kei Okada + + BSD + + catkin + + catkin_virtualenv + actionlib + dynamic_reconfigure + jsk_recognition_msgs + + catkin_virtualenv + actionlib + dynamic_reconfigure + jsk_recognition_msgs + + + requirements.txt + + \ No newline at end of file diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt new file mode 100644 index 0000000000..8b33391093 --- /dev/null +++ b/gpt4v_vqa/requirements.txt @@ -0,0 +1,4 @@ +requests +openai +opencv-python +cvbridge3 \ No newline at end of file From 786d963a0aa0278741041903a63eb492a3d4f6df Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:25:58 +0900 Subject: [PATCH 02/15] [gpt4v_vqa] fix bug --- gpt4v_vqa/launch/vqa.launch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt4v_vqa/launch/vqa.launch b/gpt4v_vqa/launch/vqa.launch index 24ea32641f..9e508b308a 100644 --- a/gpt4v_vqa/launch/vqa.launch +++ b/gpt4v_vqa/launch/vqa.launch @@ -1,6 +1,6 @@ - + From 7d18b1988c846bc3ddebf5919516f65094c7be70 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:33:25 +0900 Subject: [PATCH 03/15] [gpt4v_vqa] bugfix for catkin_virtualenv --- gpt4v_vqa/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt index 8336d59532..4d75fe8ea2 100644 --- a/gpt4v_vqa/CMakeLists.txt +++ b/gpt4v_vqa/CMakeLists.txt @@ -6,6 +6,7 @@ find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure) catkin_generate_virtualenv( PYTHON_INTERPRETER python3.8 USE_SYSTEM_PACKAGES FALSE + ISOLATE_REQUIREMENTS TRUE CHECK_VENV FALSE ) @@ -18,4 +19,5 @@ catkin_package() catkin_install_python( PROGRAMS node_scripts/gpt4v_vqa_node.py - DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}) \ No newline at end of file + DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION} +) \ No newline at end of file From 063d9043d735e220c457761930fe02fd59b34a7a Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:40:28 +0900 Subject: [PATCH 04/15] [gpt4v_vqa] use ros_numpy and add node# --- gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 18 ++++++++---------- gpt4v_vqa/node_scripts/vqa_interpreter.py | 23 +++++++++++++++++++++++ 2 files changed, 31 insertions(+), 10 deletions(-) create mode 100755 gpt4v_vqa/node_scripts/vqa_interpreter.py diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py index 4c66f2612f..3776498571 100644 --- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -7,18 +7,14 @@ import cv2 import numpy as np import requests +import ros_numpy import rospy -from cv_bridge import CvBridge from dynamic_reconfigure.server import Server from sensor_msgs.msg import Image from gpt4v_vqa.cfg import GPT4VConfig -from jsk_recognition_msgs.msg import ( - QuestionAndAnswerText, - VQATaskAction, - VQATaskGoal, - VQATaskResult, -) +from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction, + VQATaskGoal, VQATaskResult) class GPT4VClientNode(object): @@ -73,7 +69,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray: return image def _image_cb(self, msg: Image): - image = CvBridge().imgmsg_to_cv2(msg) + image = ros_numpy.numpify(msg) self.default_img = image def _ac_cb(self, goal: VQATaskGoal): @@ -86,9 +82,11 @@ def _ac_cb(self, goal: VQATaskGoal): result = VQATaskResult() if goal.image is not None: - image = CvBridge().imgmsg_to_cv2(goal.image) + image = ros_numpy.numpify(goal.image) elif goal.compressed_image is not None: - image = CvBridge().compressed_imgmsg_to_cv2(goal.compressed_image) + rospy.logerr(f"Compressed image is not supported.") + self.ac.set_aborted(result) + return else: if self.default_img is not None: image = self.default_img diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py new file mode 100755 index 0000000000..6ac3473975 --- /dev/null +++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python + +import actionlib +import rospy +from jsk_recognition.msg import VQATaskAction, VQATaskGoal + +if __name__ == "__main__": + rospy.init_node("vqa_interpreter") + + client = actionlib.SimpleActionClient("/vqa/inference_server", VQATaskAction) + + while not rospy.is_shutdown(): + question = input("Enter a question: ") + if question == "exit": + break + goal = VQATaskGoal() + goal.questions = [question] + client.send_goal(goal) + if client.wait_for_result(timeout=rospy.Duration(30.0)): + result = client.get_result() + print(result) + else: + print("Timeout") From 36348008191fd10f7ba929129b932e3d89cf615c Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:42:23 +0900 Subject: [PATCH 05/15] [gpt4v_vqa] update requirements --- gpt4v_vqa/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt index 8b33391093..0df5dd2bf4 100644 --- a/gpt4v_vqa/requirements.txt +++ b/gpt4v_vqa/requirements.txt @@ -1,4 +1,4 @@ requests openai opencv-python -cvbridge3 \ No newline at end of file +rosnumpy \ No newline at end of file From c03dcdf6f5a42b9af03e2f7113dffbbe8e4702f6 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:49:19 +0900 Subject: [PATCH 06/15] [gpt4v_vqa] bugfix --- gpt4v_vqa/node_scripts/vqa_interpreter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py index 6ac3473975..3cc7fcd28d 100755 --- a/gpt4v_vqa/node_scripts/vqa_interpreter.py +++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py @@ -2,7 +2,7 @@ import actionlib import rospy -from jsk_recognition.msg import VQATaskAction, VQATaskGoal +from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal if __name__ == "__main__": rospy.init_node("vqa_interpreter") From 454f4f6843d0c7896be23a4e85fea88a8765ac11 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:52:57 +0900 Subject: [PATCH 07/15] [gpt4v_vqa] add euslisp and python utils --- gpt4v_vqa/CMakeLists.txt | 2 ++ gpt4v_vqa/euslisp/utils.l | 0 gpt4v_vqa/python/gpt4v_vqa/__init__.py | 0 gpt4v_vqa/setup.py | 7 +++++++ 4 files changed, 9 insertions(+) create mode 100644 gpt4v_vqa/euslisp/utils.l create mode 100644 gpt4v_vqa/python/gpt4v_vqa/__init__.py create mode 100644 gpt4v_vqa/setup.py diff --git a/gpt4v_vqa/CMakeLists.txt b/gpt4v_vqa/CMakeLists.txt index 4d75fe8ea2..aab6dc26c1 100644 --- a/gpt4v_vqa/CMakeLists.txt +++ b/gpt4v_vqa/CMakeLists.txt @@ -3,6 +3,8 @@ project(gpt4v_vqa) find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure) +catkin_python_setup() + catkin_generate_virtualenv( PYTHON_INTERPRETER python3.8 USE_SYSTEM_PACKAGES FALSE diff --git a/gpt4v_vqa/euslisp/utils.l b/gpt4v_vqa/euslisp/utils.l new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gpt4v_vqa/python/gpt4v_vqa/__init__.py b/gpt4v_vqa/python/gpt4v_vqa/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/gpt4v_vqa/setup.py b/gpt4v_vqa/setup.py new file mode 100644 index 0000000000..12dc413107 --- /dev/null +++ b/gpt4v_vqa/setup.py @@ -0,0 +1,7 @@ +from distutils.core import setup + +from catkin_pkg.python_setup import generate_distutils_setup + +d = generate_distutils_setup(packages=["gpt4v_vqa"], package_dir={"": "python"}) + +setup(**d) From b9486cb6b8011710bbfab25ed247c8dca67384a7 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:53:15 +0900 Subject: [PATCH 08/15] [gpt4v_vqa] bugfix --- gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py index 3776498571..4c3eb21667 100644 --- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -10,11 +10,15 @@ import ros_numpy import rospy from dynamic_reconfigure.server import Server +from jsk_recognition_msgs.msg import ( + QuestionAndAnswerText, + VQATaskAction, + VQATaskGoal, + VQATaskResult, +) from sensor_msgs.msg import Image from gpt4v_vqa.cfg import GPT4VConfig -from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction, - VQATaskGoal, VQATaskResult) class GPT4VClientNode(object): @@ -81,9 +85,9 @@ def _ac_cb(self, goal: VQATaskGoal): rospy.loginfo("Received goal") result = VQATaskResult() - if goal.image is not None: + if len(goal.image.data) > 0: image = ros_numpy.numpify(goal.image) - elif goal.compressed_image is not None: + elif len(goal.compressed_image.data) > 0: rospy.logerr(f"Compressed image is not supported.") self.ac.set_aborted(result) return From 2dbb8def2f9dd020e0ce4a2bc672abcdcc9a6f98 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:56:13 +0900 Subject: [PATCH 09/15] [gpt4v_vqa] update requirements --- gpt4v_vqa/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/gpt4v_vqa/requirements.txt b/gpt4v_vqa/requirements.txt index 0df5dd2bf4..6e98da90e8 100644 --- a/gpt4v_vqa/requirements.txt +++ b/gpt4v_vqa/requirements.txt @@ -1,4 +1,5 @@ requests openai +numpy==1.19.5 opencv-python rosnumpy \ No newline at end of file From bb4b28e02ae9dba562e8553a83e5e8214a72d69a Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:56:18 +0900 Subject: [PATCH 10/15] [gpt4v_vqa] bugfix --- gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py index 4c3eb21667..1c20d794fa 100644 --- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -67,7 +67,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray: scale = min(self.max_height / height, self.max_width / width) image = cv2.resize( image, - (int(width * scale), int(height * scale), num_channel), + (int(width * scale), int(height * scale)), interpolation=cv2.INTER_AREA, ) return image @@ -108,10 +108,10 @@ def _ac_cb(self, goal: VQATaskGoal): rospy.logerr(f"No choices in response: {response}") continue answer = response["choices"][0]["message"]["content"] - result.result.results.append( + result.result.result.append( QuestionAndAnswerText(question=question, answer=answer) ) - if len(result.result.results) == 0: + if len(result.result.result) == 0: rospy.logerr("No answers found") self.ac.set_aborted(result) return From 1383e369ee9c1100d113e65ff7f3f0ecf57ac99b Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 22:58:52 +0900 Subject: [PATCH 11/15] [gpt4v_vqa] bugfix --- gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py index 1c20d794fa..aafcdeef95 100644 --- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -74,6 +74,7 @@ def resize_image(self, image: np.ndarray) -> np.ndarray: def _image_cb(self, msg: Image): image = ros_numpy.numpify(msg) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) self.default_img = image def _ac_cb(self, goal: VQATaskGoal): @@ -87,6 +88,7 @@ def _ac_cb(self, goal: VQATaskGoal): if len(goal.image.data) > 0: image = ros_numpy.numpify(goal.image) + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) elif len(goal.compressed_image.data) > 0: rospy.logerr(f"Compressed image is not supported.") self.ac.set_aborted(result) From 8c3ecde104a9dc53674bf56c0792fa3c06f7345d Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 23:10:40 +0900 Subject: [PATCH 12/15] [gpt4v_vqa] add VQAClient lib --- gpt4v_vqa/node_scripts/vqa_interpreter.py | 17 ++++++---------- gpt4v_vqa/python/gpt4v_vqa/__init__.py | 24 +++++++++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/gpt4v_vqa/node_scripts/vqa_interpreter.py b/gpt4v_vqa/node_scripts/vqa_interpreter.py index 3cc7fcd28d..3c2ff48696 100755 --- a/gpt4v_vqa/node_scripts/vqa_interpreter.py +++ b/gpt4v_vqa/node_scripts/vqa_interpreter.py @@ -1,23 +1,18 @@ #!/usr/bin/env python -import actionlib import rospy -from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal + +from gpt4v_vqa import VQAClient if __name__ == "__main__": rospy.init_node("vqa_interpreter") - client = actionlib.SimpleActionClient("/vqa/inference_server", VQATaskAction) + client = VQAClient() + client.wait_for_server() while not rospy.is_shutdown(): question = input("Enter a question: ") if question == "exit": break - goal = VQATaskGoal() - goal.questions = [question] - client.send_goal(goal) - if client.wait_for_result(timeout=rospy.Duration(30.0)): - result = client.get_result() - print(result) - else: - print("Timeout") + result = client.vqa(question) + print(result) diff --git a/gpt4v_vqa/python/gpt4v_vqa/__init__.py b/gpt4v_vqa/python/gpt4v_vqa/__init__.py index e69de29bb2..a69cee0e27 100644 --- a/gpt4v_vqa/python/gpt4v_vqa/__init__.py +++ b/gpt4v_vqa/python/gpt4v_vqa/__init__.py @@ -0,0 +1,24 @@ +import actionlib +import rospy +from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal + + +class VQAClient: + def __init__(self, action_name="/vqa/inference_server"): + self.client = actionlib.SimpleActionClient(action_name, VQATaskAction) + + def wait_for_server(self, timeout=10.0): + self.client.wait_for_server(timeout=rospy.Duration(timeout)) + + def vqa(self, question, image=None, timeout=30.0): + goal = VQATaskGoal() + goal.questions = [question] + if image is not None: + goal.image = image + self.client.send_goal(goal) + if self.client.wait_for_result(timeout=rospy.Duration(timeout)): + result = self.client.get_result() + return result + else: + rospy.logwarn("Timeout") + return None From ff9d5c085156e3ae9eb026fe322fbb74e348b8a9 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 23:16:09 +0900 Subject: [PATCH 13/15] [gpt4v_vqa] add token and quota calculation --- gpt4v_vqa/node_scripts/gpt4v_vqa_node.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py index aafcdeef95..8e21dc658c 100644 --- a/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py +++ b/gpt4v_vqa/node_scripts/gpt4v_vqa_node.py @@ -10,16 +10,15 @@ import ros_numpy import rospy from dynamic_reconfigure.server import Server -from jsk_recognition_msgs.msg import ( - QuestionAndAnswerText, - VQATaskAction, - VQATaskGoal, - VQATaskResult, -) +from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction, + VQATaskGoal, VQATaskResult) from sensor_msgs.msg import Image from gpt4v_vqa.cfg import GPT4VConfig +COST_PER_TOKEN_FOR_INPUT = 0.01 / 1000 +COST_PER_TOKEN_FOR_OUTPUT = 0.03 / 1000 + class GPT4VClientNode(object): def __init__(self, api_key: str): @@ -113,6 +112,10 @@ def _ac_cb(self, goal: VQATaskGoal): result.result.result.append( QuestionAndAnswerText(question=question, answer=answer) ) + input_tokens = response["usage"]["prompt_tokens"] + output_tokens = response["usage"]["completion_tokens"] + rospy.loginfo(f"Used tokens for this completion is {input_tokens} for input and {output_tokens} for output.") + rospy.loginfo(f"Which costs ${input_tokens * COST_PER_TOKEN_FOR_INPUT} USD for input and ${output_tokens * COST_PER_TOKEN_FOR_OUTPUT} USD for output.") if len(result.result.result) == 0: rospy.logerr("No answers found") self.ac.set_aborted(result) From a963fb97aa88b50c474f0d3765f2a86d2a581ea9 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 23:33:05 +0900 Subject: [PATCH 14/15] [gpt4v_vqa] update eusilsp lib and add simple script --- gpt4v_vqa/euslisp/run_simple_vqa.l | 9 ++++++++ gpt4v_vqa/euslisp/utils.l | 33 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) create mode 100755 gpt4v_vqa/euslisp/run_simple_vqa.l diff --git a/gpt4v_vqa/euslisp/run_simple_vqa.l b/gpt4v_vqa/euslisp/run_simple_vqa.l new file mode 100755 index 0000000000..bb724d4dbb --- /dev/null +++ b/gpt4v_vqa/euslisp/run_simple_vqa.l @@ -0,0 +1,9 @@ +#!/usr/bin/env roseus + +(load "package://gpt4v_vqa/euslisp/utils.l") + +(ros::roseus "run_simple_vqa") + +(print "Question is \"Please describe the image briefly.\"") +(print (format nil "Answer is ~a" (run-vqa "Please describe the image briefly."))) +(exit) \ No newline at end of file diff --git a/gpt4v_vqa/euslisp/utils.l b/gpt4v_vqa/euslisp/utils.l index e69de29bb2..3c7acd4998 100644 --- a/gpt4v_vqa/euslisp/utils.l +++ b/gpt4v_vqa/euslisp/utils.l @@ -0,0 +1,33 @@ +(ros::load-ros-manifest "jsk_recognition_msgs") + +(defparameter *vqa-action* nil) + +(defun init-vqa-action (&key (timeout 10)) + (unless *vqa-action* + (setq *vqa-action* + (instance ros::simple-action-client :init + "/vqa/inference_server" jsk_recognition_msgs::VQATaskAction))) + (send *vqa-action* :wait-for-server timeout) + ) + +(defun run-vqa (question &optional msg_image &key (timeout 30)) + "run vqa action client. return answer string or nil if failed." + (if (not (init-vqa-action)) + (return-from run-vqa nil)) + (let* (result answer + (action-goal (instance jsk_recognition_msgs::VQATaskGoal :init))) + (send action-goal :questions (list question)) + (if msg_image + (send action-goal :image msg_image) + ) + (send *vqa-action* :send-goal action-goal) + (unless (send *vqa-action* :wait-for-result :timeout timeout) + (send *vqa-action* :cancel-all-goals) + (ros::ros-error "No result returned from /vqa action server") + (return-from run-vqa nil)) + (setq result (send *vqa-action* :get-result)) + (if (and result (> (length (send result :result :result)) 0)) + (send (elt (send result :result :result) 0) :answer) + nil) + ) + ) \ No newline at end of file From 156c4eb4f2f6d5ac917f53b7091841dfff66a077 Mon Sep 17 00:00:00 2001 From: Koki Shinjo Date: Tue, 9 Jan 2024 23:47:39 +0900 Subject: [PATCH 15/15] [gpt4v_vqa] update README.md --- gpt4v_vqa/README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/gpt4v_vqa/README.md b/gpt4v_vqa/README.md index 4e6586cda1..a5573726da 100644 --- a/gpt4v_vqa/README.md +++ b/gpt4v_vqa/README.md @@ -11,13 +11,21 @@ catkin build gpt4v_vqa ## Usage ```bash -roslaunch gpt4v_vqa vqa.launch api_key:= +roslaunch gpt4v_vqa vqa.launch api_key:= VQA_INPUT_IMAGE:= +``` + +And from other terminal + +```bash +$ rosrun gpt4v_vqa vqa_interpreter.py ``` ## Nodes ### gpt4v_vqa +This node is a ROS wrapper for GPT4V model. Its behavior is similar to [VQA node](../jsk_perception/node_scripts/vqa_node.py). But there is a difference that this node does not support continuous inference. This node use API only when action server is called. + #### Subscribed Topics * **`~image`** ([sensor_msgs/Image])