Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GPT4V VQA package #2812

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions gpt4v_vqa/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
cmake_minimum_required(VERSION 3.0.2)
project(gpt4v_vqa)

find_package(catkin REQUIRED COMPONENTS catkin_virtualenv dynamic_reconfigure)

catkin_python_setup()

catkin_generate_virtualenv(
PYTHON_INTERPRETER python3.8
USE_SYSTEM_PACKAGES FALSE
ISOLATE_REQUIREMENTS TRUE
CHECK_VENV FALSE
)

generate_dynamic_reconfigure_options(
cfg/GPT4V.cfg
)

catkin_package()

catkin_install_python(
PROGRAMS
node_scripts/gpt4v_vqa_node.py
DESTINATION ${CATKIN_PACKAGE_BIN_DESTINATION}
)
39 changes: 39 additions & 0 deletions gpt4v_vqa/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# GPT4V VQA

This repository offers a ROS Node with GPT4V model.

## Installation

```bash
catkin build gpt4v_vqa
```

## Usage

```bash
roslaunch gpt4v_vqa vqa.launch api_key:=<YOUR_API_KEY> VQA_INPUT_IMAGE:=<IMAGE TOPIC>
```

And from other terminal

```bash
$ rosrun gpt4v_vqa vqa_interpreter.py
```

## Nodes

### gpt4v_vqa

This node is a ROS wrapper for GPT4V model. Its behavior is similar to [VQA node](../jsk_perception/node_scripts/vqa_node.py). But there is a difference that this node does not support continuous inference. This node use API only when action server is called.

#### Subscribed Topics

* **`~image`** ([sensor_msgs/Image])

The image used for VQA as default image.

#### Action Servers

* **`~inference_server`** ([jsk_recognition_msgs/VQATaskAction])

The action server for VQA.
12 changes: 12 additions & 0 deletions gpt4v_vqa/cfg/GPT4V.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/usr/bin/env python
PACKAGE = "gpt4v_vqa"

from dynamic_reconfigure.parameter_generator_catkin import *

gen = ParameterGenerator()

gen.add("max_height", int_t, 0, "Maximum image height", 480, 0, 1080)
gen.add("max_width", int_t, 0, "Maximum image width", 640, 0, 1920)
gen.add("detail_level", str_t, 0, "Detail level of GPT4V API", "low")

exit(gen.generate(PACKAGE, "gpt4v_vqa", "GPT4V"))
9 changes: 9 additions & 0 deletions gpt4v_vqa/euslisp/run_simple_vqa.l
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/usr/bin/env roseus

(load "package://gpt4v_vqa/euslisp/utils.l")

(ros::roseus "run_simple_vqa")

(print "Question is \"Please describe the image briefly.\"")
(print (format nil "Answer is ~a" (run-vqa "Please describe the image briefly.")))
(exit)
33 changes: 33 additions & 0 deletions gpt4v_vqa/euslisp/utils.l
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
(ros::load-ros-manifest "jsk_recognition_msgs")

(defparameter *vqa-action* nil)

(defun init-vqa-action (&key (timeout 10))
(unless *vqa-action*
(setq *vqa-action*
(instance ros::simple-action-client :init
"/vqa/inference_server" jsk_recognition_msgs::VQATaskAction)))
(send *vqa-action* :wait-for-server timeout)
)

(defun run-vqa (question &optional msg_image &key (timeout 30))
"run vqa action client. return answer string or nil if failed."
(if (not (init-vqa-action))
(return-from run-vqa nil))
(let* (result answer
(action-goal (instance jsk_recognition_msgs::VQATaskGoal :init)))
(send action-goal :questions (list question))
(if msg_image
(send action-goal :image msg_image)
)
(send *vqa-action* :send-goal action-goal)
(unless (send *vqa-action* :wait-for-result :timeout timeout)
(send *vqa-action* :cancel-all-goals)
(ros::ros-error "No result returned from /vqa action server")
(return-from run-vqa nil))
(setq result (send *vqa-action* :get-result))
(if (and result (> (length (send result :result :result)) 0))
(send (elt (send result :result :result) 0) :answer)
nil)
)
)
12 changes: 12 additions & 0 deletions gpt4v_vqa/launch/vqa.launch
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<?xml version="1.0" encoding="utf-8"?>
<launch>
<arg name="openai_api_key" default="$(optenv OPENAI_API_KEY)" />
<arg name="VQA_INPUT_IMAGE" default="vqa_image" />

<node name="vqa" pkg="gpt4v_vqa" type="gpt4v_vqa_node.py" output="screen">
<remap from="~image" to="$(arg VQA_INPUT_IMAGE)" />
<rosparam subst_value="true">
api_key: $(arg openai_api_key)
</rosparam>
</node>
</launch>
184 changes: 184 additions & 0 deletions gpt4v_vqa/node_scripts/gpt4v_vqa_node.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
#!/usr/bin/env python

import base64
from typing import Dict, Optional

import actionlib
import cv2
import numpy as np
import requests
import ros_numpy
import rospy
from dynamic_reconfigure.server import Server
from jsk_recognition_msgs.msg import (QuestionAndAnswerText, VQATaskAction,
VQATaskGoal, VQATaskResult)
from sensor_msgs.msg import Image

from gpt4v_vqa.cfg import GPT4VConfig

COST_PER_TOKEN_FOR_INPUT = 0.01 / 1000
COST_PER_TOKEN_FOR_OUTPUT = 0.03 / 1000


class GPT4VClientNode(object):
def __init__(self, api_key: str):
self.api_key = api_key
# Configuration
self.max_height: int = 480
self.max_width: int = 640
self.detail_level: str = "low"
# Node variables
self.default_img: Optional[np.ndarray] = None
self.sub = rospy.Subscriber("~image", Image, self._image_cb)
self.param_srv = Server(GPT4VConfig, self.config_cb)
self.ac = actionlib.SimpleActionServer(
"~inference_server", VQATaskAction, self._ac_cb, auto_start=False
)
self.ac.start()

def config_cb(self, config, level):
"""Dynamic reconfigure callback"""
self.set_max_size(config["max_height"], config["max_width"])
self.detail_level = config["detail_level"]
return config

def set_max_size(self, max_height: int, max_width: int):
"""Set max size of image to send to API

Args:
max_height (int): max height
max_width (int): max width
"""
self.max_height = max_height
self.max_width = max_width

def resize_image(self, image: np.ndarray) -> np.ndarray:
"""Resize image to maximum size configuration

Args:
image (np.ndarray): image to resize

Returns:
np.ndarray: resized image
"""
height, width, num_channel = image.shape
if height > self.max_height or width > self.max_width:
scale = min(self.max_height / height, self.max_width / width)
image = cv2.resize(
image,
(int(width * scale), int(height * scale)),
interpolation=cv2.INTER_AREA,
)
return image

def _image_cb(self, msg: Image):
image = ros_numpy.numpify(msg)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
self.default_img = image

def _ac_cb(self, goal: VQATaskGoal):
"""Action callback

Args:
goal (VQATaskAction): action goal
"""
rospy.loginfo("Received goal")
result = VQATaskResult()

if len(goal.image.data) > 0:
image = ros_numpy.numpify(goal.image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
elif len(goal.compressed_image.data) > 0:
rospy.logerr(f"Compressed image is not supported.")
self.ac.set_aborted(result)
return
else:
if self.default_img is not None:
image = self.default_img
else:
rospy.logerr("Image is empty")
self.ac.set_aborted(result)
return
image = self.resize_image(image)
for question in goal.questions:
response = self._get_multimodal_response(question, image)
if response is None:
rospy.logerr(f"Failed to get response from question {question}")
continue
if "choices" not in response or len(response["choices"]) == 0:
rospy.logerr(f"No choices in response: {response}")
continue
answer = response["choices"][0]["message"]["content"]
result.result.result.append(
QuestionAndAnswerText(question=question, answer=answer)
)
input_tokens = response["usage"]["prompt_tokens"]
output_tokens = response["usage"]["completion_tokens"]
rospy.loginfo(f"Used tokens for this completion is {input_tokens} for input and {output_tokens} for output.")
rospy.loginfo(f"Which costs ${input_tokens * COST_PER_TOKEN_FOR_INPUT} USD for input and ${output_tokens * COST_PER_TOKEN_FOR_OUTPUT} USD for output.")
if len(result.result.result) == 0:
rospy.logerr("No answers found")
self.ac.set_aborted(result)
return
else:
self.ac.set_succeeded(result)

def _get_multimodal_response(
self,
question: str,
image: np.ndarray,
max_tokens: int = 300,
detail: str = "low",
) -> Optional[Dict]:
"""Get response from GPT-4-Vision API

Args:
question (str): question to ask
image (np.ndarray): image to ask question about
max_tokens (int, optional): max tokens to use for output. Defaults to 300. Which is about $0.01 at 2024-01-09. (See https://openai.com/pricing)
detail (str, optional): detail level. Defaults to "low". See https://platform.openai.com/docs/guides/vision/managing-images for details.

Returns:
Dict: response from API"""
base64_image = base64.b64encode(cv2.imencode(".jpg", image)[1]).decode("utf-8")
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {self.api_key}",
}

payload = {
"model": "gpt-4-vision-preview",
"messages": [
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"defail": detail,
},
},
],
}
],
"max_tokens": max_tokens,
}
try:
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=headers,
json=payload,
)
return response.json()
except requests.exceptions.RequestException as e:
rospy.logerr(e)
return None


if __name__ == "__main__":
rospy.init_node("vqa")
api_key = rospy.get_param("~api_key")
GPT4VClientNode(api_key)
rospy.spin()
18 changes: 18 additions & 0 deletions gpt4v_vqa/node_scripts/vqa_interpreter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/usr/bin/env python

import rospy

from gpt4v_vqa import VQAClient

if __name__ == "__main__":
rospy.init_node("vqa_interpreter")

client = VQAClient()
client.wait_for_server()

while not rospy.is_shutdown():
question = input("Enter a question: ")
if question == "exit":
break
result = client.vqa(question)
print(result)
27 changes: 27 additions & 0 deletions gpt4v_vqa/package.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
<?xml version="1.0"?>
<package format="2">
<name>gpt4v_vqa</name>
<version>1.2.17</version>
<description>The gpt4v_vqa package</description>

<author email="shinjo@jsk.imi.i.u-tokyo.ac.jp">Koki Shinjo</author>
<maintainer email="k-okada@jsk.t.u-tokyo.ac.jp">Kei Okada</maintainer>

<license>BSD</license>

<buildtool_depend>catkin</buildtool_depend>

<build_depend>catkin_virtualenv</build_depend>
<build_depend>actionlib</build_depend>
<build_depend>dynamic_reconfigure</build_depend>
<build_depend>jsk_recognition_msgs</build_depend>

<exec_depend>catkin_virtualenv</exec_depend>
<exec_depend>actionlib</exec_depend>
<exec_depend>dynamic_reconfigure</exec_depend>
<exec_depend>jsk_recognition_msgs</exec_depend>

<export>
<pip_requirements>requirements.txt</pip_requirements>
</export>
</package>
24 changes: 24 additions & 0 deletions gpt4v_vqa/python/gpt4v_vqa/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import actionlib
import rospy
from jsk_recognition_msgs.msg import VQATaskAction, VQATaskGoal


class VQAClient:
def __init__(self, action_name="/vqa/inference_server"):
self.client = actionlib.SimpleActionClient(action_name, VQATaskAction)

def wait_for_server(self, timeout=10.0):
self.client.wait_for_server(timeout=rospy.Duration(timeout))

def vqa(self, question, image=None, timeout=30.0):
goal = VQATaskGoal()
goal.questions = [question]
if image is not None:
goal.image = image
self.client.send_goal(goal)
if self.client.wait_for_result(timeout=rospy.Duration(timeout)):
result = self.client.get_result()
return result
else:
rospy.logwarn("Timeout")
return None
Loading
Loading