From 1deab1b857d97a6eaa68c88634693d8b2dba93c0 Mon Sep 17 00:00:00 2001
From: Yann Dubois <yanndubois96@gmail.com>
Date: Fri, 16 Aug 2024 16:46:56 -0700
Subject: [PATCH] [ENH] OpenAI use tools instead of functions (#391)

* [ENH] more general `_requires_chatml`

* [ENH] update OpenAI to using tools instead of functions
---
 src/alpaca_eval/decoders/openai.py            |  8 ++-
 .../configs.yaml                              | 55 +++++++++++--------
 .../alpaca_eval_gpt4_fn/configs.yaml          | 49 ++++++++++-------
 .../alpaca_eval_gpt4_turbo_fn/configs.yaml    | 49 ++++++++++-------
 .../chatgpt_fn/configs.yaml                   | 29 ++++++----
 5 files changed, 111 insertions(+), 79 deletions(-)

diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py
index fb1cb46e..a61900b4 100644
--- a/src/alpaca_eval/decoders/openai.py
+++ b/src/alpaca_eval/decoders/openai.py
@@ -237,10 +237,15 @@ def _openai_completion_helper(
                     else:
                         choices[i]["text"] = choice.message.content
 
+                    # backward compatibility for function calls # TODO: remove once function calls are removed
                     if choice.message.function_call:
                         # currently we only use function calls to get a JSON object => return raw text of json
                         choices[i]["text"] = choice.message.function_call.arguments
 
+                    if choice.message.tool_calls is not None:
+                        # currently we only use function calls to get a JSON object => return raw text of json
+                        choices[i]["text"] = choice.message.tool_calls[0].function.arguments
+
             else:
                 completion_batch = client.completions.create(prompt=prompt_batch, **curr_kwargs)
                 choices = completion_batch.choices
@@ -290,7 +295,8 @@ def _openai_completion_helper(
 def _requires_chatml(model: str) -> bool:
     """Whether a model requires the ChatML format."""
     # TODO: this should ideally be an OpenAI function... Maybe it already exists?
-    return ("turbo" in model or "gpt-4" in model) and "instruct" not in model
+    not_chatml = ("instruct" in model) or ("gpt-3" in model and "turbo" not in model) or (model.startswith("text-"))
+    return not not_chatml
 
 
 def _get_price_per_token(model, price_per_token=None):
diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml
index f6979d14..20615b68 100644
--- a/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml
+++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_cot_gpt4_turbo_fn/configs.yaml
@@ -5,30 +5,37 @@ alpaca_eval_cot_gpt4_turbo_fn:
     model_name: "gpt-4-1106-preview"
     max_tokens: 300
     temperature: 0
-    function_call:
-      name: "make_partial_leaderboard"
-    functions:
-      - name: "make_partial_leaderboard"
-        description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
-        parameters:
-          type: "object"
-          properties:
-            concise_explanation:
-              type: "string"
-              description: "A concise explanation for the ranking of the current models."
-            ordered_models:
-              type: "array"
-              description: "A list of models ordered by the preference of their outputs. The first model in the list has the best output."
-              items:
-                type: "object"
-                properties:
-                  model:
-                    type: "string"
-                    description: "The name of the model"
-                  rank:
-                    type: "number"
-                    description: "Order of preference of the model, 1 has the best output"
-        "required": [ "ordered_models" ]
+    tool_choice:
+      type: function
+      function:
+        name: "make_partial_leaderboard"
+    tools:
+      - type: function
+        function:
+          name: "make_partial_leaderboard"
+          description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
+          parameters:
+            type: "object"
+            properties:
+              concise_explanation:
+                type: "string"
+                description: "A concise explanation for the ranking of the current models."
+              ordered_models:
+                type: "array"
+                description: "A list of models ordered by the preference of their outputs. The first model in the list has the best output."
+                items:
+                  type: "object"
+                  properties:
+                    model:
+                      type: "string"
+                      description: "The name of the model"
+                    rank:
+                      type: "number"
+                      description: "Order of preference of the model, 1 has the best output"
+                  additionalProperties: false
+                  required: [ "model", "rank" ]
+            additionalProperties: false
+            required: [ "ordered_models" ]
   fn_completion_parser: "pipeline_meta_parser"
   completion_parser_kwargs:
       parsers_to_kwargs:
diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml
index 1aa2b91d..f16b20e5 100644
--- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml
+++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_fn/configs.yaml
@@ -5,27 +5,34 @@ alpaca_eval_gpt4_fn:
     model_name: "gpt-4"
     max_tokens: 100
     temperature: 0
-    function_call:
-      name: "make_leaderboard"
-    functions:
-      - name: "make_leaderboard"
-        description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
-        parameters:
-          type: "object"
-          properties:
-            ordered_models:
-              type: "array"
-              description: "A list of models ordered by the preference of their outputs"
-              items:
-                type: "object"
-                properties:
-                  model:
-                    type: "string"
-                    description: "The name of the model"
-                  rank:
-                    type: "number"
-                    description: "Order of preference of the model, 1 has the best output"
-        "required": [ "ordered_models" ]
+    tool_choice:
+      type: function
+      function:
+        name: "make_leaderboard"
+    tools:
+      - type: function
+        function:
+          name: "make_leaderboard"
+          description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
+          parameters:
+            type: "object"
+            properties:
+              ordered_models:
+                type: "array"
+                description: "A list of models ordered by the preference of their outputs"
+                items:
+                  type: "object"
+                  properties:
+                    model:
+                      type: "string"
+                      description: "The name of the model"
+                    rank:
+                      type: "number"
+                      description: "Order of preference of the model, 1 has the best output"
+                  additionalProperties: false
+                  required: [ "model", "rank" ]
+            additionalProperties: false
+            required: [ "ordered_models" ]
   fn_completion_parser: "pipeline_meta_parser"
   completion_parser_kwargs:
       parsers_to_kwargs:
diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml
index b7f6be3a..9896a327 100644
--- a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml
+++ b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml
@@ -5,27 +5,34 @@ alpaca_eval_gpt4_turbo_fn:
     model_name: "gpt-4-1106-preview"
     max_tokens: 100
     temperature: 0
-    function_call:
-      name: "make_partial_leaderboard"
-    functions:
-      - name: "make_partial_leaderboard"
-        description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
-        parameters:
-          type: "object"
-          properties:
-            ordered_models:
-              type: "array"
-              description: "A list of models ordered by the preference of their outputs. The first model in the list has the best output."
-              items:
-                type: "object"
-                properties:
-                  model:
-                    type: "string"
-                    description: "The name of the model"
-                  rank:
-                    type: "number"
-                    description: "Order of preference of the model, 1 has the best output"
-        "required": [ "ordered_models" ]
+    tool_choice:
+      type: function
+      function:
+        name: "make_partial_leaderboard"
+    tools:
+      - type: function
+        function:
+          name: "make_partial_leaderboard"
+          description: "Make a leaderboard of models given a list of the models ordered by the preference of their outputs."
+          parameters:
+            type: "object"
+            properties:
+              ordered_models:
+                type: "array"
+                description: "A list of models ordered by the preference of their outputs. The first model in the list has the best output."
+                items:
+                  type: "object"
+                  properties:
+                    model:
+                      type: "string"
+                      description: "The name of the model"
+                    rank:
+                      type: "number"
+                      description: "Order of preference of the model, 1 has the best output"
+                  additionalProperties: false
+                  required: [ "model", "rank" ]
+            additionalProperties: false
+            required: [ "ordered_models" ]
   fn_completion_parser: "pipeline_meta_parser"
   completion_parser_kwargs:
       parsers_to_kwargs:
diff --git a/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml
index b857a327..d200aa7b 100644
--- a/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml
+++ b/src/alpaca_eval/evaluators_configs/chatgpt_fn/configs.yaml
@@ -5,18 +5,23 @@ chatgpt_fn:
     model_name: "gpt-3.5-turbo-16k-0613"
     max_tokens: 50
     temperature: 0
-    function_call:
-      name: "print_best_model"
-    functions:
-      - name: "print_best_model"
-        description: "Print the best model given the preferred output."
-        parameters:
-          type: "object"
-          properties:
-            best_output:
-              type: "string"
-              description: "Name of the best output, should be 'Output (a)' or 'Output (b)'"
-        "required": [ "best_output" ]
+    tool_choice:
+      type: function
+      function:
+        name: "print_best_model"
+    tools:
+      - type: function
+        function:
+          name: "print_best_model"
+          description: "Print the best model given the preferred output."
+          parameters:
+            type: "object"
+            properties:
+              best_output:
+                type: "string"
+                description: "Name of the best output, should be 'Output (a)' or 'Output (b)'"
+            additionalProperties: false
+            required: [ "best_output" ]
   completion_parser_kwargs:
     outputs_to_match:
       1: '(?i)output \(a\)'