[ENH] adding drbx and gpt4 turbo (#275)

* [ENH] adding drbx and gpt4 turbo * [ENH] adding drbx and gpt4 turbo
tatsu-lab · Apr 12, 2024 · 13587ff · 13587ff
1 parent 764fbda
commit 13587ff
Show file tree

Hide file tree

Showing 20 changed files with 196,232 additions and 21 deletions.
diff --git a/docs/index.html b/docs/index.html
@@ -205,7 +205,7 @@ <h2>An Automatic Evaluator for Instruction-following Language Models</h2>
 
     <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;">
         <small id="alpaca_eval_info" style="color: #777;">
-            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview
+            Baseline: GPT-4 1106 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo
         </small>
     </div>
 

diff --git a/results/dbrx-instruct/model_outputs.json b/results/dbrx-instruct/model_outputs.json
diff --git a/results/dbrx-instruct/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/dbrx-instruct/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/gpt-4-0125-preview/model_outputs.json b/results/gpt-4-0125-preview/model_outputs.json
diff --git a/results/gpt-4-0125-preview/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt-4-0125-preview/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/results/gpt-4-turbo-2024-04-09/model_outputs.json b/results/gpt-4-turbo-2024-04-09/model_outputs.json
diff --git a/results/gpt-4-turbo-2024-04-09/weighted_alpaca_eval_gpt4_turbo/annotations.json b/results/gpt-4-turbo-2024-04-09/weighted_alpaca_eval_gpt4_turbo/annotations.json
diff --git a/src/alpaca_eval/decoders/openai.py b/src/alpaca_eval/decoders/openai.py
@@ -297,6 +297,8 @@ def _get_price_per_token(model, price_per_token=None):
     """Returns the price per token for a given model"""
     if price_per_token is not None:
         return float(price_per_token)
+    if "gpt-4-turbo" in model:
+        return 0.001 / 1000
     elif "gpt-4-1106" in model:
         return (
             0.01 / 1000

diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_cot_gpt4_turbo/configs.yaml
@@ -2,7 +2,7 @@ alpaca_eval_clf_cot_gpt4_turbo:
   prompt_template: "alpaca_eval_clf_cot_gpt4_turbo/alpaca_eval_clf_cot.txt"
   fn_completions: "openai_completions"
   completions_kwargs:
-    model_name: "gpt-4-1106-preview"
+    model_name: "gpt-4-turbo"
     max_tokens: 300
     temperature: 1 # temperature should be applied for sampling, so that should make no effect.
     logprobs: true

diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_gpt4_turbo/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_clf_gpt4_turbo/configs.yaml
@@ -2,7 +2,7 @@ alpaca_eval_clf_gpt4_turbo:
   prompt_template: "alpaca_eval_clf_gpt4_turbo/alpaca_eval_clf.txt"
   fn_completions: "openai_completions"
   completions_kwargs:
-    model_name: "gpt-4-1106-preview"
+    model_name: "gpt-4-turbo"
     max_tokens: 1
     temperature: 1 # temperature should be applied for sampling, so that should make no effect.
     logprobs: true

diff --git a/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml b/src/alpaca_eval/evaluators_configs/alpaca_eval_gpt4_turbo_fn/configs.yaml
@@ -2,7 +2,7 @@ alpaca_eval_gpt4_turbo_fn:
   prompt_template: "alpaca_eval_gpt4_turbo_fn/alpaca_eval_fn.txt"
   fn_completions: "openai_completions"
   completions_kwargs:
-    model_name: "gpt-4-1106-preview"
+    model_name: "gpt-4-turbo"
     max_tokens: 100
     temperature: 0
     function_call:

diff --git a/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_cot_gpt4_turbo/configs.yaml b/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_cot_gpt4_turbo/configs.yaml
@@ -2,7 +2,7 @@ weighted_alpaca_eval_cot_gpt4_turbo:
   prompt_template: "alpaca_eval_clf_cot_gpt4_turbo/alpaca_eval_clf_cot.txt"
   fn_completions: "openai_completions"
   completions_kwargs:
-    model_name: "gpt-4-1106-preview"
+    model_name: "gpt-4-turbo"
     max_tokens: 300
     temperature: 1 # temperature should be applied for sampling, so that should make no effect.
     logprobs: true

diff --git a/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_gpt4_turbo/configs.yaml b/src/alpaca_eval/evaluators_configs/weighted_alpaca_eval_gpt4_turbo/configs.yaml
@@ -2,7 +2,7 @@ weighted_alpaca_eval_gpt4_turbo:
   prompt_template: "alpaca_eval_clf_gpt4_turbo/alpaca_eval_clf.txt"
   fn_completions: "openai_completions"
   completions_kwargs:
-    model_name: "gpt-4-1106-preview"
+    model_name: "gpt-4-turbo"
     max_tokens: 1
     temperature: 1 # temperature should be applied for sampling, so that should make no effect.
     logprobs: true

diff --git a/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/...lpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -1,4 +1,6 @@
 ,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,discrete_win_rate,mode,avg_length,length_controlled_winrate
+gpt-4-0125-preview,51.22356403608144,1.5925964020631154,408,388,9,805,51.24223602484472,verified,2018,52.71305126323133
+gpt-4-turbo-2024-04-09,42.6755010801893,1.601975204819872,334,463,8,805,41.98757763975156,minimal,1802,51.9269770895354
 gpt4_1106_preview_verbose,64.30360147101865,1.3348590089025316,525,268,12,805,65.96273291925466,dev,2402,51.57500797967598
 gpt4_1106_preview,50.0,0.0,0,0,805,805,50.0,minimal,2049,50.0
 gpt4_1106_preview_concise,22.92019444047205,1.232517714329424,172,622,11,805,22.049689440993788,dev,1136,41.896601591245386
@@ -52,7 +54,7 @@ gpt35_turbo_instruct,8.462446504415423,0.8724086933609648,66,735,3,804,8.3955223
 wizardlm-70b,14.383896086782608,1.0395048912985754,106,697,2,805,13.291925465838508,community,1545,17.575060737493747
 vicuna-33b-v1.3,12.705947921540371,0.999255784310268,90,711,4,805,11.428571428571429,verified,1479,17.574575310874923
 pairrm-tulu-2-13b,13.831901016757762,1.0835284665170843,110,694,1,805,13.72670807453416,community,1454,17.40520369795085
-Conifer-7B-DPO,11.313585649162219,0.9870897936343657,87,717,1,805,10.869565217391305,community,1253,17.11249588276248
+Conifer-7B-DPO,11.31358564916222,0.9870897936343656,87,717,1,805,10.869565217391305,community,1253,17.11249588276248
 Mistral-7B-Instruct-v0.2,14.722772657714286,1.0785266446729775,113,691,1,805,14.09937888198758,minimal,1676,17.111251846021165
 evo-7b,15.577437399527952,1.0835570388658722,112,689,4,805,14.161490683229813,community,1774,16.489386004239325
 humpback-llama2-70b,10.121771502645965,0.9401806122130112,77,727,1,805,9.627329192546584,community,1107,16.249164231428974
@@ -141,4 +143,4 @@ falcon-7b-instruct,2.146617553167702,0.454225792894195,16,787,2,805,2.1118012422
 oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304347826086,verified,726,3.270102114456748
 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614
 guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697
-baichuan-13b-chat,1.9921455615279502,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568
+baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568
diff --git a/...ights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv b/...ights/weighted_alpaca_eval_gpt4_turbo/length_controlled_v1/baseline_gpt4_1106_preview.csv
@@ -134,3 +134,6 @@ oasst-sft-pythia-12b,-1.8114830706934373,0.8352440378605592,-4.1447992306227759
 guanaco-13b,-1.3596147092204240,0.8881208851338480,-4.3434287450074027
 guanaco-7b,-1.5767517948647565,0.8962503484379708,-4.4121467026415724
 baichuan-13b-chat,-1.5434450958960080,0.8816095189753549,-4.7614489966908522
+dbrx-instruct,-1.4122945710935642,0.6308023589854422,-1.2161908189092332
+gpt-4-0125-preview,-0.9909246238247712,0.2194074997122520,0.1258121207377884
+gpt-4-turbo-2024-04-09,-1.2778589108647278,0.2362259903279992,0.0952165677410898
diff --git a/src/alpaca_eval/models_configs/dbrx-instruct/configs.yaml b/src/alpaca_eval/models_configs/dbrx-instruct/configs.yaml
@@ -0,0 +1,12 @@
+dbrx-instruct:
+  prompt_template: "Mixtral-8x7B-Instruct-v0.1/togetherai_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "databricks/dbrx-instruct"
+    max_tokens: 4096
+    requires_chatml: True
+    price_per_token: 1.2e-06
+    client_kwargs:
+      base_url: 'https://api.together.xyz'
+  pretty_name: "DBRX Instruct"
+  link: "https://huggingface.co/databricks/dbrx-instruct"
diff --git a/src/alpaca_eval/models_configs/gpt-4-0125-preview/configs.yaml b/src/alpaca_eval/models_configs/gpt-4-0125-preview/configs.yaml
@@ -0,0 +1,7 @@
+gpt-4-0125-preview:
+  prompt_template: "gpt4_1106_preview/chatml_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "gpt-4-0125-preview"
+    max_tokens: 4096
+  pretty_name: "GPT-4 Preview"
diff --git a/src/alpaca_eval/models_configs/gpt-4-turbo-2024-04-09/configs.yaml b/src/alpaca_eval/models_configs/gpt-4-turbo-2024-04-09/configs.yaml
@@ -0,0 +1,7 @@
+gpt-4-turbo-2024-04-09:
+  prompt_template: "gpt4_1106_preview/chatml_prompt.txt"
+  fn_completions: "openai_completions"
+  completions_kwargs:
+    model_name: "gpt-4-turbo-2024-04-09"
+    max_tokens: 4096
+  pretty_name: "GPT-4 Turbo 04-09"
diff --git a/src/alpaca_eval/models_configs/gpt4_turbo/chatml_prompt.txt b/src/alpaca_eval/models_configs/gpt4_turbo/chatml_prompt.txt
diff --git a/src/alpaca_eval/models_configs/gpt4_turbo/configs.yaml b/src/alpaca_eval/models_configs/gpt4_turbo/configs.yaml