change turbo to preview

tatsu-lab · Mar 20, 2024 · bcb3097 · bcb3097
1 parent 6b58ab1
commit bcb3097
Show file tree

Hide file tree

Showing 12 changed files with 38 additions and 108 deletions.
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@ Updates:
 
 :tada: **Length-controlled Win Rates** are out and used by default! This increases the correlation with ChatBot Arena from 0.93 to 0.98, while significantly decreasing length gameability. The raw win rates are still shown on the website and the CLI. More details [here](#length-controlled-win-rates).
 
-:tada: **AlpacaEval 2.0** is out and used by default! We improved the auto-annotator (better and cheaper) and use GPT-4 turbo as baseline. More details [here](#alpacaeval-20). For the old version, set your environment variable `IS_ALPACA_EVAL_2=False`.
+:tada: **AlpacaEval 2.0** is out and used by default! We improved the auto-annotator (better and cheaper) and use GPT-4 preview as baseline. More details [here](#alpacaeval-20). For the old version, set your environment variable `IS_ALPACA_EVAL_2=False`.
 
 ---
 
@@ -1253,6 +1253,13 @@ For more information and results about length controlled win-rates see [this not
 
 This idea of estimating the controlled direct effect, by predicting the outcome while conditioning on the mediator (the length difference), is common in statistical inference.
 
+To get LC win rates on previously annotated models, you can use the following command:
+
+```bash
+pip install -U alpaca_eval
+alpaca_eval --model_outputs … --is_recompute_metrics_only True
+```
+
 </details>
 
 

diff --git a/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv b/docs/data_AlpacaEval/alpaca_eval_gpt4_leaderboard.csv
@@ -1,5 +1,5 @@
 name,length_controlled_winrate,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,89.85849210429464,97.69900497512438,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_1106_preview/model_outputs.json,minimal
+GPT-4 Preview,89.85849210429464,97.69900497512438,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_1106_preview/model_outputs.json,minimal
 XwinLM 70b V0.3,94.01522563893708,97.636815920398,2113,https://github.com/Xwin-LM/Xwin-LM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/xwinlm-70b-v0.3/model_outputs.json,community
 Mistral Medium,91.54314285144824,96.83229813664596,1500,https://mistral.ai/news/la-plateforme/,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/mistral-medium/model_outputs.json,minimal
 XwinLM 70b V0.1,,95.56803995,1775,https://github.com/Xwin-LM/Xwin-LM,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/xwinlm-70b-v0.1/model_outputs.json,community

diff --git a/docs/data_AlpacaEval_2/alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv b/docs/data_AlpacaEval_2/alpaca_eval_cot_gpt4_turbo_fn_leaderboard.csv
@@ -1,5 +1,5 @@
 name,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,50.0,2049.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
+GPT-4 Preview,50.0,2049.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
 Yi 34B Chat,35.3416149068323,2123.0,https://huggingface.co/01-ai/Yi-34B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Yi-34B-Chat/model_outputs.json,minimal
 GPT-4,20.0,1365.0,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 Mixtral 8x7B v0.1,19.937888198757765,1465.0,https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mixtral-8x7B-Instruct-v0.1/model_outputs.json,minimal

diff --git a/docs/data_AlpacaEval_2/alpaca_eval_gpt4_turbo_fn_leaderboard.csv b/docs/data_AlpacaEval_2/alpaca_eval_gpt4_turbo_fn_leaderboard.csv
@@ -1,5 +1,5 @@
 name,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
+GPT-4 Preview,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_turbo/model_outputs.json,minimal
 Yi 34B Chat,34.84472049689441,2123,https://huggingface.co/01-ai/Yi-34B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Yi-34B-Chat/model_outputs.json,minimal
 GPT-4,25.093167701863354,1365,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 Mixtral 8x7B v0.1,22.795031055900623,1465,https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Mixtral-8x7B-Instruct-v0.1/model_outputs.json,minimal

diff --git a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv
@@ -1,5 +1,5 @@
 name,length_controlled_winrate,win_rate,avg_length,link,samples,filter
-GPT-4 Turbo,50.0,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_1106_preview/model_outputs.json,minimal
+GPT-4 Preview,50.0,50.0,2049,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_1106_preview/model_outputs.json,minimal
 Claude 3 Opus (02/29),40.39177606350116,29.04176413403727,1388,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-3-opus-20240229/model_outputs.json,minimal
 GPT-4,38.12808974440021,23.576789314782605,1365,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,minimal
 Qwen1.5 72B Chat,36.571754111987296,26.49828339562733,1549,https://huggingface.co/Qwen/Qwen1.5-72B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Qwen1.5-72B-Chat/model_outputs.json,community

diff --git a/docs/index.html b/docs/index.html
@@ -169,7 +169,7 @@ <h1>AlpacaEval
         <br>
         <h2>An Automatic Evaluator for Instruction-following Language Models</h2>
 <!--        <small id="alpaca_eval_info" style="color: #777;">-->
-<!--            Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo-->
+<!--            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview-->
 <!--        </small>-->
 <!--        <br>-->
         <small id="caution" style="color: #8C1515;">
@@ -205,7 +205,7 @@ <h2>An Automatic Evaluator for Instruction-following Language Models</h2>
 
     <div class="container" style="text-align: center; margin-bottom: 10px; margin-top: -10px;">
         <small id="alpaca_eval_info" style="color: #777;">
-            Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo
+            Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview
         </small>
     </div>
 
@@ -233,7 +233,7 @@ <h2>About AlpacaEval</h2>
             <a href="https://crfm.stanford.edu/2023/05/22/alpaca-farm.html">AlpacaFarm</a>
             evaluation set,
             which tests the ability of models to follow general user instructions.
-            These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Turbo for AlpacaEval 2.0) by
+            These responses are then compared to reference responses (Davinci003 for AlpacaEval, GPT-4 Preview for AlpacaEval 2.0) by
             the provided GPT-4 based auto-annotators,
             which results in the win rates presented above.
             AlpacaEval displays a high agreement rate with ground truth human annotations,
@@ -373,7 +373,7 @@ <h2>AlpacaEval limitations</h2>
     function updateInfoMessage(version) {
         let infoText;
         if (version === 'alpaca_eval_2') {
-            infoText = 'Baseline: GPT-4 Turbo &nbsp; | &nbsp; Auto-annotator: GPT-4 Turbo';
+            infoText = 'Baseline: GPT-4 Preview &nbsp; | &nbsp; Auto-annotator: GPT-4 Preview';
         } else if (version === 'alpaca_eval') {
             infoText = 'Baseline: Davinci003 &nbsp; | &nbsp; Auto-annotator: GPT-4';
         }

diff --git a/figures/causal_graph.png b/figures/causal_graph.png
diff --git a/notebooks/length_controlled.ipynb b/notebooks/length_controlled.ipynb
diff --git a/src/alpaca_eval/models_configs/gpt4_1106_preview/configs.yaml b/src/alpaca_eval/models_configs/gpt4_1106_preview/configs.yaml
@@ -4,4 +4,4 @@ gpt4_1106_preview:
   completions_kwargs:
     model_name: "gpt-4-1106-preview"
     max_tokens: 4096
-  pretty_name: "GPT-4 Turbo"
+  pretty_name: "GPT-4 Perview"
diff --git a/src/alpaca_eval/models_configs/gpt4_1106_preview_concise/configs.yaml b/src/alpaca_eval/models_configs/gpt4_1106_preview_concise/configs.yaml
@@ -4,4 +4,4 @@ gpt4_1106_preview_concise:
   completions_kwargs:
     model_name: "gpt-4-1106-preview"
     max_tokens: 4096
-  pretty_name: "GPT-4 Turbo (concise)"
+  pretty_name: "GPT-4 Preview (concise)"
diff --git a/src/alpaca_eval/models_configs/gpt4_1106_preview_verbose/configs.yaml b/src/alpaca_eval/models_configs/gpt4_1106_preview_verbose/configs.yaml
@@ -4,4 +4,4 @@ gpt4_1106_preview_verbose:
   completions_kwargs:
     model_name: "gpt-4-1106-preview"
     max_tokens: 4096
-  pretty_name: "GPT-4 Turbo (verbose)"
+  pretty_name: "GPT-4 Preview (verbose)"
diff --git a/src/alpaca_eval/models_configs/gpt4_turbo/configs.yaml b/src/alpaca_eval/models_configs/gpt4_turbo/configs.yaml
@@ -4,4 +4,4 @@ gpt4_turbo:
   completions_kwargs:
     model_name: "gpt-4-1106-preview"
     max_tokens: 4096
-  pretty_name: "GPT-4 Turbo"
+  pretty_name: "GPT-4 Preview"