From f91c8faed1216d49a62b121089a33fd2a8950d8a Mon Sep 17 00:00:00 2001 From: Yann Dubois Date: Thu, 7 Mar 2024 13:20:44 -0800 Subject: [PATCH] update added models --- notebooks/length_correction.ipynb | 8534 +++++++++-------- src/alpaca_eval/constants.py | 37 +- ...ted_alpaca_eval_gpt4_turbo_leaderboard.csv | 4 +- 3 files changed, 4328 insertions(+), 4247 deletions(-) diff --git a/notebooks/length_correction.ipynb b/notebooks/length_correction.ipynb index cb0edac6..800bba1b 100644 --- a/notebooks/length_correction.ipynb +++ b/notebooks/length_correction.ipynb @@ -148,7 +148,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "We are comparing to 33 Arena models\n" + "We are comparing to 36 Arena models\n" ] } ], @@ -444,8 +444,8 @@ "output_type": "stream", "text": [ "Arena vs Length\n", - "Spearman Corr: 0.281\n", - "Kendall Corr: 0.194\n" + "Spearman Corr: 0.284\n", + "Kendall Corr: 0.197\n" ] } ], @@ -492,12 +492,12 @@ "Adversarial rank gain: 0.0\n", "\n", "## Correlation with Arena (higher is better)\n", - "Spearman Corr: 0.930\n", - "Kendall Corr: 0.811\n", + "Spearman Corr: 0.939\n", + "Kendall Corr: 0.826\n", "\n", - "## Correlation with length (closer to spearman=0.28, kendall=0.19 is better)\n", - "Spearman Corr: 0.467\n", - "Kendall Corr: 0.324\n", + "## Correlation with length (closer to spearman=0.28, kendall=0.20 is better)\n", + "Spearman Corr: 0.465\n", + "Kendall Corr: 0.322\n", "\n", "## Top 10 models\n" ] @@ -508,13 +508,13 @@ "gpt4_1106_preview_verbose 64.303601\n", "gpt4_1106_preview 50.000000\n", "Snorkel-Mistral-PairRM-DPO-best-of-16 34.860133\n", + "Contextual-KTO-Mistral-PairRM 33.227355\n", "pairrm-Yi-34B-Chat 31.241283\n", "Snorkel-Mistral-PairRM-DPO 30.220053\n", "Yi-34B-Chat 29.659947\n", "claude-3-opus-20240229 29.041764\n", "Qwen1.5-72B-Chat 26.498283\n", "claude-3-sonnet-20240229 25.556325\n", - "Mixtral-8x7B-Instruct-v0.1_verbose 24.614063\n", "Name: win_rate, dtype: float64" ] }, @@ -700,18 +700,18 @@ "# Report for **balanced_win_rate**\n", "\n", "## Gameability (lower is better)\n", - "Verbosity gameability (relative std metric): 12.0%\n", - "Conciseness gameability (relative std metric): 18.2%\n", + "Verbosity gameability (relative std metric): 12.1%\n", + "Conciseness gameability (relative std metric): 18.4%\n", "Adversarial winrate gain: 40.8\n", - "Adversarial rank gain: 108.0\n", + "Adversarial rank gain: 110.0\n", "\n", "## Correlation with Arena (higher is better)\n", - "Spearman Corr: 0.933\n", - "Kendall Corr: 0.807\n", + "Spearman Corr: 0.947\n", + "Kendall Corr: 0.829\n", "\n", - "## Correlation with length (closer to spearman=0.28, kendall=0.19 is better)\n", - "Spearman Corr: 0.125\n", - "Kendall Corr: 0.085\n", + "## Correlation with length (closer to spearman=0.28, kendall=0.20 is better)\n", + "Spearman Corr: 0.148\n", + "Kendall Corr: 0.103\n", "\n", "## Top 10 models\n" ] @@ -719,16 +719,16 @@ { "data": { "text/plain": [ - "gpt4_1106_preview_verbose 55.464179\n", - "gpt4_1106_preview 50.0\n", - "gpt4_gamed 44.532377\n", - "gpt4_1106_preview_concise 42.476316\n", - "claude-3-opus-20240229 39.494379\n", - "Qwen1.5-72B-Chat 37.072419\n", - "gpt4 36.503486\n", - "claude-3-sonnet-20240229 35.516836\n", - "gpt4_0613_verbose 32.752569\n", - "Snorkel-Mistral-PairRM-DPO-best-of-16 31.50451\n", + "gpt4_1106_preview_verbose 55.464179\n", + "gpt4_1106_preview 50.0\n", + "gpt4_gamed 44.532377\n", + "gpt4_1106_preview_concise 42.476316\n", + "claude-3-opus-20240229 39.494379\n", + "mistral-large-2402 37.141506\n", + "Qwen1.5-72B-Chat 37.072419\n", + "gpt4 36.503486\n", + "claude-3-sonnet-20240229 35.516836\n", + "gpt4_0613_verbose 32.752569\n", "Name: balanced_win_rate, dtype: object" ] }, @@ -792,1504 +792,1532 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 avg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rankavg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rank
gpt4_1106_preview_verbose240264.30360155.464179-8.839422000
gpt4_1106_preview204950.00000050.0000000.000000110
gpt4_1106_preview_concise113622.92019442.47631619.55612113310
claude-3-opus-20240229138829.04176439.49437910.452615642
gpt4136523.57678936.50348612.9266971165
Qwen1.5-72B-Chat154926.49828337.07241910.574135752
gpt4_0314137122.07325930.9727728.89951314113
claude-3-sonnet-20240229142025.55632535.5168369.960510871
gpt4_0613_verbose147323.23736032.7525699.5152091284
claude-2.1_verbose141424.35407130.6148626.2607911013-3
gpt4_0613114015.75503829.52866213.773624281711
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013331.504510-3.35562329-7
pairrm-Yi-34B-Chat219531.24128329.895164-1.346119314-11
mistral-medium150021.85577330.8454778.98970415123
claude-2106917.18824029.81395812.62571823158
claude108216.98534431.09017514.104832241014
Yi-34B-Chat212329.65994728.477128-1.182818521-16
Snorkel-Mistral-PairRM-DPO273630.22005327.673988-2.546065424-20
claude-instant-1.2111216.12740028.73048412.60308525196
claude-2.1109615.73350729.56282813.829321291613
xwinlm-70b-v0.1177521.81295725.8250814.0121241626-10
gemini-pro145618.17764524.4491346.2714902129-8
Mixtral-8x7B-Instruct-v0.1146518.25531826.0834147.8280962025-5
evo-v2-7b175420.83411322.7788491.9447361832-14
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406324.503900-0.110163928-19
Mixtral-8x7B-Instruct-v0.1_concise91013.74404028.43668714.692646392217
gpt-3.5-turbo-16k-0613132814.13239122.6952928.56290234340
gpt-3.5-turbo-0613133114.09579920.4216576.3258583540-5
gpt-3.5-turbo-1106_verbose105812.76317028.25111315.487943432320
gpt4_0613_concise6279.40032128.60299219.202671612041
pairrm-tulu-2-70b160718.63896323.9434145.3044511930-11
tulu-2-dpo-70b141815.98285424.9247998.94194527270
Mistral-7B-ReMax-v0.1147815.99933123.7792787.7799462631-5
gpt-3.5-turbo-11067969.17796521.19791012.019945643727
LMCocktail-10.7B-v1120313.15343122.7167739.56334240337
internlm2-chat-20b-ppo237321.74915520.927878-0.8212771739-22
claude-2.1_concise5739.22712519.0306589.803532634518
gpt-3.5-turbo-03018279.62245321.63443912.011986583622
xwinlm-13b-v0.1189417.42793518.8452621.4173272246-24
deepseek-llm-67b-chat115112.09342221.8888229.795400463511
gpt35_turbo_instruct10188.46244714.9336066.47116068671
wizardlm-70b154514.38389618.4417794.0578833247-15
vicuna-33b-v1.3147912.70594818.2693495.5634014448-4
pairrm-tulu-2-13b145413.83190120.1873666.3554653841-3
Mistral-7B-Instruct-v0.2167614.72277317.5036102.7808373151-20
evo-7b177415.57743717.7027452.1253073050-20
humpback-llama2-70b110710.12177219.4164939.294722564412
OpenHermes-2.5-Mistral-7B110710.34041616.3528796.0124645557-2
deita-7b-v1.0141712.64663919.8553397.20869945423
jina-chat6767.78613021.06302013.276889743836
gpt-3.5-turbo-1106_concise4317.41586529.49073622.074871821864
causallm-14b139111.14616115.2259114.0797505166-15
pairrm-zephyr-7b-beta148712.84127818.0461055.2048274249-7
Starling-LM-7B-alpha189514.24592415.8387491.5928263359-26
llama-2-70b-chat-hf179013.88825816.3172842.4290263658-22
openchat-v3.1-13b148411.08223016.8851995.80296952520
wizardlm-13b-v1.2163512.02748015.5787153.5512344762-15
ultralm-13b-v2.0-best-of-16172013.85337316.7490302.8956573753-16
wizardlm-13b-v1.1152511.23391015.7774304.5435205060-10
zephyr-7b-beta144410.99288614.8246913.8318055368-15
dolphin-2.2.1-mistral-7b11309.03980013.9744274.9346286673-7
humpback-llama-65b12329.42513915.2771495.8520106065-5
openbuddy-llama2-70b-v10.110778.09642213.8790505.7826287274-2
openbuddy-llama-65b-v811628.77065016.4873477.716697675512
Qwen-14B-Chat10137.50233319.46420311.961870794336
gpt4_gamed683.73833744.53237740.7940401102108
cut-13b163710.77908914.0744313.2953425472-18
openchat-v2-w-13b15669.61534414.1064264.4910825971-12
tulu-2-dpo-13b161410.11978815.5217735.4019855763-6
claude2-alpaca-13b11277.43735116.7331079.295755815427
minotaur-13b8815.73896416.47170810.732744985642
airoboros-65b15129.38895013.0358463.6468966281-19
cohere198312.90145513.1474590.2460044179-38
vicuna-13b-v1.311327.13724014.4678477.330607857015
xwinlm-7b-v0.1189411.24565212.6443081.3986564984-35
airoboros-33b15149.05316012.8401993.7870396582-17
platolm-7b13446.32082811.4432695.1224419193-2
vicuna-13b-v1.510616.7221229.5592312.83710988104-16
gemma-7b-it11156.93729413.7800136.842719867610
openchat-v2-13b15648.43507610.9214022.4863266994-25
zephyr-7b-alpha13028.35266412.3156273.9629637086-16
openbuddy-llama-30b-v7.19686.13001513.5063677.376352947717
ultralm-13b-best-of-16198011.30731512.0371390.7298244888-40
oasst-sft-llama-33b7484.77039112.7422077.9718161048321
wizardlm-13b9855.87815313.8455117.967358967521
nous-hermes-13b8445.41187914.5538509.141971996930
vicuna-13b10375.83110312.1213406.290237978710
tulu-2-dpo-7b16638.19751513.4755695.2780537178-7
openbuddy-llama2-13b-v11.110576.17471615.6650469.490330936132
ultralm-13b-v2.013997.50462312.3536134.8489907885-7
text_davinci_0012962.7640058.5657685.80176212010911
openbuddy-falcon-40b-v910895.95574313.1408877.185144958015
openchat-13b16328.02238610.8994032.8770177395-22
llama-2-13b-chat-hf15137.70231011.8447174.1424077689-13
guanaco-65b12496.85849511.6532364.7947428790-3
opencoderplus-15b16287.4062229.9277002.52147783101-18
oasst-rlhf-llama-33b10796.29643510.1995043.9030699298-6
openchat8192-13b16647.4727679.6742932.20152680103-23
phi-2-dpo16877.75709610.1029812.3458857599-24
minichat-1.5-3b15456.5534438.4939731.94053090110-20
vicuna-7b-v1.510834.7974949.8315135.0340191031021
llama-2-chat-7b-evol70k-neft16127.60238410.5308302.9284477796-19
recycled-wizardlm-7b-v2.015837.33712910.4381123.1009838497-13
vicuna-7b-v1.311104.64251211.5998646.9573521059114
alpaca-farm-ppo-sim-gpt4-20k5113.4503427.8091354.3587931131130
ultralm-13b10875.07459011.5392856.464695100928
baize-v2-13b9304.5905459.0547814.464236106107-1
recycled-wizardlm-7b-v1.014946.6327509.2003542.56760489106-17
alpaca-7b_verbose5372.9331028.1557025.2226011181126
alpaca-farm-ppo-human8034.1004279.9540765.8536491081008
vicuna-7b10444.1626118.2633114.100700107111-4
alpaca-7b3962.59145115.39340612.8019551226458
phi-2-sft10683.9775687.6087993.631231109114-5
minichat-3b8683.0071519.3947406.38758911710512
guanaco-33b13115.0024947.5689602.566467101115-14
falcon-40b-instruct6623.3429198.7658655.4229461161088
gemma-2b-it10413.4019716.0022782.600306115120-5
llama-2-7b-chat-hf14794.9613406.7118811.750541102118-16
openbuddy-falcon-7b-v611523.5211746.9788473.457673111116-5
alpaca-7b_concise3511.9911766.6734384.6822611271198
phi-26262.3502103.1053250.755115124126-2
baize-v2-7b11273.4048155.3222341.917420114121-7
chatglm2-6b10272.7621854.5709161.808732121123-2
pythia-12b-mix-sft9132.5780903.3446370.766547123125-2
falcon-7b-instruct4782.1466186.7879614.6413441251178
oasst-sft-pythia-12b7261.7901142.2741660.4840521281280
guanaco-13b17743.4695974.1034340.633838112124-12
guanaco-7b13642.8800024.6449851.764983119122-3
baichuan-13b-chat17271.9921462.4647490.472603126127-1gpt4_1106_preview_verbose240264.30360155.464179-8.839422000
gpt4_1106_preview204950.00000050.0000000.000000110
gpt4_1106_preview_concise113622.92019442.47631619.55612114311
claude-3-opus-20240229138829.04176439.49437910.452615743
gpt4136523.57678936.50348612.9266971275
Qwen1.5-72B-Chat154926.49828337.07241910.574135862
gpt4_0314137122.07325930.9727728.89951315132
claude-3-sonnet-20240229142025.55632535.5168369.960510981
gpt4_0613_verbose147323.23736032.7525699.5152091394
mistral-large-2402136221.43877637.14150615.70273019514
claude-2.1_verbose141424.35407130.6148626.2607911115-4
gpt4_0613114015.75503829.52866213.773624301911
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013331.504510-3.355623210-8
Contextual-KTO-Mistral-PairRM252133.22735531.198200-2.029155311-8
pairrm-Yi-34B-Chat219531.24128329.895164-1.346119416-12
mistral-medium150021.85577330.8454778.98970416142
claude-2106917.18824029.81395812.62571825178
claude108216.98534431.09017514.104832261214
Yi-34B-Chat212329.65994728.477128-1.182818623-17
Snorkel-Mistral-PairRM-DPO273630.22005327.673988-2.546065526-21
claude-instant-1.2111216.12740028.73048412.60308527216
claude-2.1109615.73350729.56282813.829321311813
xwinlm-70b-v0.1177521.81295725.8250814.0121241728-11
gemini-pro145618.17764524.4491346.2714902331-8
Mixtral-8x7B-Instruct-v0.1146518.25531826.0834147.8280962227-5
evo-v2-7b175420.83411322.7788491.9447362034-14
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406324.503900-0.1101631030-20
Mixtral-8x7B-Instruct-v0.1_concise91013.74404028.43668714.692646412417
gpt-3.5-turbo-16k-0613132814.13239122.6952928.56290236360
gpt-3.5-turbo-0613133114.09579920.4216576.3258583742-5
gpt-3.5-turbo-1106_verbose105812.76317028.25111315.487943452520
gpt4_0613_concise6279.40032128.60299219.202671632241
pairrm-tulu-2-70b160718.63896323.9434145.3044512132-11
tulu-2-dpo-70b141815.98285424.9247998.94194529290
Mistral-7B-ReMax-v0.1147815.99933123.7792787.7799462833-5
gpt-3.5-turbo-11067969.17796521.19791012.019945663927
LMCocktail-10.7B-v1120313.15343122.7167739.56334242357
internlm2-chat-20b-ppo237321.74915520.927878-0.8212771841-23
claude-2.1_concise5739.22712519.0306589.803532654718
gpt-3.5-turbo-03018279.62245321.63443912.011986603822
xwinlm-13b-v0.1189417.42793518.8452621.4173272448-24
deepseek-llm-67b-chat115112.09342221.8888229.795400483711
gpt35_turbo_instruct10188.46244714.9336066.47116070691
wizardlm-70b154514.38389618.4417794.0578833449-15
vicuna-33b-v1.3147912.70594818.2693495.5634014650-4
pairrm-tulu-2-13b145413.83190120.1873666.3554654043-3
Mistral-7B-Instruct-v0.2167614.72277317.5036102.7808373353-20
evo-7b177415.57743717.7027452.1253073252-20
humpback-llama2-70b110710.12177219.4164939.294722584612
OpenHermes-2.5-Mistral-7B110710.34041616.3528796.0124645759-2
deita-7b-v1.0141712.64663919.8553397.20869947443
jina-chat6767.78613021.06302013.276889764036
gpt-3.5-turbo-1106_concise4317.41586529.49073622.074871842064
causallm-14b139111.14616115.2259114.0797505368-15
pairrm-zephyr-7b-beta148712.84127818.0461055.2048274451-7
Starling-LM-7B-alpha189514.24592415.8387491.5928263561-26
llama-2-70b-chat-hf179013.88825816.3172842.4290263860-22
openchat-v3.1-13b148411.08223016.8851995.80296954540
wizardlm-13b-v1.2163512.02748015.5787153.5512344964-15
ultralm-13b-v2.0-best-of-16172013.85337316.7490302.8956573955-16
wizardlm-13b-v1.1152511.23391015.7774304.5435205262-10
zephyr-7b-beta144410.99288614.8246913.8318055570-15
dolphin-2.2.1-mistral-7b11309.03980013.9744274.9346286875-7
humpback-llama-65b12329.42513915.2771495.8520106267-5
openbuddy-llama2-70b-v10.110778.09642213.8790505.7826287476-2
openbuddy-llama-65b-v811628.77065016.4873477.716697695712
Qwen-14B-Chat10137.50233319.46420311.961870814536
gpt4_gamed683.73833744.53237740.7940401122110
cut-13b163710.77908914.0744313.2953425674-18
openchat-v2-w-13b15669.61534414.1064264.4910826173-12
tulu-2-dpo-13b161410.11978815.5217735.4019855965-6
claude2-alpaca-13b11277.43735116.7331079.295755835627
minotaur-13b8815.73896416.47170810.7327441005842
airoboros-65b15129.38895013.0358463.6468966483-19
cohere198312.90145513.1474590.2460044381-38
vicuna-13b-v1.311327.13724014.4678477.330607877215
xwinlm-7b-v0.1189411.24565212.6443081.3986565186-35
airoboros-33b15149.05316012.8401993.7870396784-17
platolm-7b13446.32082811.4432695.1224419395-2
vicuna-13b-v1.510616.7221229.5592312.83710990106-16
gemma-7b-it11156.93729413.7800136.842719887810
openchat-v2-13b15648.43507610.9214022.4863267196-25
zephyr-7b-alpha13028.35266412.3156273.9629637288-16
openbuddy-llama-30b-v7.19686.13001513.5063677.376352967917
ultralm-13b-best-of-16198011.30731512.0371390.7298245090-40
oasst-sft-llama-33b7484.77039112.7422077.9718161068521
wizardlm-13b9855.87815313.8455117.967358987721
nous-hermes-13b8445.41187914.5538509.1419711017130
vicuna-13b10375.83110312.1213406.290237998910
tulu-2-dpo-7b16638.19751513.4755695.2780537380-7
openbuddy-llama2-13b-v11.110576.17471615.6650469.490330956332
ultralm-13b-v2.013997.50462312.3536134.8489908087-7
text_davinci_0012962.7640058.5657685.80176212211111
openbuddy-falcon-40b-v910895.95574313.1408877.185144978215
openchat-13b16328.02238610.8994032.8770177597-22
llama-2-13b-chat-hf15137.70231011.8447174.1424077891-13
guanaco-65b12496.85849511.6532364.7947428992-3
opencoderplus-15b16287.4062229.9277002.52147785103-18
oasst-rlhf-llama-33b10796.29643510.1995043.90306994100-6
openchat8192-13b16647.4727679.6742932.20152682105-23
phi-2-dpo16877.75709610.1029812.34588577101-24
minichat-1.5-3b15456.5534438.4939731.94053092112-20
vicuna-7b-v1.510834.7974949.8315135.0340191051041
llama-2-chat-7b-evol70k-neft16127.60238410.5308302.9284477998-19
recycled-wizardlm-7b-v2.015837.33712910.4381123.1009838699-13
vicuna-7b-v1.311104.64251211.5998646.9573521079314
alpaca-farm-ppo-sim-gpt4-20k5113.4503427.8091354.3587931151150
ultralm-13b10875.07459011.5392856.464695102948
baize-v2-13b9304.5905459.0547814.464236108109-1
recycled-wizardlm-7b-v1.014946.6327509.2003542.56760491108-17
alpaca-7b_verbose5372.9331028.1557025.2226011201146
alpaca-farm-ppo-human8034.1004279.9540765.8536491101028
vicuna-7b10444.1626118.2633114.100700109113-4
alpaca-7b3962.59145115.39340612.8019551246658
phi-2-sft10683.9775687.6087993.631231111116-5
minichat-3b8683.0071519.3947406.38758911910712
guanaco-33b13115.0024947.5689602.566467103117-14
falcon-40b-instruct6623.3429198.7658655.4229461181108
gemma-2b-it10413.4019716.0022782.600306117122-5
llama-2-7b-chat-hf14794.9613406.7118811.750541104120-16
openbuddy-falcon-7b-v611523.5211746.9788473.457673113118-5
alpaca-7b_concise3511.9911766.6734384.6822611291218
phi-26262.3502103.1053250.755115126128-2
baize-v2-7b11273.4048155.3222341.917420116123-7
chatglm2-6b10272.7621854.5709161.808732123125-2
pythia-12b-mix-sft9132.5780903.3446370.766547125127-2
falcon-7b-instruct4782.1466186.7879614.6413441271198
oasst-sft-pythia-12b7261.7901142.2741660.4840521301300
guanaco-13b17743.4695974.1034340.633838114126-12
guanaco-7b13642.8800024.6449851.764983121124-3
baichuan-13b-chat17271.9921462.4647490.472603128129-1
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 17, @@ -2352,18 +2380,18 @@ "# Report for **avg_sigmoid_length_corrected_win_rate**\n", "\n", "## Gameability (lower is better)\n", - "Verbosity gameability (relative std metric): 12.9%\n", - "Conciseness gameability (relative std metric): 16.2%\n", + "Verbosity gameability (relative std metric): 13.2%\n", + "Conciseness gameability (relative std metric): 16.6%\n", "Adversarial winrate gain: 3.6\n", "Adversarial rank gain: 1.0\n", "\n", "## Correlation with Arena (higher is better)\n", - "Spearman Corr: 0.954\n", - "Kendall Corr: 0.830\n", + "Spearman Corr: 0.964\n", + "Kendall Corr: 0.854\n", "\n", - "## Correlation with length (closer to spearman=0.28, kendall=0.19 is better)\n", - "Spearman Corr: 0.260\n", - "Kendall Corr: 0.165\n", + "## Correlation with length (closer to spearman=0.28, kendall=0.20 is better)\n", + "Spearman Corr: 0.268\n", + "Kendall Corr: 0.176\n", "\n", "## Top 10 models\n" ] @@ -2442,1560 +2470,1580 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 avg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rankavg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rank
gpt4_1106_preview_verbose240264.30360142.502532-21.80106902-2
gpt4_1106_preview204950.00000050.0000000.000000101
gpt4_1106_preview_concise113622.92019439.48161716.5614231349
claude-3-opus-20240229138829.04176445.85777416.816010615
gpt4136523.57678937.58408214.0072931174
Qwen1.5-72B-Chat154926.49828338.74359512.245311752
gpt4_0314137122.07325935.10128613.0280271495
claude-3-sonnet-20240229142025.55632539.80048214.244157835
gpt4_0613_verbose147323.23736035.31502612.0776661284
claude-2.1_verbose141424.35407138.02852913.6744581064
gpt4_0613114015.75503827.10893711.35389928199
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013316.971563-17.888570248-46
pairrm-Yi-34B-Chat219531.24128326.712191-4.529092321-18
mistral-medium150021.85577332.77864010.92286715105
claude-2106917.18824030.13211812.943878231112
claude108216.98534429.67990112.694557241212
Yi-34B-Chat212329.65994727.469108-2.190839517-12
Snorkel-Mistral-PairRM-DPO273630.22005312.207351-18.012701475-71
claude-instant-1.2111216.12740027.96234511.834945251312
claude-2.1109615.73350727.39421011.660703291811
xwinlm-70b-v0.1177521.81295727.6444965.83153916160
gemini-pro145618.17764527.8490579.67141221156
Mixtral-8x7B-Instruct-v0.1146518.25531827.8497039.59438620146
evo-v2-7b175420.83411326.8078845.9737711820-2
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406323.777507-0.836556926-17
Mixtral-8x7B-Instruct-v0.1_concise91013.74404024.93274811.188708392316
gpt-3.5-turbo-16k-0613132814.13239122.8595438.72715234277
gpt-3.5-turbo-0613133114.09579922.7741448.67834535287
gpt-3.5-turbo-1106_verbose105812.76317022.4349529.671782432914
gpt4_0613_concise6279.40032117.7667518.366430614219
pairrm-tulu-2-70b160718.63896326.3797407.7407771922-3
tulu-2-dpo-70b141815.98285424.9131218.93026727243
Mistral-7B-ReMax-v0.1147815.99933124.2564798.25714826251
gpt-3.5-turbo-11067969.17796516.9711837.793218644915
LMCocktail-10.7B-v1120313.15343122.2158039.062372403010
internlm2-chat-20b-ppo237321.74915514.939076-6.8100791760-43
claude-2.1_concise5739.22712517.5381438.311018634617
gpt-3.5-turbo-03018279.62245317.7076548.085200584315
xwinlm-13b-v0.1189417.42793520.1078372.6799032234-12
deepseek-llm-67b-chat115112.09342220.7441218.650699463313
gpt35_turbo_instruct10188.46244715.0149966.552549685711
wizardlm-70b154514.38389621.0761066.69221032320
vicuna-33b-v1.3147912.70594819.2540766.54812844395
pairrm-tulu-2-13b145413.83190121.2109737.37907238317
Mistral-7B-Instruct-v0.2167614.72277319.9731055.2503323135-4
evo-7b177415.57743719.7564154.1789783036-6
humpback-llama2-70b110710.12177217.5728107.451039564511
OpenHermes-2.5-Mistral-7B110710.34041617.9524077.611992554114
deita-7b-v1.0141712.64663919.7215227.07488245378
jina-chat6767.78613014.6330526.846921746113
gpt-3.5-turbo-1106_concise4317.41586514.2705986.854733826220
causallm-14b139111.14616117.5778406.43167951447
pairrm-zephyr-7b-beta148712.84127819.3833946.54211642384
Starling-LM-7B-alpha189514.24592416.4226152.1766923354-21
llama-2-70b-chat-hf179013.88825817.4069873.5187293647-11
openchat-v3.1-13b148411.08223016.7527625.67053152511
wizardlm-13b-v1.2163512.02748016.7406144.7131344752-5
ultralm-13b-v2.0-best-of-16172013.85337318.2535094.4001353740-3
wizardlm-13b-v1.1152511.23391016.6349635.4010545053-3
zephyr-7b-beta144410.99288616.9356175.94273153503
dolphin-2.2.1-mistral-7b11309.03980015.5974876.557688665610
humpback-llama-65b12329.42513915.7723466.34720660555
openbuddy-llama2-70b-v10.110778.09642214.1653536.06893172648
openbuddy-llama-65b-v811628.77065014.9970126.22636267589
Qwen-14B-Chat10137.50233313.3264225.82408879709
gpt4_gamed683.7383377.3370863.5987491101091
cut-13b163710.77908914.9847614.2056725459-5
openchat-v2-w-13b15669.61534413.9292014.3138575966-7
tulu-2-dpo-13b161410.11978814.2637554.1439665763-6
claude2-alpaca-13b11277.43735112.8431325.40578181729
minotaur-13b8815.73896410.4657464.72678298908
airoboros-65b15129.38895013.9962374.6072876265-3
cohere198312.90145513.7517170.8502624167-26
vicuna-13b-v1.311327.13724012.3079935.170753857312
xwinlm-7b-v0.1189411.24565212.9749011.7292504971-22
airoboros-33b15149.05316013.4819114.4287506569-4
platolm-7b13446.32082810.1609333.8401059196-5
vicuna-13b-v1.510616.72212211.8074635.085341887810
gemma-7b-it11156.93729412.0185395.08124586779
openchat-v2-13b15648.43507612.2328763.7978006974-5
zephyr-7b-alpha13028.35266413.6428725.29020870682
openbuddy-llama-30b-v7.19686.13001510.9946084.86459394859
ultralm-13b-best-of-16198011.30731512.0862840.7789694876-28
oasst-sft-llama-33b7484.7703918.8823744.1119831041004
wizardlm-13b9855.87815310.5053754.62722396897
nous-hermes-13b8445.4118799.9317364.51985799981
vicuna-13b10375.83110310.3011574.47005497943
tulu-2-dpo-7b16638.19751511.2134373.0159217182-11
openbuddy-llama2-13b-v11.110576.17471610.8564724.68175593867
ultralm-13b-v2.013997.50462311.7947914.2901687879-1
text_davinci_0012962.7640055.3669132.6029081201182
openbuddy-falcon-40b-v910895.95574310.3884644.43272295923
openchat-13b16328.02238611.1864163.1640307383-10
llama-2-13b-chat-hf15137.70231011.4760893.7737797680-4
guanaco-65b12496.85849511.4127874.55429387816
opencoderplus-15b16287.40622210.3522142.9459928393-10
oasst-rlhf-llama-33b10796.29643511.0106034.71416892848
openchat8192-13b16647.47276710.2155842.7428188095-15
phi-2-dpo16877.75709610.4486102.6915147591-16
minichat-1.5-3b15456.5534439.6024793.0490369099-9
vicuna-7b-v1.510834.7974948.3809403.5834461031021
llama-2-chat-7b-evol70k-neft16127.60238410.7281393.1257557787-10
recycled-wizardlm-7b-v2.015837.33712910.5285003.1913708488-4
vicuna-7b-v1.311104.6425128.0536553.4111431051050
alpaca-farm-ppo-sim-gpt4-20k5113.4503426.5963083.1459661131112
ultralm-13b10875.0745908.8560113.781421100101-1
baize-v2-13b9304.5905458.2961293.7055841061033
recycled-wizardlm-7b-v1.014946.6327509.9773693.3446198997-8
alpaca-7b_verbose5372.9331025.5942852.6611841181162
alpaca-farm-ppo-human8034.1004277.5741373.4737101081062
vicuna-7b10444.1626117.3415393.178928107108-1
alpaca-7b3962.5914514.9996032.4081521221193
phi-2-sft10683.9775686.9746592.997091109110-1
minichat-3b8683.0071515.4963702.4892191171170
guanaco-33b13115.0024948.1437363.141242101104-3
falcon-40b-instruct6623.3429196.2930752.9501561161124
gemma-2b-it10413.4019716.0042532.6022821151141
llama-2-7b-chat-hf14794.9613407.5182122.556872102107-5
openbuddy-falcon-7b-v611523.5211746.0382292.517055111113-2
alpaca-7b_concise3511.9911763.8532421.8620661271261
phi-26262.3502104.4424202.0922111241231
baize-v2-7b11273.4048155.8795782.474763114115-1
chatglm2-6b10272.7621854.8909452.1287601211201
pythia-12b-mix-sft9132.5780904.6742402.0961491231212
falcon-7b-instruct4782.1466184.1154621.9688451251250
oasst-sft-pythia-12b7261.7901143.3430881.5529741281271
guanaco-13b17743.4695974.4003900.930793112124-12
guanaco-7b13642.8800024.5929131.712911119122-3
baichuan-13b-chat17271.9921462.6123290.620183126128-2gpt4_1106_preview_verbose240264.30360142.502532-21.80106902-2
gpt4_1106_preview204950.00000050.0000000.000000101
gpt4_1106_preview_concise113622.92019439.48161716.56142314410
claude-3-opus-20240229138829.04176445.85777416.816010716
gpt4136523.57678937.58408214.0072931275
Qwen1.5-72B-Chat154926.49828338.74359512.245311853
gpt4_0314137122.07325935.10128613.0280271596
claude-3-sonnet-20240229142025.55632539.80048214.244157936
gpt4_0613_verbose147323.23736035.31502612.0776661385
mistral-large-2402136221.43877634.21738612.77861019109
claude-2.1_verbose141424.35407138.02852913.6744581165
gpt4_0613114015.75503827.10893711.353899302010
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013316.971563-17.888570250-48
Contextual-KTO-Mistral-PairRM252133.22735518.613504-14.613851341-38
pairrm-Yi-34B-Chat219531.24128326.712191-4.529092422-18
mistral-medium150021.85577332.77864010.92286716115
claude-2106917.18824030.13211812.943878251213
claude108216.98534429.67990112.694557261313
Yi-34B-Chat212329.65994727.469108-2.190839618-12
Snorkel-Mistral-PairRM-DPO273630.22005312.207351-18.012701577-72
claude-instant-1.2111216.12740027.96234511.834945271413
claude-2.1109615.73350727.39421011.660703311912
xwinlm-70b-v0.1177521.81295727.6444965.83153917170
gemini-pro145618.17764527.8490579.67141223167
Mixtral-8x7B-Instruct-v0.1146518.25531827.8497039.59438622157
evo-v2-7b175420.83411326.8078845.9737712021-1
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406323.777507-0.8365561027-17
Mixtral-8x7B-Instruct-v0.1_concise91013.74404024.93274811.188708412417
gpt-3.5-turbo-16k-0613132814.13239122.8595438.72715236288
gpt-3.5-turbo-0613133114.09579922.7741448.67834537298
gpt-3.5-turbo-1106_verbose105812.76317022.4349529.671782453015
gpt4_0613_concise6279.40032117.7667518.366430634419
pairrm-tulu-2-70b160718.63896326.3797407.7407772123-2
tulu-2-dpo-70b141815.98285424.9131218.93026729254
Mistral-7B-ReMax-v0.1147815.99933124.2564798.25714828262
gpt-3.5-turbo-11067969.17796516.9711837.793218665115
LMCocktail-10.7B-v1120313.15343122.2158039.062372423111
internlm2-chat-20b-ppo237321.74915514.939076-6.8100791862-44
claude-2.1_concise5739.22712517.5381438.311018654817
gpt-3.5-turbo-03018279.62245317.7076548.085200604515
xwinlm-13b-v0.1189417.42793520.1078372.6799032435-11
deepseek-llm-67b-chat115112.09342220.7441218.650699483414
gpt35_turbo_instruct10188.46244715.0149966.552549705911
wizardlm-70b154514.38389621.0761066.69221034331
vicuna-33b-v1.3147912.70594819.2540766.54812846406
pairrm-tulu-2-13b145413.83190121.2109737.37907240328
Mistral-7B-Instruct-v0.2167614.72277319.9731055.2503323336-3
evo-7b177415.57743719.7564154.1789783237-5
humpback-llama2-70b110710.12177217.5728107.451039584711
OpenHermes-2.5-Mistral-7B110710.34041617.9524077.611992574314
deita-7b-v1.0141712.64663919.7215227.07488247389
jina-chat6767.78613014.6330526.846921766313
gpt-3.5-turbo-1106_concise4317.41586514.2705986.854733846420
causallm-14b139111.14616117.5778406.43167953467
pairrm-zephyr-7b-beta148712.84127819.3833946.54211644395
Starling-LM-7B-alpha189514.24592416.4226152.1766923556-21
llama-2-70b-chat-hf179013.88825817.4069873.5187293849-11
openchat-v3.1-13b148411.08223016.7527625.67053154531
wizardlm-13b-v1.2163512.02748016.7406144.7131344954-5
ultralm-13b-v2.0-best-of-16172013.85337318.2535094.4001353942-3
wizardlm-13b-v1.1152511.23391016.6349635.4010545255-3
zephyr-7b-beta144410.99288616.9356175.94273155523
dolphin-2.2.1-mistral-7b11309.03980015.5974876.557688685810
humpback-llama-65b12329.42513915.7723466.34720662575
openbuddy-llama2-70b-v10.110778.09642214.1653536.06893174668
openbuddy-llama-65b-v811628.77065014.9970126.22636269609
Qwen-14B-Chat10137.50233313.3264225.82408881729
gpt4_gamed683.7383377.3370863.5987491121111
cut-13b163710.77908914.9847614.2056725661-5
openchat-v2-w-13b15669.61534413.9292014.3138576168-7
tulu-2-dpo-13b161410.11978814.2637554.1439665965-6
claude2-alpaca-13b11277.43735112.8431325.40578183749
minotaur-13b8815.73896410.4657464.726782100928
airoboros-65b15129.38895013.9962374.6072876467-3
cohere198312.90145513.7517170.8502624369-26
vicuna-13b-v1.311327.13724012.3079935.170753877512
xwinlm-7b-v0.1189411.24565212.9749011.7292505173-22
airoboros-33b15149.05316013.4819114.4287506771-4
platolm-7b13446.32082810.1609333.8401059398-5
vicuna-13b-v1.510616.72212211.8074635.085341908010
gemma-7b-it11156.93729412.0185395.08124588799
openchat-v2-13b15648.43507612.2328763.7978007176-5
zephyr-7b-alpha13028.35266413.6428725.29020872702
openbuddy-llama-30b-v7.19686.13001510.9946084.86459396879
ultralm-13b-best-of-16198011.30731512.0862840.7789695078-28
oasst-sft-llama-33b7484.7703918.8823744.1119831061024
wizardlm-13b9855.87815310.5053754.62722398917
nous-hermes-13b8445.4118799.9317364.5198571011001
vicuna-13b10375.83110310.3011574.47005499963
tulu-2-dpo-7b16638.19751511.2134373.0159217384-11
openbuddy-llama2-13b-v11.110576.17471610.8564724.68175595887
ultralm-13b-v2.013997.50462311.7947914.2901688081-1
text_davinci_0012962.7640055.3669132.6029081221202
openbuddy-falcon-40b-v910895.95574310.3884644.43272297943
openchat-13b16328.02238611.1864163.1640307585-10
llama-2-13b-chat-hf15137.70231011.4760893.7737797882-4
guanaco-65b12496.85849511.4127874.55429389836
opencoderplus-15b16287.40622210.3522142.9459928595-10
oasst-rlhf-llama-33b10796.29643511.0106034.71416894868
openchat8192-13b16647.47276710.2155842.7428188297-15
phi-2-dpo16877.75709610.4486102.6915147793-16
minichat-1.5-3b15456.5534439.6024793.04903692101-9
vicuna-7b-v1.510834.7974948.3809403.5834461051041
llama-2-chat-7b-evol70k-neft16127.60238410.7281393.1257557989-10
recycled-wizardlm-7b-v2.015837.33712910.5285003.1913708690-4
vicuna-7b-v1.311104.6425128.0536553.4111431071070
alpaca-farm-ppo-sim-gpt4-20k5113.4503426.5963083.1459661151132
ultralm-13b10875.0745908.8560113.781421102103-1
baize-v2-13b9304.5905458.2961293.7055841081053
recycled-wizardlm-7b-v1.014946.6327509.9773693.3446199199-8
alpaca-7b_verbose5372.9331025.5942852.6611841201182
alpaca-farm-ppo-human8034.1004277.5741373.4737101101082
vicuna-7b10444.1626117.3415393.178928109110-1
alpaca-7b3962.5914514.9996032.4081521241213
phi-2-sft10683.9775686.9746592.997091111112-1
minichat-3b8683.0071515.4963702.4892191191190
guanaco-33b13115.0024948.1437363.141242103106-3
falcon-40b-instruct6623.3429196.2930752.9501561181144
gemma-2b-it10413.4019716.0042532.6022821171161
llama-2-7b-chat-hf14794.9613407.5182122.556872104109-5
openbuddy-falcon-7b-v611523.5211746.0382292.517055113115-2
alpaca-7b_concise3511.9911763.8532421.8620661291281
phi-26262.3502104.4424202.0922111261251
baize-v2-7b11273.4048155.8795782.474763116117-1
chatglm2-6b10272.7621854.8909452.1287601231221
pythia-12b-mix-sft9132.5780904.6742402.0961491251232
falcon-7b-instruct4782.1466184.1154621.9688451271270
oasst-sft-pythia-12b7261.7901143.3430881.5529741301291
guanaco-13b17743.4695974.4003900.930793114126-12
guanaco-7b13642.8800024.5929131.712911121124-3
baichuan-13b-chat17271.9921462.6123290.620183128130-2
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 20, @@ -4110,7 +4158,7 @@ "\n" ], "text/plain": [ - "" + "" ] }, "metadata": {}, @@ -4299,8 +4347,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 16.3 s, sys: 183 ms, total: 16.4 s\n", - "Wall time: 16.4 s\n" + "CPU times: user 16.3 s, sys: 199 ms, total: 16.5 s\n", + "Wall time: 16.5 s\n" ] } ], @@ -4330,17 +4378,17 @@ "\n", "## Gameability (lower is better)\n", "Verbosity gameability (relative std metric): 6.3%\n", - "Conciseness gameability (relative std metric): 13.4%\n", + "Conciseness gameability (relative std metric): 13.6%\n", "Adversarial winrate gain: 8.5\n", "Adversarial rank gain: 45.0\n", "\n", "## Correlation with Arena (higher is better)\n", - "Spearman Corr: 0.971\n", - "Kendall Corr: 0.872\n", + "Spearman Corr: 0.978\n", + "Kendall Corr: 0.893\n", "\n", - "## Correlation with length (closer to spearman=0.28, kendall=0.19 is better)\n", - "Spearman Corr: 0.195\n", - "Kendall Corr: 0.127\n", + "## Correlation with length (closer to spearman=0.28, kendall=0.20 is better)\n", + "Spearman Corr: 0.208\n", + "Kendall Corr: 0.141\n", "\n", "## Top 10 models\n" ] @@ -4350,14 +4398,14 @@ "text/plain": [ "gpt4_1106_preview_verbose 51.575008\n", "gpt4_1106_preview 50.000000\n", - "gpt4_1106_preview_concise 41.896601\n", + "gpt4_1106_preview_concise 41.896602\n", "claude-3-opus-20240229 40.391776\n", "gpt4 38.128090\n", "Qwen1.5-72B-Chat 36.571754\n", "gpt4_0314 35.307061\n", "claude-3-sonnet-20240229 34.872474\n", "gpt4_0613_verbose 33.821267\n", - "claude-2.1_verbose 30.291179\n", + "mistral-large-2402 32.652080\n", "Name: np.tanh(rand_delta_len_std_only) + instruction_difficulty + not_gamed_baseline.astype(float) - 1, dtype: float64" ] }, @@ -4430,1608 +4478,1628 @@ "data": { "text/html": [ "\n", - "\n", + "
\n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
 avg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rankavg_lengthwin_ratenew_win_ratedelta_win_raterank_win_raterank_new_win_ratedelta_rank
gpt4_1106_preview_verbose240264.30360151.575008-12.728593000
gpt4_1106_preview204950.00000050.0000000.000000110
gpt4_1106_preview_concise113622.92019441.89660118.97640713211
claude-3-opus-20240229138829.04176440.39177611.350012633
gpt4136523.57678938.12809014.5513001147
Qwen1.5-72B-Chat154926.49828336.57175410.073471752
gpt4_0314137122.07325935.30706113.2338021468
claude-3-sonnet-20240229142025.55632534.8724749.316149871
gpt4_0613_verbose147323.23736033.82126710.5839071284
claude-2.1_verbose141424.35407130.2911795.9371081091
gpt4_0613114015.75503830.18332214.428284281018
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013329.974322-4.885811211-9
pairrm-Yi-34B-Chat219531.24128328.814841-2.426442312-9
mistral-medium150021.85577328.6143376.75856515132
claude-2106917.18824028.15519610.96695623149
claude108216.98534427.28950410.30416124159
Yi-34B-Chat212329.65994727.190548-2.469399516-11
Snorkel-Mistral-PairRM-DPO273630.22005326.386425-3.833628417-13
claude-instant-1.2111216.12740025.6122599.48485925187
claude-2.1109615.73350725.2519449.518437291910
xwinlm-70b-v0.1177521.81295724.6496862.8367291620-4
gemini-pro145618.17764524.4271056.24946021210
Mixtral-8x7B-Instruct-v0.1146518.25531823.6884835.4331652022-2
evo-v2-7b175420.83411323.3577062.5235931823-5
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406323.223121-1.390942924-15
Mixtral-8x7B-Instruct-v0.1_concise91013.74404022.9626099.218569392514
gpt-3.5-turbo-16k-0613132814.13239122.7201898.58779834268
gpt-3.5-turbo-0613133114.09579922.3525138.25671435278
gpt-3.5-turbo-1106_verbose105812.76317022.0009379.237767432815
gpt4_0613_concise6279.40032121.57799112.177670612932
pairrm-tulu-2-70b160718.63896321.4284042.7894411930-11
tulu-2-dpo-70b141815.98285421.2386105.2557562731-4
Mistral-7B-ReMax-v0.1147815.99933120.5513684.5520362632-6
gpt-3.5-turbo-11067969.17796519.30058910.122624643331
LMCocktail-10.7B-v1120313.15343118.9507105.79727940346
internlm2-chat-20b-ppo237321.74915518.748739-3.0004151735-18
claude-2.1_concise5739.22712518.2084588.981333633627
gpt-3.5-turbo-03018279.62245318.0932428.470788583721
xwinlm-13b-v0.1189417.42793517.9189380.4910032238-16
deepseek-llm-67b-chat115112.09342217.8433845.74996246397
gpt35_turbo_instruct10188.46244717.7213039.258856684028
wizardlm-70b154514.38389617.5750613.1911653241-9
vicuna-33b-v1.3147912.70594817.5745754.86862744422
pairrm-tulu-2-13b145413.83190117.4052043.5733033843-5
Mistral-7B-Instruct-v0.2167614.72277317.1112522.3884793144-13
evo-7b177415.57743716.4893860.9119493045-15
humpback-llama2-70b110710.12177216.2491646.127393564610
OpenHermes-2.5-Mistral-7B110710.34041616.2485785.90816255478
deita-7b-v1.0141712.64663916.0590143.4123744548-3
jina-chat6767.78613015.8660048.079874744925
gpt-3.5-turbo-1106_concise4317.41586515.7695218.353656825032
causallm-14b139111.14616115.7203254.57416451510
pairrm-zephyr-7b-beta148712.84127815.5298672.6885894252-10
Starling-LM-7B-alpha189514.24592414.6904710.4445483353-20
llama-2-70b-chat-hf179013.88825814.6773190.7890613654-18
openchat-v3.1-13b148411.08223014.5033883.4211575255-3
wizardlm-13b-v1.2163512.02748014.4625912.4351104756-9
ultralm-13b-v2.0-best-of-16172013.85337314.1989880.3456143757-20
wizardlm-13b-v1.1152511.23391013.9157212.6818115058-8
zephyr-7b-beta144410.99288613.2031982.2103135359-6
dolphin-2.2.1-mistral-7b11309.03980013.1214784.08167866606
humpback-llama-65b12329.42513912.7998603.3747216061-1
openbuddy-llama2-70b-v10.110778.09642212.5721734.475751726210
openbuddy-llama-65b-v811628.77065012.4693563.69870667634
Qwen-14B-Chat10137.50233312.3787424.876408796415
gpt4_gamed683.73833712.1887648.4504271106545
cut-13b163710.77908912.1547821.3756935466-12
openchat-v2-w-13b15669.61534412.0304282.4150845967-8
tulu-2-dpo-13b161410.11978811.5544791.4346915768-11
claude2-alpaca-13b11277.43735111.4988984.061547816912
minotaur-13b8815.73896411.4588195.719855987028
airoboros-65b15129.38895011.0076421.6186926271-9
cohere198312.90145510.893021-2.0084344172-31
vicuna-13b-v1.311327.13724010.8431653.705925857312
xwinlm-7b-v0.1189411.24565210.812206-0.4334464974-25
airoboros-33b15149.05316010.7190031.6658426575-10
platolm-7b13446.32082810.5328854.212057917615
vicuna-13b-v1.510616.72212210.4844383.762316887711
gemma-7b-it11156.93729410.4257603.48846686788
openchat-v2-13b15648.43507610.3996071.9645326979-10
zephyr-7b-alpha13028.35266410.2897611.9370977080-10
openbuddy-llama-30b-v7.19686.13001510.2144954.084480948113
ultralm-13b-best-of-16198011.3073159.876089-1.4312264882-34
oasst-sft-llama-33b7484.7703919.8664125.0960211048321
wizardlm-13b9855.8781539.8281513.949998968412
nous-hermes-13b8445.4118799.7178634.305984998514
vicuna-13b10375.8311039.2220603.390957978611
tulu-2-dpo-7b16638.1975159.2002661.0027507187-16
openbuddy-llama2-13b-v11.110576.1747169.1590902.98437393885
ultralm-13b-v2.013997.5046239.1290181.6243967889-11
text_davinci_0012962.7640059.0206016.2565961209030
openbuddy-falcon-40b-v910895.9557438.9889363.03319495914
openchat-13b16328.0223868.8060530.7836677392-19
llama-2-13b-chat-hf15137.7023108.4360150.7337057693-17
guanaco-65b12496.8584958.2529171.3944228794-7
opencoderplus-15b16287.4062228.1524100.7461888395-12
oasst-rlhf-llama-33b10796.2964357.9709221.6744879296-4
openchat8192-13b16647.4727677.8970620.4242958097-17
phi-2-dpo16877.7570967.7708950.0137997598-23
minichat-1.5-3b15456.5534437.7016331.1481909099-9
vicuna-7b-v1.510834.7974947.6168932.8193991031003
llama-2-chat-7b-evol70k-neft16127.6023847.533053-0.06933177101-24
recycled-wizardlm-7b-v2.015837.3371297.5216100.18448184102-18
vicuna-7b-v1.311104.6425127.1564612.5139491051032
alpaca-farm-ppo-sim-gpt4-20k5113.4503427.1218083.6714661131049
ultralm-13b10875.0745907.1081912.033601100105-5
baize-v2-13b9304.5905457.0122472.4217021061060
recycled-wizardlm-7b-v1.014946.6327506.9014770.26872789107-18
alpaca-7b_verbose5372.9331026.8184643.88536311810810
alpaca-farm-ppo-human8034.1004276.4186032.318176108109-1
vicuna-7b10444.1626116.2772182.114607107110-3
alpaca-7b3962.5914515.8754873.28403712211111
phi-2-sft10683.9775685.8537881.876220109112-3
minichat-3b8683.0071515.7293332.7221821171134
guanaco-33b13115.0024945.6900190.687525101114-13
falcon-40b-instruct6623.3429195.6075332.2646141161151
gemma-2b-it10413.4019715.5964822.194511115116-1
llama-2-7b-chat-hf14794.9613405.3548210.393482102117-15
openbuddy-falcon-7b-v611523.5211744.8261241.304950111118-7
alpaca-7b_concise3511.9911764.4631082.4719311271198
phi-26262.3502104.3955482.0453381241204
baize-v2-7b11273.4048154.3825650.977750114121-7
chatglm2-6b10272.7621854.3592831.597098121122-1
pythia-12b-mix-sft9132.5780904.2213621.6432721231230
falcon-7b-instruct4782.1466184.0369381.8903201251241
oasst-sft-pythia-12b7261.7901143.2701021.4799881281253
guanaco-13b17743.4695973.003787-0.465810112126-14
guanaco-7b13642.8800022.871117-0.008885119127-8
baichuan-13b-chat17271.9921462.0621700.070025126128-2gpt4_1106_preview_verbose240264.30360151.575008-12.728593000
gpt4_1106_preview204950.00000050.0000000.000000110
gpt4_1106_preview_concise113622.92019441.89660218.97640714212
claude-3-opus-20240229138829.04176440.39177611.350012734
gpt4136523.57678938.12809014.5513001248
Qwen1.5-72B-Chat154926.49828336.57175410.073471853
gpt4_0314137122.07325935.30706113.2338021569
claude-3-sonnet-20240229142025.55632534.8724749.316149972
gpt4_0613_verbose147323.23736033.82126710.5839071385
mistral-large-2402136221.43877632.65208011.21330419910
claude-2.1_verbose141424.35407130.2911795.93710811101
gpt4_0613114015.75503830.18332214.428284301119
Snorkel-Mistral-PairRM-DPO-best-of-16261634.86013329.974322-4.885811212-10
Contextual-KTO-Mistral-PairRM252133.22735529.705809-3.521546313-10
pairrm-Yi-34B-Chat219531.24128328.814841-2.426442414-10
mistral-medium150021.85577328.6143376.75856516151
claude-2106917.18824028.15519610.96695625169
claude108216.98534427.28950410.30416126179
Yi-34B-Chat212329.65994727.190548-2.469399618-12
Snorkel-Mistral-PairRM-DPO273630.22005326.386425-3.833628519-14
claude-instant-1.2111216.12740025.6122599.48485927207
claude-2.1109615.73350725.2519449.518437312110
xwinlm-70b-v0.1177521.81295724.6496862.8367291722-5
gemini-pro145618.17764524.4271056.24946023230
Mixtral-8x7B-Instruct-v0.1146518.25531823.6884835.4331652224-2
evo-v2-7b175420.83411323.3577062.5235932025-5
Mixtral-8x7B-Instruct-v0.1_verbose208324.61406323.223121-1.3909421026-16
Mixtral-8x7B-Instruct-v0.1_concise91013.74404022.9626099.218569412714
gpt-3.5-turbo-16k-0613132814.13239122.7201898.58779936288
gpt-3.5-turbo-0613133114.09579922.3525138.25671437298
gpt-3.5-turbo-1106_verbose105812.76317022.0009379.237767453015
gpt4_0613_concise6279.40032121.57799112.177670633132
pairrm-tulu-2-70b160718.63896321.4284042.7894412132-11
tulu-2-dpo-70b141815.98285421.2386105.2557562933-4
Mistral-7B-ReMax-v0.1147815.99933120.5513684.5520362834-6
gpt-3.5-turbo-11067969.17796519.30058910.122625663531
LMCocktail-10.7B-v1120313.15343118.9507105.79727942366
internlm2-chat-20b-ppo237321.74915518.748740-3.0004151837-19
claude-2.1_concise5739.22712518.2084588.981333653827
gpt-3.5-turbo-03018279.62245318.0932428.470788603921
xwinlm-13b-v0.1189417.42793517.9189380.4910032440-16
deepseek-llm-67b-chat115112.09342217.8433845.74996248417
gpt35_turbo_instruct10188.46244717.7213039.258856704228
wizardlm-70b154514.38389617.5750613.1911653443-9
vicuna-33b-v1.3147912.70594817.5745754.86862746442
pairrm-tulu-2-13b145413.83190117.4052043.5733034045-5
Mistral-7B-Instruct-v0.2167614.72277317.1112522.3884793346-13
evo-7b177415.57743716.4893860.9119493247-15
humpback-llama2-70b110710.12177216.2491646.127393584810
OpenHermes-2.5-Mistral-7B110710.34041616.2485785.90816257498
deita-7b-v1.0141712.64663916.0590143.4123744750-3
jina-chat6767.78613015.8660048.079874765125
gpt-3.5-turbo-1106_concise4317.41586515.7695218.353656845232
causallm-14b139111.14616115.7203254.57416453530
pairrm-zephyr-7b-beta148712.84127815.5298672.6885894454-10
Starling-LM-7B-alpha189514.24592414.6904710.4445483555-20
llama-2-70b-chat-hf179013.88825814.6773190.7890613856-18
openchat-v3.1-13b148411.08223014.5033883.4211575457-3
wizardlm-13b-v1.2163512.02748014.4625912.4351104958-9
ultralm-13b-v2.0-best-of-16172013.85337314.1989880.3456143959-20
wizardlm-13b-v1.1152511.23391013.9157212.6818115260-8
zephyr-7b-beta144410.99288613.2031982.2103135561-6
dolphin-2.2.1-mistral-7b11309.03980013.1214784.08167868626
humpback-llama-65b12329.42513912.7998603.3747216263-1
openbuddy-llama2-70b-v10.110778.09642212.5721734.475751746410
openbuddy-llama-65b-v811628.77065012.4693563.69870669654
Qwen-14B-Chat10137.50233312.3787424.876408816615
gpt4_gamed683.73833712.1887648.4504271126745
cut-13b163710.77908912.1547821.3756935668-12
openchat-v2-w-13b15669.61534412.0304282.4150846169-8
tulu-2-dpo-13b161410.11978811.5544791.4346915970-11
claude2-alpaca-13b11277.43735111.4988984.061547837112
minotaur-13b8815.73896411.4588195.7198551007228
airoboros-65b15129.38895011.0076421.6186926473-9
cohere198312.90145510.893021-2.0084344374-31
vicuna-13b-v1.311327.13724010.8431653.705925877512
xwinlm-7b-v0.1189411.24565210.812206-0.4334465176-25
airoboros-33b15149.05316010.7190031.6658426777-10
platolm-7b13446.32082810.5328854.212057937815
vicuna-13b-v1.510616.72212210.4844383.762316907911
gemma-7b-it11156.93729410.4257603.48846688808
openchat-v2-13b15648.43507610.3996071.9645327181-10
zephyr-7b-alpha13028.35266410.2897611.9370977282-10
openbuddy-llama-30b-v7.19686.13001510.2144954.084480968313
ultralm-13b-best-of-16198011.3073159.876089-1.4312265084-34
oasst-sft-llama-33b7484.7703919.8664125.0960211068521
wizardlm-13b9855.8781539.8281513.949998988612
nous-hermes-13b8445.4118799.7178634.3059841018714
vicuna-13b10375.8311039.2220603.390957998811
tulu-2-dpo-7b16638.1975159.2002661.0027507389-16
openbuddy-llama2-13b-v11.110576.1747169.1590902.98437395905
ultralm-13b-v2.013997.5046239.1290181.6243968091-11
text_davinci_0012962.7640059.0206016.2565961229230
openbuddy-falcon-40b-v910895.9557438.9889373.03319497934
openchat-13b16328.0223868.8060530.7836677594-19
llama-2-13b-chat-hf15137.7023108.4360150.7337057895-17
guanaco-65b12496.8584958.2529171.3944228996-7
opencoderplus-15b16287.4062228.1524100.7461888597-12
oasst-rlhf-llama-33b10796.2964357.9709221.6744879498-4
openchat8192-13b16647.4727677.8970620.4242958299-17
phi-2-dpo16877.7570967.7708950.01379977100-23
minichat-1.5-3b15456.5534437.7016331.14819092101-9
vicuna-7b-v1.510834.7974947.6168932.8193991051023
llama-2-chat-7b-evol70k-neft16127.6023847.533053-0.06933179103-24
recycled-wizardlm-7b-v2.015837.3371297.5216100.18448186104-18
vicuna-7b-v1.311104.6425127.1564612.5139491071052
alpaca-farm-ppo-sim-gpt4-20k5113.4503427.1218083.6714661151069
ultralm-13b10875.0745907.1081912.033601102107-5
baize-v2-13b9304.5905457.0122472.4217021081080
recycled-wizardlm-7b-v1.014946.6327506.9014770.26872791109-18
alpaca-7b_verbose5372.9331026.8184643.88536312011010
alpaca-farm-ppo-human8034.1004276.4186032.318176110111-1
vicuna-7b10444.1626116.2772182.114607109112-3
alpaca-7b3962.5914515.8754873.28403712411311
phi-2-sft10683.9775685.8537881.876220111114-3
minichat-3b8683.0071515.7293332.7221821191154
guanaco-33b13115.0024945.6900190.687525103116-13
falcon-40b-instruct6623.3429195.6075332.2646141181171
gemma-2b-it10413.4019715.5964822.194511117118-1
llama-2-7b-chat-hf14794.9613405.3548210.393482104119-15
openbuddy-falcon-7b-v611523.5211744.8261241.304950113120-7
alpaca-7b_concise3511.9911764.4631082.4719311291218
phi-26262.3502104.3955482.0453381261224
baize-v2-7b11273.4048154.3825650.977750116123-7
chatglm2-6b10272.7621854.3592831.597098123124-1
pythia-12b-mix-sft9132.5780904.2213621.6432721251250
falcon-7b-instruct4782.1466184.0369381.8903201271261
oasst-sft-pythia-12b7261.7901143.2701021.4799881301273
guanaco-13b17743.4695973.003787-0.465810114128-14
guanaco-7b13642.8800022.871117-0.008885121129-8
baichuan-13b-chat17271.9921462.0621700.070025128130-2
\n" ], "text/plain": [ - "" + "" ] }, "execution_count": 27, @@ -6053,7 +6121,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 28, "id": "dc3b8c58-a686-47b2-a4dd-d9e6cc8089f2", "metadata": {}, "outputs": [], @@ -6094,7 +6162,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 30, "id": "4ba44847-8e03-486c-bb39-2148a7f2cbf0", "metadata": {}, "outputs": [ @@ -6104,7 +6172,7 @@ "0.4972910406584219" ] }, - "execution_count": 34, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -6125,7 +6193,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 31, "id": "e5482374-61a4-409d-9993-bac7c79a3c9d", "metadata": {}, "outputs": [], @@ -6135,7 +6203,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 32, "id": "dd2e8f6a-f702-46ad-b24f-e705b8a686ec", "metadata": {}, "outputs": [ @@ -6144,19 +6212,19 @@ "text/plain": [ "gpt4_1106_preview_verbose 1264.947899\n", "gpt4_1106_preview 1254.000000\n", - "gpt4_1106_preview_concise 1197.190903\n", + "gpt4_1106_preview_concise 1197.190905\n", "claude-3-opus-20240229 1186.394707\n", - "gpt4 1169.900608\n", + "gpt4 1169.900609\n", " ... \n", - "falcon-7b-instruct 703.579173\n", + "falcon-7b-instruct 703.579174\n", "oasst-sft-pythia-12b 665.600234\n", "guanaco-13b 650.365762\n", "guanaco-7b 642.280987\n", - "baichuan-13b-chat 583.349617\n", - "Name: np.tanh(rand_delta_len_std_only) + instruction_difficulty + not_gamed_baseline.astype(float) - 1, Length: 129, dtype: float64" + "baichuan-13b-chat 583.349616\n", + "Name: np.tanh(rand_delta_len_std_only) + instruction_difficulty + not_gamed_baseline.astype(float) - 1, Length: 131, dtype: float64" ] }, - "execution_count": 44, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -6167,17 +6235,17 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 33, "id": "c3cc7204-d14c-4d40-828f-2c0ebc9c90a8", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "PearsonRResult(statistic=0.9551930453734696, pvalue=6.035381315868855e-18)" + "PearsonRResult(statistic=0.956960109042506, pvalue=7.650929551873686e-20)" ] }, - "execution_count": 45, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -6229,6 +6297,14 @@ "\n", "$$min_{\\mathbf{w'}_l[m], \\mathbf{w'}_{x}[m], \\mathbf{w'}_{m}[m]} \\sum_i \\mathcal{L}_i(logistic( \\mathbf{w'}_l[m] * tanh(standardized(length(m(x_i)) - length(b(x_i)))) + \\mathbf{w'}_x[m] * embedding(x_i) + \\mathbf{w}'_m[m] ) + \\lambda' \\mathcal{L}_i(logistic( \\mathbf{w'}_l[m] * tanh(standardized(length(b'(x_i)) - length(b(x_i))))$$\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ebb64fb-4bbd-4abb-ad89-99812f21d60a", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/alpaca_eval/constants.py b/src/alpaca_eval/constants.py index 7cbacf2e..cf617bf3 100644 --- a/src/alpaca_eval/constants.py +++ b/src/alpaca_eval/constants.py @@ -206,26 +206,29 @@ def ALPACAFARM_GOLD_ANNOTATIONS(): # maps models to Arena Elo rating CHATBOT_ARENA_LEADERBOARD = { "gpt4_1106_preview": 1252, - "gpt4_0314": 1190, - "gpt4_0613": 1162, - "mistral-medium": 1150, - "claude": 1149, - "claude-2": 1132, - "claude-2.1": 1119, - "Mixtral-8x7B-Instruct-v0.1": 1118, - "gpt-3.5-turbo-0613": 1118, + "claude-3-opus-20240229": 1233, + "gpt4_0314": 1185, + "claude-3-sonnet-20240229": 1180, + "gpt4_0613": 1161, + "mistral-large-2402": 1155, + "mistral-medium": 1147, + "claude": 1146, + "claude-2": 1127, + "claude-2.1": 1117, + "Mixtral-8x7B-Instruct-v0.1": 1116, + "gpt-3.5-turbo-0613": 1115, "Yi-34B-Chat": 1115, - "gemini-pro": 1114, - "claude-instant-1.2": 1109, - "gpt-3.5-turbo-0301": 1105, - "wizardlm-70b": 1105, - "tulu-2-dpo-70b": 1104, - "vicuna-33b-v1.3": 1093, - "Starling-LM-7B-alpha": 1090, + "gemini-pro": 1112, + "claude-instant-1.2": 1105, + "gpt-3.5-turbo-0301": 1103, + "wizardlm-70b": 1103, + "tulu-2-dpo-70b": 1099, + "vicuna-33b-v1.3": 1090, + "Starling-LM-7B-alpha": 1085, "deepseek-llm-67b-chat": 1082, "llama-2-70b-chat-hf": 1082, - "OpenHermes-2.5-Mistral-7B": 1078, - "gpt-3.5-turbo-1106": 1071, + "OpenHermes-2.5-Mistral-7B": 1073, + "gpt-3.5-turbo-1106": 1069, "dolphin-2.2.1-mistral-7b": 1065, "wizardlm-13b-v1.2": 1058, "zephyr-7b-beta": 1051, diff --git a/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 86b639a0..413db3e8 100644 --- a/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/src/alpaca_eval/leaderboards/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -8,9 +8,11 @@ Qwen1.5-72B-Chat,26.49828339562733,1.304236164893057,201,600,4,805,25.2173913043 gpt4_0314,22.073258928708075,1.2466725494608204,172,627,6,805,21.73913043478261,verified,1371,35.30706121640206 claude-3-sonnet-20240229,25.556325292273296,1.3419811051815638,193,608,4,805,24.22360248447205,verified,1420,34.87247436243302 gpt4_0613_verbose,23.237360043453418,1.283539505582624,171,630,4,805,21.490683229813666,dev,1473,33.82126688658535 +mistral-large-2402,21.43877598137888,1.2485232545097724,166,638,1,805,20.6832298136646,verified,1362,32.65207998531868 claude-2.1_verbose,24.35407109006212,1.293586209982439,191,613,1,805,23.7888198757764,dev,1414,30.29117916664986 gpt4_0613,15.75503808763975,1.0754642482396215,117,684,4,805,14.782608695652174,verified,1140,30.18332231673423 Snorkel-Mistral-PairRM-DPO-best-of-16,34.8601328912795,1.3599450436840308,270,533,2,805,33.66459627329193,community,2616,29.974321613074405 +Contextual-KTO-Mistral-PairRM,33.227355200024846,1.3779687477923963,260,544,1,805,32.36024844720497,verified,2521,29.705808939683976 pairrm-Yi-34B-Chat,31.24128294680746,1.34824373994879,239,563,3,805,29.87577639751553,community,2195,28.81484086684313 mistral-medium,21.855772543652176,1.2682402187223842,164,639,2,805,20.496894409937887,minimal,1500,28.614337401726104 claude-2,17.188240356708075,1.17482825615589,131,673,1,805,16.335403726708076,minimal,1069,28.155196141629148 @@ -124,7 +126,7 @@ baize-v2-7b,3.404814977515528,0.5826293992489878,26,779,0,805,3.229813664596273, chatglm2-6b,2.7621847964596284,0.5020758950625489,19,781,5,805,2.670807453416149,community,1027,4.35928292679035 pythia-12b-mix-sft,2.5780902809689445,0.5127326717340586,19,786,0,805,2.360248447204969,verified,913,4.221361861408184 falcon-7b-instruct,2.146617553167702,0.454225792894195,16,787,2,805,2.111801242236025,verified,478,4.036937566812824 -oasst-sft-pythia-12b,1.790114083180124,0.39855808830493417,13,790,2,805,1.7391304347826086,verified,726,3.2701021144567473 +oasst-sft-pythia-12b,1.790114083180124,0.3985580883049341,13,790,2,805,1.7391304347826086,verified,726,3.270102114456748 guanaco-13b,3.469596859739131,0.5518606725700214,22,780,3,805,2.919254658385093,verified,1774,3.003787329611614 guanaco-7b,2.880002266173913,0.5202924149314048,21,783,1,805,2.670807453416149,verified,1364,2.871116813131697 baichuan-13b-chat,1.9921455615279504,0.4176985079331233,14,790,1,805,1.8012422360248446,community,1727,2.062170253598568