diff --git a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv index 18abe6d3..32065a1d 100644 --- a/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv +++ b/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv @@ -36,6 +36,7 @@ Claude 3 Opus (02/29),40.5095080124761,29.10526953334248,1388,,https://github.co Infinity-Instruct-7M-Gen-mistral-7B,39.66949964831439,34.347412485016434,1742,https://huggingface.co/BAAI/Infinity-Instruct-7M-Gen-mistral-7B,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Infinity-Instruct-7M-Gen-mistral-7B/model_outputs.json,community Llama 3.1 405B Instruct,39.25732749961743,39.10666895419877,1988,https://huggingface.co/meta-llama/Meta-Llama-3.1-405B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Meta-Llama-3.1-405B-Instruct-Turbo/model_outputs.json,minimal SPPO-Llama-3-Instruct-8B-PairRM,38.56280663670214,39.67286090605648,2066,https://huggingface.co/UCLA-AGI/Llama-3-Instruct-8B-SPPO-Iter3,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/SPPO-Llama-3-Instruct-8B-PairRM/model_outputs.json,community +GPO-Llama-3-8B-Instruct-GPM-2B,38.4334071653788,48.87200127423316,2613,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/GPO-Llama-3-8B-Instruct-GPM-2B/model_outputs.json,community GPT-4,38.12808974440021,23.576789314782605,1365,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4/model_outputs.json,verified Qwen2 72B Instruct,38.07461345451606,29.8527557752399,1626,https://huggingface.co/Qwen/Qwen2-72B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Qwen2-72B-Instruct/model_outputs.json,verified Llama 3.1 70B Instruct,38.05512453607286,39.12691443804968,2044,https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Meta-Llama-3.1-70B-Instruct-Turbo/model_outputs.json,minimal @@ -43,6 +44,7 @@ Infinity-Instruct-3M-0625-Llama3-70B,37.97881098506053,24.277231851026183,1294,h Aligner 2B+Qwen1.5 72B Chat,36.725868878524274,31.773037737123104,1812,https://github.com/AlignInc/aligner-replication,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/aligner-2b_qwen1.5-72b-chat/model_outputs.json,community Qwen1.5 72B Chat,36.571754111987296,26.49828339562733,1549,https://huggingface.co/Qwen/Qwen1.5-72B-Chat,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Qwen1.5-72B-Chat/model_outputs.json,verified GPT-4 (03/14),35.30706121640206,22.073258928708075,1371,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/gpt4_0314/model_outputs.json,verified +SPPO-Llama-3-8B-Instruct-GPM-2B,35.30471134991328,45.44098127183851,2490,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/SPPO-Llama-3-8B-Instruct-GPM-2B/model_outputs.json,community Ein 70B v0.1,35.029054008520646,24.84472049689441,1467,https://huggingface.co/SF-Foundation/EinBase-70B-v0.1-full,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/Ein-70B-v0.1/model_outputs.json,community Claude 3 Sonnet (02/29),34.87247436243302,25.556325292273296,1420,,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/claude-3-sonnet-20240229/model_outputs.json,minimal FsfairX-Zephyr-Chat-v0.1,34.78744762311656,35.94648644102434,2275,https://huggingface.co/sfairXC/FsfairX-Zephyr-Chat-v0.1,https://github.com/tatsu-lab/alpaca_eval/blob/main/results/FsfairX-Zephyr-Chat-v0.1/model_outputs.json,community