Skip to content

Commit

Permalink
Merge branch 'main' into main
Browse files Browse the repository at this point in the history
  • Loading branch information
YannDubs authored Oct 12, 2023
2 parents 956ae74 + b91c179 commit 862d075
Show file tree
Hide file tree
Showing 6 changed files with 14,635 additions and 125 deletions.
125 changes: 63 additions & 62 deletions docs/alpaca_eval_gpt4_leaderboard.csv

Large diffs are not rendered by default.

9,662 changes: 9,662 additions & 0 deletions results/platolm-7b/annotations.json

Large diffs are not rendered by default.

4,832 changes: 4,832 additions & 0 deletions results/platolm-7b/model_outputs.json

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -1,64 +1,65 @@
,win_rate,standard_error,n_wins,n_wins_base,n_draws,n_total,mode,avg_length
xwinlm-70b-v0.1,95.56803995006244,0.7249419256764628,765,35,1,801,community,1775.0
gpt4,95.27950310559004,0.716281440286153,761,32,12,805,minimal,1365.0
llama-2-70b-chat-hf,92.66169154228857,0.911762258320568,743,57,4,804,minimal,1790.0
ultralm-13b-v2.0-best-of-16,92.29813664596274,0.9402998068253294,743,62,0,805,community,1720.0
xwinlm-13b-v0.1,91.76029962546816,0.9681394385222166,734,65,2,801,community,1894.0
ultralm-13b-best-of-16,91.54228855721394,0.981927769109018,736,68,0,804,community,1980.0
claude-2,91.35572139303484,0.9897323784630048,734,69,1,804,minimal,1069.0
openchat-v3.1-13b,89.49004975124379,1.076875474505156,718,83,3,804,community,1484.0
chatgpt,89.36567164179104,1.0789487022114888,716,83,5,804,minimal,827.0
wizardlm-13b-v1.2,89.1656288916563,1.0904254662753898,714,85,4,803,community,1635.0
vicuna-33b-v1.3,88.99253731343283,1.095692216068168,713,86,5,804,verified,1479.0
claude,88.38509316770187,1.1144875403283188,707,89,9,805,minimal,1082.0
humpback-llama2-70b,87.93532338308458,1.1545476754393662,706,96,2,804,community,1822.0
xwinlm-7b-v0.1,87.82771535580525,1.154308695894137,703,97,1,801,community,1894.0
openbuddy-llama2-70b-v10.1,87.67123287671232,1.1508417516577765,701,96,6,803,community,1077.0
openchat-v2-w-13b,87.1268656716418,1.1769197439396015,699,102,3,804,community,1566.0
openbuddy-llama-65b-v8,86.53366583541147,1.2029182403474274,693,107,2,802,community,1162.0
wizardlm-13b-v1.1,86.31840796019901,1.2063217831272972,692,108,4,804,community,1525.0
cohere,85.0560398505604,1.2558329840021718,682,119,2,803,community,1715.0
openchat-v2-13b,84.96894409937889,1.2572979835605944,683,120,2,805,community,1564.0
humpback-llama-65b,83.70646766169155,1.3071034735987248,672,130,2,804,community,1269.0
ultralm-13b-v2.0,83.60248447204968,1.30578174546824,673,132,0,805,community,1399.0
vicuna-13b-v1.3,82.11180124223603,1.348769957803504,660,143,2,805,verified,1132.0
gpt35_turbo_instruct,81.7103620474407,1.3306133328057392,642,134,25,801,community,1018.0
openbuddy-llama-30b-v7.1,81.54613466334165,1.370658000946423,654,148,0,802,community,968.0
llama-2-13b-chat-hf,81.09452736318407,1.3817573087734825,652,152,0,804,minimal,1513.0
openchat-13b,80.8695652173913,1.3843738653129234,650,153,2,805,community,1632.0
openbuddy-falcon-40b-v9,80.69738480697384,1.3908517976873225,647,154,2,803,community,1089.0
ultralm-13b,80.63511830635119,1.3939556917204066,647,155,1,803,community,1087.0
openchat8192-13b,79.53980099502488,1.4222439886269744,639,164,1,804,community,1664.0
evo-7b,79.20298879202988,1.4222487749194896,632,163,8,803,community,1774.0
opencoderplus-15b,78.69565217391305,1.440029529188432,632,170,3,805,community,1628.0
openbuddy-llama2-13b-v11.1,77.48756218905473,1.4712754099058205,622,180,2,804,community,1057.0
vicuna-7b-v1.3,76.8414481897628,1.487520320531845,614,184,3,801,verified,1110.0
wizardlm-13b,75.31094527363184,1.5101858292160824,601,194,9,804,minimal,985.0
jina-chat,74.12718204488779,1.541070307435577,592,205,5,802,community,676.0
airoboros-65b,73.91304347826086,1.5285333061227804,587,202,16,805,community,1512.0
airoboros-33b,73.29192546583852,1.55290318216736,587,212,6,805,community,1514.0
guanaco-65b,71.80124223602485,1.586912361158523,578,227,0,805,minimal,1249.0
llama-2-7b-chat-hf,71.36645962732919,1.593038654706019,574,230,1,805,minimal,1479.0
vicuna-13b,70.43478260869566,1.6069688407799696,566,237,2,805,minimal,1037.0
openbuddy-falcon-7b-v6,70.36114570361146,1.612538056786233,565,238,0,803,community,1152.0
baize-v2-13b,66.95652173913044,1.6565358231309506,538,265,2,805,community,930.0
oasst-rlhf-llama-33b,66.52173913043478,1.6608288428292477,534,268,3,805,minimal,1079.0
minotaur-13b,66.02484472049689,1.6645545328264226,529,271,5,805,community,881.0
guanaco-33b,65.96273291925466,1.67108537053247,531,274,0,805,verified,1311.0
nous-hermes-13b,65.46583850931677,1.669962276077284,524,275,6,805,verified,844.0
vicuna-7b,64.40993788819875,1.6851107260487883,517,285,3,805,verified,1044.0
baize-v2-7b,63.85093167701863,1.6945981855442178,514,291,0,805,community,1127.0
oasst-sft-llama-33b,54.96894409937888,1.7402667933686875,436,356,13,805,verified,748.0
guanaco-13b,52.60869565217391,1.7576690299699242,422,380,3,805,verified,1774.0
text_davinci_003,50.0,0.0,0,0,805,805,minimal,307.0
chatglm2-6b,47.12858926342072,1.7593143221324448,375,421,5,801,community,1027.0
guanaco-7b,46.58385093167702,1.7570464905413992,374,429,2,805,verified,1364.0
falcon-40b-instruct,45.71428571428572,1.7524717060805597,366,435,4,805,minimal,662.0
alpaca-farm-ppo-sim-gpt4-20k,44.099378881987576,1.7399772578861137,350,445,10,805,verified,511.0
pythia-12b-mix-sft,41.86335403726708,1.737637146007538,336,467,2,805,verified,913.0
alpaca-farm-ppo-human,41.24223602484472,1.7271813123250834,328,469,8,805,minimal,803.0
alpaca-7b,26.459627329192543,1.535711469748,205,584,16,805,minimal,396.0
oasst-sft-pythia-12b,25.962732919254663,1.5261079289535309,201,588,16,805,verified,726.0
falcon-7b-instruct,23.60248447204969,1.4898235369056625,187,612,6,805,verified,478.0
baichuan-13b-chat,21.801242236024844,1.4495247592518703,173,627,5,805,community,1727.0
text_davinci_001,15.17412935323383,1.235107892276849,112,672,20,804,minimal,296.0
xwinlm-70b-v0.1,95.56803995,0.724941926,765,35,1,801,community,1775
gpt4,95.27950311,0.71628144,761,32,12,805,minimal,1365
llama-2-70b-chat-hf,92.66169154,0.911762258,743,57,4,804,minimal,1790
ultralm-13b-v2.0-best-of-16,92.29813665,0.940299807,743,62,0,805,community,1720
xwinlm-13b-v0.1,91.76029963,0.968139439,734,65,2,801,community,1894
ultralm-13b-best-of-16,91.54228856,0.981927769,736,68,0,804,community,1980
claude-2,91.35572139,0.989732378,734,69,1,804,minimal,1069
openchat-v3.1-13b,89.49004975,1.076875475,718,83,3,804,community,1484
chatgpt,89.36567164,1.078948702,716,83,5,804,minimal,827
wizardlm-13b-v1.2,89.16562889,1.090425466,714,85,4,803,community,1635
vicuna-33b-v1.3,88.99253731,1.095692216,713,86,5,804,verified,1479
claude,88.38509317,1.11448754,707,89,9,805,minimal,1082
humpback-llama2-70b,87.93532338,1.154547675,706,96,2,804,community,1822
xwinlm-7b-v0.1,87.82771536,1.154308696,703,97,1,801,community,1894
openbuddy-llama2-70b-v10.1,87.67123288,1.150841752,701,96,6,803,community,1077
openchat-v2-w-13b,87.12686567,1.176919744,699,102,3,804,community,1566
openbuddy-llama-65b-v8,86.53366584,1.20291824,693,107,2,802,community,1162
wizardlm-13b-v1.1,86.31840796,1.206321783,692,108,4,804,community,1525
cohere,85.05603985,1.255832984,682,119,2,803,community,1715
openchat-v2-13b,84.9689441,1.257297984,683,120,2,805,community,1564
humpback-llama-65b,83.70646766,1.307103474,672,130,2,804,community,1269
ultralm-13b-v2.0,83.60248447,1.305781745,673,132,0,805,community,1399
vicuna-13b-v1.3,82.11180124,1.348769958,660,143,2,805,verified,1132
platolm-7b,81.94271482,1.35256737,656,143,4,803,community,1344
gpt35_turbo_instruct,81.71036205,1.330613333,642,134,25,801,community,1018
openbuddy-llama-30b-v7.1,81.54613466,1.370658001,654,148,0,802,community,968
llama-2-13b-chat-hf,81.09452736,1.381757309,652,152,0,804,minimal,1513
openchat-13b,80.86956522,1.384373865,650,153,2,805,community,1632
openbuddy-falcon-40b-v9,80.69738481,1.390851798,647,154,2,803,community,1089
ultralm-13b,80.63511831,1.393955692,647,155,1,803,community,1087
openchat8192-13b,79.539801,1.422243989,639,164,1,804,community,1664
evo-7b,79.20298879202988,1.4222487749194896,632,163,8,803,community,1774
opencoderplus-15b,78.69565217,1.440029529,632,170,3,805,community,1628
openbuddy-llama2-13b-v11.1,77.48756219,1.47127541,622,180,2,804,community,1057
vicuna-7b-v1.3,76.84144819,1.487520321,614,184,3,801,verified,1110
wizardlm-13b,75.31094527,1.510185829,601,194,9,804,minimal,985
jina-chat,74.12718204,1.541070307,592,205,5,802,community,676
airoboros-65b,73.91304348,1.528533306,587,202,16,805,community,1512
airoboros-33b,73.29192547,1.552903182,587,212,6,805,community,1514
guanaco-65b,71.80124224,1.586912361,578,227,0,805,minimal,1249
llama-2-7b-chat-hf,71.36645963,1.593038655,574,230,1,805,minimal,1479
vicuna-13b,70.43478261,1.606968841,566,237,2,805,minimal,1037
openbuddy-falcon-7b-v6,70.3611457,1.612538057,565,238,0,803,community,1152
baize-v2-13b,66.95652174,1.656535823,538,265,2,805,community,930
oasst-rlhf-llama-33b,66.52173913,1.660828843,534,268,3,805,minimal,1079
minotaur-13b,66.02484472,1.664554533,529,271,5,805,community,881
guanaco-33b,65.96273292,1.671085371,531,274,0,805,verified,1311
nous-hermes-13b,65.46583851,1.669962276,524,275,6,805,verified,844
vicuna-7b,64.40993789,1.685110726,517,285,3,805,verified,1044
baize-v2-7b,63.85093168,1.694598186,514,291,0,805,community,1127
oasst-sft-llama-33b,54.9689441,1.740266793,436,356,13,805,verified,748
guanaco-13b,52.60869565,1.75766903,422,380,3,805,verified,1774
text_davinci_003,50,0,0,0,805,805,minimal,307
chatglm2-6b,47.12858926,1.759314322,375,421,5,801,community,1027
guanaco-7b,46.58385093,1.757046491,374,429,2,805,verified,1364
falcon-40b-instruct,45.71428571,1.752471706,366,435,4,805,minimal,662
alpaca-farm-ppo-sim-gpt4-20k,44.09937888,1.739977258,350,445,10,805,verified,511
pythia-12b-mix-sft,41.86335404,1.737637146,336,467,2,805,verified,913
alpaca-farm-ppo-human,41.24223602,1.727181312,328,469,8,805,minimal,803
alpaca-7b,26.45962733,1.53571147,205,584,16,805,minimal,396
oasst-sft-pythia-12b,25.96273292,1.526107929,201,588,16,805,verified,726
falcon-7b-instruct,23.60248447,1.489823537,187,612,6,805,verified,478
baichuan-13b-chat,21.80124224,1.449524759,173,627,5,805,community,1727
text_davinci_001,15.17412935,1.235107892,112,672,20,804,minimal,296
13 changes: 13 additions & 0 deletions src/alpaca_eval/models_configs/platolm-7b/configs.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
platolm-7b:
prompt_template: "platolm-7b/prompt.txt"
fn_completions: "huggingface_local_completions"
completions_kwargs:
model_name: "FreedomIntelligence/PlatoLM-7b"
model_kwargs:
torch_dtype: 'bfloat16'
max_new_tokens: 2048
temperature: 0.7
top_p: 1.0
do_sample: True
pretty_name: "PlatoLM 7B"
link: "https://huggingface.co/FreedomIntelligence/PlatoLM-7B"
1 change: 1 addition & 0 deletions src/alpaca_eval/models_configs/platolm-7b/prompt.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n Human: {instruction} <\s> Assistant:

0 comments on commit 862d075

Please sign in to comment.