Muennighoff commited on
Commit
5443e66
·
1 Parent(s): b851397

Merge eval

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 4b284b12bc4/evaluation/generation/merged.csv +53 -0
  2. 4b284b12bc4/evaluation/generation/merged.json +1 -0
  3. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_0.csv +21 -0
  4. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_1.csv +21 -0
  5. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_2.csv +21 -0
  6. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_3.csv +21 -0
  7. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_4.csv +21 -0
  8. 4b284b12bc4/evaluation/rankeval/4b284b12bc4_5.csv +21 -0
  9. 4b284b17bc4/evaluation/generation/merged.csv +53 -0
  10. 4b284b17bc4/evaluation/generation/merged.json +1 -0
  11. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_0.csv +21 -0
  12. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_1.csv +21 -0
  13. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_2.csv +21 -0
  14. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_3.csv +21 -0
  15. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_4.csv +21 -0
  16. 4b284b17bc4/evaluation/rankeval/4b284b17bc4_5.csv +21 -0
  17. 4b284b21bc4/evaluation/generation/merged.csv +53 -0
  18. 4b284b21bc4/evaluation/generation/merged.json +1 -0
  19. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_0.csv +21 -0
  20. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_1.csv +21 -0
  21. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_2.csv +21 -0
  22. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_3.csv +21 -0
  23. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_4.csv +21 -0
  24. 4b284b21bc4/evaluation/rankeval/4b284b21bc4_5.csv +21 -0
  25. 4b284b28bc4/evaluation/generation/merged.csv +53 -0
  26. 4b284b28bc4/evaluation/generation/merged.json +1 -0
  27. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_0.csv +21 -0
  28. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_1.csv +21 -0
  29. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_2.csv +21 -0
  30. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_3.csv +21 -0
  31. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_4.csv +21 -0
  32. 4b284b28bc4/evaluation/rankeval/4b284b28bc4_5.csv +21 -0
  33. 4b284b42bc4/evaluation/generation/merged.csv +53 -0
  34. 4b284b42bc4/evaluation/generation/merged.json +1 -0
  35. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_0.csv +21 -0
  36. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_1.csv +21 -0
  37. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_2.csv +21 -0
  38. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_3.csv +21 -0
  39. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_4.csv +21 -0
  40. 4b284b42bc4/evaluation/rankeval/4b284b42bc4_5.csv +21 -0
  41. 4b284b84bc4/evaluation/4b284b84bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json +0 -87
  42. 4b284b84bc4/evaluation/4b284b84bc4_1_lm-eval_global_step80108_2023-01-30-11-26-40_1shots_backup.json +0 -87
  43. 4b284b84bc4/evaluation/4b284b84bc4_2_lm-eval_global_step80108_2023-01-30-11-26-40_2shots_backup.json +0 -87
  44. 4b284b84bc4/evaluation/4b284b84bc4_3_lm-eval_global_step80108_2023-01-30-11-26-40_3shots_backup.json +0 -87
  45. 4b284b84bc4/evaluation/4b284b84bc4_4_lm-eval_global_step80108_2023-01-30-11-26-40_4shots_backup.json +0 -87
  46. 4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json +0 -87
  47. 4b284b84bc4/evaluation/generation/merged.csv +53 -0
  48. 4b284b84bc4/evaluation/generation/merged.json +1 -0
  49. 4b284b84bc4/evaluation/rankeval/4b284b84bc4_0.csv +21 -0
  50. 4b284b84bc4/evaluation/{4b284b84bc4_0.json → rankeval/4b284b84bc4_0.json} +0 -0
4b284b12bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.00024104025657346095
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.00024104025657346095
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1714205638298909
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1714205638298909
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.19259169221915515
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.19259169221915515
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.19636018570824587
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.19636018570824587
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.19590832872090894
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.19590832872090894
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19402158147865167
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19402158147865167
14
+ e2e_nlg_cleaned,5,average,multiple,0.15842389870223766
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.01730052045113504
16
+ gem_xsum,0,median,rouge2_fmeasure,0.01730052045113504
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.019132118327200527
18
+ gem_xsum,1,median,rouge2_fmeasure,0.019132118327200527
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.026545543337132424
20
+ gem_xsum,2,median,rouge2_fmeasure,0.026545543337132424
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03291830334125208
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03291830334125208
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010104068388385765
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010104068388385765
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.00041371259854665804
26
+ gem_xsum,5,median,rouge2_fmeasure,0.00041371259854665804
27
+ gem_xsum,5,average,multiple,0.017735711073942083
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.0532813862747049
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.0532813862747049
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05368591058094131
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05368591058094131
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.05344291957030947
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.05344291957030947
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05368996382308088
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05368996382308088
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0515680827205002
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.0515680827205002
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05107734688924233
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05107734688924233
40
+ web_nlg_en,5,average,multiple,0.052790934976463186
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.002874313185982406
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.002874313185982406
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.028190707681194575
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.028190707681194575
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.04456119604899187
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.04456119604899187
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.03887583188926559
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.03887583188926559
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013407675922368708
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013407675922368708
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0020845828252393957
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0020845828252393957
53
+ wiki_lingua_en,5,average,multiple,0.021665717925507092
4b284b12bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4070835356827751, "bleu_stderr": 0.03514958095848397, "rouge1_fmeasure": 0.11509298027342854, "rouge1_fmeasure_stderr": 0.002040147114373331, "rouge1_precision": 0.0758536616906455, "rouge1_precision_stderr": 0.0015747064380670645, "rouge1_recall": 0.3264375465319237, "rouge1_recall_stderr": 0.004888854445231445, "rouge2_fmeasure": 0.0532813862747049, "rouge2_fmeasure_stderr": 0.0012579627803205211, "rouge2_precision": 0.03493638633069714, "rouge2_precision_stderr": 0.0009342574915112234, "rouge2_recall": 0.15766160622381195, "rouge2_recall_stderr": 0.0033114573324024405, "rougeL_fmeasure": 0.1105412242108245, "rougeL_fmeasure_stderr": 0.0019072286738988954, "rougeL_precision": 0.07257604824526195, "rougeL_precision_stderr": 0.0014483785678009685, "rougeL_recall": 0.31637706878833355, "rougeL_recall_stderr": 0.004769735504597033, "rougeLsum_fmeasure": 0.10843545057843905, "rougeLsum_fmeasure_stderr": 0.0019083088150967664, "rougeLsum_precision": 0.0714774644843108, "rougeLsum_precision_stderr": 0.0014699104009543759, "rougeLsum_recall": 0.307939556685913, "rougeLsum_recall_stderr": 0.004520814685280998}}, "1": {"PALM_prompt": {"bleu": 0.41914858834195134, "bleu_stderr": 0.030279335876129, "rouge1_fmeasure": 0.11424698089656772, "rouge1_fmeasure_stderr": 0.001973221738343803, "rouge1_precision": 0.07536633674836868, "rouge1_precision_stderr": 0.001620641410096321, "rouge1_recall": 0.3290768382699901, "rouge1_recall_stderr": 0.00481767508183653, "rouge2_fmeasure": 0.05368591058094131, "rouge2_fmeasure_stderr": 0.0012551880063213156, "rouge2_precision": 0.03540467062379218, "rouge2_precision_stderr": 0.001074817084017668, "rouge2_recall": 0.16089821041540717, "rouge2_recall_stderr": 0.0033011630774406127, "rougeL_fmeasure": 0.10991123942051419, "rougeL_fmeasure_stderr": 0.0018557651460448018, "rougeL_precision": 0.07231503158237214, "rougeL_precision_stderr": 0.0015163361416883465, "rougeL_recall": 0.3189205930522712, "rougeL_recall_stderr": 0.004694857387684187, "rougeLsum_fmeasure": 0.1082043807305256, "rougeLsum_fmeasure_stderr": 0.0018480349337665876, "rougeLsum_precision": 0.07148579673935408, "rougeLsum_precision_stderr": 0.0015357817111525064, "rougeLsum_recall": 0.3110112645350247, "rougeLsum_recall_stderr": 0.00441643475943137}}, "2": {"PALM_prompt": {"bleu": 0.4241874936612034, "bleu_stderr": 0.03699728854949305, "rouge1_fmeasure": 0.11375522621692136, "rouge1_fmeasure_stderr": 0.0019642936162507533, "rouge1_precision": 0.07469786641233617, "rouge1_precision_stderr": 0.0015771153206732972, "rouge1_recall": 0.32891693541469197, "rouge1_recall_stderr": 0.004751520151482175, "rouge2_fmeasure": 0.05344291957030947, "rouge2_fmeasure_stderr": 0.001233885317072834, "rouge2_precision": 0.03462695918297652, "rouge2_precision_stderr": 0.0009079391487918842, "rouge2_recall": 0.16210166248343671, "rouge2_recall_stderr": 0.003411098262587952, "rougeL_fmeasure": 0.10934081987838024, "rougeL_fmeasure_stderr": 0.0018415550723111455, "rougeL_precision": 0.07161852924412807, "rougeL_precision_stderr": 0.0014761860684115284, "rougeL_recall": 0.31797917629392425, "rougeL_recall_stderr": 0.004598849198704314, "rougeLsum_fmeasure": 0.10823991385374933, "rougeLsum_fmeasure_stderr": 0.0018380668821100924, "rougeLsum_precision": 0.0711214967812572, "rougeLsum_precision_stderr": 0.00149377360051449, "rougeLsum_recall": 0.3130870814045286, "rougeLsum_recall_stderr": 0.004421211065212564}}, "3": {"PALM_prompt": {"bleu": 0.3916994292697065, "bleu_stderr": 0.02655023153261868, "rouge1_fmeasure": 0.11443103117633296, "rouge1_fmeasure_stderr": 0.0019845366723218495, "rouge1_precision": 0.07713695315738618, "rouge1_precision_stderr": 0.0018521617901133295, "rouge1_recall": 0.32641437991318506, "rouge1_recall_stderr": 0.004583689746653368, "rouge2_fmeasure": 0.05368996382308088, "rouge2_fmeasure_stderr": 0.0012403567348119643, "rouge2_precision": 0.036319480745632425, "rouge2_precision_stderr": 0.0012079439649413412, "rouge2_recall": 0.15985213856119682, "rouge2_recall_stderr": 0.003223582265695079, "rougeL_fmeasure": 0.10920281437234497, "rougeL_fmeasure_stderr": 0.0018472429946517301, "rougeL_precision": 0.07333561421491072, "rougeL_precision_stderr": 0.0017170202297610163, "rougeL_recall": 0.3129899723170166, "rougeL_recall_stderr": 0.004403504671395443, "rougeLsum_fmeasure": 0.10882089385505, "rougeLsum_fmeasure_stderr": 0.0018674885337082484, "rougeLsum_precision": 0.07332584684616669, "rougeLsum_precision_stderr": 0.0017553031622823821, "rougeLsum_recall": 0.31070372160179327, "rougeLsum_recall_stderr": 0.004314602758278112}}, "4": {"PALM_prompt": {"bleu": 0.37875018794247045, "bleu_stderr": 0.024296780304434905, "rouge1_fmeasure": 0.1102139471634063, "rouge1_fmeasure_stderr": 0.0019620155943445507, "rouge1_precision": 0.07231813177556075, "rouge1_precision_stderr": 0.0015118246585830762, "rouge1_recall": 0.31870699434574523, "rouge1_recall_stderr": 0.0046463072458484975, "rouge2_fmeasure": 0.0515680827205002, "rouge2_fmeasure_stderr": 0.001213141047008391, "rouge2_precision": 0.033695317164630666, "rouge2_precision_stderr": 0.0009163247572691914, "rouge2_recall": 0.15554105747469235, "rouge2_recall_stderr": 0.003236397171744527, "rougeL_fmeasure": 0.1054595308766645, "rougeL_fmeasure_stderr": 0.0018290764447497754, "rougeL_precision": 0.06901574750871842, "rougeL_precision_stderr": 0.0013947803448898716, "rougeL_recall": 0.30638147232578594, "rougeL_recall_stderr": 0.004472401624140904, "rougeLsum_fmeasure": 0.10492767242713451, "rougeLsum_fmeasure_stderr": 0.001836832016012516, "rougeLsum_precision": 0.06885871876103705, "rougeLsum_precision_stderr": 0.001420315845279487, "rougeLsum_recall": 0.30338100654345734, "rougeLsum_recall_stderr": 0.0043561963135752306}}, "5": {"PALM_prompt": {"bleu": 0.3689406693649318, "bleu_stderr": 0.01833284872989782, "rouge1_fmeasure": 0.10942321553706275, "rouge1_fmeasure_stderr": 0.001960578009336271, "rouge1_precision": 0.0725733890131515, "rouge1_precision_stderr": 0.0016541722828599028, "rouge1_recall": 0.31647681542290346, "rouge1_recall_stderr": 0.004649887369574888, "rouge2_fmeasure": 0.05107734688924233, "rouge2_fmeasure_stderr": 0.00122669666906548, "rouge2_precision": 0.0340511038621137, "rouge2_precision_stderr": 0.001109810266658632, "rouge2_recall": 0.1539259113242296, "rouge2_recall_stderr": 0.003295678214536681, "rougeL_fmeasure": 0.10453686352175766, "rougeL_fmeasure_stderr": 0.0018279045748061345, "rougeL_precision": 0.06920155517233274, "rougeL_precision_stderr": 0.0015510532870385023, "rougeL_recall": 0.30372126675172995, "rougeL_recall_stderr": 0.004472388579309833, "rougeLsum_fmeasure": 0.1036182486745027, "rougeLsum_fmeasure_stderr": 0.0018321897232056268, "rougeLsum_precision": 0.0688242602910231, "rougeLsum_precision_stderr": 0.0015744817424633028, "rougeLsum_recall": 0.3000075619025587, "rougeLsum_recall_stderr": 0.004328528641012373}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 0.14859459498800928, "bleu_stderr": 0.019538924284114197, "rouge1_fmeasure": 0.05739499438745971, "rouge1_fmeasure_stderr": 0.000959404112224303, "rouge1_precision": 0.0505257980847339, "rouge1_precision_stderr": 0.0009316535384306269, "rouge1_recall": 0.07944574030730557, "rouge1_recall_stderr": 0.0013436416711421135, "rouge2_fmeasure": 0.002874313185982406, "rouge2_fmeasure_stderr": 0.00022791640812011725, "rouge2_precision": 0.0025522430280765624, "rouge2_precision_stderr": 0.00019188137819214202, "rouge2_recall": 0.003889530091650386, "rouge2_recall_stderr": 0.0003502492712853139, "rougeL_fmeasure": 0.053008425140295905, "rougeL_fmeasure_stderr": 0.0008201296289562219, "rougeL_precision": 0.04641940622808299, "rougeL_precision_stderr": 0.0007892141876172595, "rougeL_recall": 0.07403750267734954, "rougeL_recall_stderr": 0.0011941764542527037, "rougeLsum_fmeasure": 0.05476968826480647, "rougeLsum_fmeasure_stderr": 0.0008953065390907387, "rougeLsum_precision": 0.04816018092226108, "rougeLsum_precision_stderr": 0.0008696938078640268, "rougeLsum_recall": 0.07599978162074517, "rougeLsum_recall_stderr": 0.0012632037879506343}}, "1": {"tldr_en": {"bleu": 1.346869321685321, "bleu_stderr": 0.06607443264134674, "rouge1_fmeasure": 0.15836293018887368, "rouge1_fmeasure_stderr": 0.001891640433790272, "rouge1_precision": 0.13584035826185337, "rouge1_precision_stderr": 0.001883003830967405, "rouge1_recall": 0.2300655738827389, "rouge1_recall_stderr": 0.002771815067524841, "rouge2_fmeasure": 0.028190707681194575, "rouge2_fmeasure_stderr": 0.0008118715649523407, "rouge2_precision": 0.02400583408187123, "rouge2_precision_stderr": 0.0007285343955308211, "rouge2_recall": 0.04289680599794199, "rouge2_recall_stderr": 0.001353605133009988, "rougeL_fmeasure": 0.12130392597137107, "rougeL_fmeasure_stderr": 0.0012906736654645788, "rougeL_precision": 0.10298984208878467, "rougeL_precision_stderr": 0.0012746961770117027, "rougeL_recall": 0.18009351587124983, "rougeL_recall_stderr": 0.002104050870387258, "rougeLsum_fmeasure": 0.14788907195330203, "rougeLsum_fmeasure_stderr": 0.0017520956258023405, "rougeLsum_precision": 0.12668865175777289, "rougeLsum_precision_stderr": 0.0017429813042225584, "rougeLsum_recall": 0.21558088869876474, "rougeLsum_recall_stderr": 0.0026000978058887433}}, "2": {"tldr_en": {"bleu": 2.247794388992107, "bleu_stderr": 0.09928029909737168, "rouge1_fmeasure": 0.1979943409707515, "rouge1_fmeasure_stderr": 0.0019701979235594003, "rouge1_precision": 0.1708173947979992, "rouge1_precision_stderr": 0.002081500885534108, "rouge1_recall": 0.2857546859749413, "rouge1_recall_stderr": 0.002766073063501205, "rouge2_fmeasure": 0.04456119604899187, "rouge2_fmeasure_stderr": 0.0009779372383836055, "rouge2_precision": 0.03867526801746755, "rouge2_precision_stderr": 0.0009249837176721441, "rouge2_recall": 0.06488659439579164, "rouge2_recall_stderr": 0.0015151297174771778, "rougeL_fmeasure": 0.14561448339286645, "rougeL_fmeasure_stderr": 0.0013308339769298708, "rougeL_precision": 0.12436000788073821, "rougeL_precision_stderr": 0.0013936495773448447, "rougeL_recall": 0.21512580022359384, "rougeL_recall_stderr": 0.0021392819398317084, "rougeLsum_fmeasure": 0.18443888049277404, "rougeLsum_fmeasure_stderr": 0.0018323545176278458, "rougeLsum_precision": 0.15896932268065497, "rougeLsum_precision_stderr": 0.0019322450604166179, "rougeLsum_recall": 0.2668532631896298, "rougeLsum_recall_stderr": 0.00261075911429361}}, "3": {"tldr_en": {"bleu": 2.212268753332442, "bleu_stderr": 0.09749124513916169, "rouge1_fmeasure": 0.17165885454358776, "rouge1_fmeasure_stderr": 0.002266523062316873, "rouge1_precision": 0.15395480397824707, "rouge1_precision_stderr": 0.002402542953049042, "rouge1_recall": 0.24557521515724243, "rouge1_recall_stderr": 0.0032199609208037362, "rouge2_fmeasure": 0.03887583188926559, "rouge2_fmeasure_stderr": 0.0009318869486006191, "rouge2_precision": 0.0345849314932535, "rouge2_precision_stderr": 0.000925086814362582, "rouge2_recall": 0.05685215254790676, "rouge2_recall_stderr": 0.0014741611375388177, "rougeL_fmeasure": 0.12611107179368627, "rougeL_fmeasure_stderr": 0.0015753873989903184, "rougeL_precision": 0.11262382269704937, "rougeL_precision_stderr": 0.001735467609168603, "rougeL_recall": 0.1853949845751863, "rougeL_recall_stderr": 0.002495030067359919, "rougeLsum_fmeasure": 0.15920866146470747, "rougeLsum_fmeasure_stderr": 0.0021025779963518188, "rougeLsum_precision": 0.1426484686620457, "rougeLsum_precision_stderr": 0.0022377962576445335, "rougeLsum_recall": 0.22876101619014125, "rougeLsum_recall_stderr": 0.0030299996388814796}}, "4": {"tldr_en": {"bleu": 0.5159665881377578, "bleu_stderr": 0.0355848354666848, "rouge1_fmeasure": 0.057194516095400834, "rouge1_fmeasure_stderr": 0.0019532780600384314, "rouge1_precision": 0.05323367871163294, "rouge1_precision_stderr": 0.002019764759552398, "rouge1_recall": 0.08477832633294664, "rouge1_recall_stderr": 0.0029256767843087337, "rouge2_fmeasure": 0.013407675922368708, "rouge2_fmeasure_stderr": 0.0006687476239191229, "rouge2_precision": 0.011892613439611567, "rouge2_precision_stderr": 0.0006525291351364055, "rouge2_recall": 0.021014952711012305, "rouge2_recall_stderr": 0.0011420293722952714, "rougeL_fmeasure": 0.0429641799328658, "rougeL_fmeasure_stderr": 0.0014362645716637073, "rougeL_precision": 0.03994947969161068, "rougeL_precision_stderr": 0.0015323136750776513, "rougeL_recall": 0.0654114763710059, "rougeL_recall_stderr": 0.0022890377474177525, "rougeLsum_fmeasure": 0.052823275351703086, "rougeLsum_fmeasure_stderr": 0.0018085153746161442, "rougeLsum_precision": 0.049203521589357486, "rougeLsum_precision_stderr": 0.0018745212624263795, "rougeLsum_recall": 0.07828616324493042, "rougeLsum_recall_stderr": 0.00270998239625019}}, "5": {"tldr_en": {"bleu": 5.133528491740168e-07, "bleu_stderr": 9.288876136024227e-07, "rouge1_fmeasure": 0.00885537771521418, "rouge1_fmeasure_stderr": 0.0008474145806800235, "rouge1_precision": 0.008397509125114434, "rouge1_precision_stderr": 0.0008676037544483993, "rouge1_recall": 0.013182384245950918, "rouge1_recall_stderr": 0.0012488579545023588, "rouge2_fmeasure": 0.0020845828252393957, "rouge2_fmeasure_stderr": 0.0002776449859686965, "rouge2_precision": 0.0018519121687661717, "rouge2_precision_stderr": 0.0002652232008037664, "rouge2_recall": 0.003201546871291623, "rouge2_recall_stderr": 0.00042951400653203656, "rougeL_fmeasure": 0.006795329450745382, "rougeL_fmeasure_stderr": 0.0006430172034977336, "rougeL_precision": 0.006312806712827651, "rougeL_precision_stderr": 0.0006339300731280119, "rougeL_recall": 0.010466170637582555, "rougeL_recall_stderr": 0.0010119772330203227, "rougeLsum_fmeasure": 0.008112137370163754, "rougeLsum_fmeasure_stderr": 0.0007719985459243486, "rougeLsum_precision": 0.0076894646083944945, "rougeLsum_precision_stderr": 0.000786099271409283, "rougeLsum_recall": 0.012227793486074013, "rougeLsum_recall_stderr": 0.0011695312324965957}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.02601889547824242, "bleu_stderr": 0.008845990174481217, "rouge1_fmeasure": 0.016342850382717815, "rouge1_fmeasure_stderr": 0.00034912236129064063, "rouge1_precision": 0.013188289488289925, "rouge1_precision_stderr": 0.0003020110162685144, "rouge1_recall": 0.02331912229418711, "rouge1_recall_stderr": 0.0005487407492538043, "rouge2_fmeasure": 0.00024104025657346095, "rouge2_fmeasure_stderr": 5.3010616942847675e-05, "rouge2_precision": 0.00018882696164487857, "rouge2_precision_stderr": 4.0143613745290394e-05, "rouge2_recall": 0.0003925072247489244, "rouge2_recall_stderr": 9.925690772584578e-05, "rougeL_fmeasure": 0.016342850382717815, "rougeL_fmeasure_stderr": 0.00034912236129064063, "rougeL_precision": 0.013188289488289925, "rougeL_precision_stderr": 0.0003020110162685144, "rougeL_recall": 0.02331912229418711, "rougeL_recall_stderr": 0.0005487407492538043, "rougeLsum_fmeasure": 0.015820908294670494, "rougeLsum_fmeasure_stderr": 0.00033227991604236137, "rougeLsum_precision": 0.012772945572946016, "rougeLsum_precision_stderr": 0.000289959796992291, "rougeLsum_recall": 0.022577540598345845, "rougeLsum_recall_stderr": 0.0005237847970514886}}, "1": {"generate_text_restaurant": {"bleu": 10.11194167971178, "bleu_stderr": 0.11325409958385195, "rouge1_fmeasure": 0.396708212066539, "rouge1_fmeasure_stderr": 0.0023271634805327288, "rouge1_precision": 0.44662580344767955, "rouge1_precision_stderr": 0.0029145470980427935, "rouge1_recall": 0.403325138761631, "rouge1_recall_stderr": 0.0029812540605776657, "rouge2_fmeasure": 0.1714205638298909, "rouge2_fmeasure_stderr": 0.0017391694364972787, "rouge2_precision": 0.19849995481108837, "rouge2_precision_stderr": 0.0023301034372897886, "rouge2_recall": 0.17476709268624893, "rouge2_recall_stderr": 0.0019834376005625296, "rougeL_fmeasure": 0.28469304359904984, "rougeL_fmeasure_stderr": 0.0018043411811584805, "rougeL_precision": 0.3259862976774872, "rougeL_precision_stderr": 0.0025491424613272398, "rougeL_recall": 0.28928330619087117, "rougeL_recall_stderr": 0.002329063084463768, "rougeLsum_fmeasure": 0.3323030796627126, "rougeLsum_fmeasure_stderr": 0.002111384569072981, "rougeLsum_precision": 0.37806486217321694, "rougeLsum_precision_stderr": 0.002787004218176944, "rougeLsum_recall": 0.3367501905387613, "rougeLsum_recall_stderr": 0.002641205675752919}}, "2": {"generate_text_restaurant": {"bleu": 10.765851233592166, "bleu_stderr": 0.1147637687545087, "rouge1_fmeasure": 0.4194853660757534, "rouge1_fmeasure_stderr": 0.002222220598248002, "rouge1_precision": 0.5029109698404333, "rouge1_precision_stderr": 0.0032326251122724503, "rouge1_recall": 0.399072749631299, "rouge1_recall_stderr": 0.002770754484292409, "rouge2_fmeasure": 0.19259169221915515, "rouge2_fmeasure_stderr": 0.0018423181170468268, "rouge2_precision": 0.23608052397188775, "rouge2_precision_stderr": 0.002567411684370418, "rouge2_recall": 0.18324767332759007, "rouge2_recall_stderr": 0.001989894281113984, "rougeL_fmeasure": 0.3027130154416743, "rougeL_fmeasure_stderr": 0.001900798692043736, "rougeL_precision": 0.3660346353864222, "rougeL_precision_stderr": 0.0028533265797324394, "rougeL_recall": 0.28763635547225785, "rougeL_recall_stderr": 0.002249079446710024, "rougeLsum_fmeasure": 0.34512069878778673, "rougeLsum_fmeasure_stderr": 0.00213278119755698, "rougeLsum_precision": 0.4152150772992765, "rougeLsum_precision_stderr": 0.0030631524445173153, "rougeLsum_recall": 0.32804484742896467, "rougeLsum_recall_stderr": 0.002520109021821381}}, "3": {"generate_text_restaurant": {"bleu": 10.504414399066166, "bleu_stderr": 0.14144404460789148, "rouge1_fmeasure": 0.42179412730593185, "rouge1_fmeasure_stderr": 0.0021527795098671134, "rouge1_precision": 0.509958754752616, "rouge1_precision_stderr": 0.003154730928607417, "rouge1_recall": 0.3954899202585752, "rouge1_recall_stderr": 0.002608601302312822, "rouge2_fmeasure": 0.19636018570824587, "rouge2_fmeasure_stderr": 0.0018014064871279597, "rouge2_precision": 0.24242098353513022, "rouge2_precision_stderr": 0.002484157206893866, "rouge2_recall": 0.18360253637850166, "rouge2_recall_stderr": 0.001904297402323581, "rougeL_fmeasure": 0.30275824983777616, "rougeL_fmeasure_stderr": 0.0018031245552577217, "rougeL_precision": 0.36777401786978037, "rougeL_precision_stderr": 0.002648283289980144, "rougeL_recall": 0.28409076327843025, "rougeL_recall_stderr": 0.002115488452238855, "rougeLsum_fmeasure": 0.3452111361624014, "rougeLsum_fmeasure_stderr": 0.0020448977561221436, "rougeLsum_precision": 0.4185549498251625, "rougeLsum_precision_stderr": 0.0029283489596877298, "rougeLsum_recall": 0.32342169868683124, "rougeLsum_recall_stderr": 0.0023588979812351725}}, "4": {"generate_text_restaurant": {"bleu": 10.453461006006084, "bleu_stderr": 0.20323399299325623, "rouge1_fmeasure": 0.4184745538314975, "rouge1_fmeasure_stderr": 0.002152424221911221, "rouge1_precision": 0.5152975398825912, "rouge1_precision_stderr": 0.0032876121566522126, "rouge1_recall": 0.3875757012647283, "rouge1_recall_stderr": 0.002563136912882847, "rouge2_fmeasure": 0.19590832872090894, "rouge2_fmeasure_stderr": 0.0017695553874619732, "rouge2_precision": 0.24767323400172267, "rouge2_precision_stderr": 0.002578983235298861, "rouge2_recall": 0.1802765312255684, "rouge2_recall_stderr": 0.0018268424519338505, "rougeL_fmeasure": 0.30013711995116676, "rougeL_fmeasure_stderr": 0.001799774395616833, "rougeL_precision": 0.37129359217610464, "rougeL_precision_stderr": 0.0027272181980460375, "rougeL_recall": 0.2780682094337028, "rougeL_recall_stderr": 0.0020775510109816452, "rougeLsum_fmeasure": 0.34206938848652774, "rougeLsum_fmeasure_stderr": 0.0020120604230570572, "rougeLsum_precision": 0.4223690533102043, "rougeLsum_precision_stderr": 0.003005111083307187, "rougeLsum_recall": 0.3169024351381635, "rougeLsum_recall_stderr": 0.0023241001320442878}}, "5": {"generate_text_restaurant": {"bleu": 10.336987597938899, "bleu_stderr": 0.20513507856533955, "rouge1_fmeasure": 0.4170315564856164, "rouge1_fmeasure_stderr": 0.002124259039598598, "rouge1_precision": 0.5143545157562858, "rouge1_precision_stderr": 0.003337934175013788, "rouge1_recall": 0.3857868235592206, "rouge1_recall_stderr": 0.0024923995035469678, "rouge2_fmeasure": 0.19402158147865167, "rouge2_fmeasure_stderr": 0.0017876994973534497, "rouge2_precision": 0.24631251817723201, "rouge2_precision_stderr": 0.002657555991369566, "rouge2_recall": 0.17787346956272435, "rouge2_recall_stderr": 0.001792217462473695, "rougeL_fmeasure": 0.3009842944910936, "rougeL_fmeasure_stderr": 0.0018055725861154817, "rougeL_precision": 0.37299125515052484, "rougeL_precision_stderr": 0.002815154394732632, "rougeL_recall": 0.27850203471081203, "rougeL_recall_stderr": 0.002038628599689064, "rougeLsum_fmeasure": 0.3425854571657328, "rougeLsum_fmeasure_stderr": 0.0020124639191903327, "rougeLsum_precision": 0.42379837225389067, "rougeLsum_precision_stderr": 0.0031034115053880863, "rougeLsum_recall": 0.31679500554979756, "rougeLsum_recall_stderr": 0.002259000761426462}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 0.711911214189282, "bleu_stderr": 0.062271095680873176, "rouge1_fmeasure": 0.11144536072130062, "rouge1_fmeasure_stderr": 0.002449613618907836, "rouge1_precision": 0.08057623396604993, "rouge1_precision_stderr": 0.0018427190672612415, "rouge1_recall": 0.1903611222785915, "rouge1_recall_stderr": 0.004119319495363413, "rouge2_fmeasure": 0.01730052045113504, "rouge2_fmeasure_stderr": 0.0010550283614850532, "rouge2_precision": 0.012439638488265747, "rouge2_precision_stderr": 0.0007674450472725088, "rouge2_recall": 0.029923546174765742, "rouge2_recall_stderr": 0.0018325311907880977, "rougeL_fmeasure": 0.09482469984719238, "rougeL_fmeasure_stderr": 0.001891115330305303, "rougeL_precision": 0.06836163772266587, "rougeL_precision_stderr": 0.001418824283099982, "rougeL_recall": 0.16290273051352785, "rougeL_recall_stderr": 0.0032456160924942976, "rougeLsum_fmeasure": 0.09329416183941738, "rougeLsum_fmeasure_stderr": 0.0019565438808381327, "rougeLsum_precision": 0.06720724706425169, "rougeLsum_precision_stderr": 0.001458820899362241, "rougeLsum_recall": 0.16056752379802697, "rougeLsum_recall_stderr": 0.003380030301216589}}, "1": {"article_DOC_summary": {"bleu": 0.7485653629026496, "bleu_stderr": 0.10161315249240252, "rouge1_fmeasure": 0.12151658666281231, "rouge1_fmeasure_stderr": 0.0025542911124592704, "rouge1_precision": 0.08658830544513134, "rouge1_precision_stderr": 0.0018612611645494558, "rouge1_recall": 0.21204874097915127, "rouge1_recall_stderr": 0.004437751832600217, "rouge2_fmeasure": 0.019132118327200527, "rouge2_fmeasure_stderr": 0.0010395559026960023, "rouge2_precision": 0.013510304634606875, "rouge2_precision_stderr": 0.0007312649740318255, "rouge2_recall": 0.03415295300257043, "rouge2_recall_stderr": 0.0019425669982816587, "rougeL_fmeasure": 0.09875044840716671, "rougeL_fmeasure_stderr": 0.0018572603815356456, "rougeL_precision": 0.0702987343925042, "rougeL_precision_stderr": 0.0013468612933160927, "rougeL_recall": 0.17295883718094748, "rougeL_recall_stderr": 0.0033149082764143117, "rougeLsum_fmeasure": 0.10073542075586238, "rougeLsum_fmeasure_stderr": 0.002008424522536892, "rougeLsum_precision": 0.07169769773353042, "rougeLsum_precision_stderr": 0.001454220219767976, "rougeLsum_recall": 0.17642520944692217, "rougeLsum_recall_stderr": 0.0035711244979865823}}, "2": {"article_DOC_summary": {"bleu": 1.0047358326681721, "bleu_stderr": 0.07696741647689843, "rouge1_fmeasure": 0.15372555310846955, "rouge1_fmeasure_stderr": 0.0024917036123975646, "rouge1_precision": 0.10919972863971208, "rouge1_precision_stderr": 0.0018326070695569962, "rouge1_recall": 0.27085077984418787, "rouge1_recall_stderr": 0.004340987074007443, "rouge2_fmeasure": 0.026545543337132424, "rouge2_fmeasure_stderr": 0.001202505861016018, "rouge2_precision": 0.018660104842681158, "rouge2_precision_stderr": 0.0008521301906071727, "rouge2_recall": 0.048121178106264206, "rouge2_recall_stderr": 0.002236763054096929, "rougeL_fmeasure": 0.12124448179130719, "rougeL_fmeasure_stderr": 0.0018406701231741705, "rougeL_precision": 0.08598679476507894, "rougeL_precision_stderr": 0.001347479595633704, "rougeL_recall": 0.21491972476150967, "rougeL_recall_stderr": 0.0033319697703370205, "rougeLsum_fmeasure": 0.12600811759241443, "rougeLsum_fmeasure_stderr": 0.002023272430841333, "rougeLsum_precision": 0.08936454412712036, "rougeLsum_precision_stderr": 0.00147707143314301, "rougeLsum_recall": 0.22313932236999878, "rougeLsum_recall_stderr": 0.003637762226705496}}, "3": {"article_DOC_summary": {"bleu": 1.3876528749760366, "bleu_stderr": 0.09352517366139018, "rouge1_fmeasure": 0.16788870363340208, "rouge1_fmeasure_stderr": 0.0026015869149755492, "rouge1_precision": 0.12156176985300436, "rouge1_precision_stderr": 0.0019916743006548566, "rouge1_recall": 0.2919486745230574, "rouge1_recall_stderr": 0.004594202930152972, "rouge2_fmeasure": 0.03291830334125208, "rouge2_fmeasure_stderr": 0.0013348595001679636, "rouge2_precision": 0.023390561110934703, "rouge2_precision_stderr": 0.0009482124699139898, "rouge2_recall": 0.05931283277905219, "rouge2_recall_stderr": 0.00253961685376344, "rougeL_fmeasure": 0.12998891345659275, "rougeL_fmeasure_stderr": 0.00194502365003499, "rougeL_precision": 0.09410425031338661, "rougeL_precision_stderr": 0.001505615984367887, "rougeL_recall": 0.22729829891507616, "rougeL_recall_stderr": 0.0035793671647219765, "rougeLsum_fmeasure": 0.13536430596241408, "rougeLsum_fmeasure_stderr": 0.0020963457328233175, "rougeLsum_precision": 0.09794367331761664, "rougeLsum_precision_stderr": 0.0016077634502078913, "rougeLsum_recall": 0.23679450093245694, "rougeLsum_recall_stderr": 0.003856459984346274}}, "4": {"article_DOC_summary": {"bleu": 0.8091606018729823, "bleu_stderr": 0.13365493953263705, "rouge1_fmeasure": 0.04817132624367883, "rouge1_fmeasure_stderr": 0.002710318861619294, "rouge1_precision": 0.04061391536633297, "rouge1_precision_stderr": 0.0025885117246031656, "rouge1_recall": 0.0774019664424454, "rouge1_recall_stderr": 0.004501264779451914, "rouge2_fmeasure": 0.010104068388385765, "rouge2_fmeasure_stderr": 0.0009810253225945517, "rouge2_precision": 0.007571424737600509, "rouge2_precision_stderr": 0.000774569805719568, "rouge2_recall": 0.017306355810365114, "rouge2_recall_stderr": 0.0017078169253319931, "rougeL_fmeasure": 0.03758941214391744, "rougeL_fmeasure_stderr": 0.002107269506833207, "rougeL_precision": 0.03226926320041072, "rougeL_precision_stderr": 0.0021853447839680425, "rougeL_recall": 0.0606544963878778, "rougeL_recall_stderr": 0.0035695265285015203, "rougeLsum_fmeasure": 0.03925168824333697, "rougeLsum_fmeasure_stderr": 0.0022220799806552142, "rougeLsum_precision": 0.03361123521194327, "rougeLsum_precision_stderr": 0.002255713167385482, "rougeLsum_recall": 0.063038545340123, "rougeLsum_recall_stderr": 0.003730201727070476}}, "5": {"article_DOC_summary": {"bleu": 2.9417748605436574e-39, "bleu_stderr": 1.644365126953672e-33, "rouge1_fmeasure": 0.002218415772902317, "rouge1_fmeasure_stderr": 0.0005977307451849004, "rouge1_precision": 0.0025292500918997793, "rouge1_precision_stderr": 0.0006843948420078455, "rouge1_recall": 0.0020532297175197265, "rouge1_recall_stderr": 0.0005556296741534733, "rouge2_fmeasure": 0.00041371259854665804, "rouge2_fmeasure_stderr": 0.0002415623180229552, "rouge2_precision": 0.0004376650603065697, "rouge2_precision_stderr": 0.00024169751059179606, "rouge2_recall": 0.00040004436924525715, "rouge2_recall_stderr": 0.0002447488200268485, "rougeL_fmeasure": 0.0018828340816919485, "rougeL_fmeasure_stderr": 0.0005121954193145257, "rougeL_precision": 0.0021231909904583543, "rougeL_precision_stderr": 0.0005742362353931501, "rougeL_recall": 0.0017609197535564964, "rougeL_recall_stderr": 0.0004851127145778472, "rougeLsum_fmeasure": 0.0019227239855572795, "rougeLsum_fmeasure_stderr": 0.0005197852399034371, "rougeLsum_precision": 0.002170837264519722, "rougeLsum_precision_stderr": 0.0005838940116621144, "rougeLsum_recall": 0.0017952250708806817, "rougeLsum_recall_stderr": 0.0004910386484249093}}}}
4b284b12bc4/evaluation/rankeval/4b284b12bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932575,0
3
+ anli_r2,acc,0.334,0.014922019523732961,0
4
+ anli_r3,acc,0.3491666666666667,0.013767075395077249,0
5
+ arc_challenge,acc,0.2636518771331058,0.012875929151297049,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428175,0
7
+ arc_easy,acc,0.5538720538720538,0.01020005782876501,0
8
+ arc_easy,acc_norm,0.4936868686868687,0.01025896566804443,0
9
+ boolq,acc,0.5464831804281346,0.008707182331111644,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.23306878306878312,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.4695279824736108,0.0049805063294075845,0
14
+ hellaswag,acc_norm,0.6132244572794264,0.004860162076330956,0
15
+ piqa,acc,0.73449401523395,0.010303308653024429,0
16
+ piqa,acc_norm,0.7475516866158868,0.010135665547362354,0
17
+ rte,acc,0.5812274368231047,0.02969666108123484,0
18
+ sciq,acc,0.82,0.012155153135511965,0
19
+ sciq,acc_norm,0.749,0.013718133516888921,0
20
+ storycloze_2016,acc,0.711918760021379,0.010472537019822578,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264218,0
4b284b12bc4/evaluation/rankeval/4b284b12bc4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.333,0.014910846164229868,0
3
+ anli_r2,acc,0.326,0.01483050720454104,0
4
+ anli_r3,acc,0.3475,0.013751753243291852,0
5
+ arc_challenge,acc,0.2627986348122867,0.012862523175351333,0
6
+ arc_challenge,acc_norm,0.30716723549488056,0.013481034054980943,0
7
+ arc_easy,acc,0.5913299663299664,0.010087174498762883,0
8
+ arc_easy,acc_norm,0.5496632996632996,0.010209047724374145,0
9
+ boolq,acc,0.5669724770642202,0.00866625130551806,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.37227304714989445,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.47191794463254333,0.004981905293878145,0
14
+ hellaswag,acc_norm,0.6139215295757817,0.004858539527872466,0
15
+ piqa,acc,0.7448313384113167,0.010171571592521822,0
16
+ piqa,acc_norm,0.7535364526659413,0.01005481078967181,0
17
+ rte,acc,0.5703971119133574,0.029796668829124674,0
18
+ sciq,acc,0.836,0.011715000693181331,0
19
+ sciq,acc_norm,0.781,0.013084731950262012,0
20
+ storycloze_2016,acc,0.7151256012827365,0.01043751398661172,0
21
+ winogrande,acc,0.5706393054459353,0.013911537499969163,0
4b284b12bc4/evaluation/rankeval/4b284b12bc4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.329,0.014865395385928354,0
3
+ anli_r2,acc,0.336,0.014944140233795027,0
4
+ anli_r3,acc,0.3383333333333333,0.013664144006618266,0
5
+ arc_challenge,acc,0.2781569965870307,0.013094469919538805,0
6
+ arc_challenge,acc_norm,0.30887372013651876,0.013501770929344003,0
7
+ arc_easy,acc,0.5993265993265994,0.01005530447425557,0
8
+ arc_easy,acc_norm,0.5576599326599326,0.01019133444422085,0
9
+ boolq,acc,0.5660550458715596,0.008668405003744129,1
10
+ cb,acc,0.48214285714285715,0.06737697508644648,1
11
+ cb,f1,0.3338011695906433,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4697271459868552,0.004980627287147585,0
14
+ hellaswag,acc_norm,0.6141206930890261,0.004858074013443988,0
15
+ piqa,acc,0.7470076169749728,0.01014288869886246,0
16
+ piqa,acc_norm,0.7519042437431991,0.010077118315574706,0
17
+ rte,acc,0.5523465703971119,0.02993107036293953,0
18
+ sciq,acc,0.835,0.011743632866916145,0
19
+ sciq,acc_norm,0.79,0.01288666233227453,0
20
+ storycloze_2016,acc,0.7156600748262961,0.010431614128665253,0
21
+ winogrande,acc,0.574585635359116,0.013895257666646378,0
4b284b12bc4/evaluation/rankeval/4b284b12bc4_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811485,0
3
+ anli_r2,acc,0.334,0.014922019523732963,0
4
+ anli_r3,acc,0.35,0.013774667009018554,0
5
+ arc_challenge,acc,0.2815699658703072,0.013143376735009022,0
6
+ arc_challenge,acc_norm,0.3122866894197952,0.013542598541688067,0
7
+ arc_easy,acc,0.5955387205387206,0.010070746648278783,0
8
+ arc_easy,acc_norm,0.5740740740740741,0.010146568651002255,0
9
+ boolq,acc,0.5587155963302752,0.008684548127832637,1
10
+ cb,acc,0.6071428571428571,0.0658538889806635,1
11
+ cb,f1,0.42400932400932395,,1
12
+ copa,acc,0.81,0.03942772444036622,0
13
+ hellaswag,acc,0.47241585341565423,0.004982182323923561,0
14
+ hellaswag,acc_norm,0.6199960167297351,0.004843954338451449,0
15
+ piqa,acc,0.7513601741022851,0.01008451123429685,0
16
+ piqa,acc_norm,0.7578890097932536,0.009994371269104397,0
17
+ rte,acc,0.5379061371841155,0.030009848912529113,0
18
+ sciq,acc,0.841,0.01156947936827129,0
19
+ sciq,acc_norm,0.796,0.012749374359024384,0
20
+ storycloze_2016,acc,0.7124532335649385,0.010466744473098363,0
21
+ winogrande,acc,0.5737963693764798,0.013898585965412338,0
4b284b12bc4/evaluation/rankeval/4b284b12bc4_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.337,0.014955087918653603,0
3
+ anli_r2,acc,0.349,0.015080663991563102,0
4
+ anli_r3,acc,0.36666666666666664,0.013916893275819938,0
5
+ arc_challenge,acc,0.2790102389078498,0.013106784883601346,0
6
+ arc_challenge,acc_norm,0.3165529010238908,0.013592431519068077,0
7
+ arc_easy,acc,0.6039562289562289,0.010035580962097942,0
8
+ arc_easy,acc_norm,0.5702861952861953,0.010157908005763674,0
9
+ boolq,acc,0.5636085626911315,0.008674000467432068,1
10
+ cb,acc,0.44642857142857145,0.067031892279424,1
11
+ cb,f1,0.3176100628930817,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4722166899024099,0.004982072108448081,0
14
+ hellaswag,acc_norm,0.6184027086237801,0.004847857546957481,0
15
+ piqa,acc,0.7431991294885746,0.010192864802278045,0
16
+ piqa,acc_norm,0.7568008705114254,0.010009611953858915,0
17
+ rte,acc,0.5379061371841155,0.03000984891252911,0
18
+ sciq,acc,0.842,0.011539894677559568,0
19
+ sciq,acc_norm,0.789,0.012909130321042092,0
20
+ storycloze_2016,acc,0.7194013896312133,0.010389809647288821,0
21
+ winogrande,acc,0.56353591160221,0.013938569465677023,0
4b284b12bc4/evaluation/rankeval/4b284b12bc4_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811487,0
3
+ anli_r2,acc,0.329,0.014865395385928357,0
4
+ anli_r3,acc,0.3541666666666667,0.013811933499570954,0
5
+ arc_challenge,acc,0.27559726962457337,0.01305716965576184,0
6
+ arc_challenge,acc_norm,0.31569965870307165,0.013582571095815291,0
7
+ arc_easy,acc,0.5997474747474747,0.010053550119896127,0
8
+ arc_easy,acc_norm,0.569023569023569,0.010161552863493746,0
9
+ boolq,acc,0.5648318042813456,0.008671229580582118,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.38376730002345766,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.47400916152160927,0.004983035420235716,0
14
+ hellaswag,acc_norm,0.619896434973113,0.004844199910173026,0
15
+ piqa,acc,0.7399347116430903,0.0102348932490613,0
16
+ piqa,acc_norm,0.7595212187159956,0.009971345364651064,0
17
+ rte,acc,0.516245487364621,0.030080573208738064,0
18
+ sciq,acc,0.844,0.01148023500612236,0
19
+ sciq,acc_norm,0.794,0.012795613612786551,0
20
+ storycloze_2016,acc,0.7177979690005345,0.010407834479647675,0
21
+ winogrande,acc,0.5722178374112076,0.013905134013839944,0
4b284b17bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.004707141554710639
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.004707141554710639
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.1486663277769484
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.1486663277769484
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.1507673483604289
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.1507673483604289
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.158839720125521
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.158839720125521
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.16013598883167798
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.16013598883167798
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.1648812739511937
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.1648812739511937
14
+ e2e_nlg_cleaned,5,average,multiple,0.13133296676674677
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.050742967235947956
16
+ gem_xsum,0,median,rouge2_fmeasure,0.050742967235947956
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.03956915695649403
18
+ gem_xsum,1,median,rouge2_fmeasure,0.03956915695649403
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.040730679478674064
20
+ gem_xsum,2,median,rouge2_fmeasure,0.040730679478674064
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.03767895922224648
22
+ gem_xsum,3,median,rouge2_fmeasure,0.03767895922224648
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01035393012550112
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01035393012550112
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003555930988203656
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003555930988203656
27
+ gem_xsum,5,average,multiple,0.029905214352947337
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05308201459552208
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05308201459552208
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.054620931903283015
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.054620931903283015
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.04972618028817665
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.04972618028817665
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.038282775688304856
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.038282775688304856
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.035274756528572794
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.035274756528572794
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.04337652772461485
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.04337652772461485
40
+ web_nlg_en,5,average,multiple,0.045727197788079044
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.033925070200158246
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.033925070200158246
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.053937307211284244
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.053937307211284244
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05625290668830642
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05625290668830642
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04688317854067561
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04688317854067561
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013810868807903593
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013810868807903593
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024939038248004536
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0024939038248004536
53
+ wiki_lingua_en,5,average,multiple,0.03455053921218809
4b284b17bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4040857346605273, "bleu_stderr": 0.04358756352339084, "rouge1_fmeasure": 0.11153940555452811, "rouge1_fmeasure_stderr": 0.002178375350508395, "rouge1_precision": 0.0759904796250538, "rouge1_precision_stderr": 0.0019478615830651011, "rouge1_recall": 0.3009878218567671, "rouge1_recall_stderr": 0.0046586299284223885, "rouge2_fmeasure": 0.05308201459552208, "rouge2_fmeasure_stderr": 0.0013729761880117914, "rouge2_precision": 0.03649587266689502, "rouge2_precision_stderr": 0.001284173883599284, "rouge2_recall": 0.14683508450255534, "rouge2_recall_stderr": 0.0032437032345681857, "rougeL_fmeasure": 0.10594741371659425, "rougeL_fmeasure_stderr": 0.0019835178597520093, "rougeL_precision": 0.07180766426825755, "rougeL_precision_stderr": 0.001761388989113738, "rougeL_recall": 0.28987291523705844, "rougeL_recall_stderr": 0.0045326872175802165, "rougeLsum_fmeasure": 0.10637513260264994, "rougeLsum_fmeasure_stderr": 0.00204505600505615, "rougeLsum_precision": 0.07245769602542276, "rougeLsum_precision_stderr": 0.00184585307065297, "rougeLsum_recall": 0.2877191217238231, "rougeLsum_recall_stderr": 0.004439207690226351}}, "1": {"PALM_prompt": {"bleu": 0.5179012826475189, "bleu_stderr": 0.03546328546887922, "rouge1_fmeasure": 0.11715894355967386, "rouge1_fmeasure_stderr": 0.0019757967913343107, "rouge1_precision": 0.07590692259956473, "rouge1_precision_stderr": 0.0015260502670476222, "rouge1_recall": 0.3587176031003754, "rouge1_recall_stderr": 0.005304902318303979, "rouge2_fmeasure": 0.054620931903283015, "rouge2_fmeasure_stderr": 0.001228796960295478, "rouge2_precision": 0.03541306461486918, "rouge2_precision_stderr": 0.0009438407351314926, "rouge2_recall": 0.17490815925289047, "rouge2_recall_stderr": 0.003575953294927914, "rougeL_fmeasure": 0.10948455596662156, "rougeL_fmeasure_stderr": 0.0017556314356608658, "rougeL_precision": 0.07063353254188104, "rougeL_precision_stderr": 0.0013315866040551792, "rougeL_recall": 0.33926359580036936, "rougeL_recall_stderr": 0.004980489876121937, "rougeLsum_fmeasure": 0.11027827926137282, "rougeLsum_fmeasure_stderr": 0.0018206662711825689, "rougeLsum_precision": 0.07145722539365447, "rougeLsum_precision_stderr": 0.0014201666549177136, "rougeLsum_recall": 0.338101035904291, "rougeLsum_recall_stderr": 0.004861723234525834}}, "2": {"PALM_prompt": {"bleu": 0.5353533242406296, "bleu_stderr": 0.03431413900192352, "rouge1_fmeasure": 0.11225032910377221, "rouge1_fmeasure_stderr": 0.001729127211467464, "rouge1_precision": 0.07083033588676813, "rouge1_precision_stderr": 0.0012393618247087826, "rouge1_recall": 0.3713979630102006, "rouge1_recall_stderr": 0.0052635629154557445, "rouge2_fmeasure": 0.04972618028817665, "rouge2_fmeasure_stderr": 0.00102253346010308, "rouge2_precision": 0.031078186918670876, "rouge2_precision_stderr": 0.0007042214847925669, "rouge2_recall": 0.17869319720275487, "rouge2_recall_stderr": 0.0037152494180892654, "rougeL_fmeasure": 0.10286097604129013, "rougeL_fmeasure_stderr": 0.001521136220014524, "rougeL_precision": 0.06478794864331923, "rougeL_precision_stderr": 0.0010868099908157105, "rougeL_recall": 0.342386434900962, "rougeL_recall_stderr": 0.004791381155468985, "rougeLsum_fmeasure": 0.10565101438548488, "rougeLsum_fmeasure_stderr": 0.0016072411783332626, "rougeLsum_precision": 0.0666711337015857, "rougeLsum_precision_stderr": 0.0011558366976764654, "rougeLsum_recall": 0.3495506191555596, "rougeLsum_recall_stderr": 0.004867024005105769}}, "3": {"PALM_prompt": {"bleu": 0.4473435878442557, "bleu_stderr": 0.021920475877328035, "rouge1_fmeasure": 0.09638137782837077, "rouge1_fmeasure_stderr": 0.0014829064557002423, "rouge1_precision": 0.060184346315459554, "rouge1_precision_stderr": 0.001087884777571127, "rouge1_recall": 0.3414484174381052, "rouge1_recall_stderr": 0.005172668914728109, "rouge2_fmeasure": 0.038282775688304856, "rouge2_fmeasure_stderr": 0.0008484797038289617, "rouge2_precision": 0.02365049876833611, "rouge2_precision_stderr": 0.0005803565276567791, "rouge2_recall": 0.14942490983015533, "rouge2_recall_stderr": 0.003513250894148731, "rougeL_fmeasure": 0.08646072645548598, "rougeL_fmeasure_stderr": 0.0013059843771721818, "rougeL_precision": 0.05398887767175152, "rougeL_precision_stderr": 0.0009593908114878796, "rougeL_recall": 0.306677112968288, "rougeL_recall_stderr": 0.004574894720404393, "rougeLsum_fmeasure": 0.09025064736984334, "rougeLsum_fmeasure_stderr": 0.0013804266464441872, "rougeLsum_precision": 0.05643326067698096, "rougeLsum_precision_stderr": 0.0010263660613837862, "rougeLsum_recall": 0.31950364177604385, "rougeLsum_recall_stderr": 0.0047753784913865055}}, "4": {"PALM_prompt": {"bleu": 0.42391682641977435, "bleu_stderr": 0.025185202302157747, "rouge1_fmeasure": 0.09021138584698546, "rouge1_fmeasure_stderr": 0.0014520692052325743, "rouge1_precision": 0.056211864304467056, "rouge1_precision_stderr": 0.0010231526331645241, "rouge1_recall": 0.3243551913604374, "rouge1_recall_stderr": 0.005025965693402745, "rouge2_fmeasure": 0.035274756528572794, "rouge2_fmeasure_stderr": 0.0008164665662699456, "rouge2_precision": 0.021734837271928865, "rouge2_precision_stderr": 0.000537789346158442, "rouge2_recall": 0.1381516300501002, "rouge2_recall_stderr": 0.003431494708848335, "rougeL_fmeasure": 0.08124379060939273, "rougeL_fmeasure_stderr": 0.00126253468227921, "rougeL_precision": 0.050592287065444816, "rougeL_precision_stderr": 0.0008807998394512943, "rougeL_recall": 0.29101119493777294, "rougeL_recall_stderr": 0.004368680842712586, "rougeLsum_fmeasure": 0.08470874404935472, "rougeLsum_fmeasure_stderr": 0.001361092950551602, "rougeLsum_precision": 0.052801000471684074, "rougeLsum_precision_stderr": 0.0009509504143137485, "rougeLsum_recall": 0.30284581980574177, "rougeLsum_recall_stderr": 0.004598745476101752}}, "5": {"PALM_prompt": {"bleu": 0.4676760272424504, "bleu_stderr": 0.02303026154350977, "rouge1_fmeasure": 0.10005721103740961, "rouge1_fmeasure_stderr": 0.0016082944292364137, "rouge1_precision": 0.06321141539943463, "rouge1_precision_stderr": 0.001223200335826357, "rouge1_recall": 0.34518893665726086, "rouge1_recall_stderr": 0.005046837181813745, "rouge2_fmeasure": 0.04337652772461485, "rouge2_fmeasure_stderr": 0.0009940986270453964, "rouge2_precision": 0.027408388290100973, "rouge2_precision_stderr": 0.0007495147001664794, "rouge2_recall": 0.15813941094305575, "rouge2_recall_stderr": 0.0034339025470937284, "rougeL_fmeasure": 0.09146925689558233, "rougeL_fmeasure_stderr": 0.0014605690131820356, "rougeL_precision": 0.057943890660243344, "rougeL_precision_stderr": 0.0011331942696761865, "rougeL_recall": 0.31252933987652376, "rougeL_recall_stderr": 0.004418336624345426, "rougeLsum_fmeasure": 0.09443075155430865, "rougeLsum_fmeasure_stderr": 0.001535767562485951, "rougeLsum_precision": 0.059802685455499036, "rougeLsum_precision_stderr": 0.0011813042691715043, "rougeLsum_recall": 0.3231337567814217, "rougeLsum_recall_stderr": 0.004620411779037274}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.484894171357125, "bleu_stderr": 0.04279862989387049, "rouge1_fmeasure": 0.1751592790344999, "rouge1_fmeasure_stderr": 0.0017874794849447823, "rouge1_precision": 0.1498248405216555, "rouge1_precision_stderr": 0.0018423908624139891, "rouge1_recall": 0.25526647462429297, "rouge1_recall_stderr": 0.002588155452951415, "rouge2_fmeasure": 0.033925070200158246, "rouge2_fmeasure_stderr": 0.0008240877531204439, "rouge2_precision": 0.02866254588676867, "rouge2_precision_stderr": 0.0007271444111163778, "rouge2_recall": 0.05162759513352221, "rouge2_recall_stderr": 0.0014037908842865225, "rougeL_fmeasure": 0.13711284339871613, "rougeL_fmeasure_stderr": 0.001264815697203119, "rougeL_precision": 0.11555678082540916, "rougeL_precision_stderr": 0.0012586001477032337, "rougeL_recall": 0.20559491764149926, "rougeL_recall_stderr": 0.002138172468666332, "rougeLsum_fmeasure": 0.16124043802649227, "rougeLsum_fmeasure_stderr": 0.0016384576957041044, "rougeLsum_precision": 0.13771072102711007, "rougeLsum_precision_stderr": 0.0016874302363856556, "rougeLsum_recall": 0.23587046561796698, "rougeLsum_recall_stderr": 0.0024016834853047104}}, "1": {"tldr_en": {"bleu": 2.6577106650236018, "bleu_stderr": 0.07159113573701131, "rouge1_fmeasure": 0.2199694812178742, "rouge1_fmeasure_stderr": 0.002010819735875878, "rouge1_precision": 0.19820460200268925, "rouge1_precision_stderr": 0.002420333361877177, "rouge1_recall": 0.3126809375051542, "rouge1_recall_stderr": 0.002841672003398339, "rouge2_fmeasure": 0.053937307211284244, "rouge2_fmeasure_stderr": 0.001075193073237261, "rouge2_precision": 0.05016728772414271, "rouge2_precision_stderr": 0.001264223371544086, "rouge2_recall": 0.07815311881303431, "rouge2_recall_stderr": 0.0017034183791022172, "rougeL_fmeasure": 0.15551270440786846, "rougeL_fmeasure_stderr": 0.0013566776773220371, "rougeL_precision": 0.13997435500321276, "rougeL_precision_stderr": 0.0017712606376717578, "rougeL_recall": 0.22624387436643148, "rougeL_recall_stderr": 0.0022018166295221108, "rougeLsum_fmeasure": 0.20618836045677164, "rougeLsum_fmeasure_stderr": 0.0018846829640657573, "rougeLsum_precision": 0.18565508806059028, "rougeLsum_precision_stderr": 0.002281622879542436, "rougeLsum_recall": 0.29360401456830726, "rougeLsum_recall_stderr": 0.0026791632190064567}}, "2": {"tldr_en": {"bleu": 2.8678009037418817, "bleu_stderr": 0.037508767497823454, "rouge1_fmeasure": 0.22435918885551207, "rouge1_fmeasure_stderr": 0.0018967794998520616, "rouge1_precision": 0.2156784752655553, "rouge1_precision_stderr": 0.002732677109431224, "rouge1_recall": 0.31237685950514665, "rouge1_recall_stderr": 0.0027963244796416183, "rouge2_fmeasure": 0.05625290668830642, "rouge2_fmeasure_stderr": 0.0010633219356063186, "rouge2_precision": 0.05738474692529928, "rouge2_precision_stderr": 0.0015267876059967937, "rouge2_recall": 0.07970531997370003, "rouge2_recall_stderr": 0.0016700905255101184, "rougeL_fmeasure": 0.15917844337997755, "rougeL_fmeasure_stderr": 0.0013165721234956872, "rougeL_precision": 0.15451467159603913, "rougeL_precision_stderr": 0.002139183021303883, "rougeL_recall": 0.225376744790094, "rougeL_recall_stderr": 0.0021771714204925977, "rougeLsum_fmeasure": 0.2111183297537402, "rougeLsum_fmeasure_stderr": 0.0017924385498513864, "rougeLsum_precision": 0.20314308377906012, "rougeLsum_precision_stderr": 0.002612094861988029, "rougeLsum_recall": 0.29423268640214134, "rougeLsum_recall_stderr": 0.002642162937712342}}, "3": {"tldr_en": {"bleu": 2.9935371246792863, "bleu_stderr": 0.08389052572374653, "rouge1_fmeasure": 0.18429548803283216, "rouge1_fmeasure_stderr": 0.002297153333126352, "rouge1_precision": 0.18405392984121038, "rouge1_precision_stderr": 0.0029940021228191355, "rouge1_recall": 0.25514944671406053, "rouge1_recall_stderr": 0.0033459375347498267, "rouge2_fmeasure": 0.04688317854067561, "rouge2_fmeasure_stderr": 0.001094346737838606, "rouge2_precision": 0.04819840246533481, "rouge2_precision_stderr": 0.0015092856540081493, "rouge2_recall": 0.06631560044043883, "rouge2_recall_stderr": 0.0016628335172097389, "rougeL_fmeasure": 0.13173075629033756, "rougeL_fmeasure_stderr": 0.0016475788902520988, "rougeL_precision": 0.13375051095662321, "rougeL_precision_stderr": 0.002380892659084607, "rougeL_recall": 0.1853146721328658, "rougeL_recall_stderr": 0.0025854840001956502, "rougeLsum_fmeasure": 0.1733778342320142, "rougeLsum_fmeasure_stderr": 0.0021681294491408274, "rougeLsum_precision": 0.1733338698061136, "rougeLsum_precision_stderr": 0.0028479993519003807, "rougeLsum_recall": 0.2401891556702282, "rougeLsum_recall_stderr": 0.003169734302090452}}, "4": {"tldr_en": {"bleu": 0.5461443377554994, "bleu_stderr": 0.044799916123802616, "rouge1_fmeasure": 0.05659885761358278, "rouge1_fmeasure_stderr": 0.001930223829324733, "rouge1_precision": 0.056788240211910736, "rouge1_precision_stderr": 0.0022506997974891542, "rouge1_recall": 0.08339625174295712, "rouge1_recall_stderr": 0.0029290040087097207, "rouge2_fmeasure": 0.013810868807903593, "rouge2_fmeasure_stderr": 0.0006930003786904605, "rouge2_precision": 0.014380667558554284, "rouge2_precision_stderr": 0.000977357379524512, "rouge2_recall": 0.02101316087357165, "rouge2_recall_stderr": 0.0011619283931715888, "rougeL_fmeasure": 0.042007409754473535, "rougeL_fmeasure_stderr": 0.0014258890197756744, "rougeL_precision": 0.04257610439946902, "rougeL_precision_stderr": 0.0017464834049321358, "rougeL_recall": 0.06302882516092573, "rougeL_recall_stderr": 0.0022681193452949904, "rougeLsum_fmeasure": 0.05299986098950862, "rougeLsum_fmeasure_stderr": 0.0018068900698462816, "rougeLsum_precision": 0.05318358596123625, "rougeLsum_precision_stderr": 0.0021166691321526723, "rougeLsum_recall": 0.07838773551352705, "rougeLsum_recall_stderr": 0.002768443855799422}}, "5": {"tldr_en": {"bleu": 1.0112557087204399e-06, "bleu_stderr": 2.4150938941127893e-06, "rouge1_fmeasure": 0.009231042835921754, "rouge1_fmeasure_stderr": 0.0008787834704319035, "rouge1_precision": 0.008911405879685974, "rouge1_precision_stderr": 0.0009422475593785767, "rouge1_recall": 0.014063917251247313, "rouge1_recall_stderr": 0.001396048114694196, "rouge2_fmeasure": 0.0024939038248004536, "rouge2_fmeasure_stderr": 0.00031409966961336354, "rouge2_precision": 0.0023383395560896076, "rouge2_precision_stderr": 0.0003211681514690954, "rouge2_recall": 0.004206929070053043, "rouge2_recall_stderr": 0.0006053899081506379, "rougeL_fmeasure": 0.007056657402327181, "rougeL_fmeasure_stderr": 0.0006761968053255695, "rougeL_precision": 0.006838517996278174, "rougeL_precision_stderr": 0.0007309899843193323, "rougeL_recall": 0.010882093390231358, "rougeL_recall_stderr": 0.0011052580830821516, "rougeLsum_fmeasure": 0.008623638497104687, "rougeLsum_fmeasure_stderr": 0.0008181729546128778, "rougeLsum_precision": 0.008357520892672053, "rougeLsum_precision_stderr": 0.0008835066902880334, "rougeLsum_recall": 0.013172143042073951, "rougeLsum_recall_stderr": 0.0013147147386398047}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.3138256134465956, "bleu_stderr": 0.042478650177872466, "rouge1_fmeasure": 0.08626819478230045, "rouge1_fmeasure_stderr": 0.0014217358109946006, "rouge1_precision": 0.09054875368611116, "rouge1_precision_stderr": 0.001766981005220654, "rouge1_recall": 0.09899686345340121, "rouge1_recall_stderr": 0.0016339321555193618, "rouge2_fmeasure": 0.004707141554710639, "rouge2_fmeasure_stderr": 0.00037478881160494376, "rouge2_precision": 0.004635097118571576, "rouge2_precision_stderr": 0.0004584060579508488, "rouge2_recall": 0.006688039413188141, "rouge2_recall_stderr": 0.0004649040639428354, "rougeL_fmeasure": 0.08482739955206746, "rougeL_fmeasure_stderr": 0.001374786595816378, "rougeL_precision": 0.08868499326830791, "rougeL_precision_stderr": 0.0016895151305050993, "rougeL_recall": 0.09761537714635501, "rougeL_recall_stderr": 0.0015968584648120006, "rougeLsum_fmeasure": 0.06674662648443055, "rougeLsum_fmeasure_stderr": 0.0011005680176848465, "rougeLsum_precision": 0.0697631127415905, "rougeLsum_precision_stderr": 0.0014027633376290944, "rougeLsum_recall": 0.07822809105458635, "rougeLsum_recall_stderr": 0.001337355132253774}}, "1": {"generate_text_restaurant": {"bleu": 6.380638446426456, "bleu_stderr": 0.09557609009378211, "rouge1_fmeasure": 0.3400267632274026, "rouge1_fmeasure_stderr": 0.0020403696620665606, "rouge1_precision": 0.3328975043204133, "rouge1_precision_stderr": 0.003509121285453912, "rouge1_recall": 0.45270988550034613, "rouge1_recall_stderr": 0.003062231284181297, "rouge2_fmeasure": 0.1486663277769484, "rouge2_fmeasure_stderr": 0.0015136508138340347, "rouge2_precision": 0.1499329665878703, "rouge2_precision_stderr": 0.002263691832530101, "rouge2_recall": 0.1983614972096706, "rouge2_recall_stderr": 0.0020825028564397568, "rougeL_fmeasure": 0.2676344730573564, "rougeL_fmeasure_stderr": 0.001562435233832024, "rougeL_precision": 0.2592088269282877, "rougeL_precision_stderr": 0.002698465205645672, "rougeL_recall": 0.36228982914822633, "rougeL_recall_stderr": 0.0026355915533943, "rougeLsum_fmeasure": 0.2753891028069307, "rougeLsum_fmeasure_stderr": 0.0019025285983907253, "rougeLsum_precision": 0.27037933017473875, "rougeLsum_precision_stderr": 0.003062674460499446, "rougeLsum_recall": 0.3663561677071641, "rougeLsum_recall_stderr": 0.0027976045204321166}}, "2": {"generate_text_restaurant": {"bleu": 6.3634920937473884, "bleu_stderr": 0.08616925114576995, "rouge1_fmeasure": 0.33740317942041553, "rouge1_fmeasure_stderr": 0.0019381935089587043, "rouge1_precision": 0.2954297857860177, "rouge1_precision_stderr": 0.0029411840632251563, "rouge1_recall": 0.48120558575759376, "rouge1_recall_stderr": 0.002844478586577598, "rouge2_fmeasure": 0.1507673483604289, "rouge2_fmeasure_stderr": 0.0014399337503822303, "rouge2_precision": 0.13374093712808155, "rouge2_precision_stderr": 0.0018836984171556573, "rouge2_recall": 0.2183048706264126, "rouge2_recall_stderr": 0.002121311703914965, "rougeL_fmeasure": 0.26968155815162015, "rougeL_fmeasure_stderr": 0.0014592699636009493, "rougeL_precision": 0.23342217539614568, "rougeL_precision_stderr": 0.002214559299952568, "rougeL_recall": 0.3907575640409568, "rougeL_recall_stderr": 0.002505134070265779, "rougeLsum_fmeasure": 0.272599814561145, "rougeLsum_fmeasure_stderr": 0.00182733683050086, "rougeLsum_precision": 0.23886641857979662, "rougeLsum_precision_stderr": 0.0025627121020893886, "rougeLsum_recall": 0.3892155963722792, "rougeLsum_recall_stderr": 0.002698506480779706}}, "3": {"generate_text_restaurant": {"bleu": 6.870066226903575, "bleu_stderr": 0.09429297050700895, "rouge1_fmeasure": 0.3444162792721549, "rouge1_fmeasure_stderr": 0.0019646659016549933, "rouge1_precision": 0.2995136417218204, "rouge1_precision_stderr": 0.002877943933866991, "rouge1_recall": 0.4870699752407154, "rouge1_recall_stderr": 0.0028159974008536274, "rouge2_fmeasure": 0.158839720125521, "rouge2_fmeasure_stderr": 0.0014985646156960436, "rouge2_precision": 0.13989458331193982, "rouge2_precision_stderr": 0.0019259857294848428, "rouge2_recall": 0.2283186786335389, "rouge2_recall_stderr": 0.002178927464111703, "rougeL_fmeasure": 0.2794048970004934, "rougeL_fmeasure_stderr": 0.0015380955393133962, "rougeL_precision": 0.24052403808712888, "rougeL_precision_stderr": 0.0022515901969111986, "rougeL_recall": 0.4008413679522952, "rougeL_recall_stderr": 0.0025096595704963047, "rougeLsum_fmeasure": 0.2812645652371936, "rougeLsum_fmeasure_stderr": 0.00189815704910076, "rougeLsum_precision": 0.24501488471812358, "rougeLsum_precision_stderr": 0.002592614810126833, "rougeLsum_recall": 0.39799000178268285, "rougeLsum_recall_stderr": 0.002715969584528833}}, "4": {"generate_text_restaurant": {"bleu": 7.017381678234455, "bleu_stderr": 0.09166907850331675, "rouge1_fmeasure": 0.3465967840398284, "rouge1_fmeasure_stderr": 0.0019574393578079014, "rouge1_precision": 0.30143650107181685, "rouge1_precision_stderr": 0.0029314795240309452, "rouge1_recall": 0.4898031820273571, "rouge1_recall_stderr": 0.0027687333603014837, "rouge2_fmeasure": 0.16013598883167798, "rouge2_fmeasure_stderr": 0.0015096765844560063, "rouge2_precision": 0.14062404051839908, "rouge2_precision_stderr": 0.0019112056232477572, "rouge2_recall": 0.2300710505484132, "rouge2_recall_stderr": 0.0021943432548559823, "rougeL_fmeasure": 0.2820802687702902, "rougeL_fmeasure_stderr": 0.0015095048991938657, "rougeL_precision": 0.2422982418880056, "rougeL_precision_stderr": 0.0022053611865584053, "rougeL_recall": 0.4046980533581673, "rougeL_recall_stderr": 0.0024846126430562907, "rougeLsum_fmeasure": 0.28522597669249417, "rougeLsum_fmeasure_stderr": 0.0018960405914839836, "rougeLsum_precision": 0.2479193537232055, "rougeLsum_precision_stderr": 0.0025849971540480303, "rougeLsum_recall": 0.40376209522673123, "rougeLsum_recall_stderr": 0.002721581361466135}}, "5": {"generate_text_restaurant": {"bleu": 7.260211722864896, "bleu_stderr": 0.12365349150703955, "rouge1_fmeasure": 0.3539789382700311, "rouge1_fmeasure_stderr": 0.002014807799247425, "rouge1_precision": 0.3111425574072635, "rouge1_precision_stderr": 0.002912308644972262, "rouge1_recall": 0.486978690036314, "rouge1_recall_stderr": 0.002680144522191287, "rouge2_fmeasure": 0.1648812739511937, "rouge2_fmeasure_stderr": 0.0015410229627803459, "rouge2_precision": 0.14647797107879432, "rouge2_precision_stderr": 0.001938088224804308, "rouge2_recall": 0.2296020100709576, "rouge2_recall_stderr": 0.0021177366913279795, "rougeL_fmeasure": 0.2855363745230388, "rougeL_fmeasure_stderr": 0.0015231421878303527, "rougeL_precision": 0.24849239666244133, "rougeL_precision_stderr": 0.002231342628042193, "rougeL_recall": 0.39930639747976754, "rougeL_recall_stderr": 0.002405155865675547, "rougeLsum_fmeasure": 0.29494158509832297, "rougeLsum_fmeasure_stderr": 0.0019724923487253014, "rougeLsum_precision": 0.25960917213172013, "rougeLsum_precision_stderr": 0.0026365144213955465, "rougeLsum_recall": 0.4057201628355578, "rougeLsum_recall_stderr": 0.002662489705069069}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.0719375915266327, "bleu_stderr": 0.09381739892136316, "rouge1_fmeasure": 0.21991779087563262, "rouge1_fmeasure_stderr": 0.0025921031610263, "rouge1_precision": 0.17021358510341336, "rouge1_precision_stderr": 0.0024373039382226077, "rouge1_recall": 0.3520563857681371, "rouge1_recall_stderr": 0.0043918973093069365, "rouge2_fmeasure": 0.050742967235947956, "rouge2_fmeasure_stderr": 0.0016968813344534614, "rouge2_precision": 0.03871189783801819, "rouge2_precision_stderr": 0.0013940343459980615, "rouge2_recall": 0.0841631095984103, "rouge2_recall_stderr": 0.002861696063072104, "rougeL_fmeasure": 0.1634042310463717, "rougeL_fmeasure_stderr": 0.002018854408540066, "rougeL_precision": 0.12632426569550884, "rougeL_precision_stderr": 0.001895537050144316, "rougeL_recall": 0.262981285619352, "rougeL_recall_stderr": 0.0034911247032713426, "rougeLsum_fmeasure": 0.17070284614092165, "rougeLsum_fmeasure_stderr": 0.0021883919787213176, "rougeLsum_precision": 0.13144702322249674, "rougeLsum_precision_stderr": 0.001966787320258499, "rougeLsum_recall": 0.2761271441560725, "rougeLsum_recall_stderr": 0.0039020054143034}}, "1": {"article_DOC_summary": {"bleu": 1.5179351179741758, "bleu_stderr": 0.05443901746520919, "rouge1_fmeasure": 0.18467573869385082, "rouge1_fmeasure_stderr": 0.0024560883653668973, "rouge1_precision": 0.13144344568204896, "rouge1_precision_stderr": 0.0018269600623641938, "rouge1_recall": 0.32351107047900046, "rouge1_recall_stderr": 0.0042227591238073415, "rouge2_fmeasure": 0.03956915695649403, "rouge2_fmeasure_stderr": 0.0013969113381600835, "rouge2_precision": 0.027812878457341133, "rouge2_precision_stderr": 0.0009822890456735115, "rouge2_recall": 0.07174244190002714, "rouge2_recall_stderr": 0.002623646025136939, "rougeL_fmeasure": 0.1433445973017209, "rougeL_fmeasure_stderr": 0.0018589035220238875, "rougeL_precision": 0.10177712280555466, "rougeL_precision_stderr": 0.0013642070339291124, "rougeL_recall": 0.2530115289147634, "rougeL_recall_stderr": 0.003361296390437375, "rougeLsum_fmeasure": 0.1463591812261117, "rougeLsum_fmeasure_stderr": 0.0020474760089749054, "rougeLsum_precision": 0.10392255150260828, "rougeLsum_precision_stderr": 0.0014996096643151138, "rougeLsum_recall": 0.25806957261490177, "rougeLsum_recall_stderr": 0.003642123743587609}}, "2": {"article_DOC_summary": {"bleu": 1.5610996318449655, "bleu_stderr": 0.061361742735683046, "rouge1_fmeasure": 0.18395936513372504, "rouge1_fmeasure_stderr": 0.0023841684164883858, "rouge1_precision": 0.1310122719121047, "rouge1_precision_stderr": 0.001774463078853173, "rouge1_recall": 0.32197642898756434, "rouge1_recall_stderr": 0.004128514222989525, "rouge2_fmeasure": 0.040730679478674064, "rouge2_fmeasure_stderr": 0.0014445249170945054, "rouge2_precision": 0.028592074721851848, "rouge2_precision_stderr": 0.0010146723977319822, "rouge2_recall": 0.07420088979991933, "rouge2_recall_stderr": 0.0027242327105131063, "rougeL_fmeasure": 0.14791071749656537, "rougeL_fmeasure_stderr": 0.0018605423267814124, "rougeL_precision": 0.10506688444375611, "rougeL_precision_stderr": 0.0013615120559406814, "rougeL_recall": 0.2609222630071341, "rougeL_recall_stderr": 0.0034251450207401918, "rougeLsum_fmeasure": 0.1463682832676705, "rougeLsum_fmeasure_stderr": 0.0020126152822738364, "rougeLsum_precision": 0.1039952495955331, "rougeLsum_precision_stderr": 0.00147131239785375, "rougeLsum_recall": 0.2579710996210495, "rougeLsum_recall_stderr": 0.0036391736637925577}}, "3": {"article_DOC_summary": {"bleu": 1.5762937299614814, "bleu_stderr": 0.07362827411239845, "rouge1_fmeasure": 0.17637720631581139, "rouge1_fmeasure_stderr": 0.00264797834039018, "rouge1_precision": 0.1284212760059797, "rouge1_precision_stderr": 0.002041598803417733, "rouge1_recall": 0.3028097339054088, "rouge1_recall_stderr": 0.004615535937102858, "rouge2_fmeasure": 0.03767895922224648, "rouge2_fmeasure_stderr": 0.001448043764579111, "rouge2_precision": 0.026902495307978534, "rouge2_precision_stderr": 0.0010371639486970465, "rouge2_recall": 0.06717388422150615, "rouge2_recall_stderr": 0.0027080652362817086, "rougeL_fmeasure": 0.14193633867939442, "rougeL_fmeasure_stderr": 0.002090431283179229, "rougeL_precision": 0.1032652428870357, "rougeL_precision_stderr": 0.001609764469770287, "rougeL_recall": 0.24490235352225267, "rougeL_recall_stderr": 0.003766948264946798, "rougeLsum_fmeasure": 0.13894288832668733, "rougeLsum_fmeasure_stderr": 0.002188039489816412, "rougeLsum_precision": 0.10103180270559059, "rougeLsum_precision_stderr": 0.0016751299955245328, "rougeLsum_recall": 0.2403006525007405, "rougeLsum_recall_stderr": 0.003943762632099877}}, "4": {"article_DOC_summary": {"bleu": 0.7513821615574038, "bleu_stderr": 0.07905589981346285, "rouge1_fmeasure": 0.048911634240246346, "rouge1_fmeasure_stderr": 0.002765697903406571, "rouge1_precision": 0.041468733586705914, "rouge1_precision_stderr": 0.0025971833623889785, "rouge1_recall": 0.07627594081632273, "rouge1_recall_stderr": 0.004409224115860876, "rouge2_fmeasure": 0.01035393012550112, "rouge2_fmeasure_stderr": 0.0009785561170831303, "rouge2_precision": 0.009048514108124975, "rouge2_precision_stderr": 0.0012181947613622947, "rouge2_recall": 0.01699435587690579, "rouge2_recall_stderr": 0.001625353914816647, "rougeL_fmeasure": 0.03907075670579812, "rougeL_fmeasure_stderr": 0.002182547417010533, "rougeL_precision": 0.03361424158025485, "rougeL_precision_stderr": 0.002192257614857122, "rougeL_recall": 0.06101985571016838, "rougeL_recall_stderr": 0.0035062484157730514, "rougeLsum_fmeasure": 0.039302040446406394, "rougeLsum_fmeasure_stderr": 0.0022230245548452298, "rougeLsum_precision": 0.03401363654888806, "rougeLsum_precision_stderr": 0.0022524026754152373, "rougeLsum_recall": 0.0612418372651088, "rougeLsum_recall_stderr": 0.0035630181436990096}}, "5": {"article_DOC_summary": {"bleu": 5.9664964945316196e-36, "bleu_stderr": 6.37663460549604e-30, "rouge1_fmeasure": 0.002904097352119159, "rouge1_fmeasure_stderr": 0.0008025657506213006, "rouge1_precision": 0.003214034101498322, "rouge1_precision_stderr": 0.0009221332972255172, "rouge1_recall": 0.002755372289086832, "rouge1_recall_stderr": 0.0007486991780011711, "rouge2_fmeasure": 0.0003555930988203656, "rouge2_fmeasure_stderr": 0.00020825306662602857, "rouge2_precision": 0.0004553870087049595, "rouge2_precision_stderr": 0.0002660277152615564, "rouge2_recall": 0.0002936371804296332, "rouge2_recall_stderr": 0.00017278177414040624, "rougeL_fmeasure": 0.0021617867610418057, "rougeL_fmeasure_stderr": 0.000600765218788033, "rougeL_precision": 0.0023980769070289483, "rougeL_precision_stderr": 0.0006896763285113808, "rougeL_recall": 0.0020343184807463843, "rougeL_recall_stderr": 0.0005525851742486296, "rougeLsum_fmeasure": 0.0024822527344372726, "rougeLsum_fmeasure_stderr": 0.0006926475370230472, "rougeLsum_precision": 0.0027490625210994384, "rougeLsum_precision_stderr": 0.0008051771911473355, "rougeLsum_recall": 0.0023634542664537216, "rougeLsum_recall_stderr": 0.0006464968490936459}}}}
4b284b17bc4/evaluation/rankeval/4b284b17bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811478,0
3
+ anli_r2,acc,0.329,0.014865395385928362,0
4
+ anli_r3,acc,0.34833333333333333,0.013759437498874075,0
5
+ arc_challenge,acc,0.26791808873720135,0.012942030195136437,0
6
+ arc_challenge,acc_norm,0.2883959044368601,0.013238394422428171,0
7
+ arc_easy,acc,0.6052188552188552,0.010030038935883584,0
8
+ arc_easy,acc_norm,0.5429292929292929,0.01022189756425604,0
9
+ boolq,acc,0.5623853211009174,0.008676717715731632,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.3888888888888889,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.469627564230233,0.004980566907790459,0
14
+ hellaswag,acc_norm,0.6134236207926708,0.004859699562451462,0
15
+ piqa,acc,0.7578890097932536,0.00999437126910438,0
16
+ piqa,acc_norm,0.7622415669205659,0.009932525779525492,0
17
+ rte,acc,0.5415162454873647,0.029992535385373314,0
18
+ sciq,acc,0.852,0.011234866364235235,0
19
+ sciq,acc_norm,0.764,0.013434451402438678,0
20
+ storycloze_2016,acc,0.7108498129342598,0.010484068799942072,0
21
+ winogrande,acc,0.5737963693764798,0.013898585965412338,0
4b284b17bc4/evaluation/rankeval/4b284b17bc4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.32,0.014758652303574886,0
3
+ anli_r2,acc,0.324,0.014806864733738854,0
4
+ anli_r3,acc,0.3491666666666667,0.01376707539507725,0
5
+ arc_challenge,acc,0.2901023890784983,0.013261573677520764,0
6
+ arc_challenge,acc_norm,0.30119453924914674,0.013406741767847638,0
7
+ arc_easy,acc,0.6342592592592593,0.009882988069418829,0
8
+ arc_easy,acc_norm,0.5837542087542088,0.01011481940450087,0
9
+ boolq,acc,0.5409785932721712,0.008715635308774412,1
10
+ cb,acc,0.5535714285714286,0.06703189227942397,1
11
+ cb,f1,0.3890671420083185,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4640509858593906,0.0049768677965835555,0
14
+ hellaswag,acc_norm,0.6082453694483171,0.004871447106554927,0
15
+ piqa,acc,0.7551686615886833,0.010032309105568793,0
16
+ piqa,acc_norm,0.766050054406964,0.009877236895137436,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.896,0.009658016218524301,0
19
+ sciq,acc_norm,0.88,0.010281328012747386,0
20
+ storycloze_2016,acc,0.711918760021379,0.010472537019822582,0
21
+ winogrande,acc,0.574585635359116,0.013895257666646378,0
4b284b17bc4/evaluation/rankeval/4b284b17bc4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.343,0.015019206922356953,0
3
+ anli_r2,acc,0.318,0.014734079309311901,0
4
+ anli_r3,acc,0.325,0.013526454480351028,0
5
+ arc_challenge,acc,0.2901023890784983,0.013261573677520759,0
6
+ arc_challenge,acc_norm,0.31313993174061433,0.013552671543623496,0
7
+ arc_easy,acc,0.6325757575757576,0.009892552616211558,0
8
+ arc_easy,acc_norm,0.617003367003367,0.009974920384536479,0
9
+ boolq,acc,0.5489296636085627,0.008703080962379622,1
10
+ cb,acc,0.42857142857142855,0.06672848092813058,1
11
+ cb,f1,0.3058470764617691,,1
12
+ copa,acc,0.78,0.04163331998932263,0
13
+ hellaswag,acc,0.45727942640908187,0.004971534874389935,0
14
+ hellaswag,acc_norm,0.602867954590719,0.004883037758919964,0
15
+ piqa,acc,0.7540805223068553,0.010047331865625194,0
16
+ piqa,acc_norm,0.7698585418933623,0.009820832826839796,0
17
+ rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.906,0.009233052000787738,0
19
+ sciq,acc_norm,0.891,0.009859828407037186,0
20
+ storycloze_2016,acc,0.7215392838054516,0.010365521460604415,0
21
+ winogrande,acc,0.5808997632202052,0.013867325192210116,0
4b284b17bc4/evaluation/rankeval/4b284b17bc4_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.348,0.015070604603768408,0
3
+ anli_r2,acc,0.36,0.01518652793204012,0
4
+ anli_r3,acc,0.35083333333333333,0.013782212417178195,0
5
+ arc_challenge,acc,0.2901023890784983,0.013261573677520769,0
6
+ arc_challenge,acc_norm,0.31143344709897613,0.013532472099850949,0
7
+ arc_easy,acc,0.6266835016835017,0.009925009142802903,0
8
+ arc_easy,acc_norm,0.6203703703703703,0.009958037725468558,0
9
+ boolq,acc,0.5498470948012233,0.008701488203356937,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.40387403446226977,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4567815176259709,0.004971106265046551,0
14
+ hellaswag,acc_norm,0.5992830113523202,0.004890422457747258,0
15
+ piqa,acc,0.7578890097932536,0.009994371269104387,0
16
+ piqa,acc_norm,0.7682263329706203,0.00984514377279405,0
17
+ rte,acc,0.48375451263537905,0.030080573208738064,0
18
+ sciq,acc,0.923,0.008434580140240632,0
19
+ sciq,acc_norm,0.903,0.00936368937324812,0
20
+ storycloze_2016,acc,0.7247461250668092,0.010328538400500567,0
21
+ winogrande,acc,0.569060773480663,0.013917796623335966,0
4b284b17bc4/evaluation/rankeval/4b284b17bc4_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.36,0.015186527932040117,0
3
+ anli_r2,acc,0.347,0.015060472031706625,0
4
+ anli_r3,acc,0.3625,0.01388303787422552,0
5
+ arc_challenge,acc,0.302901023890785,0.013428241573185349,0
6
+ arc_challenge,acc_norm,0.32337883959044367,0.013669421630012129,0
7
+ arc_easy,acc,0.640993265993266,0.009843424713072174,0
8
+ arc_easy,acc_norm,0.6186868686868687,0.009966542497171025,0
9
+ boolq,acc,0.545565749235474,0.008708665643758015,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.4538378958668814,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.45180242979486157,0.004966544724452227,0
14
+ hellaswag,acc_norm,0.5955984863572994,0.004897728370737246,0
15
+ piqa,acc,0.7578890097932536,0.009994371269104385,0
16
+ piqa,acc_norm,0.7752992383025027,0.009738282586548389,0
17
+ rte,acc,0.48375451263537905,0.030080573208738064,0
18
+ sciq,acc,0.915,0.008823426366942331,0
19
+ sciq,acc_norm,0.912,0.008963053962592085,0
20
+ storycloze_2016,acc,0.7177979690005345,0.010407834479647672,0
21
+ winogrande,acc,0.5706393054459353,0.013911537499969163,0
4b284b17bc4/evaluation/rankeval/4b284b17bc4_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.363,0.015213890444671281,0
3
+ anli_r2,acc,0.347,0.015060472031706624,0
4
+ anli_r3,acc,0.34,0.013680495725767794,0
5
+ arc_challenge,acc,0.2986348122866894,0.013374078615068756,0
6
+ arc_challenge,acc_norm,0.310580204778157,0.013522292098053052,0
7
+ arc_easy,acc,0.6447811447811448,0.009820245899287117,0
8
+ arc_easy,acc_norm,0.625,0.009933992677987828,0
9
+ boolq,acc,0.5376146788990825,0.008720273736433679,1
10
+ cb,acc,0.5535714285714286,0.06703189227942397,1
11
+ cb,f1,0.3974410235905637,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.44981079466241786,0.004964579685712439,0
14
+ hellaswag,acc_norm,0.6002788289185421,0.004888398535520516,0
15
+ piqa,acc,0.7584330794341676,0.00998671800180446,0
16
+ piqa,acc_norm,0.7671381936887922,0.009861236071080757,0
17
+ rte,acc,0.49097472924187724,0.030091559826331334,0
18
+ sciq,acc,0.918,0.00868051561552374,0
19
+ sciq,acc_norm,0.908,0.009144376393151117,0
20
+ storycloze_2016,acc,0.7113842864778194,0.01047831178564294,0
21
+ winogrande,acc,0.5785319652722968,0.013878072377497603,0
4b284b21bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.06289750165250287
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.06289750165250287
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.12449829406834531
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.12449829406834531
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.14591205568832014
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.14591205568832014
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.14908085018598377
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.14908085018598377
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.15250785714191883
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.15250785714191883
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.15567663749325128
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.15567663749325128
14
+ e2e_nlg_cleaned,5,average,multiple,0.13176219937172037
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04639360161894793
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04639360161894793
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.041352212313184845
18
+ gem_xsum,1,median,rouge2_fmeasure,0.041352212313184845
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04614704751240165
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04614704751240165
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04492583126972709
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04492583126972709
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.01203821663060757
24
+ gem_xsum,4,median,rouge2_fmeasure,0.01203821663060757
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0003256051958251534
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0003256051958251534
27
+ gem_xsum,5,average,multiple,0.03186375242344904
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.04998894903569846
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.04998894903569846
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.051345397484036256
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.051345397484036256
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.053828506115298144
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.053828506115298144
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.051724489676439236
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.051724489676439236
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.052942763106877684
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.052942763106877684
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.054089458597439195
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.054089458597439195
40
+ web_nlg_en,5,average,multiple,0.05231992733596483
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03393757157227001
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03393757157227001
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05423182118294372
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05423182118294372
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05619324678157442
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05619324678157442
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04775424867054453
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04775424867054453
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.014607128479951145
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.014607128479951145
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0027143726441978717
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0027143726441978717
53
+ wiki_lingua_en,5,average,multiple,0.03490639822191362
4b284b21bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3423886062648571, "bleu_stderr": 0.03277534172219839, "rouge1_fmeasure": 0.1078155721409226, "rouge1_fmeasure_stderr": 0.0020097112571708245, "rouge1_precision": 0.07116912691303, "rouge1_precision_stderr": 0.0015925050224480028, "rouge1_recall": 0.2998523601292701, "rouge1_recall_stderr": 0.004615423133559915, "rouge2_fmeasure": 0.04998894903569846, "rouge2_fmeasure_stderr": 0.001254978454654862, "rouge2_precision": 0.03290155614985229, "rouge2_precision_stderr": 0.0009635800950074162, "rouge2_recall": 0.1430151870039024, "rouge2_recall_stderr": 0.003133828558045969, "rougeL_fmeasure": 0.10384529013890176, "rougeL_fmeasure_stderr": 0.0018689379304153127, "rougeL_precision": 0.06817638471397719, "rougeL_precision_stderr": 0.001440134234671522, "rougeL_recall": 0.2915042735714293, "rougeL_recall_stderr": 0.004516752469805975, "rougeLsum_fmeasure": 0.10254842199703165, "rougeLsum_fmeasure_stderr": 0.0018748895711891628, "rougeLsum_precision": 0.06758374630355668, "rougeLsum_precision_stderr": 0.0014755310584898722, "rougeLsum_recall": 0.2855613282697172, "rougeLsum_recall_stderr": 0.004322222242728911}}, "1": {"PALM_prompt": {"bleu": 0.43549317288896894, "bleu_stderr": 0.029438186163177924, "rouge1_fmeasure": 0.11204264924342298, "rouge1_fmeasure_stderr": 0.0018816879797699437, "rouge1_precision": 0.07179524471867899, "rouge1_precision_stderr": 0.0013752072779383184, "rouge1_recall": 0.35855038856048876, "rouge1_recall_stderr": 0.005049635846993475, "rouge2_fmeasure": 0.051345397484036256, "rouge2_fmeasure_stderr": 0.0011584347174914676, "rouge2_precision": 0.032801419147362856, "rouge2_precision_stderr": 0.0008197872630256377, "rouge2_recall": 0.1723852958712864, "rouge2_recall_stderr": 0.0035877516735487143, "rougeL_fmeasure": 0.10481665765175784, "rougeL_fmeasure_stderr": 0.0016825296642133658, "rougeL_precision": 0.06711558592890904, "rougeL_precision_stderr": 0.00122163800184802, "rougeL_recall": 0.33387823961526397, "rougeL_recall_stderr": 0.004537882130758048, "rougeLsum_fmeasure": 0.10653343348961458, "rougeLsum_fmeasure_stderr": 0.0017665097203702323, "rougeLsum_precision": 0.06830552553162243, "rougeLsum_precision_stderr": 0.001292240550330291, "rougeLsum_recall": 0.33957294457499176, "rougeLsum_recall_stderr": 0.004661961230072316}}, "2": {"PALM_prompt": {"bleu": 0.4731158648079456, "bleu_stderr": 0.019398006063203924, "rouge1_fmeasure": 0.11565039220118124, "rouge1_fmeasure_stderr": 0.0017909510986326618, "rouge1_precision": 0.07373712610103833, "rouge1_precision_stderr": 0.0013232763617533117, "rouge1_recall": 0.3838400605745808, "rouge1_recall_stderr": 0.004971353366559517, "rouge2_fmeasure": 0.053828506115298144, "rouge2_fmeasure_stderr": 0.0011299593484305154, "rouge2_precision": 0.03418002818557043, "rouge2_precision_stderr": 0.0007999127016124942, "rouge2_recall": 0.18966804432678097, "rouge2_recall_stderr": 0.0037070132910163134, "rougeL_fmeasure": 0.10799852330681028, "rougeL_fmeasure_stderr": 0.0016342464145243179, "rougeL_precision": 0.0688866789263329, "rougeL_precision_stderr": 0.001197716696827759, "rougeL_recall": 0.35416301817745277, "rougeL_recall_stderr": 0.004408869173242153, "rougeLsum_fmeasure": 0.11029915615802689, "rougeLsum_fmeasure_stderr": 0.0017015090669507955, "rougeLsum_precision": 0.07036220911566307, "rougeLsum_precision_stderr": 0.0012542262936579098, "rougeLsum_recall": 0.3642675981143951, "rougeLsum_recall_stderr": 0.004634298128673839}}, "3": {"PALM_prompt": {"bleu": 0.511033492138013, "bleu_stderr": 0.016836817368392938, "rouge1_fmeasure": 0.11234455309812195, "rouge1_fmeasure_stderr": 0.001658068318944836, "rouge1_precision": 0.0711394556954671, "rouge1_precision_stderr": 0.0012159991412829965, "rouge1_recall": 0.3852018338546001, "rouge1_recall_stderr": 0.005067790324136812, "rouge2_fmeasure": 0.051724489676439236, "rouge2_fmeasure_stderr": 0.0010463045011873814, "rouge2_precision": 0.03260081169248259, "rouge2_precision_stderr": 0.0007319564987104789, "rouge2_recall": 0.18947442868920766, "rouge2_recall_stderr": 0.0037625536404537635, "rougeL_fmeasure": 0.10462160106353212, "rougeL_fmeasure_stderr": 0.0015374663592284314, "rougeL_precision": 0.06636573472304302, "rougeL_precision_stderr": 0.0011254814997386617, "rougeL_recall": 0.3527863813991901, "rougeL_recall_stderr": 0.004403124502538447, "rougeLsum_fmeasure": 0.10692341362108594, "rougeLsum_fmeasure_stderr": 0.001592257988435529, "rougeLsum_precision": 0.0678291021786637, "rougeLsum_precision_stderr": 0.0011705423632374366, "rougeLsum_recall": 0.36350690371493805, "rougeLsum_recall_stderr": 0.004671323997080324}}, "4": {"PALM_prompt": {"bleu": 0.5255384435057461, "bleu_stderr": 0.03418580070894041, "rouge1_fmeasure": 0.11520145885524227, "rouge1_fmeasure_stderr": 0.0016544612830049232, "rouge1_precision": 0.07289393265058683, "rouge1_precision_stderr": 0.001221203882596679, "rouge1_recall": 0.39210555607501246, "rouge1_recall_stderr": 0.004907654599821545, "rouge2_fmeasure": 0.052942763106877684, "rouge2_fmeasure_stderr": 0.0010297898091120672, "rouge2_precision": 0.03333523314795858, "rouge2_precision_stderr": 0.0007290358506575526, "rouge2_recall": 0.19457648128168623, "rouge2_recall_stderr": 0.003616303249775819, "rougeL_fmeasure": 0.10655328646588215, "rougeL_fmeasure_stderr": 0.0014999277772535491, "rougeL_precision": 0.06752250125401707, "rougeL_precision_stderr": 0.0011086146136256522, "rougeL_recall": 0.35896707400416245, "rougeL_recall_stderr": 0.004279012704408649, "rougeLsum_fmeasure": 0.10971668045319283, "rougeLsum_fmeasure_stderr": 0.001577124609883823, "rougeLsum_precision": 0.06949836967046885, "rougeLsum_precision_stderr": 0.0011669918564828742, "rougeLsum_recall": 0.3713609435685992, "rougeLsum_recall_stderr": 0.004540169352829801}}, "5": {"PALM_prompt": {"bleu": 0.5823615010118224, "bleu_stderr": 0.037398869921054644, "rouge1_fmeasure": 0.11639260450950206, "rouge1_fmeasure_stderr": 0.0016495247557604747, "rouge1_precision": 0.07346269739111895, "rouge1_precision_stderr": 0.0012187681049390564, "rouge1_recall": 0.4042470714837263, "rouge1_recall_stderr": 0.005065518163952181, "rouge2_fmeasure": 0.054089458597439195, "rouge2_fmeasure_stderr": 0.0010444166935940533, "rouge2_precision": 0.033940599163428696, "rouge2_precision_stderr": 0.0007391477550345831, "rouge2_recall": 0.20304234761076909, "rouge2_recall_stderr": 0.003764961452207978, "rougeL_fmeasure": 0.10741878654019481, "rougeL_fmeasure_stderr": 0.0015017472778460627, "rougeL_precision": 0.06792933252591714, "rougeL_precision_stderr": 0.0011100797550413972, "rougeL_recall": 0.3687266029366595, "rougeL_recall_stderr": 0.004388331338140494, "rougeLsum_fmeasure": 0.11057542829413115, "rougeLsum_fmeasure_stderr": 0.0015808583588389114, "rougeLsum_precision": 0.06991029736225668, "rougeLsum_precision_stderr": 0.001173008019162227, "rougeLsum_recall": 0.38168151833425595, "rougeLsum_recall_stderr": 0.00467124723267976}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.4552750330972206, "bleu_stderr": 0.05646997136025405, "rouge1_fmeasure": 0.17104966216653122, "rouge1_fmeasure_stderr": 0.0018388410541572498, "rouge1_precision": 0.1463073350919793, "rouge1_precision_stderr": 0.001849976731473478, "rouge1_recall": 0.24687692205659445, "rouge1_recall_stderr": 0.0026245416604448834, "rouge2_fmeasure": 0.03393757157227001, "rouge2_fmeasure_stderr": 0.0008395819605445009, "rouge2_precision": 0.02868070817665277, "rouge2_precision_stderr": 0.0007455100631059865, "rouge2_recall": 0.050970339370236116, "rouge2_recall_stderr": 0.0013773850589296726, "rougeL_fmeasure": 0.13431002294362815, "rougeL_fmeasure_stderr": 0.001329252784566218, "rougeL_precision": 0.1134975332744349, "rougeL_precision_stderr": 0.0013046086784248739, "rougeL_recall": 0.19882847469371265, "rougeL_recall_stderr": 0.002160779528818444, "rougeLsum_fmeasure": 0.1570085078363825, "rougeLsum_fmeasure_stderr": 0.0016723834304551517, "rougeLsum_precision": 0.1341572569615249, "rougeLsum_precision_stderr": 0.001680913399139742, "rougeLsum_recall": 0.22735010800139077, "rougeLsum_recall_stderr": 0.002429214690741554}}, "1": {"tldr_en": {"bleu": 2.807870663958701, "bleu_stderr": 0.08388649431195123, "rouge1_fmeasure": 0.21883314563862602, "rouge1_fmeasure_stderr": 0.001968035833203396, "rouge1_precision": 0.19104818364291273, "rouge1_precision_stderr": 0.0022212767005973058, "rouge1_recall": 0.3165545579856358, "rouge1_recall_stderr": 0.002818835296854103, "rouge2_fmeasure": 0.05423182118294372, "rouge2_fmeasure_stderr": 0.0010465141353409224, "rouge2_precision": 0.04737338234449901, "rouge2_precision_stderr": 0.0010449765003741809, "rouge2_recall": 0.081573363995006, "rouge2_recall_stderr": 0.0017297723073854117, "rougeL_fmeasure": 0.15600647601501733, "rougeL_fmeasure_stderr": 0.0013358035921943341, "rougeL_precision": 0.1352656471757331, "rougeL_precision_stderr": 0.0015360042939327185, "rougeL_recall": 0.23168675674466935, "rougeL_recall_stderr": 0.0022572191515858007, "rougeLsum_fmeasure": 0.20602296990458846, "rougeLsum_fmeasure_stderr": 0.0018407304080033926, "rougeLsum_precision": 0.17973117727215357, "rougeLsum_precision_stderr": 0.0020830282164021902, "rougeLsum_recall": 0.29889898054403574, "rougeLsum_recall_stderr": 0.0026791790500151685}}, "2": {"tldr_en": {"bleu": 3.0758182963274967, "bleu_stderr": 0.07269865886048295, "rouge1_fmeasure": 0.213319730404769, "rouge1_fmeasure_stderr": 0.001947942927707738, "rouge1_precision": 0.22091940935835946, "rouge1_precision_stderr": 0.003036544780507811, "rouge1_recall": 0.28592540020450335, "rouge1_recall_stderr": 0.0028268458929152375, "rouge2_fmeasure": 0.05619324678157442, "rouge2_fmeasure_stderr": 0.0011378174233996204, "rouge2_precision": 0.0625668775391695, "rouge2_precision_stderr": 0.001863481800778566, "rouge2_recall": 0.07636442527094099, "rouge2_recall_stderr": 0.0016705370533154184, "rougeL_fmeasure": 0.15707830961714367, "rougeL_fmeasure_stderr": 0.001424704542577375, "rougeL_precision": 0.16524622277086715, "rougeL_precision_stderr": 0.002504977446892061, "rougeL_recall": 0.21312172953880804, "rougeL_recall_stderr": 0.002230572391304009, "rougeLsum_fmeasure": 0.20133540483239756, "rougeLsum_fmeasure_stderr": 0.0018432855451969799, "rougeLsum_precision": 0.2088533869488438, "rougeLsum_precision_stderr": 0.0029209986594073956, "rougeLsum_recall": 0.27031759901723496, "rougeLsum_recall_stderr": 0.00269091513844755}}, "3": {"tldr_en": {"bleu": 3.2676357062273516, "bleu_stderr": 0.12290136548273946, "rouge1_fmeasure": 0.17690311487463797, "rouge1_fmeasure_stderr": 0.0023173541475324253, "rouge1_precision": 0.21388175415191102, "rouge1_precision_stderr": 0.0037807201763198858, "rouge1_recall": 0.22109179833036308, "rouge1_recall_stderr": 0.00317048229219639, "rouge2_fmeasure": 0.04775424867054453, "rouge2_fmeasure_stderr": 0.0011657814934947282, "rouge2_precision": 0.06310561248802647, "rouge2_precision_stderr": 0.0021350241076741575, "rouge2_recall": 0.05983363177094545, "rouge2_recall_stderr": 0.0015915281660158853, "rougeL_fmeasure": 0.13377606619329538, "rougeL_fmeasure_stderr": 0.0017521905460361976, "rougeL_precision": 0.16656647324261953, "rougeL_precision_stderr": 0.0032154715784240336, "rougeL_recall": 0.16788106886771248, "rougeL_recall_stderr": 0.0024768275501905303, "rougeLsum_fmeasure": 0.16765421945999662, "rougeLsum_fmeasure_stderr": 0.002202375233222233, "rougeLsum_precision": 0.2032660883469589, "rougeLsum_precision_stderr": 0.003645991164048245, "rougeLsum_recall": 0.20988404520976878, "rougeLsum_recall_stderr": 0.003033569663633178}}, "4": {"tldr_en": {"bleu": 0.3207302415446247, "bleu_stderr": 0.03951812995159553, "rouge1_fmeasure": 0.05381265634572307, "rouge1_fmeasure_stderr": 0.0019615926895963105, "rouge1_precision": 0.07217489085431607, "rouge1_precision_stderr": 0.003030440715398035, "rouge1_recall": 0.06719640775054124, "rouge1_recall_stderr": 0.0025953918642397556, "rouge2_fmeasure": 0.014607128479951145, "rouge2_fmeasure_stderr": 0.0007893617726734669, "rouge2_precision": 0.021665967083708774, "rouge2_precision_stderr": 0.0015098116987345535, "rouge2_recall": 0.01860172806438717, "rouge2_recall_stderr": 0.0011134284906677621, "rougeL_fmeasure": 0.041897511040161206, "rougeL_fmeasure_stderr": 0.0015198623386063986, "rougeL_precision": 0.057955761496930415, "rougeL_precision_stderr": 0.002549676328226009, "rougeL_recall": 0.05288705994275681, "rougeL_recall_stderr": 0.0020802147270799964, "rougeLsum_fmeasure": 0.05091825334557307, "rougeLsum_fmeasure_stderr": 0.001857333758746474, "rougeLsum_precision": 0.06845672227370601, "rougeLsum_precision_stderr": 0.0028945029688266562, "rougeLsum_recall": 0.06371644081837961, "rougeLsum_recall_stderr": 0.0024692144864177654}}, "5": {"tldr_en": {"bleu": 2.2490869778774144e-09, "bleu_stderr": 4.579875964652809e-08, "rouge1_fmeasure": 0.008600982975105755, "rouge1_fmeasure_stderr": 0.000871586481245996, "rouge1_precision": 0.012746310259293975, "rouge1_precision_stderr": 0.001498370110228753, "rouge1_recall": 0.01036719903184163, "rouge1_recall_stderr": 0.0011041663951146658, "rouge2_fmeasure": 0.0027143726441978717, "rouge2_fmeasure_stderr": 0.0004021894165602589, "rouge2_precision": 0.0046837043167768724, "rouge2_precision_stderr": 0.000851048694914921, "rouge2_recall": 0.0032041325852383415, "rouge2_recall_stderr": 0.00048168285623642387, "rougeL_fmeasure": 0.006953217638177468, "rougeL_fmeasure_stderr": 0.0007188116843350712, "rougeL_precision": 0.010658331542732118, "rougeL_precision_stderr": 0.0013171585867806077, "rougeL_recall": 0.008285275201527513, "rougeL_recall_stderr": 0.0008892336234217646, "rougeLsum_fmeasure": 0.008236010332581721, "rougeLsum_fmeasure_stderr": 0.0008379564793264335, "rougeLsum_precision": 0.01225616498642844, "rougeLsum_precision_stderr": 0.0014511703007000516, "rougeLsum_recall": 0.009942438567258063, "rougeLsum_recall_stderr": 0.0010638347073576458}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 3.3868925634521596, "bleu_stderr": 0.07415023515490869, "rouge1_fmeasure": 0.16968795558812888, "rouge1_fmeasure_stderr": 0.0020520636735140885, "rouge1_precision": 0.12781588961197554, "rouge1_precision_stderr": 0.0016392738386146906, "rouge1_recall": 0.26686211195039367, "rouge1_recall_stderr": 0.0030010672113457175, "rouge2_fmeasure": 0.06289750165250287, "rouge2_fmeasure_stderr": 0.0013193385142581298, "rouge2_precision": 0.04754240795799523, "rouge2_precision_stderr": 0.001014281868832704, "rouge2_recall": 0.09732440743600192, "rouge2_recall_stderr": 0.002035877095135508, "rougeL_fmeasure": 0.15745602394794062, "rougeL_fmeasure_stderr": 0.0017868944105710683, "rougeL_precision": 0.11843660338175163, "rougeL_precision_stderr": 0.0014270824435811313, "rougeL_recall": 0.24837983245494402, "rougeL_recall_stderr": 0.0026307193382184542, "rougeLsum_fmeasure": 0.14802696104042146, "rougeLsum_fmeasure_stderr": 0.0018361037263776232, "rougeLsum_precision": 0.11138415788379845, "rougeLsum_precision_stderr": 0.0014602010171592388, "rougeLsum_recall": 0.23362654109870873, "rougeLsum_recall_stderr": 0.0027185132812130603}}, "1": {"generate_text_restaurant": {"bleu": 5.656637998489558, "bleu_stderr": 0.06138640019754124, "rouge1_fmeasure": 0.29857293628895076, "rouge1_fmeasure_stderr": 0.0017883517178428313, "rouge1_precision": 0.22448431958439818, "rouge1_precision_stderr": 0.0015774519438982097, "rouge1_recall": 0.475937695786901, "rouge1_recall_stderr": 0.002823854240730373, "rouge2_fmeasure": 0.12449829406834531, "rouge2_fmeasure_stderr": 0.0012625501065363337, "rouge2_precision": 0.09278798103068853, "rouge2_precision_stderr": 0.0010182785337770847, "rouge2_recall": 0.2039364568886483, "rouge2_recall_stderr": 0.0021426616098330804, "rougeL_fmeasure": 0.2478615356783164, "rougeL_fmeasure_stderr": 0.0013407338809586194, "rougeL_precision": 0.18571244490415678, "rougeL_precision_stderr": 0.0011675141052374309, "rougeL_recall": 0.3980265502028155, "rougeL_recall_stderr": 0.0023515010622215645, "rougeLsum_fmeasure": 0.24278851384210426, "rougeLsum_fmeasure_stderr": 0.0016843478489220692, "rougeLsum_precision": 0.18244634491896064, "rougeLsum_precision_stderr": 0.001437571751990845, "rougeLsum_recall": 0.3876575760439037, "rougeLsum_recall_stderr": 0.002699802672015727}}, "2": {"generate_text_restaurant": {"bleu": 6.856901680068561, "bleu_stderr": 0.08124041054587171, "rouge1_fmeasure": 0.32161895837988735, "rouge1_fmeasure_stderr": 0.00168146427395452, "rouge1_precision": 0.2401800477102809, "rouge1_precision_stderr": 0.0014473576335462571, "rouge1_recall": 0.5163816757468545, "rouge1_recall_stderr": 0.0027490552702758896, "rouge2_fmeasure": 0.14591205568832014, "rouge2_fmeasure_stderr": 0.001294685527449689, "rouge2_precision": 0.10780300493629737, "rouge2_precision_stderr": 0.0010069472486850257, "rouge2_recall": 0.2413921319510364, "rouge2_recall_stderr": 0.002257493837016372, "rougeL_fmeasure": 0.2704562548681343, "rougeL_fmeasure_stderr": 0.0013038206676934103, "rougeL_precision": 0.20140298531801745, "rougeL_precision_stderr": 0.0011006514010170898, "rougeL_recall": 0.43694912141494435, "rougeL_recall_stderr": 0.002368493512084033, "rougeLsum_fmeasure": 0.2651360409168281, "rougeLsum_fmeasure_stderr": 0.00165326573625161, "rougeLsum_precision": 0.19780238487026972, "rougeLsum_precision_stderr": 0.0013643352859564666, "rougeLsum_recall": 0.42666941865224894, "rougeLsum_recall_stderr": 0.0027363306527661384}}, "3": {"generate_text_restaurant": {"bleu": 7.067980420028392, "bleu_stderr": 0.07674666618798165, "rouge1_fmeasure": 0.3234918087831591, "rouge1_fmeasure_stderr": 0.0016866852215600267, "rouge1_precision": 0.24119330473410452, "rouge1_precision_stderr": 0.001436265091489619, "rouge1_recall": 0.520447176584629, "rouge1_recall_stderr": 0.0027841731171524635, "rouge2_fmeasure": 0.14908085018598377, "rouge2_fmeasure_stderr": 0.001325863965517803, "rouge2_precision": 0.1098856604345649, "rouge2_precision_stderr": 0.0010230282133462645, "rouge2_recall": 0.24765707981063853, "rouge2_recall_stderr": 0.002344687079258296, "rougeL_fmeasure": 0.2728226991547497, "rougeL_fmeasure_stderr": 0.001319038570084664, "rougeL_precision": 0.2028877822846739, "rougeL_precision_stderr": 0.00110597560698619, "rougeL_recall": 0.44168375010036015, "rougeL_recall_stderr": 0.0024210245845319865, "rougeLsum_fmeasure": 0.2684669151387224, "rougeLsum_fmeasure_stderr": 0.0016553181450139245, "rougeLsum_precision": 0.19995751457276323, "rougeLsum_precision_stderr": 0.0013597786817193271, "rougeLsum_recall": 0.4331317668864426, "rougeLsum_recall_stderr": 0.002771288232621466}}, "4": {"generate_text_restaurant": {"bleu": 7.301331414189049, "bleu_stderr": 0.07282835065460666, "rouge1_fmeasure": 0.3281997540286874, "rouge1_fmeasure_stderr": 0.0016842914784987548, "rouge1_precision": 0.24454894011699824, "rouge1_precision_stderr": 0.0014458985189821688, "rouge1_recall": 0.5282893998840518, "rouge1_recall_stderr": 0.0027239476703194553, "rouge2_fmeasure": 0.15250785714191883, "rouge2_fmeasure_stderr": 0.0013289295083122636, "rouge2_precision": 0.11241096101927037, "rouge2_precision_stderr": 0.0010296837842975505, "rouge2_recall": 0.25292679532334217, "rouge2_recall_stderr": 0.0023135968641909677, "rougeL_fmeasure": 0.27576809325399554, "rougeL_fmeasure_stderr": 0.001332292317195587, "rougeL_precision": 0.20493346554385022, "rougeL_precision_stderr": 0.0011217276272122712, "rougeL_recall": 0.44669665680269593, "rougeL_recall_stderr": 0.0023872195322647088, "rougeLsum_fmeasure": 0.2720277049182356, "rougeLsum_fmeasure_stderr": 0.0016797663432208906, "rougeLsum_precision": 0.20253099841039804, "rougeLsum_precision_stderr": 0.0013836573195200987, "rougeLsum_recall": 0.4389119231936933, "rougeLsum_recall_stderr": 0.0027715255759945875}}, "5": {"generate_text_restaurant": {"bleu": 7.391640493190723, "bleu_stderr": 0.07696132388022625, "rouge1_fmeasure": 0.33316361794611193, "rouge1_fmeasure_stderr": 0.0016721318654132375, "rouge1_precision": 0.248561550528162, "rouge1_precision_stderr": 0.0014842009296995168, "rouge1_recall": 0.5361907676530253, "rouge1_recall_stderr": 0.0026923692664488426, "rouge2_fmeasure": 0.15567663749325128, "rouge2_fmeasure_stderr": 0.001316099001704392, "rouge2_precision": 0.11498051543992278, "rouge2_precision_stderr": 0.0010486619107346665, "rouge2_recall": 0.2578870983510302, "rouge2_recall_stderr": 0.0022815014259324113, "rougeL_fmeasure": 0.27881193769275936, "rougeL_fmeasure_stderr": 0.0013297414200118412, "rougeL_precision": 0.2074081838133947, "rougeL_precision_stderr": 0.0011575590389617165, "rougeL_recall": 0.45169489347455444, "rougeL_recall_stderr": 0.0023905395426844995, "rougeLsum_fmeasure": 0.27704641367351746, "rougeLsum_fmeasure_stderr": 0.001654188879883823, "rougeLsum_precision": 0.20657408552804282, "rougeLsum_precision_stderr": 0.0014132706167399262, "rougeLsum_recall": 0.4467577031168348, "rougeLsum_recall_stderr": 0.0027053770792765386}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 1.8973746751821576, "bleu_stderr": 0.07374608873039461, "rouge1_fmeasure": 0.20448305051370513, "rouge1_fmeasure_stderr": 0.0024499517599214865, "rouge1_precision": 0.14714404834539455, "rouge1_precision_stderr": 0.0018923895784899547, "rouge1_recall": 0.35332808198249066, "rouge1_recall_stderr": 0.004277854613117222, "rouge2_fmeasure": 0.04639360161894793, "rouge2_fmeasure_stderr": 0.0015361212037212923, "rouge2_precision": 0.03290270141635618, "rouge2_precision_stderr": 0.0010989269503369368, "rouge2_recall": 0.08287973270989689, "rouge2_recall_stderr": 0.002833032880998302, "rougeL_fmeasure": 0.1522143356903949, "rougeL_fmeasure_stderr": 0.0018364801701113477, "rougeL_precision": 0.10929554255441323, "rougeL_precision_stderr": 0.001387362394565063, "rougeL_recall": 0.26469649902339626, "rougeL_recall_stderr": 0.0033616815736178403, "rougeLsum_fmeasure": 0.1627172387554665, "rougeLsum_fmeasure_stderr": 0.0020740152357133783, "rougeLsum_precision": 0.11672705515751206, "rougeLsum_precision_stderr": 0.001546854617689576, "rougeLsum_recall": 0.28315180080173324, "rougeLsum_recall_stderr": 0.0037966298013545237}}, "1": {"article_DOC_summary": {"bleu": 1.6712614198546765, "bleu_stderr": 0.08029090892350033, "rouge1_fmeasure": 0.18885391905418497, "rouge1_fmeasure_stderr": 0.0025584792794476653, "rouge1_precision": 0.1344416388362992, "rouge1_precision_stderr": 0.0019127327308320389, "rouge1_recall": 0.33053933219336484, "rouge1_recall_stderr": 0.00435936977132835, "rouge2_fmeasure": 0.041352212313184845, "rouge2_fmeasure_stderr": 0.0015256985849036882, "rouge2_precision": 0.02919021495874484, "rouge2_precision_stderr": 0.001083579043412117, "rouge2_recall": 0.07395642387380598, "rouge2_recall_stderr": 0.0027833547930418757, "rougeL_fmeasure": 0.1447818747001178, "rougeL_fmeasure_stderr": 0.001931305342910804, "rougeL_precision": 0.10283551822207533, "rougeL_precision_stderr": 0.0014281321188522025, "rougeL_recall": 0.25511273674645885, "rougeL_recall_stderr": 0.0034307162021354593, "rougeLsum_fmeasure": 0.14973596762001254, "rougeLsum_fmeasure_stderr": 0.002121716066142779, "rougeLsum_precision": 0.10637101855903888, "rougeLsum_precision_stderr": 0.0015675521812090556, "rougeLsum_recall": 0.26365396988763345, "rougeLsum_recall_stderr": 0.003725007857599503}}, "2": {"article_DOC_summary": {"bleu": 1.7895958187086474, "bleu_stderr": 0.04787428909067152, "rouge1_fmeasure": 0.19729124660372196, "rouge1_fmeasure_stderr": 0.0024865996635105967, "rouge1_precision": 0.14045160525988198, "rouge1_precision_stderr": 0.0018675726895835078, "rouge1_recall": 0.3448073202698317, "rouge1_recall_stderr": 0.004206053313996407, "rouge2_fmeasure": 0.04614704751240165, "rouge2_fmeasure_stderr": 0.001558612721206191, "rouge2_precision": 0.03250319865239163, "rouge2_precision_stderr": 0.001104582027493848, "rouge2_recall": 0.08285013633430294, "rouge2_recall_stderr": 0.0028679116547561338, "rougeL_fmeasure": 0.15159285154298055, "rougeL_fmeasure_stderr": 0.0018751369925771642, "rougeL_precision": 0.1076421486644076, "rougeL_precision_stderr": 0.00138815532226034, "rougeL_recall": 0.2669756747819945, "rougeL_recall_stderr": 0.0033509483772327935, "rougeLsum_fmeasure": 0.1571758204423751, "rougeLsum_fmeasure_stderr": 0.002082270276853359, "rougeLsum_precision": 0.11165266260752228, "rougeLsum_precision_stderr": 0.0015420215686911798, "rougeLsum_recall": 0.2762348157117063, "rougeLsum_recall_stderr": 0.0036302813408504828}}, "3": {"article_DOC_summary": {"bleu": 1.799886255448494, "bleu_stderr": 0.06241605343512123, "rouge1_fmeasure": 0.19247676178691825, "rouge1_fmeasure_stderr": 0.0027783344646224338, "rouge1_precision": 0.1397482550494329, "rouge1_precision_stderr": 0.0021614980521941947, "rouge1_recall": 0.3314206756572503, "rouge1_recall_stderr": 0.0048258842534713575, "rouge2_fmeasure": 0.04492583126972709, "rouge2_fmeasure_stderr": 0.0015630404482977962, "rouge2_precision": 0.032034815897723355, "rouge2_precision_stderr": 0.0011234176352706835, "rouge2_recall": 0.08021224571691461, "rouge2_recall_stderr": 0.00290937059770342, "rougeL_fmeasure": 0.14710040508558128, "rougeL_fmeasure_stderr": 0.0020844906061187624, "rougeL_precision": 0.10664386827077653, "rougeL_precision_stderr": 0.0016219261205998713, "rougeL_recall": 0.25469242016766824, "rougeL_recall_stderr": 0.003747468811937111, "rougeLsum_fmeasure": 0.1523907734490472, "rougeLsum_fmeasure_stderr": 0.0023066662456799336, "rougeLsum_precision": 0.11038994072863482, "rougeLsum_precision_stderr": 0.001775109847716677, "rougeLsum_recall": 0.2637144833686383, "rougeLsum_recall_stderr": 0.004087246305150146}}, "4": {"article_DOC_summary": {"bleu": 0.912241430159217, "bleu_stderr": 0.1508779538575514, "rouge1_fmeasure": 0.052109895215743926, "rouge1_fmeasure_stderr": 0.0029325961974541494, "rouge1_precision": 0.04293551268263482, "rouge1_precision_stderr": 0.0025300915700631186, "rouge1_recall": 0.08251909123638941, "rouge1_recall_stderr": 0.0047552295627311435, "rouge2_fmeasure": 0.01203821663060757, "rouge2_fmeasure_stderr": 0.001058756941479336, "rouge2_precision": 0.008902116582691397, "rouge2_precision_stderr": 0.0007882814434287245, "rouge2_recall": 0.020355209333257882, "rouge2_recall_stderr": 0.0018399757489791825, "rougeL_fmeasure": 0.039428572953528446, "rougeL_fmeasure_stderr": 0.002210624192813922, "rougeL_precision": 0.03291018535700006, "rougeL_precision_stderr": 0.001988611171780747, "rougeL_recall": 0.06245660895956642, "rougeL_recall_stderr": 0.0036021201516783473, "rougeLsum_fmeasure": 0.0423468971865807, "rougeLsum_fmeasure_stderr": 0.002398760878941016, "rougeLsum_precision": 0.03518063964656488, "rougeLsum_precision_stderr": 0.0021155739741956746, "rougeLsum_recall": 0.0671088243853192, "rougeLsum_recall_stderr": 0.003920638757312349}}, "5": {"article_DOC_summary": {"bleu": 3.097815638153428e-39, "bleu_stderr": 5.018895149426352e-34, "rouge1_fmeasure": 0.0023684978392920267, "rouge1_fmeasure_stderr": 0.000684399718764436, "rouge1_precision": 0.002739968547331711, "rouge1_precision_stderr": 0.0008278012212952025, "rouge1_recall": 0.0022134599272398445, "rouge1_recall_stderr": 0.0006273422111807437, "rouge2_fmeasure": 0.0003256051958251534, "rouge2_fmeasure_stderr": 0.00013752393773970521, "rouge2_precision": 0.000376696230153845, "rouge2_precision_stderr": 0.00016337393027254123, "rouge2_recall": 0.00030314025597044463, "rouge2_recall_stderr": 0.00012776302473939982, "rougeL_fmeasure": 0.0019272015023736237, "rougeL_fmeasure_stderr": 0.0005653230599258551, "rougeL_precision": 0.0021741866307845633, "rougeL_precision_stderr": 0.0006503820290684123, "rougeL_recall": 0.0018275205470825698, "rougeL_recall_stderr": 0.0005304151694392347, "rougeLsum_fmeasure": 0.0020216316744952494, "rougeLsum_fmeasure_stderr": 0.0005921597049542807, "rougeLsum_precision": 0.002299258100195655, "rougeLsum_precision_stderr": 0.0006916877058747434, "rougeLsum_recall": 0.0019069310038515171, "rougeLsum_recall_stderr": 0.0005514157386454834}}}}
4b284b21bc4/evaluation/rankeval/4b284b21bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811485,0
3
+ anli_r2,acc,0.337,0.0149550879186536,0
4
+ anli_r3,acc,0.355,0.013819249004047296,0
5
+ arc_challenge,acc,0.28668941979522183,0.013214986329274757,0
6
+ arc_challenge,acc_norm,0.30631399317406144,0.013470584417276513,0
7
+ arc_easy,acc,0.617003367003367,0.009974920384536469,0
8
+ arc_easy,acc_norm,0.5462962962962963,0.010215708295494117,0
9
+ boolq,acc,0.5669724770642202,0.008666251305518059,1
10
+ cb,acc,0.48214285714285715,0.0673769750864465,1
11
+ cb,f1,0.4347442680776014,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4841665006970723,0.004987278910505115,0
14
+ hellaswag,acc_norm,0.6352320254929297,0.004803812631994966,0
15
+ piqa,acc,0.7578890097932536,0.00999437126910438,0
16
+ piqa,acc_norm,0.7676822633297062,0.009853201384168243,0
17
+ rte,acc,0.5306859205776173,0.03003973059219781,0
18
+ sciq,acc,0.845,0.011450157470799475,0
19
+ sciq,acc_norm,0.757,0.013569640199177458,0
20
+ storycloze_2016,acc,0.7151256012827365,0.010437513986611718,0
21
+ winogrande,acc,0.5990528808208366,0.013773974554948033,0
4b284b21bc4/evaluation/rankeval/4b284b21bc4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.336,0.01494414023379502,0
3
+ anli_r2,acc,0.315,0.014696631960792506,0
4
+ anli_r3,acc,0.34,0.0136804957257678,0
5
+ arc_challenge,acc,0.29266211604095566,0.01329591610361942,0
6
+ arc_challenge,acc_norm,0.32849829351535836,0.013724978465537357,0
7
+ arc_easy,acc,0.6220538720538721,0.009949405744045452,0
8
+ arc_easy,acc_norm,0.5787037037037037,0.010131882498193127,0
9
+ boolq,acc,0.5669724770642202,0.008666251305518059,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.38181818181818183,,1
12
+ copa,acc,0.74,0.04408440022768077,0
13
+ hellaswag,acc,0.48137821151165106,0.004986319587524962,0
14
+ hellaswag,acc_norm,0.6344353714399522,0.004806039039008954,0
15
+ piqa,acc,0.7551686615886833,0.010032309105568788,0
16
+ piqa,acc_norm,0.764961915125136,0.009893146688805308,0
17
+ rte,acc,0.5451263537906137,0.029973636495415252,0
18
+ sciq,acc,0.891,0.00985982840703719,0
19
+ sciq,acc_norm,0.871,0.010605256784796579,0
20
+ storycloze_2016,acc,0.7044361304115446,0.01055177883937378,0
21
+ winogrande,acc,0.5974743488555643,0.013782866831703048,0
4b284b21bc4/evaluation/rankeval/4b284b21bc4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.327,0.014842213153411247,0
3
+ anli_r2,acc,0.333,0.01491084616422986,0
4
+ anli_r3,acc,0.3408333333333333,0.01368860079329693,0
5
+ arc_challenge,acc,0.295221843003413,0.013329750293382318,0
6
+ arc_challenge,acc_norm,0.32337883959044367,0.013669421630012129,0
7
+ arc_easy,acc,0.622895622895623,0.009945041946366499,0
8
+ arc_easy,acc_norm,0.6018518518518519,0.010044662374653398,0
9
+ boolq,acc,0.5920489296636086,0.008595583792654892,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.3829365079365079,,1
12
+ copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.48048197570205137,0.00498597821493792,0
14
+ hellaswag,acc_norm,0.6397132045409281,0.004791024004587989,0
15
+ piqa,acc,0.7578890097932536,0.009994371269104376,0
16
+ piqa,acc_norm,0.7682263329706203,0.009845143772794043,0
17
+ rte,acc,0.5090252707581228,0.030091559826331334,0
18
+ sciq,acc,0.903,0.009363689373248092,0
19
+ sciq,acc_norm,0.882,0.010206869264381791,0
20
+ storycloze_2016,acc,0.7161945483698557,0.01042569627973092,0
21
+ winogrande,acc,0.6053670086819258,0.013736915172371883,0
4b284b21bc4/evaluation/rankeval/4b284b21bc4_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.338,0.014965960710224496,0
3
+ anli_r2,acc,0.345,0.015039986742055238,0
4
+ anli_r3,acc,0.3566666666666667,0.013833742805050717,0
5
+ arc_challenge,acc,0.29436860068259385,0.013318528460539429,0
6
+ arc_challenge,acc_norm,0.3319112627986348,0.01376098820088054,0
7
+ arc_easy,acc,0.627104377104377,0.009922743197129257,0
8
+ arc_easy,acc_norm,0.609006734006734,0.010012992232540631,0
9
+ boolq,acc,0.5923547400611621,0.008594580270731619,1
10
+ cb,acc,0.6071428571428571,0.0658538889806635,1
11
+ cb,f1,0.5367003367003368,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4826727743477395,0.004986784319771787,0
14
+ hellaswag,acc_norm,0.6368253335988847,0.004799317209902001,0
15
+ piqa,acc,0.7589771490750816,0.009979042717267314,0
16
+ piqa,acc_norm,0.7742110990206746,0.009754980670917311,0
17
+ rte,acc,0.5631768953068592,0.029855247390314945,0
18
+ sciq,acc,0.913,0.0089168666307459,0
19
+ sciq,acc_norm,0.897,0.009616833339695798,0
20
+ storycloze_2016,acc,0.7204703367183325,0.01037770209970486,0
21
+ winogrande,acc,0.6037884767166535,0.013746404157154949,0
4b284b21bc4/evaluation/rankeval/4b284b21bc4_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.343,0.015019206922356951,0
3
+ anli_r2,acc,0.346,0.01505026612756445,0
4
+ anli_r3,acc,0.36083333333333334,0.01386918025244486,0
5
+ arc_challenge,acc,0.2960750853242321,0.013340916085246271,0
6
+ arc_challenge,acc_norm,0.3242320819112628,0.013678810399518819,0
7
+ arc_easy,acc,0.6283670033670034,0.00991589712365879,0
8
+ arc_easy,acc_norm,0.6153198653198653,0.009983171707008997,0
9
+ boolq,acc,0.6119266055045871,0.00852313058476084,1
10
+ cb,acc,0.5535714285714286,0.06703189227942395,1
11
+ cb,f1,0.4583333333333333,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.48157737502489545,0.0049863932662691625,0
14
+ hellaswag,acc_norm,0.6417048396733719,0.00478519504988916,0
15
+ piqa,acc,0.7595212187159956,0.009971345364651078,0
16
+ piqa,acc_norm,0.7676822633297062,0.009853201384168243,0
17
+ rte,acc,0.5379061371841155,0.030009848912529113,0
18
+ sciq,acc,0.923,0.008434580140240648,0
19
+ sciq,acc_norm,0.912,0.008963053962592074,0
20
+ storycloze_2016,acc,0.7338321753073223,0.010220104800551206,0
21
+ winogrande,acc,0.6085240726124704,0.01371748707129085,0
4b284b21bc4/evaluation/rankeval/4b284b21bc4_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.351,0.015100563798316405,0
3
+ anli_r2,acc,0.345,0.015039986742055237,0
4
+ anli_r3,acc,0.345,0.013728421539454878,0
5
+ arc_challenge,acc,0.30716723549488056,0.013481034054980945,0
6
+ arc_challenge,acc_norm,0.32337883959044367,0.013669421630012122,0
7
+ arc_easy,acc,0.6300505050505051,0.009906656266021155,0
8
+ arc_easy,acc_norm,0.6111111111111112,0.01000324833531377,0
9
+ boolq,acc,0.6146788990825688,0.008511930879680652,1
10
+ cb,acc,0.5714285714285714,0.06672848092813058,1
11
+ cb,f1,0.37671957671957673,,1
12
+ copa,acc,0.78,0.04163331998932261,0
13
+ hellaswag,acc,0.4827723561043617,0.004986818680313444,0
14
+ hellaswag,acc_norm,0.6446922923720374,0.004776283203468094,0
15
+ piqa,acc,0.7529923830250272,0.010062268140772625,0
16
+ piqa,acc_norm,0.7671381936887922,0.009861236071080753,0
17
+ rte,acc,0.5776173285198556,0.02973162264649588,0
18
+ sciq,acc,0.919,0.008632121032139978,0
19
+ sciq,acc_norm,0.907,0.009188875634996669,0
20
+ storycloze_2016,acc,0.7252805986103688,0.010322309878339507,0
21
+ winogrande,acc,0.595895816890292,0.013791610664670845,0
4b284b28bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.010022915068112901
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.010022915068112901
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.20489136085595536
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.20489136085595536
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.2325284196471626
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.2325284196471626
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.24388713793667496
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.24388713793667496
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.24852828649672148
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.24852828649672148
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.24634621400768708
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.24634621400768708
14
+ e2e_nlg_cleaned,5,average,multiple,0.19770072233538574
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.0511787638415587
16
+ gem_xsum,0,median,rouge2_fmeasure,0.0511787638415587
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04515071736102295
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04515071736102295
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.047730927310845786
20
+ gem_xsum,2,median,rouge2_fmeasure,0.047730927310845786
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04656621187743751
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04656621187743751
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.010817994039374855
24
+ gem_xsum,4,median,rouge2_fmeasure,0.010817994039374855
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0001299594149643802
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0001299594149643802
27
+ gem_xsum,5,average,multiple,0.03359576230753403
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.049917192299013896
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.049917192299013896
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.05553061893758205
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.05553061893758205
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.057331612844470456
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.057331612844470456
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05836966723015618
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05836966723015618
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.0577700863367864
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.0577700863367864
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05951196634046783
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05951196634046783
40
+ web_nlg_en,5,average,multiple,0.056405190664746134
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03601951697280678
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03601951697280678
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.04757609861819433
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.04757609861819433
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05650249608530642
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05650249608530642
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.05005886014366939
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.05005886014366939
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015594437236270214
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.015594437236270214
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0024833328621297794
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0024833328621297794
53
+ wiki_lingua_en,5,average,multiple,0.034705790319729486
4b284b28bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.2871925225988394, "bleu_stderr": 0.02880794237734816, "rouge1_fmeasure": 0.10707093959955763, "rouge1_fmeasure_stderr": 0.0019099253416430774, "rouge1_precision": 0.07031750338322859, "rouge1_precision_stderr": 0.0015362201736309874, "rouge1_recall": 0.3012155634284117, "rouge1_recall_stderr": 0.004546846231718025, "rouge2_fmeasure": 0.049917192299013896, "rouge2_fmeasure_stderr": 0.0012014538250113653, "rouge2_precision": 0.03252619427180376, "rouge2_precision_stderr": 0.0009041180535715348, "rouge2_recall": 0.14439430798437106, "rouge2_recall_stderr": 0.0030595449553106713, "rougeL_fmeasure": 0.10327080072990603, "rougeL_fmeasure_stderr": 0.0017814345648663893, "rougeL_precision": 0.06757287768017098, "rougeL_precision_stderr": 0.001414062975537199, "rougeL_recall": 0.2931917692240097, "rougeL_recall_stderr": 0.0044525581749204875, "rougeLsum_fmeasure": 0.10238598559666155, "rougeLsum_fmeasure_stderr": 0.001793711398293261, "rougeLsum_precision": 0.06720725044881726, "rougeLsum_precision_stderr": 0.0014407633141713585, "rougeLsum_recall": 0.2882066628071488, "rougeLsum_recall_stderr": 0.0042625595600572705}}, "1": {"PALM_prompt": {"bleu": 0.4674006237665374, "bleu_stderr": 0.03675015156688127, "rouge1_fmeasure": 0.11843411039548267, "rouge1_fmeasure_stderr": 0.001895876971489225, "rouge1_precision": 0.07708396436923028, "rouge1_precision_stderr": 0.001529013971135644, "rouge1_recall": 0.36235242098066, "rouge1_recall_stderr": 0.00504305962540817, "rouge2_fmeasure": 0.05553061893758205, "rouge2_fmeasure_stderr": 0.0012200850248834274, "rouge2_precision": 0.035725346847754684, "rouge2_precision_stderr": 0.0008827277116185362, "rouge2_recall": 0.17866943863024684, "rouge2_recall_stderr": 0.003599106083841207, "rougeL_fmeasure": 0.11228525846628372, "rougeL_fmeasure_stderr": 0.0017420331023114827, "rougeL_precision": 0.07294152863590639, "rougeL_precision_stderr": 0.0014064426998360536, "rougeL_recall": 0.3437168036630356, "rougeL_recall_stderr": 0.004711682138202734, "rougeLsum_fmeasure": 0.11267135213673385, "rougeLsum_fmeasure_stderr": 0.0017806522868172628, "rougeLsum_precision": 0.07337404630465882, "rougeLsum_precision_stderr": 0.0014496612765965426, "rougeLsum_recall": 0.3436003452114997, "rougeLsum_recall_stderr": 0.004616609203187745}}, "2": {"PALM_prompt": {"bleu": 0.5250778439407279, "bleu_stderr": 0.025625496064299234, "rouge1_fmeasure": 0.1222441134481138, "rouge1_fmeasure_stderr": 0.0019020155784340502, "rouge1_precision": 0.0786831611862841, "rouge1_precision_stderr": 0.0014969718758498687, "rouge1_recall": 0.3956998500897082, "rouge1_recall_stderr": 0.005131459404981971, "rouge2_fmeasure": 0.057331612844470456, "rouge2_fmeasure_stderr": 0.001226447389142751, "rouge2_precision": 0.03682232020364324, "rouge2_precision_stderr": 0.0009265440871287532, "rouge2_recall": 0.19779728011829195, "rouge2_recall_stderr": 0.0038163400420330356, "rougeL_fmeasure": 0.11353522248821303, "rougeL_fmeasure_stderr": 0.001703018837429438, "rougeL_precision": 0.07302274174131225, "rougeL_precision_stderr": 0.001338647584797816, "rougeL_recall": 0.36618568072469376, "rougeL_recall_stderr": 0.004622159858156629, "rougeLsum_fmeasure": 0.11628410210009521, "rougeLsum_fmeasure_stderr": 0.0017922853862293046, "rougeLsum_precision": 0.07488527729009754, "rougeLsum_precision_stderr": 0.001411146414679001, "rougeLsum_recall": 0.3754608054763451, "rougeLsum_recall_stderr": 0.004798151749531035}}, "3": {"PALM_prompt": {"bleu": 0.6240971401779115, "bleu_stderr": 0.03840020245332954, "rouge1_fmeasure": 0.12413425365513392, "rouge1_fmeasure_stderr": 0.0018801730074252724, "rouge1_precision": 0.07946921950052364, "rouge1_precision_stderr": 0.0015734983149428565, "rouge1_recall": 0.41443709705557313, "rouge1_recall_stderr": 0.0052063482293464285, "rouge2_fmeasure": 0.05836966723015618, "rouge2_fmeasure_stderr": 0.0012404189838209753, "rouge2_precision": 0.03727374629297188, "rouge2_precision_stderr": 0.0010068745541143666, "rouge2_recall": 0.20933728572588564, "rouge2_recall_stderr": 0.0039729036741121115, "rougeL_fmeasure": 0.11463438747216832, "rougeL_fmeasure_stderr": 0.0016925693943895671, "rougeL_precision": 0.07340984582992345, "rougeL_precision_stderr": 0.0014110667213403662, "rougeL_recall": 0.3803567082991153, "rougeL_recall_stderr": 0.004624202286888448, "rougeLsum_fmeasure": 0.11802649327609507, "rougeLsum_fmeasure_stderr": 0.0017907209912884955, "rougeLsum_precision": 0.07570628299965847, "rougeLsum_precision_stderr": 0.0015124490142500056, "rougeLsum_recall": 0.39213432148715394, "rougeLsum_recall_stderr": 0.004828378800565317}}, "4": {"PALM_prompt": {"bleu": 0.6460958847523566, "bleu_stderr": 0.03922360003785139, "rouge1_fmeasure": 0.12269659017119199, "rouge1_fmeasure_stderr": 0.0017597424238451932, "rouge1_precision": 0.07760008602513783, "rouge1_precision_stderr": 0.001320690166501572, "rouge1_recall": 0.4207626130570328, "rouge1_recall_stderr": 0.0051343858175420766, "rouge2_fmeasure": 0.0577700863367864, "rouge2_fmeasure_stderr": 0.001158809400479912, "rouge2_precision": 0.03635274690781486, "rouge2_precision_stderr": 0.0008330185169416092, "rouge2_recall": 0.2129557453468375, "rouge2_recall_stderr": 0.003851289229933982, "rougeL_fmeasure": 0.11231459598044416, "rougeL_fmeasure_stderr": 0.0015694568258752962, "rougeL_precision": 0.07107029230772488, "rougeL_precision_stderr": 0.001171387599016451, "rougeL_recall": 0.38200909854824977, "rougeL_recall_stderr": 0.004464667845112728, "rougeLsum_fmeasure": 0.11625392090888889, "rougeLsum_fmeasure_stderr": 0.0016649015391818642, "rougeLsum_precision": 0.07356474812940257, "rougeLsum_precision_stderr": 0.0012438027129333135, "rougeLsum_recall": 0.39729543398826606, "rougeLsum_recall_stderr": 0.004755911479219925}}, "5": {"PALM_prompt": {"bleu": 0.7283147266727299, "bleu_stderr": 0.03506630990313516, "rouge1_fmeasure": 0.12670702220068714, "rouge1_fmeasure_stderr": 0.0018353031821599214, "rouge1_precision": 0.0800529618846568, "rouge1_precision_stderr": 0.0014030415087213262, "rouge1_recall": 0.4360056147654019, "rouge1_recall_stderr": 0.005237161872126708, "rouge2_fmeasure": 0.05951196634046783, "rouge2_fmeasure_stderr": 0.0011799130523628795, "rouge2_precision": 0.037349148159645094, "rouge2_precision_stderr": 0.000853308549188945, "rouge2_recall": 0.2217598521275178, "rouge2_recall_stderr": 0.003994885901439153, "rougeL_fmeasure": 0.11485528722019342, "rougeL_fmeasure_stderr": 0.001589365738462336, "rougeL_precision": 0.07255538123365188, "rougeL_precision_stderr": 0.0012004887370386942, "rougeL_recall": 0.39357574348180674, "rougeL_recall_stderr": 0.004547413362244599, "rougeLsum_fmeasure": 0.11989099231027077, "rougeLsum_fmeasure_stderr": 0.0017190774588667075, "rougeLsum_precision": 0.07578798351325475, "rougeLsum_precision_stderr": 0.0012955589973183616, "rougeLsum_recall": 0.4104972172441367, "rougeLsum_recall_stderr": 0.0048123434799543785}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.6192080325529026, "bleu_stderr": 0.04050520870343064, "rouge1_fmeasure": 0.17809655126532312, "rouge1_fmeasure_stderr": 0.0018236388559653763, "rouge1_precision": 0.15222794945796614, "rouge1_precision_stderr": 0.001854391491208858, "rouge1_recall": 0.259279440153063, "rouge1_recall_stderr": 0.0027052349535794085, "rouge2_fmeasure": 0.03601951697280678, "rouge2_fmeasure_stderr": 0.0008300101222283796, "rouge2_precision": 0.030497998250497895, "rouge2_precision_stderr": 0.0007413435249853986, "rouge2_recall": 0.05425261740808977, "rouge2_recall_stderr": 0.001399269212332445, "rougeL_fmeasure": 0.1384741786574652, "rougeL_fmeasure_stderr": 0.0012894950527783321, "rougeL_precision": 0.11697977840134756, "rougeL_precision_stderr": 0.0012794261124303204, "rougeL_recall": 0.2062333696623697, "rougeL_recall_stderr": 0.0021597198817625285, "rougeLsum_fmeasure": 0.1628530750095549, "rougeLsum_fmeasure_stderr": 0.0016530415994811475, "rougeLsum_precision": 0.1389877568936686, "rougeLsum_precision_stderr": 0.0016763555392295747, "rougeLsum_recall": 0.23808538118300834, "rougeLsum_recall_stderr": 0.0025032807703488534}}, "1": {"tldr_en": {"bleu": 2.545643626821724, "bleu_stderr": 0.048869510836124584, "rouge1_fmeasure": 0.20484942755932814, "rouge1_fmeasure_stderr": 0.001960405666708866, "rouge1_precision": 0.17909391212556103, "rouge1_precision_stderr": 0.0021684904700656395, "rouge1_recall": 0.2986428547454232, "rouge1_recall_stderr": 0.002901154619840759, "rouge2_fmeasure": 0.04757609861819433, "rouge2_fmeasure_stderr": 0.0009961877982725383, "rouge2_precision": 0.04098672752953724, "rouge2_precision_stderr": 0.0009402949935154375, "rouge2_recall": 0.07250823843480829, "rouge2_recall_stderr": 0.00171407343966866, "rougeL_fmeasure": 0.14525233898326748, "rougeL_fmeasure_stderr": 0.001303267897245817, "rougeL_precision": 0.12586154047693615, "rougeL_precision_stderr": 0.001447455908294373, "rougeL_recall": 0.21783174447787615, "rougeL_recall_stderr": 0.002284390945146576, "rougeLsum_fmeasure": 0.19265194446101522, "rougeLsum_fmeasure_stderr": 0.0018360173432108782, "rougeLsum_precision": 0.16826083836269604, "rougeLsum_precision_stderr": 0.0020324268022024173, "rougeLsum_recall": 0.28157870760475356, "rougeLsum_recall_stderr": 0.002752878166728396}}, "2": {"tldr_en": {"bleu": 3.123719853905042, "bleu_stderr": 0.09435758546693512, "rouge1_fmeasure": 0.21836409386191818, "rouge1_fmeasure_stderr": 0.001989518194370473, "rouge1_precision": 0.2332474242904849, "rouge1_precision_stderr": 0.0031768172038711467, "rouge1_recall": 0.29410268890498986, "rouge1_recall_stderr": 0.002958828414438297, "rouge2_fmeasure": 0.05650249608530642, "rouge2_fmeasure_stderr": 0.0011487340298214372, "rouge2_precision": 0.06566711930000235, "rouge2_precision_stderr": 0.0019413943466877388, "rouge2_recall": 0.07670871046547496, "rouge2_recall_stderr": 0.0016787739182338268, "rougeL_fmeasure": 0.1603606067900483, "rougeL_fmeasure_stderr": 0.0014157142739287894, "rougeL_precision": 0.1752085713341631, "rougeL_precision_stderr": 0.0026584779502129303, "rougeL_recall": 0.21819240455646285, "rougeL_recall_stderr": 0.002276168106614223, "rougeLsum_fmeasure": 0.20617794726883687, "rougeLsum_fmeasure_stderr": 0.0018867289872942854, "rougeLsum_precision": 0.22112816157170134, "rougeLsum_precision_stderr": 0.0030769042746772593, "rougeLsum_recall": 0.2776979400072693, "rougeLsum_recall_stderr": 0.002813335835445409}}, "3": {"tldr_en": {"bleu": 3.5006949715383118, "bleu_stderr": 0.08071997944617754, "rouge1_fmeasure": 0.1819863863181494, "rouge1_fmeasure_stderr": 0.0024282963824572346, "rouge1_precision": 0.22344987630747953, "rouge1_precision_stderr": 0.0038946320752308397, "rouge1_recall": 0.228687908011972, "rouge1_recall_stderr": 0.0033634865916312945, "rouge2_fmeasure": 0.05005886014366939, "rouge2_fmeasure_stderr": 0.0012507484027525577, "rouge2_precision": 0.06821362116644611, "rouge2_precision_stderr": 0.0023091245896002433, "rouge2_recall": 0.06243742554002245, "rouge2_recall_stderr": 0.001665626169617889, "rougeL_fmeasure": 0.13680266609806765, "rougeL_fmeasure_stderr": 0.001821525871477116, "rougeL_precision": 0.17380086539073145, "rougeL_precision_stderr": 0.003294779021368952, "rougeL_recall": 0.1728818390227638, "rougeL_recall_stderr": 0.0026287282834976822, "rougeLsum_fmeasure": 0.17195845477450572, "rougeLsum_fmeasure_stderr": 0.0023005495305984053, "rougeLsum_precision": 0.21200019749157653, "rougeLsum_precision_stderr": 0.003735810714751958, "rougeLsum_recall": 0.21587781354640423, "rougeLsum_recall_stderr": 0.003186682013692883}}, "4": {"tldr_en": {"bleu": 0.4203638205606742, "bleu_stderr": 0.04062156232814086, "rouge1_fmeasure": 0.05800966504497334, "rouge1_fmeasure_stderr": 0.002055695120200674, "rouge1_precision": 0.06907034391802865, "rouge1_precision_stderr": 0.002842925157819051, "rouge1_recall": 0.07641790711530233, "rouge1_recall_stderr": 0.0028071608967584296, "rouge2_fmeasure": 0.015594437236270214, "rouge2_fmeasure_stderr": 0.0008033551932234715, "rouge2_precision": 0.020323118606363738, "rouge2_precision_stderr": 0.0013765538659170954, "rouge2_recall": 0.020689214217856734, "rouge2_recall_stderr": 0.0011098261782535187, "rougeL_fmeasure": 0.04363217232305618, "rougeL_fmeasure_stderr": 0.0015475509311173715, "rougeL_precision": 0.05326819078634553, "rougeL_precision_stderr": 0.002300594360643888, "rougeL_recall": 0.05823666899257084, "rougeL_recall_stderr": 0.0021806250615139192, "rougeLsum_fmeasure": 0.05441077080277394, "rougeLsum_fmeasure_stderr": 0.0019346325978603714, "rougeLsum_precision": 0.06512227133445396, "rougeLsum_precision_stderr": 0.002711627122094951, "rougeLsum_recall": 0.07184593801381658, "rougeLsum_recall_stderr": 0.0026558284841563095}}, "5": {"tldr_en": {"bleu": 1.610620675193976e-08, "bleu_stderr": 6.336079313722214e-08, "rouge1_fmeasure": 0.008814606254748622, "rouge1_fmeasure_stderr": 0.0008820141996097188, "rouge1_precision": 0.011141473368752484, "rouge1_precision_stderr": 0.0012984382630292146, "rouge1_recall": 0.011356368661606248, "rouge1_recall_stderr": 0.001166446625113568, "rouge2_fmeasure": 0.0024833328621297794, "rouge2_fmeasure_stderr": 0.0003381158590391299, "rouge2_precision": 0.0036979579695363266, "rouge2_precision_stderr": 0.00069734856115032, "rouge2_recall": 0.0030116660387216835, "rouge2_recall_stderr": 0.00039145176963269976, "rougeL_fmeasure": 0.0067361341505456805, "rougeL_fmeasure_stderr": 0.0006778741138953477, "rougeL_precision": 0.008749076310088717, "rougeL_precision_stderr": 0.0010822814361618575, "rougeL_recall": 0.008702418570223684, "rougeL_recall_stderr": 0.0008937526565969341, "rougeLsum_fmeasure": 0.008282398875850052, "rougeLsum_fmeasure_stderr": 0.0008304343325046589, "rougeLsum_precision": 0.010561687151292019, "rougeLsum_precision_stderr": 0.0012526496630397305, "rougeLsum_recall": 0.010693942253000091, "rougeLsum_recall_stderr": 0.001098581608402249}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.8175837539659996, "bleu_stderr": 0.06903661804515053, "rouge1_fmeasure": 0.07913875454413057, "rouge1_fmeasure_stderr": 0.0015242869094641899, "rouge1_precision": 0.067410137720777, "rouge1_precision_stderr": 0.0015267778764287776, "rouge1_recall": 0.11905569111444682, "rouge1_recall_stderr": 0.002272837057404917, "rouge2_fmeasure": 0.010022915068112901, "rouge2_fmeasure_stderr": 0.0005999619167956302, "rouge2_precision": 0.00855571479752271, "rouge2_precision_stderr": 0.000610070681102783, "rouge2_recall": 0.01452701665726237, "rouge2_recall_stderr": 0.0008619364610158222, "rougeL_fmeasure": 0.07774011765647057, "rougeL_fmeasure_stderr": 0.0014739968390031794, "rougeL_precision": 0.06588530481318243, "rougeL_precision_stderr": 0.001456682483397516, "rougeL_recall": 0.11730596366118605, "rougeL_recall_stderr": 0.0022143035280371703, "rougeLsum_fmeasure": 0.06060524038626222, "rougeLsum_fmeasure_stderr": 0.0012344605903209444, "rougeLsum_precision": 0.053281568752607505, "rougeLsum_precision_stderr": 0.0013584259494467741, "rougeLsum_recall": 0.0896663423829268, "rougeLsum_recall_stderr": 0.0017758029746831937}}, "1": {"generate_text_restaurant": {"bleu": 11.360023014480657, "bleu_stderr": 0.14523312599938573, "rouge1_fmeasure": 0.4457059970163705, "rouge1_fmeasure_stderr": 0.0023475038584115206, "rouge1_precision": 0.5368002507342822, "rouge1_precision_stderr": 0.003197714150333534, "rouge1_recall": 0.4204347436029918, "rouge1_recall_stderr": 0.0029932330301144583, "rouge2_fmeasure": 0.20489136085595536, "rouge2_fmeasure_stderr": 0.0019458972766888125, "rouge2_precision": 0.25010730577128015, "rouge2_precision_stderr": 0.0025416736419787912, "rouge2_recall": 0.19320140085672308, "rouge2_recall_stderr": 0.0021059841419488645, "rougeL_fmeasure": 0.321161092281206, "rougeL_fmeasure_stderr": 0.001995161634007105, "rougeL_precision": 0.390181985500488, "rougeL_precision_stderr": 0.002842772643373759, "rougeL_recall": 0.30202346410754144, "rougeL_recall_stderr": 0.002382966736707436, "rougeLsum_fmeasure": 0.3629740875330838, "rougeLsum_fmeasure_stderr": 0.00225666081685001, "rougeLsum_precision": 0.4390382418075218, "rougeLsum_precision_stderr": 0.003082038398447118, "rougeLsum_recall": 0.34172069570320107, "rougeLsum_recall_stderr": 0.0026874775375555484}}, "2": {"generate_text_restaurant": {"bleu": 13.284050272128148, "bleu_stderr": 0.1770460818661209, "rouge1_fmeasure": 0.47571760194079554, "rouge1_fmeasure_stderr": 0.002265550724177173, "rouge1_precision": 0.5714641277328295, "rouge1_precision_stderr": 0.0032799480051059426, "rouge1_recall": 0.4468451656698964, "rouge1_recall_stderr": 0.0029010255062063995, "rouge2_fmeasure": 0.2325284196471626, "rouge2_fmeasure_stderr": 0.0020182406639818696, "rouge2_precision": 0.28407643182119585, "rouge2_precision_stderr": 0.002743346128512875, "rouge2_recall": 0.21834852344926514, "rouge2_recall_stderr": 0.0021773726271045162, "rougeL_fmeasure": 0.3475057918065784, "rougeL_fmeasure_stderr": 0.0020325138450125527, "rougeL_precision": 0.4201198181536731, "rougeL_precision_stderr": 0.002974299270315645, "rougeL_recall": 0.32583054256145155, "rougeL_recall_stderr": 0.002409815129643757, "rougeLsum_fmeasure": 0.391612619922272, "rougeLsum_fmeasure_stderr": 0.002268675240750965, "rougeLsum_precision": 0.4715445246123803, "rougeLsum_precision_stderr": 0.0031935178217488633, "rougeLsum_recall": 0.36737757013379746, "rougeLsum_recall_stderr": 0.002675859339354407}}, "3": {"generate_text_restaurant": {"bleu": 14.26351682307982, "bleu_stderr": 0.16518466746497154, "rouge1_fmeasure": 0.4869136869814224, "rouge1_fmeasure_stderr": 0.002252377471622215, "rouge1_precision": 0.5781766953393467, "rouge1_precision_stderr": 0.003232493897043414, "rouge1_recall": 0.45846733751843627, "rouge1_recall_stderr": 0.002862839752183954, "rouge2_fmeasure": 0.24388713793667496, "rouge2_fmeasure_stderr": 0.002081168246828599, "rouge2_precision": 0.2936437583754312, "rouge2_precision_stderr": 0.002766357156065721, "rouge2_recall": 0.2296605325605603, "rouge2_recall_stderr": 0.002239048819483412, "rougeL_fmeasure": 0.35476450327695513, "rougeL_fmeasure_stderr": 0.002131136688683226, "rougeL_precision": 0.42323041568862063, "rougeL_precision_stderr": 0.00299893377825116, "rougeL_recall": 0.3336509571396467, "rougeL_recall_stderr": 0.002463831973937884, "rougeLsum_fmeasure": 0.40259898572566205, "rougeLsum_fmeasure_stderr": 0.0023428037926685654, "rougeLsum_precision": 0.47879240286661645, "rougeLsum_precision_stderr": 0.0032211942725843233, "rougeLsum_recall": 0.3788603945042402, "rougeLsum_recall_stderr": 0.00272506620717064}}, "4": {"generate_text_restaurant": {"bleu": 14.568456206535862, "bleu_stderr": 0.22714934644025087, "rouge1_fmeasure": 0.49248119559722, "rouge1_fmeasure_stderr": 0.002261462995557444, "rouge1_precision": 0.5824109780846385, "rouge1_precision_stderr": 0.0032128656177573896, "rouge1_recall": 0.46303711017335764, "rouge1_recall_stderr": 0.0028550395782162385, "rouge2_fmeasure": 0.24852828649672148, "rouge2_fmeasure_stderr": 0.0020894492308823178, "rouge2_precision": 0.2975497604665679, "rouge2_precision_stderr": 0.002721358209608908, "rouge2_recall": 0.2337393664335435, "rouge2_recall_stderr": 0.0022535476965624083, "rougeL_fmeasure": 0.35908240651080353, "rougeL_fmeasure_stderr": 0.0021535460141111546, "rougeL_precision": 0.4258973373500384, "rougeL_precision_stderr": 0.0029570811260566745, "rougeL_recall": 0.33738654578439453, "rougeL_recall_stderr": 0.002480938975354226, "rougeLsum_fmeasure": 0.4073362598730636, "rougeLsum_fmeasure_stderr": 0.002350552518906019, "rougeLsum_precision": 0.48186572666282207, "rougeLsum_precision_stderr": 0.0031731113183941597, "rougeLsum_recall": 0.3829876385027774, "rougeLsum_recall_stderr": 0.0027332507970680904}}, "5": {"generate_text_restaurant": {"bleu": 14.41705650902542, "bleu_stderr": 0.25051935957566557, "rouge1_fmeasure": 0.4912358527300658, "rouge1_fmeasure_stderr": 0.002175083911938509, "rouge1_precision": 0.5768002584328835, "rouge1_precision_stderr": 0.003210936196026848, "rouge1_recall": 0.4663927464698937, "rouge1_recall_stderr": 0.002827189059265761, "rouge2_fmeasure": 0.24634621400768708, "rouge2_fmeasure_stderr": 0.002025833005255322, "rouge2_precision": 0.2929623409468226, "rouge2_precision_stderr": 0.0026747147184981385, "rouge2_recall": 0.2340255710785021, "rouge2_recall_stderr": 0.002212699322566578, "rougeL_fmeasure": 0.35597711356903294, "rougeL_fmeasure_stderr": 0.002065825901549598, "rougeL_precision": 0.41839619423807317, "rougeL_precision_stderr": 0.0028660522019280242, "rougeL_recall": 0.3383776885015685, "rougeL_recall_stderr": 0.0024683861897523925, "rougeLsum_fmeasure": 0.4066143622258549, "rougeLsum_fmeasure_stderr": 0.002273545252564866, "rougeLsum_precision": 0.47696170422006223, "rougeLsum_precision_stderr": 0.003098280726576402, "rougeLsum_recall": 0.3864090320075692, "rougeLsum_recall_stderr": 0.0027224907209281047}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.242352554457821, "bleu_stderr": 0.06303290180001715, "rouge1_fmeasure": 0.22142206258867178, "rouge1_fmeasure_stderr": 0.002657386115409225, "rouge1_precision": 0.17908873736938333, "rouge1_precision_stderr": 0.0025998582918028913, "rouge1_recall": 0.3401581574130041, "rouge1_recall_stderr": 0.00466124187806079, "rouge2_fmeasure": 0.0511787638415587, "rouge2_fmeasure_stderr": 0.0017012685833384458, "rouge2_precision": 0.04001978076164408, "rouge2_precision_stderr": 0.00143853172997629, "rouge2_recall": 0.08351989855551167, "rouge2_recall_stderr": 0.002908378017045409, "rougeL_fmeasure": 0.16457376114227693, "rougeL_fmeasure_stderr": 0.002030624196291875, "rougeL_precision": 0.13280826032727655, "rougeL_precision_stderr": 0.0019720134198053826, "rougeL_recall": 0.2547013277658435, "rougeL_recall_stderr": 0.0036980006484927316, "rougeLsum_fmeasure": 0.1721074292499855, "rougeLsum_fmeasure_stderr": 0.002183282479578269, "rougeLsum_precision": 0.13820058290248866, "rougeLsum_precision_stderr": 0.0020236107693674434, "rougeLsum_recall": 0.267820674383363, "rougeLsum_recall_stderr": 0.00406330233863675}}, "1": {"article_DOC_summary": {"bleu": 1.8494036251126653, "bleu_stderr": 0.06618274112679547, "rouge1_fmeasure": 0.19996151191164502, "rouge1_fmeasure_stderr": 0.0024642136191649943, "rouge1_precision": 0.14242240386934715, "rouge1_precision_stderr": 0.0018474131654211898, "rouge1_recall": 0.34972417416826374, "rouge1_recall_stderr": 0.00425304183688617, "rouge2_fmeasure": 0.04515071736102295, "rouge2_fmeasure_stderr": 0.0014958437813087835, "rouge2_precision": 0.03180784324852304, "rouge2_precision_stderr": 0.0010605401760687887, "rouge2_recall": 0.08122973807701163, "rouge2_recall_stderr": 0.0027530425274732025, "rougeL_fmeasure": 0.15185456372717573, "rougeL_fmeasure_stderr": 0.0018177177985044526, "rougeL_precision": 0.10785508103994479, "rougeL_precision_stderr": 0.001342675196432051, "rougeL_recall": 0.26778299938242467, "rougeL_recall_stderr": 0.0033228224655886393, "rougeLsum_fmeasure": 0.16002746501050483, "rougeLsum_fmeasure_stderr": 0.002040753526176795, "rougeLsum_precision": 0.11366701902108305, "rougeLsum_precision_stderr": 0.0014987342312601044, "rougeLsum_recall": 0.28182954712893066, "rougeLsum_recall_stderr": 0.0036825745017303583}}, "2": {"article_DOC_summary": {"bleu": 1.9553979785993556, "bleu_stderr": 0.08114119801334493, "rouge1_fmeasure": 0.20066139374359412, "rouge1_fmeasure_stderr": 0.002583892537384957, "rouge1_precision": 0.14299555153351842, "rouge1_precision_stderr": 0.0019414837646538163, "rouge1_recall": 0.35054680023331225, "rouge1_recall_stderr": 0.004405959066068025, "rouge2_fmeasure": 0.047730927310845786, "rouge2_fmeasure_stderr": 0.0016012182287873963, "rouge2_precision": 0.03372540037348309, "rouge2_precision_stderr": 0.0011433968826317544, "rouge2_recall": 0.08536452458478411, "rouge2_recall_stderr": 0.002896468085770253, "rougeL_fmeasure": 0.15423466735433622, "rougeL_fmeasure_stderr": 0.0019141237080501433, "rougeL_precision": 0.1096711314096099, "rougeL_precision_stderr": 0.0014240979490601532, "rougeL_recall": 0.2713048964277041, "rougeL_recall_stderr": 0.003435168695225616, "rougeLsum_fmeasure": 0.16073844532804232, "rougeLsum_fmeasure_stderr": 0.0021770573542344426, "rougeLsum_precision": 0.11432354351222646, "rougeLsum_precision_stderr": 0.0016135722781519138, "rougeLsum_recall": 0.282399261929163, "rougeLsum_recall_stderr": 0.0038458320187164767}}, "3": {"article_DOC_summary": {"bleu": 1.9899134724216982, "bleu_stderr": 0.08189694158167818, "rouge1_fmeasure": 0.19424816162568462, "rouge1_fmeasure_stderr": 0.002758326610864678, "rouge1_precision": 0.14095312565957024, "rouge1_precision_stderr": 0.0021252189798371952, "rouge1_recall": 0.3355112558235932, "rouge1_recall_stderr": 0.00482038827448099, "rouge2_fmeasure": 0.04656621187743751, "rouge2_fmeasure_stderr": 0.001620760377451161, "rouge2_precision": 0.03309503575773881, "rouge2_precision_stderr": 0.0011583105646001772, "rouge2_recall": 0.08361606756802129, "rouge2_recall_stderr": 0.0030274265177690917, "rougeL_fmeasure": 0.14844786782549954, "rougeL_fmeasure_stderr": 0.002106590246901777, "rougeL_precision": 0.10750740692234483, "rougeL_precision_stderr": 0.001617470504570499, "rougeL_recall": 0.25822344513710505, "rougeL_recall_stderr": 0.0038342113677037708, "rougeLsum_fmeasure": 0.15564298400864224, "rougeLsum_fmeasure_stderr": 0.002325750760252049, "rougeLsum_precision": 0.11272654966890937, "rougeLsum_precision_stderr": 0.0017739663038892394, "rougeLsum_recall": 0.2704314687293562, "rougeLsum_recall_stderr": 0.0041899251242752955}}, "4": {"article_DOC_summary": {"bleu": 0.851414956048288, "bleu_stderr": 0.10142038717157466, "rouge1_fmeasure": 0.05061277872693584, "rouge1_fmeasure_stderr": 0.0028337392524855837, "rouge1_precision": 0.04182867272383347, "rouge1_precision_stderr": 0.002516912133041032, "rouge1_recall": 0.08068317129523372, "rouge1_recall_stderr": 0.004630110172525636, "rouge2_fmeasure": 0.010817994039374855, "rouge2_fmeasure_stderr": 0.0009898135477985153, "rouge2_precision": 0.00821242826897234, "rouge2_precision_stderr": 0.000780085704496626, "rouge2_recall": 0.018229654639556718, "rouge2_recall_stderr": 0.0016649615737807282, "rougeL_fmeasure": 0.038452182498697945, "rougeL_fmeasure_stderr": 0.002124389719631779, "rougeL_precision": 0.03237406484505732, "rougeL_precision_stderr": 0.0020316037823294927, "rougeL_recall": 0.06169464366711305, "rougeL_recall_stderr": 0.0035477374875163793, "rougeLsum_fmeasure": 0.040956310132902136, "rougeLsum_fmeasure_stderr": 0.0023102690323821627, "rougeLsum_precision": 0.03434532775935572, "rougeLsum_precision_stderr": 0.0021605467450954603, "rougeLsum_recall": 0.06559190633881043, "rougeLsum_recall_stderr": 0.0038270052788935026}}, "5": {"article_DOC_summary": {"bleu": 2.3013943780107486e-40, "bleu_stderr": 6.513754776072693e-35, "rouge1_fmeasure": 0.0019051009413407058, "rouge1_fmeasure_stderr": 0.0005593343846105789, "rouge1_precision": 0.0021529907822185413, "rouge1_precision_stderr": 0.0006324831746531657, "rouge1_recall": 0.001764574432238271, "rouge1_recall_stderr": 0.0005172389485732619, "rouge2_fmeasure": 0.0001299594149643802, "rouge2_fmeasure_stderr": 7.503370260000825e-05, "rouge2_precision": 0.0001457415441877151, "rouge2_precision_stderr": 8.410282821934284e-05, "rouge2_recall": 0.00011802662746058974, "rouge2_recall_stderr": 6.837858900511585e-05, "rougeL_fmeasure": 0.0015504765957310557, "rougeL_fmeasure_stderr": 0.0004402385576724586, "rougeL_precision": 0.0017522974182971845, "rougeL_precision_stderr": 0.000503946684193751, "rougeL_recall": 0.0014399120468157657, "rougeL_recall_stderr": 0.0004056116387189239, "rougeLsum_fmeasure": 0.0015923123485654278, "rougeLsum_fmeasure_stderr": 0.00044995087651658854, "rougeLsum_precision": 0.0018058994766162238, "rougeLsum_precision_stderr": 0.0005178578773672206, "rougeLsum_recall": 0.001474217364139951, "rougeLsum_recall_stderr": 0.0004127036649374616}}}}
4b284b28bc4/evaluation/rankeval/4b284b28bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.344,0.015029633724408947,0
3
+ anli_r2,acc,0.321,0.01477082181793464,0
4
+ anli_r3,acc,0.34833333333333333,0.01375943749887408,0
5
+ arc_challenge,acc,0.27986348122866894,0.013119040897725922,0
6
+ arc_challenge,acc_norm,0.31143344709897613,0.013532472099850942,0
7
+ arc_easy,acc,0.5984848484848485,0.010058790020755567,0
8
+ arc_easy,acc_norm,0.5395622895622896,0.01022761638628902,0
9
+ boolq,acc,0.5700305810397553,0.008658853690729254,1
10
+ cb,acc,0.35714285714285715,0.06460957383809221,1
11
+ cb,f1,0.1754385964912281,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4792869946225851,0.004985498055190357,0
14
+ hellaswag,acc_norm,0.6265684126667994,0.004827266662144035,0
15
+ piqa,acc,0.7584330794341676,0.009986718001804467,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.5342960288808665,0.030025579819366422,0
18
+ sciq,acc,0.848,0.011358918303475274,0
19
+ sciq,acc_norm,0.769,0.013334797216936438,0
20
+ storycloze_2016,acc,0.7231427044361304,0.01034711289027692,0
21
+ winogrande,acc,0.5753749013417522,0.013891893150264213,0
4b284b28bc4/evaluation/rankeval/4b284b28bc4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.34,0.014987482264363937,0
3
+ anli_r2,acc,0.321,0.014770821817934644,0
4
+ anli_r3,acc,0.34,0.013680495725767803,0
5
+ arc_challenge,acc,0.29266211604095566,0.013295916103619417,0
6
+ arc_challenge,acc_norm,0.32337883959044367,0.013669421630012132,0
7
+ arc_easy,acc,0.6262626262626263,0.009927267058259628,0
8
+ arc_easy,acc_norm,0.5917508417508418,0.01008556619579125,0
9
+ boolq,acc,0.5948012232415902,0.008586427929715515,1
10
+ cb,acc,0.375,0.06527912098338669,1
11
+ cb,f1,0.32099491681373216,,1
12
+ copa,acc,0.77,0.04229525846816506,0
13
+ hellaswag,acc,0.48078072097191793,0.004986093791041653,0
14
+ hellaswag,acc_norm,0.6337382991435969,0.004807975515446487,0
15
+ piqa,acc,0.7622415669205659,0.009932525779525489,0
16
+ piqa,acc_norm,0.763873775843308,0.009908965890558218,0
17
+ rte,acc,0.5740072202166066,0.029764956741777645,0
18
+ sciq,acc,0.904,0.009320454434783227,0
19
+ sciq,acc_norm,0.885,0.01009340759490462,0
20
+ storycloze_2016,acc,0.7204703367183325,0.01037770209970486,0
21
+ winogrande,acc,0.590370955011839,0.013821049109655453,0
4b284b28bc4/evaluation/rankeval/4b284b28bc4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095526,0
3
+ anli_r2,acc,0.325,0.014818724459095526,0
4
+ anli_r3,acc,0.3233333333333333,0.013508372867300217,0
5
+ arc_challenge,acc,0.30204778156996587,0.01341751914471642,0
6
+ arc_challenge,acc_norm,0.3216723549488055,0.013650488084494162,0
7
+ arc_easy,acc,0.6308922558922558,0.009901987410242742,0
8
+ arc_easy,acc_norm,0.6123737373737373,0.009997307914447612,0
9
+ boolq,acc,0.627217125382263,0.008457255867914694,1
10
+ cb,acc,0.25,0.058387420812114225,1
11
+ cb,f1,0.22987012987012986,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.47988448516231824,0.004985741706385727,0
14
+ hellaswag,acc_norm,0.6363274248157738,0.004800728138792371,0
15
+ piqa,acc,0.7606093579978237,0.009955884250291681,0
16
+ piqa,acc_norm,0.76550598476605,0.009885203143240543,0
17
+ rte,acc,0.5631768953068592,0.02985524739031495,0
18
+ sciq,acc,0.914,0.008870325962594766,0
19
+ sciq,acc_norm,0.883,0.010169287802713329,0
20
+ storycloze_2016,acc,0.7177979690005345,0.010407834479647673,0
21
+ winogrande,acc,0.5824782951854776,0.013859978264440248,0
4b284b28bc4/evaluation/rankeval/4b284b28bc4_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.325,0.014818724459095524,0
3
+ anli_r2,acc,0.336,0.014944140233795021,0
4
+ anli_r3,acc,0.3233333333333333,0.013508372867300212,0
5
+ arc_challenge,acc,0.30119453924914674,0.013406741767847624,0
6
+ arc_challenge,acc_norm,0.32337883959044367,0.01366942163001213,0
7
+ arc_easy,acc,0.6372053872053872,0.009865936757013942,0
8
+ arc_easy,acc_norm,0.6186868686868687,0.009966542497171021,0
9
+ boolq,acc,0.6241590214067279,0.008471147248160107,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.3565868967138097,,1
12
+ copa,acc,0.81,0.03942772444036623,0
13
+ hellaswag,acc,0.4790878311093408,0.004985415250690914,0
14
+ hellaswag,acc_norm,0.634833698466441,0.004804927608773137,0
15
+ piqa,acc,0.7540805223068553,0.01004733186562519,0
16
+ piqa,acc_norm,0.7687704026115343,0.009837063180625334,0
17
+ rte,acc,0.6064981949458483,0.029405839314203194,0
18
+ sciq,acc,0.91,0.00905439020486644,0
19
+ sciq,acc_norm,0.897,0.009616833339695796,0
20
+ storycloze_2016,acc,0.7295563869588455,0.010271810373331027,0
21
+ winogrande,acc,0.585635359116022,0.013844846232268563,0
4b284b28bc4/evaluation/rankeval/4b284b28bc4_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.345,0.015039986742055235,0
3
+ anli_r2,acc,0.325,0.014818724459095526,0
4
+ anli_r3,acc,0.31416666666666665,0.013405399314984096,0
5
+ arc_challenge,acc,0.30204778156996587,0.01341751914471642,0
6
+ arc_challenge,acc_norm,0.32764505119453924,0.013715847940719344,0
7
+ arc_easy,acc,0.6405723905723906,0.009845958893373766,0
8
+ arc_easy,acc_norm,0.6212121212121212,0.00995373765654204,0
9
+ boolq,acc,0.6275229357798165,0.008455846866956085,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.3647495361781076,,1
12
+ copa,acc,0.82,0.038612291966536955,0
13
+ hellaswag,acc,0.4819757020513842,0.004986538243846636,0
14
+ hellaswag,acc_norm,0.6387173869747063,0.004793904922401888,0
15
+ piqa,acc,0.7551686615886833,0.01003230910556879,0
16
+ piqa,acc_norm,0.76550598476605,0.00988520314324054,0
17
+ rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.92,0.008583336977753653,0
19
+ sciq,acc_norm,0.907,0.009188875634996702,0
20
+ storycloze_2016,acc,0.7386424371993586,0.010160471460690485,0
21
+ winogrande,acc,0.5832675611681136,0.013856250072796322,0
4b284b28bc4/evaluation/rankeval/4b284b28bc4_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.332,0.014899597242811475,0
3
+ anli_r2,acc,0.316,0.014709193056057106,0
4
+ anli_r3,acc,0.31666666666666665,0.013434078660827384,0
5
+ arc_challenge,acc,0.30887372013651876,0.013501770929344003,0
6
+ arc_challenge,acc_norm,0.32849829351535836,0.013724978465537377,0
7
+ arc_easy,acc,0.6401515151515151,0.009848484848484846,0
8
+ arc_easy,acc_norm,0.6296296296296297,0.009908978578665755,0
9
+ boolq,acc,0.6275229357798165,0.008455846866956086,1
10
+ cb,acc,0.30357142857142855,0.06199938655510754,1
11
+ cb,f1,0.2503507986266607,,1
12
+ copa,acc,0.8,0.040201512610368445,0
13
+ hellaswag,acc,0.4788886675960964,0.004985331652408345,0
14
+ hellaswag,acc_norm,0.6412069308902609,0.004786660691181937,0
15
+ piqa,acc,0.750272034820457,0.010099232969867486,0
16
+ piqa,acc_norm,0.764961915125136,0.009893146688805312,0
17
+ rte,acc,0.5740072202166066,0.02976495674177765,0
18
+ sciq,acc,0.921,0.008534156773333445,0
19
+ sciq,acc_norm,0.908,0.00914437639315112,0
20
+ storycloze_2016,acc,0.7279529663281668,0.010290888060871242,0
21
+ winogrande,acc,0.5911602209944752,0.013816954295135684,0
4b284b42bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.0063826724183375155
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.0063826724183375155
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.2056915755809246
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.2056915755809246
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.22591032128288588
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.22591032128288588
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.23547797340215765
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.23547797340215765
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.23765394178309218
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.23765394178309218
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.2366049201616526
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.2366049201616526
14
+ e2e_nlg_cleaned,5,average,multiple,0.1912869007715084
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.04790575968435739
16
+ gem_xsum,0,median,rouge2_fmeasure,0.04790575968435739
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04413377405232099
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04413377405232099
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.046170973346933354
20
+ gem_xsum,2,median,rouge2_fmeasure,0.046170973346933354
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04715930396420784
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04715930396420784
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.012335864733508397
24
+ gem_xsum,4,median,rouge2_fmeasure,0.012335864733508397
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0004337191943913522
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0004337191943913522
27
+ gem_xsum,5,average,multiple,0.03302323249595322
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05344453588119793
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05344453588119793
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.0604895960614538
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.0604895960614538
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.06086364336249341
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.06086364336249341
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.06172653863702163
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.06172653863702163
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.061883789388597316
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.061883789388597316
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.06197974009288303
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.06197974009288303
40
+ web_nlg_en,5,average,multiple,0.060064640570607855
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03327297097578151
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03327297097578151
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.056766090400891124
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.056766090400891124
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.057748452491246806
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.057748452491246806
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04689779702656875
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04689779702656875
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.013741746630537094
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.013741746630537094
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.0023512305693387013
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.0023512305693387013
53
+ wiki_lingua_en,5,average,multiple,0.035129714682393995
4b284b42bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.4273206525263921, "bleu_stderr": 0.05364575256139351, "rouge1_fmeasure": 0.11230037179769856, "rouge1_fmeasure_stderr": 0.0021352190368007454, "rouge1_precision": 0.07516332695488044, "rouge1_precision_stderr": 0.0017494053619516534, "rouge1_recall": 0.3000002487080154, "rouge1_recall_stderr": 0.004693559983294075, "rouge2_fmeasure": 0.05344453588119793, "rouge2_fmeasure_stderr": 0.0013515518560690634, "rouge2_precision": 0.03590952594711019, "rouge2_precision_stderr": 0.00111479086151588, "rouge2_recall": 0.14620648751654972, "rouge2_recall_stderr": 0.003234756290211622, "rougeL_fmeasure": 0.10771019147913463, "rougeL_fmeasure_stderr": 0.0019671591552995436, "rougeL_precision": 0.07168459834878929, "rougeL_precision_stderr": 0.0015668817269601622, "rougeL_recall": 0.29090649446634975, "rougeL_recall_stderr": 0.004555120095447256, "rougeLsum_fmeasure": 0.10717155022915739, "rougeLsum_fmeasure_stderr": 0.0019927955387855968, "rougeLsum_precision": 0.07152558346509422, "rougeLsum_precision_stderr": 0.001609325107400611, "rougeLsum_recall": 0.28799432197952096, "rougeLsum_recall_stderr": 0.004451937577964067}}, "1": {"PALM_prompt": {"bleu": 0.5663724921835591, "bleu_stderr": 0.03793270967595185, "rouge1_fmeasure": 0.12756709737220784, "rouge1_fmeasure_stderr": 0.002030468381881967, "rouge1_precision": 0.08217512985928499, "rouge1_precision_stderr": 0.0015069260165856669, "rouge1_recall": 0.3920117360421087, "rouge1_recall_stderr": 0.005374085547490371, "rouge2_fmeasure": 0.0604895960614538, "rouge2_fmeasure_stderr": 0.0013006193696080512, "rouge2_precision": 0.03886435424085205, "rouge2_precision_stderr": 0.0009357648884166175, "rouge2_recall": 0.1952944224324245, "rouge2_recall_stderr": 0.003818475633981706, "rougeL_fmeasure": 0.12035361435811785, "rougeL_fmeasure_stderr": 0.00181560137606144, "rougeL_precision": 0.07729829270693146, "rougeL_precision_stderr": 0.0013303444161253287, "rougeL_recall": 0.3723170212800364, "rougeL_recall_stderr": 0.005024550440352146, "rougeLsum_fmeasure": 0.1206781467869011, "rougeLsum_fmeasure_stderr": 0.0018904175388126346, "rougeLsum_precision": 0.07779518642109406, "rougeLsum_precision_stderr": 0.0014090714637609509, "rougeLsum_recall": 0.3702218835789373, "rougeLsum_recall_stderr": 0.004905029541255394}}, "2": {"PALM_prompt": {"bleu": 0.6313130510239234, "bleu_stderr": 0.02727704631142144, "rouge1_fmeasure": 0.12940251519432414, "rouge1_fmeasure_stderr": 0.0018563230360622849, "rouge1_precision": 0.08219107059651697, "rouge1_precision_stderr": 0.0013422258523153536, "rouge1_recall": 0.4151352004015792, "rouge1_recall_stderr": 0.005332508949374494, "rouge2_fmeasure": 0.06086364336249341, "rouge2_fmeasure_stderr": 0.0011655118907789416, "rouge2_precision": 0.03845495568474405, "rouge2_precision_stderr": 0.0008134611533833349, "rouge2_recall": 0.20877882384960775, "rouge2_recall_stderr": 0.003907086651198888, "rougeL_fmeasure": 0.12014889628673367, "rougeL_fmeasure_stderr": 0.0016294871735385334, "rougeL_precision": 0.07626492600480415, "rougeL_precision_stderr": 0.0011790764283155961, "rougeL_recall": 0.38715675983971815, "rougeL_recall_stderr": 0.004836346504741043, "rougeLsum_fmeasure": 0.12261763705509457, "rougeLsum_fmeasure_stderr": 0.0017345722248776798, "rougeLsum_precision": 0.07790995067901116, "rougeLsum_precision_stderr": 0.0012592690527861467, "rougeLsum_recall": 0.3934974316637387, "rougeLsum_recall_stderr": 0.004930425413445172}}, "3": {"PALM_prompt": {"bleu": 0.6636681020720647, "bleu_stderr": 0.03135011211987113, "rouge1_fmeasure": 0.13179598729950448, "rouge1_fmeasure_stderr": 0.0018608653117238654, "rouge1_precision": 0.08335165133186997, "rouge1_precision_stderr": 0.0013566594758271408, "rouge1_recall": 0.4363611958728497, "rouge1_recall_stderr": 0.005466144892220358, "rouge2_fmeasure": 0.06172653863702163, "rouge2_fmeasure_stderr": 0.0011849569250187196, "rouge2_precision": 0.038821951607757095, "rouge2_precision_stderr": 0.0008309945118711307, "rouge2_recall": 0.21925719264824975, "rouge2_recall_stderr": 0.003975211126463013, "rougeL_fmeasure": 0.1203852422828686, "rougeL_fmeasure_stderr": 0.0015949578691764172, "rougeL_precision": 0.07607458853736868, "rougeL_precision_stderr": 0.0011617831771301823, "rougeL_recall": 0.3989579329521947, "rougeL_recall_stderr": 0.004797365715247612, "rougeLsum_fmeasure": 0.12430003630857168, "rougeLsum_fmeasure_stderr": 0.0017268968711559307, "rougeLsum_precision": 0.07866244698502353, "rougeLsum_precision_stderr": 0.0012631336576740743, "rougeLsum_recall": 0.41007264952780736, "rougeLsum_recall_stderr": 0.004984839846868383}}, "4": {"PALM_prompt": {"bleu": 0.7445914925255956, "bleu_stderr": 0.04471373508927592, "rouge1_fmeasure": 0.13191715691415712, "rouge1_fmeasure_stderr": 0.001848451971487058, "rouge1_precision": 0.08335134599012016, "rouge1_precision_stderr": 0.0013601845542222193, "rouge1_recall": 0.4369343435538318, "rouge1_recall_stderr": 0.00545111390697167, "rouge2_fmeasure": 0.061883789388597316, "rouge2_fmeasure_stderr": 0.0011527436830992247, "rouge2_precision": 0.03875848226958462, "rouge2_precision_stderr": 0.0008040225568103941, "rouge2_recall": 0.22226421540491542, "rouge2_recall_stderr": 0.004032909658463521, "rougeL_fmeasure": 0.11940663728897792, "rougeL_fmeasure_stderr": 0.0015860574558582763, "rougeL_precision": 0.0754476375575414, "rougeL_precision_stderr": 0.001174153361661276, "rougeL_recall": 0.3964743307100715, "rougeL_recall_stderr": 0.004828941243739242, "rougeLsum_fmeasure": 0.12423777278329896, "rougeLsum_fmeasure_stderr": 0.0017238674224559544, "rougeLsum_precision": 0.07858636401791905, "rougeLsum_precision_stderr": 0.0012777097204336968, "rougeLsum_recall": 0.41038517941804836, "rougeLsum_recall_stderr": 0.005000143163857465}}, "5": {"PALM_prompt": {"bleu": 0.8005310739494581, "bleu_stderr": 0.033724293174082695, "rouge1_fmeasure": 0.13259315662099147, "rouge1_fmeasure_stderr": 0.0017356987872365276, "rouge1_precision": 0.08338013300594908, "rouge1_precision_stderr": 0.001260130864731573, "rouge1_recall": 0.44613389749037924, "rouge1_recall_stderr": 0.0053462691621041685, "rouge2_fmeasure": 0.06197974009288303, "rouge2_fmeasure_stderr": 0.001091686612001906, "rouge2_precision": 0.03871058510533021, "rouge2_precision_stderr": 0.0007593773025926396, "rouge2_recall": 0.22607554732619956, "rouge2_recall_stderr": 0.004027973657557073, "rougeL_fmeasure": 0.11956303513601427, "rougeL_fmeasure_stderr": 0.001512266881380029, "rougeL_precision": 0.07523239671078998, "rougeL_precision_stderr": 0.0011063220739786807, "rougeL_recall": 0.40282994585844645, "rougeL_recall_stderr": 0.0046937441735825924, "rougeLsum_fmeasure": 0.12469867862475097, "rougeLsum_fmeasure_stderr": 0.0016220089487202947, "rougeLsum_precision": 0.07851137617969797, "rougeLsum_precision_stderr": 0.001188072848467214, "rougeLsum_recall": 0.41817890606333163, "rougeLsum_recall_stderr": 0.004881701441444398}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.414984862410896, "bleu_stderr": 0.05940234677879655, "rouge1_fmeasure": 0.172029086258773, "rouge1_fmeasure_stderr": 0.0018269882294786313, "rouge1_precision": 0.14692825480864516, "rouge1_precision_stderr": 0.0018425931958625118, "rouge1_recall": 0.2512970070836738, "rouge1_recall_stderr": 0.002648136861575683, "rouge2_fmeasure": 0.03327297097578151, "rouge2_fmeasure_stderr": 0.0008291280040639999, "rouge2_precision": 0.027966624066931015, "rouge2_precision_stderr": 0.0007198127679157396, "rouge2_recall": 0.050572163826476904, "rouge2_recall_stderr": 0.00137135380680017, "rougeL_fmeasure": 0.13597766469494962, "rougeL_fmeasure_stderr": 0.0013133399966305693, "rougeL_precision": 0.11468189898145331, "rougeL_precision_stderr": 0.0012927912945671872, "rougeL_recall": 0.20361426191617446, "rougeL_recall_stderr": 0.002161893564032071, "rougeLsum_fmeasure": 0.15820053127675784, "rougeLsum_fmeasure_stderr": 0.0016635461401018714, "rougeLsum_precision": 0.1348221046642956, "rougeLsum_precision_stderr": 0.0016706708007563362, "rougeLsum_recall": 0.23199038101436165, "rougeLsum_recall_stderr": 0.0024449378448124903}}, "1": {"tldr_en": {"bleu": 2.836407401710481, "bleu_stderr": 0.04043755617370114, "rouge1_fmeasure": 0.2221915237870407, "rouge1_fmeasure_stderr": 0.0019942889407287214, "rouge1_precision": 0.20096070137159358, "rouge1_precision_stderr": 0.0024183404012715405, "rouge1_recall": 0.31553844384377067, "rouge1_recall_stderr": 0.002899492197080727, "rouge2_fmeasure": 0.056766090400891124, "rouge2_fmeasure_stderr": 0.0010805710237233974, "rouge2_precision": 0.05245104495325915, "rouge2_precision_stderr": 0.0012835508781176563, "rouge2_recall": 0.0826631709797398, "rouge2_recall_stderr": 0.0017188300283128646, "rougeL_fmeasure": 0.15912396957362257, "rougeL_fmeasure_stderr": 0.0013601645244304275, "rougeL_precision": 0.14393356016704015, "rougeL_precision_stderr": 0.001799791687854001, "rougeL_recall": 0.23074840768589486, "rougeL_recall_stderr": 0.0022312022411283892, "rougeLsum_fmeasure": 0.20821150035815492, "rougeLsum_fmeasure_stderr": 0.0018771242477053024, "rougeLsum_precision": 0.18819591770649413, "rougeLsum_precision_stderr": 0.002281534419296507, "rougeLsum_recall": 0.2963346702924994, "rougeLsum_recall_stderr": 0.0027562668344971878}}, "2": {"tldr_en": {"bleu": 3.0672692499706633, "bleu_stderr": 0.049106354646547744, "rouge1_fmeasure": 0.22254451379263313, "rouge1_fmeasure_stderr": 0.001915395486050023, "rouge1_precision": 0.20680907589782577, "rouge1_precision_stderr": 0.002488502986523018, "rouge1_recall": 0.30989444650796616, "rouge1_recall_stderr": 0.002753786143548591, "rouge2_fmeasure": 0.057748452491246806, "rouge2_fmeasure_stderr": 0.001071605656207478, "rouge2_precision": 0.05499744970071221, "rouge2_precision_stderr": 0.001313828428933835, "rouge2_recall": 0.08212190745009494, "rouge2_recall_stderr": 0.0016893036964833342, "rougeL_fmeasure": 0.15964814622161871, "rougeL_fmeasure_stderr": 0.0013347717397906025, "rougeL_precision": 0.14843146397136125, "rougeL_precision_stderr": 0.0018718282405222096, "rougeL_recall": 0.22714968032490238, "rougeL_recall_stderr": 0.0022095657686829933, "rougeLsum_fmeasure": 0.20960062484582434, "rougeLsum_fmeasure_stderr": 0.0018046035863626521, "rougeLsum_precision": 0.19471899528471098, "rougeLsum_precision_stderr": 0.002355009451192118, "rougeLsum_recall": 0.292336561157753, "rougeLsum_recall_stderr": 0.0026262120649372398}}, "3": {"tldr_en": {"bleu": 3.0771001660724235, "bleu_stderr": 0.10166627860233955, "rouge1_fmeasure": 0.1820115847047415, "rouge1_fmeasure_stderr": 0.002231556986150065, "rouge1_precision": 0.17474019948410954, "rouge1_precision_stderr": 0.0027474429834987804, "rouge1_recall": 0.25521371457329683, "rouge1_recall_stderr": 0.0033152648455079072, "rouge2_fmeasure": 0.04689779702656875, "rouge2_fmeasure_stderr": 0.001034952296288115, "rouge2_precision": 0.045362687902130126, "rouge2_precision_stderr": 0.0012564421439368422, "rouge2_recall": 0.06804888392817302, "rouge2_recall_stderr": 0.0016992089106080323, "rougeL_fmeasure": 0.13086000378258256, "rougeL_fmeasure_stderr": 0.0015951624748139174, "rougeL_precision": 0.1265226291981258, "rougeL_precision_stderr": 0.002108936479136444, "rougeL_recall": 0.18709591094229805, "rougeL_recall_stderr": 0.002591170589040495, "rougeLsum_fmeasure": 0.17171382413552935, "rougeLsum_fmeasure_stderr": 0.0021029154857551075, "rougeLsum_precision": 0.1650854923795236, "rougeLsum_precision_stderr": 0.0026195222440337307, "rougeLsum_recall": 0.24111486429370707, "rougeLsum_recall_stderr": 0.003150424029915772}}, "4": {"tldr_en": {"bleu": 0.6074700254086013, "bleu_stderr": 0.05423743996990111, "rouge1_fmeasure": 0.05660115935642753, "rouge1_fmeasure_stderr": 0.0019128121580337133, "rouge1_precision": 0.05686654081896349, "rouge1_precision_stderr": 0.002233440497229162, "rouge1_recall": 0.08353923265042008, "rouge1_recall_stderr": 0.0029077074042461065, "rouge2_fmeasure": 0.013741746630537094, "rouge2_fmeasure_stderr": 0.0006742800088300104, "rouge2_precision": 0.012648534117560613, "rouge2_precision_stderr": 0.000714480304383431, "rouge2_recall": 0.02199374005627044, "rouge2_recall_stderr": 0.0012054249033507543, "rougeL_fmeasure": 0.04194122086488744, "rougeL_fmeasure_stderr": 0.0014016615434499398, "rougeL_precision": 0.04243723633575883, "rougeL_precision_stderr": 0.0017020140807939923, "rougeL_recall": 0.0631413785577323, "rougeL_recall_stderr": 0.002247650978929538, "rougeLsum_fmeasure": 0.05334942672891278, "rougeLsum_fmeasure_stderr": 0.0018012620317959791, "rougeLsum_precision": 0.05323619772247561, "rougeLsum_precision_stderr": 0.0020619540523883185, "rougeLsum_recall": 0.07914684184702428, "rougeLsum_recall_stderr": 0.0027725917227984414}}, "5": {"tldr_en": {"bleu": 1.0033020706801269e-06, "bleu_stderr": 1.9077024795694913e-06, "rouge1_fmeasure": 0.009252067005496314, "rouge1_fmeasure_stderr": 0.0008857404763954463, "rouge1_precision": 0.008781294870051274, "rouge1_precision_stderr": 0.0009050499890841822, "rouge1_recall": 0.013677582922976033, "rouge1_recall_stderr": 0.0013193791852402054, "rouge2_fmeasure": 0.0023512305693387013, "rouge2_fmeasure_stderr": 0.0002931024418143357, "rouge2_precision": 0.002216585419720102, "rouge2_precision_stderr": 0.0003047584752236144, "rouge2_recall": 0.0035060227812251585, "rouge2_recall_stderr": 0.0004475043348074056, "rougeL_fmeasure": 0.007007677157867705, "rougeL_fmeasure_stderr": 0.0006708462523675694, "rougeL_precision": 0.00679769925058833, "rougeL_precision_stderr": 0.000718459101307871, "rougeL_recall": 0.010367182111106583, "rougeL_recall_stderr": 0.0010118399277303067, "rougeLsum_fmeasure": 0.008671543071872297, "rougeLsum_fmeasure_stderr": 0.0008282868408056053, "rougeLsum_precision": 0.0082009891177148, "rougeLsum_precision_stderr": 0.0008441371445186326, "rougeLsum_recall": 0.012916195361676434, "rougeLsum_recall_stderr": 0.0012503160451136067}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.5451634050753023, "bleu_stderr": 0.03476122561593657, "rouge1_fmeasure": 0.04310290552413315, "rouge1_fmeasure_stderr": 0.0013354170612674893, "rouge1_precision": 0.046590041682019065, "rouge1_precision_stderr": 0.0018113877733116784, "rouge1_recall": 0.05509845941495712, "rouge1_recall_stderr": 0.0017154015403554534, "rouge2_fmeasure": 0.0063826724183375155, "rouge2_fmeasure_stderr": 0.0004428905323251431, "rouge2_precision": 0.0061050451406738674, "rouge2_precision_stderr": 0.0004913565515769247, "rouge2_recall": 0.009015504658999519, "rouge2_recall_stderr": 0.0006494884515366231, "rougeL_fmeasure": 0.041469788976882346, "rougeL_fmeasure_stderr": 0.0012468639228984321, "rougeL_precision": 0.04442283664669869, "rougeL_precision_stderr": 0.0016857127584432392, "rougeL_recall": 0.053364978373077035, "rougeL_recall_stderr": 0.0016220526842467798, "rougeLsum_fmeasure": 0.03867299387707548, "rougeLsum_fmeasure_stderr": 0.0011657405780310776, "rougeLsum_precision": 0.04212798093112617, "rougeLsum_precision_stderr": 0.00163957458202675, "rougeLsum_recall": 0.04928251726552432, "rougeLsum_recall_stderr": 0.0014753429112820023}}, "1": {"generate_text_restaurant": {"bleu": 11.396112958955289, "bleu_stderr": 0.12774001717020914, "rouge1_fmeasure": 0.4479642771715969, "rouge1_fmeasure_stderr": 0.0023079960050490524, "rouge1_precision": 0.5339579313432855, "rouge1_precision_stderr": 0.0031227264254700595, "rouge1_recall": 0.42523020862127436, "rouge1_recall_stderr": 0.002981115684302531, "rouge2_fmeasure": 0.2056915755809246, "rouge2_fmeasure_stderr": 0.0019163170514045066, "rouge2_precision": 0.248732624279182, "rouge2_precision_stderr": 0.002509203740513705, "rouge2_recall": 0.1952491238459977, "rouge2_recall_stderr": 0.0020864674211335237, "rougeL_fmeasure": 0.3204968358680575, "rougeL_fmeasure_stderr": 0.001985520070286022, "rougeL_precision": 0.3850344789575973, "rougeL_precision_stderr": 0.00278033592883375, "rougeL_recall": 0.3033612056952134, "rougeL_recall_stderr": 0.002388739694886294, "rougeLsum_fmeasure": 0.36313254630951347, "rougeLsum_fmeasure_stderr": 0.0022360796971689707, "rougeLsum_precision": 0.4342921156865786, "rougeLsum_precision_stderr": 0.003003575710929277, "rougeLsum_recall": 0.34421205733760346, "rougeLsum_recall_stderr": 0.0026942002219059206}}, "2": {"generate_text_restaurant": {"bleu": 12.976405649449786, "bleu_stderr": 0.20113725542092992, "rouge1_fmeasure": 0.47007019927530463, "rouge1_fmeasure_stderr": 0.0022920087841303556, "rouge1_precision": 0.558994227405181, "rouge1_precision_stderr": 0.0032295866736836163, "rouge1_recall": 0.44495262240037453, "rouge1_recall_stderr": 0.0029490508361979437, "rouge2_fmeasure": 0.22591032128288588, "rouge2_fmeasure_stderr": 0.0020560357969047136, "rouge2_precision": 0.27217728286113113, "rouge2_precision_stderr": 0.0026898706377774815, "rouge2_recall": 0.21411207509510502, "rouge2_recall_stderr": 0.002230097528851014, "rougeL_fmeasure": 0.3425232001087342, "rougeL_fmeasure_stderr": 0.0020806737697165846, "rougeL_precision": 0.40975809640656674, "rougeL_precision_stderr": 0.00294221380121017, "rougeL_recall": 0.3236212819558878, "rougeL_recall_stderr": 0.0024673142471829217, "rougeLsum_fmeasure": 0.3859835303051329, "rougeLsum_fmeasure_stderr": 0.0023029475516390383, "rougeLsum_precision": 0.46033224287141866, "rougeLsum_precision_stderr": 0.0031762512099836643, "rougeLsum_recall": 0.3647714098177991, "rougeLsum_recall_stderr": 0.0027210820647532467}}, "3": {"generate_text_restaurant": {"bleu": 13.841780368990428, "bleu_stderr": 0.16004525418920557, "rouge1_fmeasure": 0.4775062117887383, "rouge1_fmeasure_stderr": 0.0022520658159749783, "rouge1_precision": 0.5672114023663154, "rouge1_precision_stderr": 0.0031836037058964135, "rouge1_recall": 0.4506315681499688, "rouge1_recall_stderr": 0.0028976257859202057, "rouge2_fmeasure": 0.23547797340215765, "rouge2_fmeasure_stderr": 0.002080191282609909, "rouge2_precision": 0.2831489435622744, "rouge2_precision_stderr": 0.0026849363746315716, "rouge2_recall": 0.2224359793863677, "rouge2_recall_stderr": 0.0022585552678834257, "rougeL_fmeasure": 0.3522099816274127, "rougeL_fmeasure_stderr": 0.0021360587201332483, "rougeL_precision": 0.42052513955243626, "rougeL_precision_stderr": 0.0029794181672521524, "rougeL_recall": 0.3316609702212396, "rougeL_recall_stderr": 0.002482260811969057, "rougeLsum_fmeasure": 0.39783266320253785, "rougeLsum_fmeasure_stderr": 0.0023487903214587246, "rougeLsum_precision": 0.4731322251491655, "rougeLsum_precision_stderr": 0.0031830112351273762, "rougeLsum_recall": 0.3751305124701987, "rougeLsum_recall_stderr": 0.0027537781991579034}}, "4": {"generate_text_restaurant": {"bleu": 14.347043577937871, "bleu_stderr": 0.13183200117809515, "rouge1_fmeasure": 0.4791803139648653, "rouge1_fmeasure_stderr": 0.0022359999842246512, "rouge1_precision": 0.5657141271378652, "rouge1_precision_stderr": 0.0032494806378829547, "rouge1_recall": 0.4543139798474618, "rouge1_recall_stderr": 0.002865992018425755, "rouge2_fmeasure": 0.23765394178309218, "rouge2_fmeasure_stderr": 0.0021422406093072697, "rouge2_precision": 0.2840374640470508, "rouge2_precision_stderr": 0.0028091760925762137, "rouge2_recall": 0.22548495306647312, "rouge2_recall_stderr": 0.002307333846171272, "rougeL_fmeasure": 0.3551586826634256, "rougeL_fmeasure_stderr": 0.0021591742079548407, "rougeL_precision": 0.4208287732411973, "rougeL_precision_stderr": 0.003031588432467615, "rougeL_recall": 0.33645734099493246, "rougeL_recall_stderr": 0.0025176571940632108, "rougeLsum_fmeasure": 0.40243539061119715, "rougeLsum_fmeasure_stderr": 0.002367520270615011, "rougeLsum_precision": 0.47480321305856177, "rougeLsum_precision_stderr": 0.003220379872286072, "rougeLsum_recall": 0.38184109614706496, "rougeLsum_recall_stderr": 0.002799589796872638}}, "5": {"generate_text_restaurant": {"bleu": 14.305213942543, "bleu_stderr": 0.1483607819215848, "rouge1_fmeasure": 0.4804680058354203, "rouge1_fmeasure_stderr": 0.0021741397703150594, "rouge1_precision": 0.5597772799865962, "rouge1_precision_stderr": 0.0031779357829373007, "rouge1_recall": 0.45931387100423793, "rouge1_recall_stderr": 0.002804556836326018, "rouge2_fmeasure": 0.2366049201616526, "rouge2_fmeasure_stderr": 0.0020482405764989274, "rouge2_precision": 0.27941286210601324, "rouge2_precision_stderr": 0.002685923767684149, "rouge2_recall": 0.2261024234770814, "rouge2_recall_stderr": 0.002220825905817483, "rougeL_fmeasure": 0.3555529772285316, "rougeL_fmeasure_stderr": 0.0020868456031009723, "rougeL_precision": 0.4153388685887487, "rougeL_precision_stderr": 0.002922026669760074, "rougeL_recall": 0.33991925444128, "rougeL_recall_stderr": 0.0024671212347911007, "rougeLsum_fmeasure": 0.403868227449958, "rougeLsum_fmeasure_stderr": 0.0023025847428200684, "rougeLsum_precision": 0.470187398074112, "rougeLsum_precision_stderr": 0.0031325933820836164, "rougeLsum_recall": 0.3864435015330604, "rougeLsum_recall_stderr": 0.0027449279832689228}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.103288067343989, "bleu_stderr": 0.12415880355418442, "rouge1_fmeasure": 0.2135984788737725, "rouge1_fmeasure_stderr": 0.0026714992621488937, "rouge1_precision": 0.1691487054500622, "rouge1_precision_stderr": 0.0024250397934530475, "rouge1_recall": 0.3329424577074328, "rouge1_recall_stderr": 0.004657379753879945, "rouge2_fmeasure": 0.04790575968435739, "rouge2_fmeasure_stderr": 0.0017255007150536288, "rouge2_precision": 0.03660197341061794, "rouge2_precision_stderr": 0.0013876615412254558, "rouge2_recall": 0.07882441796533844, "rouge2_recall_stderr": 0.0029121244310315966, "rougeL_fmeasure": 0.15810158263668517, "rougeL_fmeasure_stderr": 0.0020195259778767218, "rougeL_precision": 0.12480506419752084, "rougeL_precision_stderr": 0.0018165017981600132, "rougeL_recall": 0.24849031836840368, "rougeL_recall_stderr": 0.0036491823702079194, "rougeLsum_fmeasure": 0.16520124922802173, "rougeLsum_fmeasure_stderr": 0.0022185800515451995, "rougeLsum_precision": 0.12993544368406393, "rougeLsum_precision_stderr": 0.0019145615600976785, "rougeLsum_recall": 0.26077607018509813, "rougeLsum_recall_stderr": 0.004054503415661984}}, "1": {"article_DOC_summary": {"bleu": 1.7714825425476433, "bleu_stderr": 0.10905675609895228, "rouge1_fmeasure": 0.19244278875333556, "rouge1_fmeasure_stderr": 0.002639391462026543, "rouge1_precision": 0.1369839651831878, "rouge1_precision_stderr": 0.0019614322408207006, "rouge1_recall": 0.3369949215642884, "rouge1_recall_stderr": 0.004522579510481593, "rouge2_fmeasure": 0.04413377405232099, "rouge2_fmeasure_stderr": 0.0015985481401232032, "rouge2_precision": 0.031094803643512716, "rouge2_precision_stderr": 0.0011365997631264868, "rouge2_recall": 0.07935713496492708, "rouge2_recall_stderr": 0.002884276945599807, "rougeL_fmeasure": 0.1500620628581487, "rougeL_fmeasure_stderr": 0.0020207447957827454, "rougeL_precision": 0.10657563716596334, "rougeL_precision_stderr": 0.0014875062360428326, "rougeL_recall": 0.26467978376956175, "rougeL_recall_stderr": 0.0036163241442920165, "rougeLsum_fmeasure": 0.15187454343375792, "rougeLsum_fmeasure_stderr": 0.0021906040942089064, "rougeLsum_precision": 0.1078060619083588, "rougeLsum_precision_stderr": 0.0016045831813046104, "rougeLsum_recall": 0.2680535986457037, "rougeLsum_recall_stderr": 0.0038953175735036904}}, "2": {"article_DOC_summary": {"bleu": 1.8719539285791582, "bleu_stderr": 0.12363679141275902, "rouge1_fmeasure": 0.19861514735566918, "rouge1_fmeasure_stderr": 0.002536869900552261, "rouge1_precision": 0.1416725332039807, "rouge1_precision_stderr": 0.0019003755973271465, "rouge1_recall": 0.3459660596871756, "rouge1_recall_stderr": 0.004306340084752496, "rouge2_fmeasure": 0.046170973346933354, "rouge2_fmeasure_stderr": 0.0016039666517335285, "rouge2_precision": 0.032589396300244954, "rouge2_precision_stderr": 0.001135411316234186, "rouge2_recall": 0.08273970858446308, "rouge2_recall_stderr": 0.0029435229941216076, "rougeL_fmeasure": 0.1563808799657367, "rougeL_fmeasure_stderr": 0.0019620345863827896, "rougeL_precision": 0.11131065526290164, "rougeL_precision_stderr": 0.0014510877883445626, "rougeL_recall": 0.2741645486545069, "rougeL_recall_stderr": 0.003497544935564585, "rougeLsum_fmeasure": 0.15581654576519338, "rougeLsum_fmeasure_stderr": 0.0021586761800846295, "rougeLsum_precision": 0.11082674371408649, "rougeLsum_precision_stderr": 0.0015845483453646742, "rougeLsum_recall": 0.2737340240682109, "rougeLsum_recall_stderr": 0.003854876130312793}}, "3": {"article_DOC_summary": {"bleu": 2.0369871247788307, "bleu_stderr": 0.11889285675918113, "rouge1_fmeasure": 0.19191359742761793, "rouge1_fmeasure_stderr": 0.00277051626651925, "rouge1_precision": 0.13956778448576387, "rouge1_precision_stderr": 0.0021737270808076243, "rouge1_recall": 0.329102550985314, "rouge1_recall_stderr": 0.004775853265191935, "rouge2_fmeasure": 0.04715930396420784, "rouge2_fmeasure_stderr": 0.001624147693037141, "rouge2_precision": 0.034143138087541, "rouge2_precision_stderr": 0.0012197975952686348, "rouge2_recall": 0.08270285997486158, "rouge2_recall_stderr": 0.0029002084266991856, "rougeL_fmeasure": 0.1528761624914166, "rougeL_fmeasure_stderr": 0.002169630539318251, "rougeL_precision": 0.11089339313344634, "rougeL_precision_stderr": 0.0016751091885532099, "rougeL_recall": 0.26375167528125915, "rougeL_recall_stderr": 0.0038713870137101424, "rougeLsum_fmeasure": 0.15380687866772882, "rougeLsum_fmeasure_stderr": 0.002388364659924707, "rougeLsum_precision": 0.11161318605392968, "rougeLsum_precision_stderr": 0.0018444411300991288, "rougeLsum_recall": 0.26547328847718554, "rougeLsum_recall_stderr": 0.004239107141632789}}, "4": {"article_DOC_summary": {"bleu": 0.9532704572175751, "bleu_stderr": 0.1334974714932564, "rouge1_fmeasure": 0.053658040802239085, "rouge1_fmeasure_stderr": 0.0029774150017601803, "rouge1_precision": 0.045437643598581885, "rouge1_precision_stderr": 0.002731741164998445, "rouge1_recall": 0.0834633850546414, "rouge1_recall_stderr": 0.004709515538170699, "rouge2_fmeasure": 0.012335864733508397, "rouge2_fmeasure_stderr": 0.0010450675751901856, "rouge2_precision": 0.009714659363066126, "rouge2_precision_stderr": 0.0009061043015811388, "rouge2_recall": 0.020204993259341004, "rouge2_recall_stderr": 0.0017173474943290197, "rougeL_fmeasure": 0.041486821630655535, "rougeL_fmeasure_stderr": 0.0022764116642440576, "rougeL_precision": 0.03525079234225019, "rougeL_precision_stderr": 0.0021438053902596605, "rougeL_recall": 0.06521180915166398, "rougeL_recall_stderr": 0.003703751644624955, "rougeLsum_fmeasure": 0.04288192765446807, "rougeLsum_fmeasure_stderr": 0.0023977202282267356, "rougeLsum_precision": 0.03647650806323336, "rougeLsum_precision_stderr": 0.002248078408967042, "rougeLsum_recall": 0.06711329846733026, "rougeLsum_recall_stderr": 0.003863144394106958}}, "5": {"article_DOC_summary": {"bleu": 3.120169018915429e-39, "bleu_stderr": 2.777103521919484e-32, "rouge1_fmeasure": 0.002997767517780879, "rouge1_fmeasure_stderr": 0.0008334506629000642, "rouge1_precision": 0.003263719197769084, "rouge1_precision_stderr": 0.0008887617448902996, "rouge1_recall": 0.0028345456514645845, "rouge1_recall_stderr": 0.0008116977743173961, "rouge2_fmeasure": 0.0004337191943913522, "rouge2_fmeasure_stderr": 0.00021214389566854186, "rouge2_precision": 0.0004590858641913026, "rouge2_precision_stderr": 0.00021377485443811557, "rouge2_recall": 0.00041953329689178745, "rouge2_recall_stderr": 0.00021411305439143847, "rougeL_fmeasure": 0.002106529594577242, "rougeL_fmeasure_stderr": 0.0005845294157065285, "rougeL_precision": 0.0023145025817495005, "rougeL_precision_stderr": 0.0006312757980824667, "rougeL_recall": 0.0019717134578344517, "rougeL_recall_stderr": 0.0005600175515270516, "rougeLsum_fmeasure": 0.0024033045589119204, "rougeLsum_fmeasure_stderr": 0.0006766571604709825, "rougeLsum_precision": 0.0026347083023874887, "rougeLsum_precision_stderr": 0.0007273791404483311, "rougeLsum_recall": 0.00225801889401816, "rougeLsum_recall_stderr": 0.000653191190142516}}}}
4b284b42bc4/evaluation/rankeval/4b284b42bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.331,0.014888272588203931,0
3
+ anli_r2,acc,0.342,0.01500870618212173,0
4
+ anli_r3,acc,0.34,0.013680495725767784,0
5
+ arc_challenge,acc,0.27986348122866894,0.013119040897725922,0
6
+ arc_challenge,acc_norm,0.29266211604095566,0.01329591610361942,0
7
+ arc_easy,acc,0.6224747474747475,0.00994722783346943,0
8
+ arc_easy,acc_norm,0.5462962962962963,0.010215708295494117,0
9
+ boolq,acc,0.5253822629969419,0.0087337795418535,1
10
+ cb,acc,0.5357142857142857,0.06724777654937658,1
11
+ cb,f1,0.45393112410656267,,1
12
+ copa,acc,0.75,0.04351941398892446,0
13
+ hellaswag,acc,0.4833698466440948,0.004987020679861267,0
14
+ hellaswag,acc_norm,0.63433578968333,0.004806316342709393,0
15
+ piqa,acc,0.7448313384113167,0.010171571592521822,0
16
+ piqa,acc_norm,0.76550598476605,0.00988520314324054,0
17
+ rte,acc,0.5776173285198556,0.029731622646495887,0
18
+ sciq,acc,0.837,0.011686212712746849,0
19
+ sciq,acc_norm,0.757,0.013569640199177458,0
20
+ storycloze_2016,acc,0.7204703367183325,0.01037770209970486,0
21
+ winogrande,acc,0.5864246250986582,0.013840971763195303,0
4b284b42bc4/evaluation/rankeval/4b284b42bc4_1.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.31,0.014632638658632902,0
3
+ anli_r2,acc,0.31,0.014632638658632905,0
4
+ anli_r3,acc,0.3283333333333333,0.013562032919529017,0
5
+ arc_challenge,acc,0.2883959044368601,0.013238394422428173,0
6
+ arc_challenge,acc_norm,0.3148464163822526,0.01357265770308495,0
7
+ arc_easy,acc,0.6262626262626263,0.009927267058259621,0
8
+ arc_easy,acc_norm,0.5934343434343434,0.010079056419223527,0
9
+ boolq,acc,0.5522935779816514,0.008697094687974059,1
10
+ cb,acc,0.3392857142857143,0.06384226561930825,1
11
+ cb,f1,0.29749748849204566,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4803823939454292,0.004985939292819582,0
14
+ hellaswag,acc_norm,0.6294562836088429,0.004819633668832538,0
15
+ piqa,acc,0.7486398258977149,0.010121156016819259,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.44765342960288806,0.02993107036293953,0
18
+ sciq,acc,0.892,0.0098200016513457,0
19
+ sciq,acc_norm,0.869,0.010674874844837954,0
20
+ storycloze_2016,acc,0.7049706039551042,0.010546232606962289,0
21
+ winogrande,acc,0.5887924230465666,0.013829128358676874,0
4b284b42bc4/evaluation/rankeval/4b284b42bc4_2.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.306,0.014580006055436969,0
3
+ anli_r2,acc,0.33,0.014876872027456734,0
4
+ anli_r3,acc,0.3308333333333333,0.013588208070709007,0
5
+ arc_challenge,acc,0.29948805460750855,0.013385021637313565,0
6
+ arc_challenge,acc_norm,0.3148464163822526,0.01357265770308495,0
7
+ arc_easy,acc,0.6388888888888888,0.00985601342581124,0
8
+ arc_easy,acc_norm,0.6182659932659933,0.009968648851839672,0
9
+ boolq,acc,0.5889908256880734,0.008605429733982185,1
10
+ cb,acc,0.25,0.058387420812114225,1
11
+ cb,f1,0.2376010151606224,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.47679745070703045,0.004984405935541087,0
14
+ hellaswag,acc_norm,0.6308504282015535,0.004815882719278393,0
15
+ piqa,acc,0.750272034820457,0.010099232969867488,0
16
+ piqa,acc_norm,0.763873775843308,0.009908965890558218,0
17
+ rte,acc,0.48736462093862815,0.030086851767188564,0
18
+ sciq,acc,0.902,0.009406619184621238,0
19
+ sciq,acc_norm,0.89,0.009899393819724444,0
20
+ storycloze_2016,acc,0.7199358631747729,0.01038376499392048,0
21
+ winogrande,acc,0.6101026045777427,0.013707547317008462,0
4b284b42bc4/evaluation/rankeval/4b284b42bc4_3.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.314,0.014683991951087962,0
3
+ anli_r2,acc,0.342,0.015008706182121734,0
4
+ anli_r3,acc,0.32416666666666666,0.013517438120881636,0
5
+ arc_challenge,acc,0.29180887372013653,0.013284525292403503,0
6
+ arc_challenge,acc_norm,0.3046075085324232,0.01344952210993249,0
7
+ arc_easy,acc,0.6342592592592593,0.00988298806941883,0
8
+ arc_easy,acc_norm,0.6212121212121212,0.00995373765654204,0
9
+ boolq,acc,0.599388379204893,0.008570545612096372,1
10
+ cb,acc,0.4107142857142857,0.0663363415035954,1
11
+ cb,f1,0.37437732746529967,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4823740290778729,0.004986680048438317,0
14
+ hellaswag,acc_norm,0.6320454092810197,0.004812633280078256,0
15
+ piqa,acc,0.7600652883569097,0.009963625892809544,0
16
+ piqa,acc_norm,0.7633297062023939,0.009916841655042809,0
17
+ rte,acc,0.5306859205776173,0.030039730592197812,0
18
+ sciq,acc,0.917,0.00872852720607479,0
19
+ sciq,acc_norm,0.902,0.009406619184621236,0
20
+ storycloze_2016,acc,0.7215392838054516,0.010365521460604417,0
21
+ winogrande,acc,0.5887924230465666,0.013829128358676878,0
4b284b42bc4/evaluation/rankeval/4b284b42bc4_4.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.335,0.014933117490932572,0
3
+ anli_r2,acc,0.352,0.015110404505648663,0
4
+ anli_r3,acc,0.3233333333333333,0.013508372867300215,0
5
+ arc_challenge,acc,0.295221843003413,0.013329750293382316,0
6
+ arc_challenge,acc_norm,0.3046075085324232,0.013449522109932487,0
7
+ arc_easy,acc,0.6447811447811448,0.009820245899287119,0
8
+ arc_easy,acc_norm,0.6195286195286195,0.009962305992058567,0
9
+ boolq,acc,0.6143730886850153,0.008513189460768057,1
10
+ cb,acc,0.4107142857142857,0.06633634150359541,1
11
+ cb,f1,0.3098047785547785,,1
12
+ copa,acc,0.78,0.04163331998932262,0
13
+ hellaswag,acc,0.47849034056960765,0.00498516207433611,0
14
+ hellaswag,acc_norm,0.6403106950806612,0.00478928472395585,0
15
+ piqa,acc,0.7562568008705114,0.010017199471500619,0
16
+ piqa,acc_norm,0.7622415669205659,0.009932525779525492,0
17
+ rte,acc,0.4729241877256318,0.030052303463143706,0
18
+ sciq,acc,0.918,0.008680515615523705,0
19
+ sciq,acc_norm,0.902,0.009406619184621224,0
20
+ storycloze_2016,acc,0.7279529663281668,0.01029088806087124,0
21
+ winogrande,acc,0.595895816890292,0.01379161066467086,0
4b284b42bc4/evaluation/rankeval/4b284b42bc4_5.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.323,0.014794927843348639,0
3
+ anli_r2,acc,0.332,0.014899597242811475,0
4
+ anli_r3,acc,0.3275,0.013553211167251961,0
5
+ arc_challenge,acc,0.29948805460750855,0.013385021637313565,0
6
+ arc_challenge,acc_norm,0.31313993174061433,0.013552671543623504,0
7
+ arc_easy,acc,0.6426767676767676,0.00983320561246312,0
8
+ arc_easy,acc_norm,0.625,0.009933992677987828,0
9
+ boolq,acc,0.618960244648318,0.008493937524439337,1
10
+ cb,acc,0.39285714285714285,0.0658538889806635,1
11
+ cb,f1,0.32470238095238096,,1
12
+ copa,acc,0.79,0.040936018074033256,0
13
+ hellaswag,acc,0.4792869946225851,0.004985498055190358,0
14
+ hellaswag,acc_norm,0.6384186417048396,0.004794764843685288,0
15
+ piqa,acc,0.7557127312295974,0.010024765172284247,0
16
+ piqa,acc_norm,0.7616974972796517,0.009940334245876222,0
17
+ rte,acc,0.5054151624548736,0.030094698123239966,0
18
+ sciq,acc,0.919,0.008632121032139964,0
19
+ sciq,acc_norm,0.911,0.009008893392651526,0
20
+ storycloze_2016,acc,0.7306253340459647,0.010258997754057014,0
21
+ winogrande,acc,0.5880031570639306,0.013833112857645937,0
4b284b84bc4/evaluation/4b284b84bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.328,
5
- "acc_stderr": 0.014853842487270334
6
- },
7
- "anli_r2": {
8
- "acc": 0.337,
9
- "acc_stderr": 0.014955087918653605
10
- },
11
- "anli_r3": {
12
- "acc": 0.32416666666666666,
13
- "acc_stderr": 0.013517438120881624
14
- },
15
- "cb": {
16
- "acc": 0.26785714285714285,
17
- "acc_stderr": 0.05971290310957636,
18
- "f1": 0.18656056587091072
19
- },
20
- "copa": {
21
- "acc": 0.76,
22
- "acc_stderr": 0.04292346959909283
23
- },
24
- "hellaswag": {
25
- "acc": 0.4547898824935272,
26
- "acc_stderr": 0.004969341773423513,
27
- "acc_norm": 0.5937064329814777,
28
- "acc_norm_stderr": 0.004901368629533419
29
- },
30
- "rte": {
31
- "acc": 0.5595667870036101,
32
- "acc_stderr": 0.029882123363118726
33
- },
34
- "winogrande": {
35
- "acc": 0.5769534333070244,
36
- "acc_stderr": 0.01388505535905647
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.694815606627472,
40
- "acc_stderr": 0.010648664383985661
41
- },
42
- "boolq": {
43
- "acc": 0.6256880733944954,
44
- "acc_stderr": 0.00846424665644323
45
- },
46
- "arc_easy": {
47
- "acc": 0.40614478114478114,
48
- "acc_stderr": 0.010077409815364048,
49
- "acc_norm": 0.3766835016835017,
50
- "acc_norm_stderr": 0.009942848077476172
51
- },
52
- "arc_challenge": {
53
- "acc": 0.20648464163822525,
54
- "acc_stderr": 0.011828865619002316,
55
- "acc_norm": 0.2551194539249147,
56
- "acc_norm_stderr": 0.012739038695202109
57
- },
58
- "sciq": {
59
- "acc": 0.775,
60
- "acc_stderr": 0.013211720158614756,
61
- "acc_norm": 0.709,
62
- "acc_norm_stderr": 0.014370995982377933
63
- },
64
- "piqa": {
65
- "acc": 0.6561479869423286,
66
- "acc_stderr": 0.011082356277961393,
67
- "acc_norm": 0.6528835690968444,
68
- "acc_norm_stderr": 0.011107104993128086
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/4b284b84bc4_1_lm-eval_global_step80108_2023-01-30-11-26-40_1shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.303,
5
- "acc_stderr": 0.014539683710535264
6
- },
7
- "anli_r2": {
8
- "acc": 0.312,
9
- "acc_stderr": 0.01465847437050901
10
- },
11
- "anli_r3": {
12
- "acc": 0.3491666666666667,
13
- "acc_stderr": 0.013767075395077247
14
- },
15
- "cb": {
16
- "acc": 0.30357142857142855,
17
- "acc_stderr": 0.06199938655510753,
18
- "f1": 0.2927120669056153
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4506074487153953,
26
- "acc_stderr": 0.0049653753416431376,
27
- "acc_norm": 0.5834495120493925,
28
- "acc_norm_stderr": 0.004919794704673269
29
- },
30
- "rte": {
31
- "acc": 0.5342960288808665,
32
- "acc_stderr": 0.030025579819366422
33
- },
34
- "winogrande": {
35
- "acc": 0.6077348066298343,
36
- "acc_stderr": 0.013722400462000883
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.6996258685195083,
40
- "acc_stderr": 0.010600915927985033
41
- },
42
- "boolq": {
43
- "acc": 0.6134556574923548,
44
- "acc_stderr": 0.008516943934341973
45
- },
46
- "arc_easy": {
47
- "acc": 0.5231481481481481,
48
- "acc_stderr": 0.010248782484554473,
49
- "acc_norm": 0.4819023569023569,
50
- "acc_norm_stderr": 0.010253060653479177
51
- },
52
- "arc_challenge": {
53
- "acc": 0.23122866894197952,
54
- "acc_stderr": 0.012320858834772273,
55
- "acc_norm": 0.2619453924914676,
56
- "acc_norm_stderr": 0.012849054826858115
57
- },
58
- "sciq": {
59
- "acc": 0.88,
60
- "acc_stderr": 0.010281328012747391,
61
- "acc_norm": 0.863,
62
- "acc_norm_stderr": 0.010878848714333327
63
- },
64
- "piqa": {
65
- "acc": 0.6887921653971708,
66
- "acc_stderr": 0.010802263878045844,
67
- "acc_norm": 0.6866158868335147,
68
- "acc_norm_stderr": 0.010822829929195489
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/4b284b84bc4_2_lm-eval_global_step80108_2023-01-30-11-26-40_2shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.315,
5
- "acc_stderr": 0.014696631960792498
6
- },
7
- "anli_r2": {
8
- "acc": 0.341,
9
- "acc_stderr": 0.014998131348402702
10
- },
11
- "anli_r3": {
12
- "acc": 0.34,
13
- "acc_stderr": 0.0136804957257678
14
- },
15
- "cb": {
16
- "acc": 0.14285714285714285,
17
- "acc_stderr": 0.04718416136255829,
18
- "f1": 0.143010752688172
19
- },
20
- "copa": {
21
- "acc": 0.79,
22
- "acc_stderr": 0.040936018074033256
23
- },
24
- "hellaswag": {
25
- "acc": 0.4473212507468632,
26
- "acc_stderr": 0.004962010338226347,
27
- "acc_norm": 0.5848436566421031,
28
- "acc_norm_stderr": 0.0049174193677660296
29
- },
30
- "rte": {
31
- "acc": 0.4729241877256318,
32
- "acc_stderr": 0.0300523034631437
33
- },
34
- "winogrande": {
35
- "acc": 0.601420678768745,
36
- "acc_stderr": 0.013760357176873838
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7156600748262961,
40
- "acc_stderr": 0.01043161412866526
41
- },
42
- "boolq": {
43
- "acc": 0.6119266055045871,
44
- "acc_stderr": 0.008523130584760851
45
- },
46
- "arc_easy": {
47
- "acc": 0.5593434343434344,
48
- "acc_stderr": 0.010187264635711983,
49
- "acc_norm": 0.5298821548821548,
50
- "acc_norm_stderr": 0.010241444322886432
51
- },
52
- "arc_challenge": {
53
- "acc": 0.2636518771331058,
54
- "acc_stderr": 0.01287592915129705,
55
- "acc_norm": 0.2858361774744027,
56
- "acc_norm_stderr": 0.013203196088537369
57
- },
58
- "sciq": {
59
- "acc": 0.906,
60
- "acc_stderr": 0.009233052000787738,
61
- "acc_norm": 0.902,
62
- "acc_norm_stderr": 0.009406619184621226
63
- },
64
- "piqa": {
65
- "acc": 0.7089227421109902,
66
- "acc_stderr": 0.010598612490942586,
67
- "acc_norm": 0.7143634385201306,
68
- "acc_norm_stderr": 0.010539303948661916
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/4b284b84bc4_3_lm-eval_global_step80108_2023-01-30-11-26-40_3shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.338,
5
- "acc_stderr": 0.014965960710224489
6
- },
7
- "anli_r2": {
8
- "acc": 0.332,
9
- "acc_stderr": 0.014899597242811476
10
- },
11
- "anli_r3": {
12
- "acc": 0.3325,
13
- "acc_stderr": 0.013605417345710526
14
- },
15
- "cb": {
16
- "acc": 0.08928571428571429,
17
- "acc_stderr": 0.038450387280282494,
18
- "f1": 0.0871517027863777
19
- },
20
- "copa": {
21
- "acc": 0.83,
22
- "acc_stderr": 0.03775251680686371
23
- },
24
- "hellaswag": {
25
- "acc": 0.4525990838478391,
26
- "acc_stderr": 0.0049673082544257514,
27
- "acc_norm": 0.5948018323043218,
28
- "acc_norm_stderr": 0.004899270310557971
29
- },
30
- "rte": {
31
- "acc": 0.49458483754512633,
32
- "acc_stderr": 0.03009469812323996
33
- },
34
- "winogrande": {
35
- "acc": 0.585635359116022,
36
- "acc_stderr": 0.01384484623226856
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7135221806520577,
40
- "acc_stderr": 0.01045510591863303
41
- },
42
- "boolq": {
43
- "acc": 0.6079510703363914,
44
- "acc_stderr": 0.008538802914911992
45
- },
46
- "arc_easy": {
47
- "acc": 0.5765993265993266,
48
- "acc_stderr": 0.010138671005289047,
49
- "acc_norm": 0.5585016835016835,
50
- "acc_norm_stderr": 0.010189314382749929
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27986348122866894,
54
- "acc_stderr": 0.01311904089772592,
55
- "acc_norm": 0.29180887372013653,
56
- "acc_norm_stderr": 0.013284525292403506
57
- },
58
- "sciq": {
59
- "acc": 0.908,
60
- "acc_stderr": 0.009144376393151086,
61
- "acc_norm": 0.906,
62
- "acc_norm_stderr": 0.009233052000787738
63
- },
64
- "piqa": {
65
- "acc": 0.7257889009793254,
66
- "acc_stderr": 0.010408618664933382,
67
- "acc_norm": 0.7334058759521219,
68
- "acc_norm_stderr": 0.010316749863541365
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/4b284b84bc4_4_lm-eval_global_step80108_2023-01-30-11-26-40_4shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.334,
5
- "acc_stderr": 0.014922019523732961
6
- },
7
- "anli_r2": {
8
- "acc": 0.34,
9
- "acc_stderr": 0.014987482264363935
10
- },
11
- "anli_r3": {
12
- "acc": 0.35083333333333333,
13
- "acc_stderr": 0.013782212417178197
14
- },
15
- "cb": {
16
- "acc": 0.17857142857142858,
17
- "acc_stderr": 0.051642771820087224,
18
- "f1": 0.18279613107199313
19
- },
20
- "copa": {
21
- "acc": 0.82,
22
- "acc_stderr": 0.03861229196653697
23
- },
24
- "hellaswag": {
25
- "acc": 0.454690300736905,
26
- "acc_stderr": 0.004969251445596333,
27
- "acc_norm": 0.5943039235212109,
28
- "acc_norm_stderr": 0.004900227226433378
29
- },
30
- "rte": {
31
- "acc": 0.4548736462093863,
32
- "acc_stderr": 0.029973636495415255
33
- },
34
- "winogrande": {
35
- "acc": 0.606156274664562,
36
- "acc_stderr": 0.013732114472668741
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7247461250668092,
40
- "acc_stderr": 0.010328538400500567
41
- },
42
- "boolq": {
43
- "acc": 0.6116207951070336,
44
- "acc_stderr": 0.008524357307908792
45
- },
46
- "arc_easy": {
47
- "acc": 0.5808080808080808,
48
- "acc_stderr": 0.010124905282491183,
49
- "acc_norm": 0.5711279461279462,
50
- "acc_norm_stderr": 0.010155440652900152
51
- },
52
- "arc_challenge": {
53
- "acc": 0.27559726962457337,
54
- "acc_stderr": 0.01305716965576184,
55
- "acc_norm": 0.30802047781569963,
56
- "acc_norm_stderr": 0.01349142951729204
57
- },
58
- "sciq": {
59
- "acc": 0.915,
60
- "acc_stderr": 0.008823426366942317,
61
- "acc_norm": 0.919,
62
- "acc_norm_stderr": 0.008632121032139993
63
- },
64
- "piqa": {
65
- "acc": 0.7285092491838956,
66
- "acc_stderr": 0.010376251176596137,
67
- "acc_norm": 0.7388465723612623,
68
- "acc_norm_stderr": 0.010248738649935587
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json DELETED
@@ -1,87 +0,0 @@
1
- {
2
- "results": {
3
- "anli_r1": {
4
- "acc": 0.348,
5
- "acc_stderr": 0.01507060460376841
6
- },
7
- "anli_r2": {
8
- "acc": 0.342,
9
- "acc_stderr": 0.015008706182121738
10
- },
11
- "anli_r3": {
12
- "acc": 0.33,
13
- "acc_stderr": 0.013579531277800918
14
- },
15
- "cb": {
16
- "acc": 0.26785714285714285,
17
- "acc_stderr": 0.05971290310957636,
18
- "f1": 0.2511904761904762
19
- },
20
- "copa": {
21
- "acc": 0.81,
22
- "acc_stderr": 0.03942772444036623
23
- },
24
- "hellaswag": {
25
- "acc": 0.4565823541127266,
26
- "acc_stderr": 0.0049709334202319285,
27
- "acc_norm": 0.6061541525592511,
28
- "acc_norm_stderr": 0.0048760280379419405
29
- },
30
- "rte": {
31
- "acc": 0.5379061371841155,
32
- "acc_stderr": 0.030009848912529117
33
- },
34
- "winogrande": {
35
- "acc": 0.6037884767166535,
36
- "acc_stderr": 0.013746404157154946
37
- },
38
- "storycloze_2016": {
39
- "acc": 0.7354355959380011,
40
- "acc_stderr": 0.01020040054171416
41
- },
42
- "boolq": {
43
- "acc": 0.6201834862385321,
44
- "acc_stderr": 0.008488668235778613
45
- },
46
- "arc_easy": {
47
- "acc": 0.5900673400673401,
48
- "acc_stderr": 0.010091953527506246,
49
- "acc_norm": 0.5791245791245792,
50
- "acc_norm_stderr": 0.01013050216406634
51
- },
52
- "arc_challenge": {
53
- "acc": 0.28754266211604096,
54
- "acc_stderr": 0.01322671905626613,
55
- "acc_norm": 0.31313993174061433,
56
- "acc_norm_stderr": 0.013552671543623504
57
- },
58
- "sciq": {
59
- "acc": 0.918,
60
- "acc_stderr": 0.008680515615523746,
61
- "acc_norm": 0.917,
62
- "acc_norm_stderr": 0.00872852720607479
63
- },
64
- "piqa": {
65
- "acc": 0.7317736670293797,
66
- "acc_stderr": 0.010336761992404485,
67
- "acc_norm": 0.7448313384113167,
68
- "acc_norm_stderr": 0.010171571592521828
69
- }
70
- },
71
- "versions": {
72
- "anli_r1": 0,
73
- "anli_r2": 0,
74
- "anli_r3": 0,
75
- "cb": 1,
76
- "copa": 0,
77
- "hellaswag": 0,
78
- "rte": 0,
79
- "winogrande": 0,
80
- "storycloze_2016": 0,
81
- "boolq": 1,
82
- "arc_easy": 0,
83
- "arc_challenge": 0,
84
- "sciq": 0,
85
- "piqa": 0
86
- }
87
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b284b84bc4/evaluation/generation/merged.csv ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset,fewshots,prompt,metric,value
2
+ e2e_nlg_cleaned,0,generate_text_restaurant,rouge2_fmeasure,0.026122584651124606
3
+ e2e_nlg_cleaned,0,median,rouge2_fmeasure,0.026122584651124606
4
+ e2e_nlg_cleaned,1,generate_text_restaurant,rouge2_fmeasure,0.17514483800816646
5
+ e2e_nlg_cleaned,1,median,rouge2_fmeasure,0.17514483800816646
6
+ e2e_nlg_cleaned,2,generate_text_restaurant,rouge2_fmeasure,0.18933491861931817
7
+ e2e_nlg_cleaned,2,median,rouge2_fmeasure,0.18933491861931817
8
+ e2e_nlg_cleaned,3,generate_text_restaurant,rouge2_fmeasure,0.20260082190070094
9
+ e2e_nlg_cleaned,3,median,rouge2_fmeasure,0.20260082190070094
10
+ e2e_nlg_cleaned,4,generate_text_restaurant,rouge2_fmeasure,0.20525503989164437
11
+ e2e_nlg_cleaned,4,median,rouge2_fmeasure,0.20525503989164437
12
+ e2e_nlg_cleaned,5,generate_text_restaurant,rouge2_fmeasure,0.19984409683367668
13
+ e2e_nlg_cleaned,5,median,rouge2_fmeasure,0.19984409683367668
14
+ e2e_nlg_cleaned,5,average,multiple,0.16638371665077187
15
+ gem_xsum,0,article_DOC_summary,rouge2_fmeasure,0.059415288606804666
16
+ gem_xsum,0,median,rouge2_fmeasure,0.059415288606804666
17
+ gem_xsum,1,article_DOC_summary,rouge2_fmeasure,0.04506092989376365
18
+ gem_xsum,1,median,rouge2_fmeasure,0.04506092989376365
19
+ gem_xsum,2,article_DOC_summary,rouge2_fmeasure,0.04784619134007039
20
+ gem_xsum,2,median,rouge2_fmeasure,0.04784619134007039
21
+ gem_xsum,3,article_DOC_summary,rouge2_fmeasure,0.04680703390738515
22
+ gem_xsum,3,median,rouge2_fmeasure,0.04680703390738515
23
+ gem_xsum,4,article_DOC_summary,rouge2_fmeasure,0.013388421641777913
24
+ gem_xsum,4,median,rouge2_fmeasure,0.013388421641777913
25
+ gem_xsum,5,article_DOC_summary,rouge2_fmeasure,0.0007583670569643345
26
+ gem_xsum,5,median,rouge2_fmeasure,0.0007583670569643345
27
+ gem_xsum,5,average,multiple,0.03554603874112768
28
+ web_nlg_en,0,PALM_prompt,rouge2_fmeasure,0.05397463032306079
29
+ web_nlg_en,0,median,rouge2_fmeasure,0.05397463032306079
30
+ web_nlg_en,1,PALM_prompt,rouge2_fmeasure,0.053994419166879803
31
+ web_nlg_en,1,median,rouge2_fmeasure,0.053994419166879803
32
+ web_nlg_en,2,PALM_prompt,rouge2_fmeasure,0.0530914234584236
33
+ web_nlg_en,2,median,rouge2_fmeasure,0.0530914234584236
34
+ web_nlg_en,3,PALM_prompt,rouge2_fmeasure,0.05387918484618448
35
+ web_nlg_en,3,median,rouge2_fmeasure,0.05387918484618448
36
+ web_nlg_en,4,PALM_prompt,rouge2_fmeasure,0.05451282884342794
37
+ web_nlg_en,4,median,rouge2_fmeasure,0.05451282884342794
38
+ web_nlg_en,5,PALM_prompt,rouge2_fmeasure,0.05471954734528363
39
+ web_nlg_en,5,median,rouge2_fmeasure,0.05471954734528363
40
+ web_nlg_en,5,average,multiple,0.05402867233054337
41
+ wiki_lingua_en,0,tldr_en,rouge2_fmeasure,0.03487366784140555
42
+ wiki_lingua_en,0,median,rouge2_fmeasure,0.03487366784140555
43
+ wiki_lingua_en,1,tldr_en,rouge2_fmeasure,0.05802448056432808
44
+ wiki_lingua_en,1,median,rouge2_fmeasure,0.05802448056432808
45
+ wiki_lingua_en,2,tldr_en,rouge2_fmeasure,0.05947240390341843
46
+ wiki_lingua_en,2,median,rouge2_fmeasure,0.05947240390341843
47
+ wiki_lingua_en,3,tldr_en,rouge2_fmeasure,0.04827221140963156
48
+ wiki_lingua_en,3,median,rouge2_fmeasure,0.04827221140963156
49
+ wiki_lingua_en,4,tldr_en,rouge2_fmeasure,0.015382205296321172
50
+ wiki_lingua_en,4,median,rouge2_fmeasure,0.015382205296321172
51
+ wiki_lingua_en,5,tldr_en,rouge2_fmeasure,0.002248573836077081
52
+ wiki_lingua_en,5,median,rouge2_fmeasure,0.002248573836077081
53
+ wiki_lingua_en,5,average,multiple,0.03637892380853031
4b284b84bc4/evaluation/generation/merged.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.37466495114095594, "bleu_stderr": 0.03116802815957843, "rouge1_fmeasure": 0.1135227535532633, "rouge1_fmeasure_stderr": 0.0020226161176734882, "rouge1_precision": 0.07404480158534202, "rouge1_precision_stderr": 0.0014902788141002475, "rouge1_recall": 0.31212436257013715, "rouge1_recall_stderr": 0.004568108823452381, "rouge2_fmeasure": 0.05397463032306079, "rouge2_fmeasure_stderr": 0.0012920845251919216, "rouge2_precision": 0.035086188446589825, "rouge2_precision_stderr": 0.0009237445527173, "rouge2_recall": 0.1533304291888016, "rouge2_recall_stderr": 0.0032417614710948096, "rougeL_fmeasure": 0.10988632514530636, "rougeL_fmeasure_stderr": 0.0018918967086146744, "rougeL_precision": 0.07143892188082063, "rougeL_precision_stderr": 0.0013733993783630446, "rougeL_recall": 0.3047679674724977, "rougeL_recall_stderr": 0.0044697317892439005, "rougeLsum_fmeasure": 0.10869588574047251, "rougeLsum_fmeasure_stderr": 0.0019005624996464461, "rougeLsum_precision": 0.07084668944691519, "rougeLsum_precision_stderr": 0.001397360756298291, "rougeLsum_recall": 0.2997116780220477, "rougeLsum_recall_stderr": 0.004343713168500645}}, "1": {"PALM_prompt": {"bleu": 0.45724995573151395, "bleu_stderr": 0.03077826387670849, "rouge1_fmeasure": 0.11499758191359032, "rouge1_fmeasure_stderr": 0.0018677715006027427, "rouge1_precision": 0.07380410032029427, "rouge1_precision_stderr": 0.0013950698189371862, "rouge1_recall": 0.36979685211837077, "rouge1_recall_stderr": 0.005157808096954796, "rouge2_fmeasure": 0.053994419166879803, "rouge2_fmeasure_stderr": 0.001195483166076756, "rouge2_precision": 0.03456277372759004, "rouge2_precision_stderr": 0.0008655744665680597, "rouge2_recall": 0.1825581650807761, "rouge2_recall_stderr": 0.0037692839645507878, "rougeL_fmeasure": 0.10864238895715833, "rougeL_fmeasure_stderr": 0.0017176631324308034, "rougeL_precision": 0.06966555184597864, "rougeL_precision_stderr": 0.0012665956123840754, "rougeL_recall": 0.34681007385001145, "rougeL_recall_stderr": 0.004679678346399935, "rougeLsum_fmeasure": 0.10948950405713026, "rougeLsum_fmeasure_stderr": 0.0017632128912827953, "rougeLsum_precision": 0.07031937468305927, "rougeLsum_precision_stderr": 0.0013198255014853241, "rougeLsum_recall": 0.35067051099532537, "rougeLsum_recall_stderr": 0.004753075684894469}}, "2": {"PALM_prompt": {"bleu": 0.5219512537266193, "bleu_stderr": 0.0212930030956169, "rouge1_fmeasure": 0.11405231602258174, "rouge1_fmeasure_stderr": 0.001654644088764185, "rouge1_precision": 0.07218926971887347, "rouge1_precision_stderr": 0.0012167661516642302, "rouge1_recall": 0.38930719643519357, "rouge1_recall_stderr": 0.004954916384960566, "rouge2_fmeasure": 0.0530914234584236, "rouge2_fmeasure_stderr": 0.0010851048561266318, "rouge2_precision": 0.03343887788145288, "rouge2_precision_stderr": 0.0007607255949351679, "rouge2_recall": 0.1941492016104801, "rouge2_recall_stderr": 0.003831607721687585, "rougeL_fmeasure": 0.10723485397749022, "rougeL_fmeasure_stderr": 0.0015594878594255188, "rougeL_precision": 0.06799635945918656, "rougeL_precision_stderr": 0.0011456716958398708, "rougeL_recall": 0.36060457316843075, "rougeL_recall_stderr": 0.004393491915214021, "rougeLsum_fmeasure": 0.10913009935368997, "rougeLsum_fmeasure_stderr": 0.0015983050972786382, "rougeLsum_precision": 0.06917630307565754, "rougeLsum_precision_stderr": 0.0011778495782492331, "rougeLsum_recall": 0.3693625063067186, "rougeLsum_recall_stderr": 0.004599410126151537}}, "3": {"PALM_prompt": {"bleu": 0.6352070705519831, "bleu_stderr": 0.032408525336239634, "rouge1_fmeasure": 0.11545860904836557, "rouge1_fmeasure_stderr": 0.0016293321313512947, "rouge1_precision": 0.07232274024412362, "rouge1_precision_stderr": 0.0011648296294780197, "rouge1_recall": 0.40875337262130357, "rouge1_recall_stderr": 0.005170431645641866, "rouge2_fmeasure": 0.05387918484618448, "rouge2_fmeasure_stderr": 0.001059642873696044, "rouge2_precision": 0.03357621675403705, "rouge2_precision_stderr": 0.0007258241400547502, "rouge2_recall": 0.20421979666303372, "rouge2_recall_stderr": 0.0038705321992327218, "rougeL_fmeasure": 0.10673572587703606, "rougeL_fmeasure_stderr": 0.001490340938407792, "rougeL_precision": 0.0670219793734058, "rougeL_precision_stderr": 0.0010711080482606746, "rougeL_recall": 0.3719335535759742, "rougeL_recall_stderr": 0.004473658738704575, "rougeLsum_fmeasure": 0.10969517770602641, "rougeLsum_fmeasure_stderr": 0.0015436923533265093, "rougeLsum_precision": 0.06880374952093221, "rougeLsum_precision_stderr": 0.0011085999525674158, "rougeLsum_recall": 0.38592664938862503, "rougeLsum_recall_stderr": 0.004766870365189962}}, "4": {"PALM_prompt": {"bleu": 0.6721480622931579, "bleu_stderr": 0.04364421620033658, "rouge1_fmeasure": 0.11682333840421252, "rouge1_fmeasure_stderr": 0.0016784638187503072, "rouge1_precision": 0.07312830538850522, "rouge1_precision_stderr": 0.0012172573610058964, "rouge1_recall": 0.41786672242074735, "rouge1_recall_stderr": 0.00514089320113892, "rouge2_fmeasure": 0.05451282884342794, "rouge2_fmeasure_stderr": 0.001075049065599178, "rouge2_precision": 0.03391327016863078, "rouge2_precision_stderr": 0.0007434663573137818, "rouge2_recall": 0.21097279270777233, "rouge2_recall_stderr": 0.003962368460085698, "rougeL_fmeasure": 0.10645004232970325, "rougeL_fmeasure_stderr": 0.0014835449120631779, "rougeL_precision": 0.06674973450321794, "rougeL_precision_stderr": 0.001076625707211798, "rougeL_recall": 0.3763932336189791, "rougeL_recall_stderr": 0.004418572944831253, "rougeLsum_fmeasure": 0.1108401266606722, "rougeLsum_fmeasure_stderr": 0.0015949626703580934, "rougeLsum_precision": 0.06947768209278911, "rougeLsum_precision_stderr": 0.0011601752390396401, "rougeLsum_recall": 0.39433935812714066, "rougeLsum_recall_stderr": 0.00474918433328278}}, "5": {"PALM_prompt": {"bleu": 0.7296458163665333, "bleu_stderr": 0.03550662426728089, "rouge1_fmeasure": 0.11795106654953416, "rouge1_fmeasure_stderr": 0.0015900535376520654, "rouge1_precision": 0.073601220000616, "rouge1_precision_stderr": 0.001145132160087058, "rouge1_recall": 0.4252463565222065, "rouge1_recall_stderr": 0.005191805176994548, "rouge2_fmeasure": 0.05471954734528363, "rouge2_fmeasure_stderr": 0.00103145574907618, "rouge2_precision": 0.03391186812656995, "rouge2_precision_stderr": 0.0007058000824068418, "rouge2_recall": 0.2132946176974872, "rouge2_recall_stderr": 0.003991916242021659, "rougeL_fmeasure": 0.10768674313552495, "rougeL_fmeasure_stderr": 0.0014533174914988178, "rougeL_precision": 0.06738065913816185, "rougeL_precision_stderr": 0.001053247887367618, "rougeL_recall": 0.3825929171137152, "rougeL_recall_stderr": 0.004448563155136455, "rougeLsum_fmeasure": 0.11184463338518444, "rougeLsum_fmeasure_stderr": 0.0015162341039388436, "rougeLsum_precision": 0.06988985169584011, "rougeLsum_precision_stderr": 0.001094990559251859, "rougeLsum_recall": 0.40083936944093385, "rougeLsum_recall_stderr": 0.0047880465656518125}}}, "GEM/wiki_lingua_en": {"0": {"tldr_en": {"bleu": 1.5192817590705794, "bleu_stderr": 0.06494006665273461, "rouge1_fmeasure": 0.17464450784994165, "rouge1_fmeasure_stderr": 0.0018866549663747307, "rouge1_precision": 0.14903430496900869, "rouge1_precision_stderr": 0.0019044306456448882, "rouge1_recall": 0.25460834562037304, "rouge1_recall_stderr": 0.0027808818609942765, "rouge2_fmeasure": 0.03487366784140555, "rouge2_fmeasure_stderr": 0.0008538791169940822, "rouge2_precision": 0.02932852184049837, "rouge2_precision_stderr": 0.0007444651216824196, "rouge2_recall": 0.05378048278152194, "rouge2_recall_stderr": 0.0015115343557708684, "rougeL_fmeasure": 0.13717038261116743, "rougeL_fmeasure_stderr": 0.001369756093492625, "rougeL_precision": 0.11560989242228453, "rougeL_precision_stderr": 0.0013437666268655546, "rougeL_recall": 0.20506193764704436, "rougeL_recall_stderr": 0.002299748596638709, "rougeLsum_fmeasure": 0.1593997663047623, "rougeLsum_fmeasure_stderr": 0.0017134556998076298, "rougeLsum_precision": 0.13582138493978418, "rougeLsum_precision_stderr": 0.0017239221521953576, "rougeLsum_recall": 0.23341669122784228, "rougeLsum_recall_stderr": 0.0025774317116309797}}, "1": {"tldr_en": {"bleu": 3.059381627710027, "bleu_stderr": 0.06573083880760217, "rouge1_fmeasure": 0.22509002102944192, "rouge1_fmeasure_stderr": 0.0019838100403287333, "rouge1_precision": 0.19528801849094218, "rouge1_precision_stderr": 0.002201427724682199, "rouge1_recall": 0.3257340657226586, "rouge1_recall_stderr": 0.002895580827853013, "rouge2_fmeasure": 0.05802448056432808, "rouge2_fmeasure_stderr": 0.001078013458633595, "rouge2_precision": 0.04999822434377242, "rouge2_precision_stderr": 0.0010227131559085638, "rouge2_recall": 0.08773922376052366, "rouge2_recall_stderr": 0.0018594738182377334, "rougeL_fmeasure": 0.16185295875984437, "rougeL_fmeasure_stderr": 0.0013532637497389792, "rougeL_precision": 0.13922763000575006, "rougeL_precision_stderr": 0.0014964090541992776, "rougeL_recall": 0.24015503219395376, "rougeL_recall_stderr": 0.0023374560954638485, "rougeLsum_fmeasure": 0.21073250157763135, "rougeLsum_fmeasure_stderr": 0.0018643526489330914, "rougeLsum_precision": 0.18280815280692314, "rougeLsum_precision_stderr": 0.0020740368495163132, "rougeLsum_recall": 0.3054903541601766, "rougeLsum_recall_stderr": 0.0027607171534094984}}, "2": {"tldr_en": {"bleu": 3.26946391692664, "bleu_stderr": 0.06690865495365421, "rouge1_fmeasure": 0.22687895073683426, "rouge1_fmeasure_stderr": 0.0019722087479110005, "rouge1_precision": 0.2022231395663083, "rouge1_precision_stderr": 0.002327440444373304, "rouge1_recall": 0.3212542850069266, "rouge1_recall_stderr": 0.0028325100234559927, "rouge2_fmeasure": 0.05947240390341843, "rouge2_fmeasure_stderr": 0.0011148780082595852, "rouge2_precision": 0.05298207477606098, "rouge2_precision_stderr": 0.0011059450545409227, "rouge2_recall": 0.0867754880829245, "rouge2_recall_stderr": 0.0018124553730314602, "rougeL_fmeasure": 0.1667829458147027, "rougeL_fmeasure_stderr": 0.0013918547568942457, "rougeL_precision": 0.14766643493041728, "rougeL_precision_stderr": 0.0016788338597698632, "rougeL_recall": 0.24166027084574745, "rougeL_recall_stderr": 0.0023102550180774937, "rougeLsum_fmeasure": 0.21293287161862917, "rougeLsum_fmeasure_stderr": 0.0018545411601376014, "rougeLsum_precision": 0.18955567145755783, "rougeLsum_precision_stderr": 0.0021818874452280475, "rougeLsum_recall": 0.30206706797305843, "rougeLsum_recall_stderr": 0.0026937255424983246}}, "3": {"tldr_en": {"bleu": 3.0982581904465443, "bleu_stderr": 0.09308367989527225, "rouge1_fmeasure": 0.18692561650624942, "rouge1_fmeasure_stderr": 0.0023335888686542303, "rouge1_precision": 0.17603411917572567, "rouge1_precision_stderr": 0.0027088089976067017, "rouge1_recall": 0.2612353818883475, "rouge1_recall_stderr": 0.0033992744560135033, "rouge2_fmeasure": 0.04827221140963156, "rouge2_fmeasure_stderr": 0.001080517273254867, "rouge2_precision": 0.04559336904702604, "rouge2_precision_stderr": 0.0012112817237957673, "rouge2_recall": 0.06990530615420809, "rouge2_recall_stderr": 0.0017473693679177455, "rougeL_fmeasure": 0.13971894217937625, "rougeL_fmeasure_stderr": 0.0017066104221784629, "rougeL_precision": 0.13188323013007386, "rougeL_precision_stderr": 0.0020767846091407608, "rougeL_recall": 0.19847294800292903, "rougeL_recall_stderr": 0.002681619286646281, "rougeLsum_fmeasure": 0.17522886133321208, "rougeLsum_fmeasure_stderr": 0.0021928506565670742, "rougeLsum_precision": 0.1648347813541367, "rougeLsum_precision_stderr": 0.0025407224451518312, "rougeLsum_recall": 0.24568842200265278, "rougeLsum_recall_stderr": 0.0032363366658664243}}, "4": {"tldr_en": {"bleu": 0.6565411757725536, "bleu_stderr": 0.052324362211905354, "rouge1_fmeasure": 0.058933509019544375, "rouge1_fmeasure_stderr": 0.002002584327257243, "rouge1_precision": 0.05811148139738531, "rouge1_precision_stderr": 0.0022224537018629854, "rouge1_recall": 0.08558209366311444, "rouge1_recall_stderr": 0.003003751048628007, "rouge2_fmeasure": 0.015382205296321172, "rouge2_fmeasure_stderr": 0.000747616978372682, "rouge2_precision": 0.014672079613835081, "rouge2_precision_stderr": 0.0008619146516289935, "rouge2_recall": 0.02381134201045262, "rouge2_recall_stderr": 0.0012808609018768735, "rougeL_fmeasure": 0.04464141924307046, "rougeL_fmeasure_stderr": 0.0015078323046653491, "rougeL_precision": 0.04421230588130963, "rougeL_precision_stderr": 0.0017253556363268328, "rougeL_recall": 0.06610553705292643, "rougeL_recall_stderr": 0.002375418784846169, "rougeLsum_fmeasure": 0.05503716623523508, "rougeLsum_fmeasure_stderr": 0.0018776883247322259, "rougeLsum_precision": 0.05425853906264316, "rougeLsum_precision_stderr": 0.002080511020141031, "rougeLsum_recall": 0.07994658087674156, "rougeLsum_recall_stderr": 0.0028206819757526707}}, "5": {"tldr_en": {"bleu": 1.0169232766561087e-06, "bleu_stderr": 2.0148976704692234e-06, "rouge1_fmeasure": 0.009010391314895649, "rouge1_fmeasure_stderr": 0.0008608910415638986, "rouge1_precision": 0.009018320417866373, "rouge1_precision_stderr": 0.0009515455079029407, "rouge1_recall": 0.013477068819464475, "rouge1_recall_stderr": 0.0013177953296362858, "rouge2_fmeasure": 0.002248573836077081, "rouge2_fmeasure_stderr": 0.00028843241279034187, "rouge2_precision": 0.002361799926713296, "rouge2_precision_stderr": 0.0003370630501668582, "rouge2_recall": 0.0034894806830281814, "rouge2_recall_stderr": 0.0004812458467814835, "rougeL_fmeasure": 0.006956210385184894, "rougeL_fmeasure_stderr": 0.0006626528079735124, "rougeL_precision": 0.007020816303734581, "rougeL_precision_stderr": 0.0007477069794008652, "rougeL_recall": 0.010607117596650792, "rougeL_recall_stderr": 0.0010619737593296397, "rougeLsum_fmeasure": 0.008558581769365812, "rougeLsum_fmeasure_stderr": 0.0008212318653585135, "rougeLsum_precision": 0.008615795202441968, "rougeLsum_precision_stderr": 0.0009202005223985237, "rougeLsum_recall": 0.012858462170759524, "rougeLsum_recall_stderr": 0.0012682727385585405}}}, "e2e_nlg_cleaned": {"0": {"generate_text_restaurant": {"bleu": 0.14846670480161736, "bleu_stderr": 0.013127750204704683, "rouge1_fmeasure": 0.07826511485489751, "rouge1_fmeasure_stderr": 0.0019164435017392188, "rouge1_precision": 0.26299103271373675, "rouge1_precision_stderr": 0.006382723076774683, "rouge1_recall": 0.05264625320052111, "rouge1_recall_stderr": 0.0014565379578075452, "rouge2_fmeasure": 0.026122584651124606, "rouge2_fmeasure_stderr": 0.00101915256853543, "rouge2_precision": 0.1576462193644736, "rouge2_precision_stderr": 0.005855563265723578, "rouge2_recall": 0.01645135749156905, "rouge2_recall_stderr": 0.0007166454954847944, "rougeL_fmeasure": 0.0738166454356822, "rougeL_fmeasure_stderr": 0.0017248337500189589, "rougeL_precision": 0.2524687494414541, "rougeL_precision_stderr": 0.006192701638587216, "rougeL_recall": 0.04953446876968493, "rougeL_recall_stderr": 0.0012966364411702697, "rougeLsum_fmeasure": 0.07392255592729936, "rougeLsum_fmeasure_stderr": 0.00174327269105063, "rougeLsum_precision": 0.25330917167188566, "rougeLsum_precision_stderr": 0.006207826569406349, "rougeLsum_recall": 0.04925315584474901, "rougeLsum_recall_stderr": 0.0012892531673512504}}, "1": {"generate_text_restaurant": {"bleu": 9.167405083071978, "bleu_stderr": 0.09131785485896106, "rouge1_fmeasure": 0.4160674099572731, "rouge1_fmeasure_stderr": 0.0020114833441626043, "rouge1_precision": 0.4027750555180633, "rouge1_precision_stderr": 0.0022654704081963494, "rouge1_recall": 0.46782170590516187, "rouge1_recall_stderr": 0.0029091530201079247, "rouge2_fmeasure": 0.17514483800816646, "rouge2_fmeasure_stderr": 0.0016790153644324725, "rouge2_precision": 0.16815397557349618, "rouge2_precision_stderr": 0.0016540332686745738, "rouge2_recall": 0.19988678530914838, "rouge2_recall_stderr": 0.0021644647347172754, "rougeL_fmeasure": 0.28884448468062934, "rougeL_fmeasure_stderr": 0.0016305979819716226, "rougeL_precision": 0.27883038558148265, "rougeL_precision_stderr": 0.001734453659792866, "rougeL_recall": 0.3267350269933985, "rougeL_recall_stderr": 0.0023849450074570734, "rougeLsum_fmeasure": 0.3452818942622058, "rougeLsum_fmeasure_stderr": 0.0019526544280227683, "rougeLsum_precision": 0.33427994949554696, "rougeLsum_precision_stderr": 0.00212000114598495, "rougeLsum_recall": 0.3882701548838279, "rougeLsum_recall_stderr": 0.0026955155305927177}}, "2": {"generate_text_restaurant": {"bleu": 10.796301153911008, "bleu_stderr": 0.146708341661776, "rouge1_fmeasure": 0.4253927318807347, "rouge1_fmeasure_stderr": 0.0020174640710145207, "rouge1_precision": 0.41448086950190893, "rouge1_precision_stderr": 0.002308739240960833, "rouge1_recall": 0.47403110687392724, "rouge1_recall_stderr": 0.002885238583477618, "rouge2_fmeasure": 0.18933491861931817, "rouge2_fmeasure_stderr": 0.001739001650440241, "rouge2_precision": 0.18273899527170673, "rouge2_precision_stderr": 0.001736753975334171, "rouge2_recall": 0.21447823413610112, "rouge2_recall_stderr": 0.0022428818903169902, "rougeL_fmeasure": 0.3029238715575528, "rougeL_fmeasure_stderr": 0.001712217734742568, "rougeL_precision": 0.2947058978443582, "rougeL_precision_stderr": 0.0018691331443875447, "rougeL_recall": 0.3390095486229893, "rougeL_recall_stderr": 0.0024297003361930447, "rougeLsum_fmeasure": 0.35424136247162186, "rougeLsum_fmeasure_stderr": 0.0019838710065518987, "rougeLsum_precision": 0.34519699878769394, "rougeLsum_precision_stderr": 0.0021888771482807224, "rougeLsum_recall": 0.3948779703096386, "rougeLsum_recall_stderr": 0.002705400716089457}}, "3": {"generate_text_restaurant": {"bleu": 11.701646150139581, "bleu_stderr": 0.14121347748513918, "rouge1_fmeasure": 0.4379907319797712, "rouge1_fmeasure_stderr": 0.002033620820037117, "rouge1_precision": 0.4240814949754891, "rouge1_precision_stderr": 0.002335304307800915, "rouge1_recall": 0.49091681381630753, "rouge1_recall_stderr": 0.00293584370599599, "rouge2_fmeasure": 0.20260082190070094, "rouge2_fmeasure_stderr": 0.001797391946128592, "rouge2_precision": 0.19473673244504042, "rouge2_precision_stderr": 0.0018208182531307374, "rouge2_recall": 0.2308195484401856, "rouge2_recall_stderr": 0.002332056213772715, "rougeL_fmeasure": 0.3154563500828602, "rougeL_fmeasure_stderr": 0.0017391940408011135, "rougeL_precision": 0.3052254250489387, "rougeL_precision_stderr": 0.0019294151867184208, "rougeL_recall": 0.35489792991067104, "rougeL_recall_stderr": 0.0024799761788283346, "rougeLsum_fmeasure": 0.36657732824376943, "rougeLsum_fmeasure_stderr": 0.0020166468270683123, "rougeLsum_precision": 0.3549420293695019, "rougeLsum_precision_stderr": 0.002227993320953577, "rougeLsum_recall": 0.4111023178790994, "rougeLsum_recall_stderr": 0.0027708358218005813}}, "4": {"generate_text_restaurant": {"bleu": 11.678623201115563, "bleu_stderr": 0.1438023448589331, "rouge1_fmeasure": 0.44123731345241246, "rouge1_fmeasure_stderr": 0.0020394573256137002, "rouge1_precision": 0.42802077277555306, "rouge1_precision_stderr": 0.002424017667020404, "rouge1_recall": 0.4941727757999444, "rouge1_recall_stderr": 0.002864707318340049, "rouge2_fmeasure": 0.20525503989164437, "rouge2_fmeasure_stderr": 0.0018231190752503231, "rouge2_precision": 0.19838194504781104, "rouge2_precision_stderr": 0.0019042215339010943, "rouge2_recall": 0.23317874468894614, "rouge2_recall_stderr": 0.0023244742635299934, "rougeL_fmeasure": 0.3168165933964199, "rougeL_fmeasure_stderr": 0.0017773944657815321, "rougeL_precision": 0.3069996273775671, "rougeL_precision_stderr": 0.002010456416143571, "rougeL_recall": 0.3562550112067379, "rougeL_recall_stderr": 0.0024675677606706826, "rougeLsum_fmeasure": 0.3684518522765776, "rougeLsum_fmeasure_stderr": 0.0020683920310729333, "rougeLsum_precision": 0.3572288837187459, "rougeLsum_precision_stderr": 0.0023220174574098856, "rougeLsum_recall": 0.4131891637059535, "rougeLsum_recall_stderr": 0.002781997432767609}}, "5": {"generate_text_restaurant": {"bleu": 11.256627548432805, "bleu_stderr": 0.14543377736086152, "rouge1_fmeasure": 0.43467772454619635, "rouge1_fmeasure_stderr": 0.0020177075108414894, "rouge1_precision": 0.4215376032960452, "rouge1_precision_stderr": 0.0024025181710894523, "rouge1_recall": 0.48727427299265463, "rouge1_recall_stderr": 0.002867231297396204, "rouge2_fmeasure": 0.19984409683367668, "rouge2_fmeasure_stderr": 0.0017840474360257142, "rouge2_precision": 0.19324475626688103, "rouge2_precision_stderr": 0.0018664258727883866, "rouge2_recall": 0.22689708066119227, "rouge2_recall_stderr": 0.0022677822733911217, "rougeL_fmeasure": 0.31426345805670414, "rougeL_fmeasure_stderr": 0.0017762603380325856, "rougeL_precision": 0.3043439759839776, "rougeL_precision_stderr": 0.002001573450675394, "rougeL_recall": 0.3536697494352659, "rougeL_recall_stderr": 0.0024814074442155673, "rougeLsum_fmeasure": 0.36514922201465566, "rougeLsum_fmeasure_stderr": 0.0020264847795391197, "rougeLsum_precision": 0.35399958676397025, "rougeLsum_precision_stderr": 0.0022946023988637647, "rougeLsum_recall": 0.40981680597146897, "rougeLsum_recall_stderr": 0.0027603392115545873}}}, "gem_xsum": {"0": {"article_DOC_summary": {"bleu": 2.4667492251754903, "bleu_stderr": 0.10381088859974377, "rouge1_fmeasure": 0.2162684407733824, "rouge1_fmeasure_stderr": 0.0028487305378065864, "rouge1_precision": 0.15936520803829934, "rouge1_precision_stderr": 0.002502914065388867, "rouge1_recall": 0.36698072640704477, "rouge1_recall_stderr": 0.004683622882613499, "rouge2_fmeasure": 0.059415288606804666, "rouge2_fmeasure_stderr": 0.0019237063507349787, "rouge2_precision": 0.044026641927011036, "rouge2_precision_stderr": 0.0016092489576812487, "rouge2_recall": 0.10240085620355213, "rouge2_recall_stderr": 0.0032524404949168598, "rougeL_fmeasure": 0.16944484885441244, "rougeL_fmeasure_stderr": 0.0022672242410205668, "rougeL_precision": 0.1250497631451513, "rougeL_precision_stderr": 0.002063727102434744, "rougeL_recall": 0.2883013244617606, "rougeL_recall_stderr": 0.003758676199438092, "rougeLsum_fmeasure": 0.17181025266219813, "rougeLsum_fmeasure_stderr": 0.0024925143856936887, "rougeLsum_precision": 0.12672592447230924, "rougeLsum_precision_stderr": 0.0021949367251175547, "rougeLsum_recall": 0.2924636147885988, "rougeLsum_recall_stderr": 0.004172229113513385}}, "1": {"article_DOC_summary": {"bleu": 1.758172461446717, "bleu_stderr": 0.07910003790624576, "rouge1_fmeasure": 0.19511361094013366, "rouge1_fmeasure_stderr": 0.0025883144750141985, "rouge1_precision": 0.13872518272148576, "rouge1_precision_stderr": 0.0019196010106841707, "rouge1_recall": 0.34230252867072486, "rouge1_recall_stderr": 0.004465857111154106, "rouge2_fmeasure": 0.04506092989376365, "rouge2_fmeasure_stderr": 0.0015273324009655352, "rouge2_precision": 0.03163153944854582, "rouge2_precision_stderr": 0.0010782973297168107, "rouge2_recall": 0.08173836317054556, "rouge2_recall_stderr": 0.0028348136277117873, "rougeL_fmeasure": 0.15075736335997478, "rougeL_fmeasure_stderr": 0.0019327841457787809, "rougeL_precision": 0.10692232891845452, "rougeL_precision_stderr": 0.001417359741210213, "rougeL_recall": 0.26668023859416595, "rougeL_recall_stderr": 0.0035247154238071715, "rougeLsum_fmeasure": 0.15379184945424418, "rougeLsum_fmeasure_stderr": 0.0021360349555465917, "rougeLsum_precision": 0.1091361614782161, "rougeLsum_precision_stderr": 0.0015647258593366256, "rougeLsum_recall": 0.27137465028647, "rougeLsum_recall_stderr": 0.003816794352052926}}, "2": {"article_DOC_summary": {"bleu": 1.929152474522292, "bleu_stderr": 0.06152527877922885, "rouge1_fmeasure": 0.19587821805854708, "rouge1_fmeasure_stderr": 0.0025521664288773132, "rouge1_precision": 0.13980890149209954, "rouge1_precision_stderr": 0.0018977428372584247, "rouge1_recall": 0.3402939621206596, "rouge1_recall_stderr": 0.00439029308436291, "rouge2_fmeasure": 0.04784619134007039, "rouge2_fmeasure_stderr": 0.0015903598069214155, "rouge2_precision": 0.03376773335297692, "rouge2_precision_stderr": 0.0011277676246399322, "rouge2_recall": 0.0855882084980882, "rouge2_recall_stderr": 0.002933007569998225, "rougeL_fmeasure": 0.15806290558333977, "rougeL_fmeasure_stderr": 0.001998316404733819, "rougeL_precision": 0.1126666102037684, "rougeL_precision_stderr": 0.0014774766678267424, "rougeL_recall": 0.2759357644277514, "rougeL_recall_stderr": 0.0035546429323666893, "rougeLsum_fmeasure": 0.15327775753143216, "rougeLsum_fmeasure_stderr": 0.002141837328174842, "rougeLsum_precision": 0.10915831064990977, "rougeLsum_precision_stderr": 0.0015695216746497264, "rougeLsum_recall": 0.2680641977382496, "rougeLsum_recall_stderr": 0.003807444122985714}}, "3": {"article_DOC_summary": {"bleu": 1.9627474167404957, "bleu_stderr": 0.05305571895946147, "rouge1_fmeasure": 0.18802043752912012, "rouge1_fmeasure_stderr": 0.002755991807062541, "rouge1_precision": 0.13763916790173225, "rouge1_precision_stderr": 0.0021817751235199668, "rouge1_recall": 0.3193392811416741, "rouge1_recall_stderr": 0.004739221494631209, "rouge2_fmeasure": 0.04680703390738515, "rouge2_fmeasure_stderr": 0.0016176840589681419, "rouge2_precision": 0.03388199846714144, "rouge2_precision_stderr": 0.0012286320326522608, "rouge2_recall": 0.08180524247777735, "rouge2_recall_stderr": 0.0028960546548511567, "rougeL_fmeasure": 0.1549296797013188, "rougeL_fmeasure_stderr": 0.002213023071302344, "rougeL_precision": 0.11334029248894266, "rougeL_precision_stderr": 0.0017604231336699231, "rougeL_recall": 0.26418540865577744, "rougeL_recall_stderr": 0.0039191139616055375, "rougeLsum_fmeasure": 0.14663858566665416, "rougeLsum_fmeasure_stderr": 0.002296870650164813, "rougeLsum_precision": 0.10737702854211702, "rougeLsum_precision_stderr": 0.0018328777923363876, "rougeLsum_recall": 0.25043089015992415, "rougeLsum_recall_stderr": 0.004037910961250234}}, "4": {"article_DOC_summary": {"bleu": 1.0275910587689852, "bleu_stderr": 0.12038281046093742, "rouge1_fmeasure": 0.05322501092072186, "rouge1_fmeasure_stderr": 0.0029794854167583293, "rouge1_precision": 0.04603105454485719, "rouge1_precision_stderr": 0.00291203860838327, "rouge1_recall": 0.08129731716534012, "rouge1_recall_stderr": 0.0045890356974722666, "rouge2_fmeasure": 0.013388421641777913, "rouge2_fmeasure_stderr": 0.0011079313127855075, "rouge2_precision": 0.011578470150574793, "rouge2_precision_stderr": 0.0013065637311316816, "rouge2_recall": 0.020777972525703254, "rouge2_recall_stderr": 0.0017120733784709268, "rougeL_fmeasure": 0.04330307819667933, "rougeL_fmeasure_stderr": 0.002401031515475891, "rougeL_precision": 0.03796068206126734, "rougeL_precision_stderr": 0.0024963984987536425, "rougeL_recall": 0.06631805498752238, "rougeL_recall_stderr": 0.0037355969047499307, "rougeLsum_fmeasure": 0.04176500700971651, "rougeLsum_fmeasure_stderr": 0.0023719884034967345, "rougeLsum_precision": 0.03683944866828292, "rougeLsum_precision_stderr": 0.00248168608412277, "rougeLsum_recall": 0.06385955481258902, "rougeLsum_recall_stderr": 0.003696411627899061}}, "5": {"article_DOC_summary": {"bleu": 1.531796041787971e-36, "bleu_stderr": 1.3347176862997463e-30, "rouge1_fmeasure": 0.002850511566655562, "rouge1_fmeasure_stderr": 0.0008725813491317338, "rouge1_precision": 0.00321897943071785, "rouge1_precision_stderr": 0.0009979849053940774, "rouge1_recall": 0.002638556229987107, "rouge1_recall_stderr": 0.0008069095304213252, "rouge2_fmeasure": 0.0007583670569643345, "rouge2_fmeasure_stderr": 0.0003812254884717486, "rouge2_precision": 0.0008682722133420189, "rouge2_precision_stderr": 0.00042068273727211144, "rouge2_recall": 0.0006912233160845813, "rouge2_recall_stderr": 0.00036262248705394215, "rougeL_fmeasure": 0.002188808353154973, "rougeL_fmeasure_stderr": 0.000656190328926511, "rougeL_precision": 0.0024287372106315966, "rougeL_precision_stderr": 0.0007289443610313556, "rougeL_recall": 0.002057235735729571, "rougeL_recall_stderr": 0.000619733025851515, "rougeLsum_fmeasure": 0.0024206446753818178, "rougeLsum_fmeasure_stderr": 0.0007355689947184377, "rougeLsum_precision": 0.002734584249275528, "rougeLsum_precision_stderr": 0.0008545960784962966, "rougeLsum_recall": 0.002247308506447503, "rougeLsum_recall_stderr": 0.0006766768468342992}}}}
4b284b84bc4/evaluation/rankeval/4b284b84bc4_0.csv ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task,metric,value,err,version
2
+ anli_r1,acc,0.328,0.014853842487270334,0
3
+ anli_r2,acc,0.337,0.014955087918653605,0
4
+ anli_r3,acc,0.32416666666666666,0.013517438120881624,0
5
+ arc_challenge,acc,0.20648464163822525,0.011828865619002316,0
6
+ arc_challenge,acc_norm,0.2551194539249147,0.012739038695202109,0
7
+ arc_easy,acc,0.40614478114478114,0.010077409815364048,0
8
+ arc_easy,acc_norm,0.3766835016835017,0.009942848077476172,0
9
+ boolq,acc,0.6256880733944954,0.00846424665644323,1
10
+ cb,acc,0.26785714285714285,0.05971290310957636,1
11
+ cb,f1,0.18656056587091072,,1
12
+ copa,acc,0.76,0.04292346959909283,0
13
+ hellaswag,acc,0.4547898824935272,0.004969341773423513,0
14
+ hellaswag,acc_norm,0.5937064329814777,0.004901368629533419,0
15
+ piqa,acc,0.6561479869423286,0.011082356277961393,0
16
+ piqa,acc_norm,0.6528835690968444,0.011107104993128086,0
17
+ rte,acc,0.5595667870036101,0.029882123363118726,0
18
+ sciq,acc,0.775,0.013211720158614756,0
19
+ sciq,acc_norm,0.709,0.014370995982377933,0
20
+ storycloze_2016,acc,0.694815606627472,0.010648664383985661,0
21
+ winogrande,acc,0.5769534333070244,0.01388505535905647,0
4b284b84bc4/evaluation/{4b284b84bc4_0.json → rankeval/4b284b84bc4_0.json} RENAMED
File without changes