diff --git "a/4b284b21bc4/eval/merged.json" "b/4b284b21bc4/eval/merged.json" new file mode 100644--- /dev/null +++ "b/4b284b21bc4/eval/merged.json" @@ -0,0 +1 @@ +{"GEM/web_nlg_en": {"0": {"PALM_prompt": {"bleu": 0.3423886062648571, "bleu_stderr": 0.03277534172219839, "rouge1_fmeasure": 0.1078155721409226, "rouge1_fmeasure_stderr": 0.0020097112571708245, "rouge1_precision": 0.07116912691303, "rouge1_precision_stderr": 0.0015925050224480028, "rouge1_recall": 0.2998523601292701, "rouge1_recall_stderr": 0.004615423133559915, "rouge2_fmeasure": 0.04998894903569846, "rouge2_fmeasure_stderr": 0.001254978454654862, "rouge2_precision": 0.03290155614985229, "rouge2_precision_stderr": 0.0009635800950074162, "rouge2_recall": 0.1430151870039024, "rouge2_recall_stderr": 0.003133828558045969, "rougeL_fmeasure": 0.10384529013890176, "rougeL_fmeasure_stderr": 0.0018689379304153127, "rougeL_precision": 0.06817638471397719, "rougeL_precision_stderr": 0.001440134234671522, "rougeL_recall": 0.2915042735714293, "rougeL_recall_stderr": 0.004516752469805975, "rougeLsum_fmeasure": 0.10254842199703165, "rougeLsum_fmeasure_stderr": 0.0018748895711891628, "rougeLsum_precision": 0.06758374630355668, "rougeLsum_precision_stderr": 0.0014755310584898722, "rougeLsum_recall": 0.2855613282697172, "rougeLsum_recall_stderr": 0.004322222242728911}, "explicit-graph-description2": {"bleu": 0.012484298141528558, "bleu_stderr": 0.0007013360733034469, "rouge1_fmeasure": 0.1293668266283262, "rouge1_fmeasure_stderr": 0.0020451981284945325, "rouge1_precision": 0.1593816005881288, "rouge1_precision_stderr": 0.0028463012802740157, "rouge1_recall": 0.15106039306191837, "rouge1_recall_stderr": 0.0018614327334657758, "rouge2_fmeasure": 0.008447632785522565, "rouge2_fmeasure_stderr": 0.0005005390644606267, "rouge2_precision": 0.010415064972944536, "rouge2_precision_stderr": 0.0005832498460949334, "rouge2_recall": 0.008381836594921248, "rouge2_recall_stderr": 0.0005769442404081656, "rougeL_fmeasure": 0.1059430520968093, "rougeL_fmeasure_stderr": 0.0015859113615098958, "rougeL_precision": 0.12882295031952942, "rougeL_precision_stderr": 0.0022244289545785443, "rougeL_recall": 0.12997760982624038, "rougeL_recall_stderr": 0.0015539465574021857, "rougeLsum_fmeasure": 0.10719224487960032, "rougeLsum_fmeasure_stderr": 0.0017665884689823603, "rougeLsum_precision": 0.1354477122353216, "rougeLsum_precision_stderr": 0.002537160964118554, "rougeLsum_recall": 0.11805775221027685, "rougeLsum_recall_stderr": 0.0015629478566191073}, "implicit-graph-description": {"bleu": 0.08806482552055464, "bleu_stderr": 0.013753939759917743, "rouge1_fmeasure": 0.04217421071874153, "rouge1_fmeasure_stderr": 0.0008234598917093276, "rouge1_precision": 0.02425622875877401, "rouge1_precision_stderr": 0.0005394400535560749, "rouge1_recall": 0.2187977886143418, "rouge1_recall_stderr": 0.002563987829673595, "rouge2_fmeasure": 0.004552592244101363, "rouge2_fmeasure_stderr": 0.0004257311158814129, "rouge2_precision": 0.0027418010825137774, "rouge2_precision_stderr": 0.00027769817357513153, "rouge2_recall": 0.020309096397945828, "rouge2_recall_stderr": 0.0012796421259529498, "rougeL_fmeasure": 0.041384238656935184, "rougeL_fmeasure_stderr": 0.0007632069889173777, "rougeL_precision": 0.02376285292107096, "rougeL_precision_stderr": 0.0004993494816843593, "rougeL_recall": 0.21647307133945715, "rougeL_recall_stderr": 0.002496433700435587, "rougeLsum_fmeasure": 0.029066972734033053, "rougeLsum_fmeasure_stderr": 0.0007033196865904391, "rougeLsum_precision": 0.016764423629076094, "rougeLsum_precision_stderr": 0.00046755444259852875, "rougeLsum_recall": 0.15365295702315013, "rougeLsum_recall_stderr": 0.0020731095703446366}, "non-explicit-description": {"bleu": 0.3002948882840799, "bleu_stderr": 0.014715720186430872, "rouge1_fmeasure": 0.04522889442637627, "rouge1_fmeasure_stderr": 0.0015737069173370269, "rouge1_precision": 0.027292817639688032, "rouge1_precision_stderr": 0.0011146302412515304, "rouge1_recall": 0.18687932979464777, "rouge1_recall_stderr": 0.004479317229289314, "rouge2_fmeasure": 0.012155444595988949, "rouge2_fmeasure_stderr": 0.0008835894771083748, "rouge2_precision": 0.007585884325607279, "rouge2_precision_stderr": 0.0006241186679155009, "rouge2_recall": 0.04765534435088357, "rouge2_recall_stderr": 0.002842447724510974, "rougeL_fmeasure": 0.04203637506264007, "rougeL_fmeasure_stderr": 0.0012876809656108756, "rougeL_precision": 0.025041692580921926, "rougeL_precision_stderr": 0.0008766166267586572, "rougeL_recall": 0.17988054876134196, "rougeL_recall_stderr": 0.004148939076846254, "rougeLsum_fmeasure": 0.03939048904516523, "rougeLsum_fmeasure_stderr": 0.0013252431846109201, "rougeLsum_precision": 0.02362548960154832, "rougeLsum_precision_stderr": 0.00092997017312708, "rougeLsum_recall": 0.16710354998948618, "rougeLsum_recall_stderr": 0.0038167321468611055}, "very-explicit-description": {"bleu": 0.0019246988010293315, "bleu_stderr": 0.0004673515089684523, "rouge1_fmeasure": 0.0314250857721497, "rouge1_fmeasure_stderr": 0.00044204337073046926, "rouge1_precision": 0.018405096713391255, "rouge1_precision_stderr": 0.00028575863010679416, "rouge1_recall": 0.13437768144660597, "rouge1_recall_stderr": 0.0012216701727659973, "rouge2_fmeasure": 1.477135016534563e-05, "rouge2_fmeasure_stderr": 7.496818160211679e-06, "rouge2_precision": 8.223475482087916e-06, "rouge2_precision_stderr": 4.15512150984225e-06, "rouge2_recall": 8.637745180723215e-05, "rouge2_recall_stderr": 5.0776197761980826e-05, "rougeL_fmeasure": 0.031419100488651396, "rougeL_fmeasure_stderr": 0.0004419332549890634, "rougeL_precision": 0.018401510662055136, "rougeL_precision_stderr": 0.00028569714825004946, "rougeL_recall": 0.13436090726457042, "rougeL_recall_stderr": 0.0012213468546436975, "rougeLsum_fmeasure": 0.021716982601686145, "rougeLsum_fmeasure_stderr": 0.00027816849062495334, "rougeLsum_precision": 0.01264230643724104, "rougeLsum_precision_stderr": 0.00017903922143252607, "rougeLsum_recall": 0.09751365313301781, "rougeLsum_recall_stderr": 0.0008310455100700755}}, "1": {"PALM_prompt": {"bleu": 0.43549317288896894, "bleu_stderr": 0.029438186163177924, "rouge1_fmeasure": 0.11204264924342298, "rouge1_fmeasure_stderr": 0.0018816879797699437, "rouge1_precision": 0.07179524471867899, "rouge1_precision_stderr": 0.0013752072779383184, "rouge1_recall": 0.35855038856048876, "rouge1_recall_stderr": 0.005049635846993475, "rouge2_fmeasure": 0.051345397484036256, "rouge2_fmeasure_stderr": 0.0011584347174914676, "rouge2_precision": 0.032801419147362856, "rouge2_precision_stderr": 0.0008197872630256377, "rouge2_recall": 0.1723852958712864, "rouge2_recall_stderr": 0.0035877516735487143, "rougeL_fmeasure": 0.10481665765175784, "rougeL_fmeasure_stderr": 0.0016825296642133658, "rougeL_precision": 0.06711558592890904, "rougeL_precision_stderr": 0.00122163800184802, "rougeL_recall": 0.33387823961526397, "rougeL_recall_stderr": 0.004537882130758048, "rougeLsum_fmeasure": 0.10653343348961458, "rougeLsum_fmeasure_stderr": 0.0017665097203702323, "rougeLsum_precision": 0.06830552553162243, "rougeLsum_precision_stderr": 0.001292240550330291, "rougeLsum_recall": 0.33957294457499176, "rougeLsum_recall_stderr": 0.004661961230072316}, "explicit-graph-description2": {"bleu": 2.2982278621261973, "bleu_stderr": 0.1086928153245287, "rouge1_fmeasure": 0.2328558522551123, "rouge1_fmeasure_stderr": 0.003900111659742201, "rouge1_precision": 0.20808691686715283, "rouge1_precision_stderr": 0.004313398059878784, "rouge1_recall": 0.4306521086221088, "rouge1_recall_stderr": 0.005516793126900347, "rouge2_fmeasure": 0.09334851918279623, "rouge2_fmeasure_stderr": 0.0025591605276398923, "rouge2_precision": 0.08181585393792677, "rouge2_precision_stderr": 0.0027980013134964224, "rouge2_recall": 0.1900831484868744, "rouge2_recall_stderr": 0.004381497415255648, "rougeL_fmeasure": 0.18783561618919367, "rougeL_fmeasure_stderr": 0.0029446786453837417, "rougeL_precision": 0.1667225810957368, "rougeL_precision_stderr": 0.003461815683539065, "rougeL_recall": 0.3693635934874014, "rougeL_recall_stderr": 0.004943896249868083, "rougeLsum_fmeasure": 0.20343772508503727, "rougeLsum_fmeasure_stderr": 0.0035163687455997497, "rougeLsum_precision": 0.1831100821976502, "rougeLsum_precision_stderr": 0.003960631509702234, "rougeLsum_recall": 0.3774482577828289, "rougeLsum_recall_stderr": 0.0050625094090084085}, "implicit-graph-description": {"bleu": 1.3977041826217982, "bleu_stderr": 0.06476957892774847, "rouge1_fmeasure": 0.13297509465908738, "rouge1_fmeasure_stderr": 0.0021274594259975206, "rouge1_precision": 0.08085175511474767, "rouge1_precision_stderr": 0.0015420226394619978, "rouge1_recall": 0.5398549553723303, "rouge1_recall_stderr": 0.004633252724794029, "rouge2_fmeasure": 0.057734516186082864, "rouge2_fmeasure_stderr": 0.0012735245538216635, "rouge2_precision": 0.03501440648141779, "rouge2_precision_stderr": 0.0008855447546426636, "rouge2_recall": 0.25426029625002416, "rouge2_recall_stderr": 0.004268932324515644, "rougeL_fmeasure": 0.11558954246536195, "rougeL_fmeasure_stderr": 0.001566596618343675, "rougeL_precision": 0.06952000055759049, "rougeL_precision_stderr": 0.0011181437346579404, "rougeL_recall": 0.49228292845501337, "rougeL_recall_stderr": 0.004312668887865199, "rougeLsum_fmeasure": 0.11581462372515029, "rougeLsum_fmeasure_stderr": 0.0019451873798571877, "rougeLsum_precision": 0.07046525801920696, "rougeLsum_precision_stderr": 0.001409866048243462, "rougeLsum_recall": 0.476402513292327, "rougeLsum_recall_stderr": 0.004517149863331296}, "non-explicit-description": {"bleu": 2.3174067250834143, "bleu_stderr": 0.1095420616112965, "rouge1_fmeasure": 0.26592346891155666, "rouge1_fmeasure_stderr": 0.0029378841494143377, "rouge1_precision": 0.18576332777291357, "rouge1_precision_stderr": 0.0029496821484972765, "rouge1_recall": 0.6724955092023494, "rouge1_recall_stderr": 0.004391893376238201, "rouge2_fmeasure": 0.12050338190039989, "rouge2_fmeasure_stderr": 0.0019700080370958755, "rouge2_precision": 0.08325319598690542, "rouge2_precision_stderr": 0.0017911862931407451, "rouge2_recall": 0.3291202433540325, "rouge2_recall_stderr": 0.004356879936719757, "rougeL_fmeasure": 0.2022817877685023, "rougeL_fmeasure_stderr": 0.002109071664874411, "rougeL_precision": 0.1397921610751568, "rougeL_precision_stderr": 0.0021793915810409932, "rougeL_recall": 0.5409023321464792, "rougeL_recall_stderr": 0.00450531183061488, "rougeLsum_fmeasure": 0.22586865290383526, "rougeLsum_fmeasure_stderr": 0.0025693268375682216, "rougeLsum_precision": 0.1579221423717992, "rougeLsum_precision_stderr": 0.0026163266317572958, "rougeLsum_recall": 0.5798479579428988, "rougeLsum_recall_stderr": 0.004227813926608466}, "very-explicit-description": {"bleu": 1.6637023692480732, "bleu_stderr": 0.07113198418739454, "rouge1_fmeasure": 0.18551745196543523, "rouge1_fmeasure_stderr": 0.002702377330670695, "rouge1_precision": 0.12318551179378552, "rouge1_precision_stderr": 0.00265250718788714, "rouge1_recall": 0.6558902866499821, "rouge1_recall_stderr": 0.004947345321902676, "rouge2_fmeasure": 0.08099215111185744, "rouge2_fmeasure_stderr": 0.0016729988751392074, "rouge2_precision": 0.05344001076519088, "rouge2_precision_stderr": 0.001521538259378062, "rouge2_recall": 0.31589673451602196, "rouge2_recall_stderr": 0.004512575556417795, "rougeL_fmeasure": 0.14267940599206763, "rougeL_fmeasure_stderr": 0.0019480493372883946, "rougeL_precision": 0.09390021055693351, "rougeL_precision_stderr": 0.0019816232535704525, "rougeL_recall": 0.5356505485725407, "rougeL_recall_stderr": 0.0047206486739923195, "rougeLsum_fmeasure": 0.16432822281253456, "rougeLsum_fmeasure_stderr": 0.002361497951165923, "rougeLsum_precision": 0.10862326603227693, "rougeLsum_precision_stderr": 0.0022972669057671545, "rougeLsum_recall": 0.5918746015837786, "rougeLsum_recall_stderr": 0.004841757394783012}}, "2": {"PALM_prompt": {"bleu": 0.4731158648079456, "bleu_stderr": 0.019398006063203924, "rouge1_fmeasure": 0.11565039220118124, "rouge1_fmeasure_stderr": 0.0017909510986326618, "rouge1_precision": 0.07373712610103833, "rouge1_precision_stderr": 0.0013232763617533117, "rouge1_recall": 0.3838400605745808, "rouge1_recall_stderr": 0.004971353366559517, "rouge2_fmeasure": 0.053828506115298144, "rouge2_fmeasure_stderr": 0.0011299593484305154, "rouge2_precision": 0.03418002818557043, "rouge2_precision_stderr": 0.0007999127016124942, "rouge2_recall": 0.18966804432678097, "rouge2_recall_stderr": 0.0037070132910163134, "rougeL_fmeasure": 0.10799852330681028, "rougeL_fmeasure_stderr": 0.0016342464145243179, "rougeL_precision": 0.0688866789263329, "rougeL_precision_stderr": 0.001197716696827759, "rougeL_recall": 0.35416301817745277, "rougeL_recall_stderr": 0.004408869173242153, "rougeLsum_fmeasure": 0.11029915615802689, "rougeLsum_fmeasure_stderr": 0.0017015090669507955, "rougeLsum_precision": 0.07036220911566307, "rougeLsum_precision_stderr": 0.0012542262936579098, "rougeLsum_recall": 0.3642675981143951, "rougeLsum_recall_stderr": 0.004634298128673839}, "explicit-graph-description2": {"bleu": 6.486000451731127, "bleu_stderr": 0.1760272135645943, "rouge1_fmeasure": 0.42431308600342277, "rouge1_fmeasure_stderr": 0.0041620247128580454, "rouge1_precision": 0.44907951348269975, "rouge1_precision_stderr": 0.005884356170552851, "rouge1_recall": 0.5373597445020493, "rouge1_recall_stderr": 0.004966707028771244, "rouge2_fmeasure": 0.23334399501205214, "rouge2_fmeasure_stderr": 0.003511605630791461, "rouge2_precision": 0.25059145216834844, "rouge2_precision_stderr": 0.004580817767902642, "rouge2_recall": 0.30420539737108304, "rouge2_recall_stderr": 0.004424545920380444, "rougeL_fmeasure": 0.3447121578849147, "rougeL_fmeasure_stderr": 0.0036720178726937637, "rougeL_precision": 0.3650320462167926, "rougeL_precision_stderr": 0.0051681070707463765, "rougeL_recall": 0.4461965468151835, "rougeL_recall_stderr": 0.004628706403782243, "rougeLsum_fmeasure": 0.37564451182458136, "rougeLsum_fmeasure_stderr": 0.0038397764850582553, "rougeLsum_precision": 0.39789891202282635, "rougeLsum_precision_stderr": 0.0053996273811364355, "rougeLsum_recall": 0.48008802419414803, "rougeLsum_recall_stderr": 0.00474893114047731}, "implicit-graph-description": {"bleu": 1.8053876932339799, "bleu_stderr": 0.0692633939605246, "rouge1_fmeasure": 0.15238609977285375, "rouge1_fmeasure_stderr": 0.0021050373299750613, "rouge1_precision": 0.0953355786305315, "rouge1_precision_stderr": 0.0018482318125930552, "rouge1_recall": 0.587435498341808, "rouge1_recall_stderr": 0.0044160353671122265, "rouge2_fmeasure": 0.07451843059126674, "rouge2_fmeasure_stderr": 0.0013538507376262556, "rouge2_precision": 0.04635353043989582, "rouge2_precision_stderr": 0.0011008091437590098, "rouge2_recall": 0.31351855748984986, "rouge2_recall_stderr": 0.0042413072919440875, "rougeL_fmeasure": 0.12964573051803532, "rougeL_fmeasure_stderr": 0.0015563770094126131, "rougeL_precision": 0.08027003022288653, "rougeL_precision_stderr": 0.0014089308706206984, "rougeL_recall": 0.5250978321957749, "rougeL_recall_stderr": 0.0044081811988050555, "rougeLsum_fmeasure": 0.13493557268343806, "rougeLsum_fmeasure_stderr": 0.0019537513800864143, "rougeLsum_precision": 0.08462110334448762, "rougeLsum_precision_stderr": 0.0017266673212118049, "rougeLsum_recall": 0.5256991157959195, "rougeLsum_recall_stderr": 0.004362166200252256}, "non-explicit-description": {"bleu": 2.8194042853676433, "bleu_stderr": 0.08001070612632513, "rouge1_fmeasure": 0.27845169302997186, "rouge1_fmeasure_stderr": 0.0031924386042951483, "rouge1_precision": 0.2102247428366051, "rouge1_precision_stderr": 0.003936648833938165, "rouge1_recall": 0.6673737234050499, "rouge1_recall_stderr": 0.004469275663366487, "rouge2_fmeasure": 0.13465046885050352, "rouge2_fmeasure_stderr": 0.0022013696056236006, "rouge2_precision": 0.10221556389262312, "rouge2_precision_stderr": 0.002526456468616791, "rouge2_recall": 0.3481713656770689, "rouge2_recall_stderr": 0.004502228925703636, "rougeL_fmeasure": 0.21334136742049684, "rougeL_fmeasure_stderr": 0.0024025933828259196, "rougeL_precision": 0.16004846098807926, "rougeL_precision_stderr": 0.003086567397610655, "rougeL_recall": 0.5368811685423543, "rougeL_recall_stderr": 0.00452435180901271, "rougeLsum_fmeasure": 0.24099516029436527, "rougeLsum_fmeasure_stderr": 0.0028365419353728666, "rougeLsum_precision": 0.18189899956371253, "rougeLsum_precision_stderr": 0.0034717018098753866, "rougeLsum_recall": 0.5853292242838097, "rougeLsum_recall_stderr": 0.004328558737604698}, "very-explicit-description": {"bleu": 2.647805827243435, "bleu_stderr": 0.07629701303117106, "rouge1_fmeasure": 0.2377267859934131, "rouge1_fmeasure_stderr": 0.0028511858947845655, "rouge1_precision": 0.1666446893829218, "rouge1_precision_stderr": 0.0032707753020478673, "rouge1_recall": 0.6729567321222975, "rouge1_recall_stderr": 0.004305012822265882, "rouge2_fmeasure": 0.11449533899405696, "rouge2_fmeasure_stderr": 0.0019357009062000538, "rouge2_precision": 0.08075845494399067, "rouge2_precision_stderr": 0.0020915699463510836, "rouge2_recall": 0.35411093297637475, "rouge2_recall_stderr": 0.004506419345004118, "rougeL_fmeasure": 0.1809798890103334, "rougeL_fmeasure_stderr": 0.002186958670887679, "rougeL_precision": 0.12643098669316738, "rougeL_precision_stderr": 0.0025705410191854284, "rougeL_recall": 0.5356644614517825, "rougeL_recall_stderr": 0.0043565730389747955, "rougeLsum_fmeasure": 0.21219482060830694, "rougeLsum_fmeasure_stderr": 0.002515047834126115, "rougeLsum_precision": 0.1479261397803003, "rougeLsum_precision_stderr": 0.0028341939795893155, "rougeLsum_recall": 0.6104555998853953, "rougeLsum_recall_stderr": 0.004304349996355649}}, "3": {"PALM_prompt": {"bleu": 0.511033492138013, "bleu_stderr": 0.016836817368392938, "rouge1_fmeasure": 0.11234455309812195, "rouge1_fmeasure_stderr": 0.001658068318944836, "rouge1_precision": 0.0711394556954671, "rouge1_precision_stderr": 0.0012159991412829965, "rouge1_recall": 0.3852018338546001, "rouge1_recall_stderr": 0.005067790324136812, "rouge2_fmeasure": 0.051724489676439236, "rouge2_fmeasure_stderr": 0.0010463045011873814, "rouge2_precision": 0.03260081169248259, "rouge2_precision_stderr": 0.0007319564987104789, "rouge2_recall": 0.18947442868920766, "rouge2_recall_stderr": 0.0037625536404537635, "rougeL_fmeasure": 0.10462160106353212, "rougeL_fmeasure_stderr": 0.0015374663592284314, "rougeL_precision": 0.06636573472304302, "rougeL_precision_stderr": 0.0011254814997386617, "rougeL_recall": 0.3527863813991901, "rougeL_recall_stderr": 0.004403124502538447, "rougeLsum_fmeasure": 0.10692341362108594, "rougeLsum_fmeasure_stderr": 0.001592257988435529, "rougeLsum_precision": 0.0678291021786637, "rougeLsum_precision_stderr": 0.0011705423632374366, "rougeLsum_recall": 0.36350690371493805, "rougeLsum_recall_stderr": 0.004671323997080324}, "explicit-graph-description2": {"bleu": 8.592270404743596, "bleu_stderr": 0.3478687773942717, "rouge1_fmeasure": 0.46569868214556953, "rouge1_fmeasure_stderr": 0.0039167763038128744, "rouge1_precision": 0.5092720807541621, "rouge1_precision_stderr": 0.005761992782925212, "rouge1_recall": 0.5373968350772912, "rouge1_recall_stderr": 0.004897334207979844, "rouge2_fmeasure": 0.2613220581162864, "rouge2_fmeasure_stderr": 0.00360096423581038, "rouge2_precision": 0.2902018344321373, "rouge2_precision_stderr": 0.004714799020672444, "rouge2_recall": 0.3073000070344161, "rouge2_recall_stderr": 0.004371879453082279, "rougeL_fmeasure": 0.3742438056269717, "rougeL_fmeasure_stderr": 0.0035350046513184163, "rougeL_precision": 0.4098376883485247, "rougeL_precision_stderr": 0.005103095930707823, "rougeL_recall": 0.43837950032145484, "rougeL_recall_stderr": 0.004496609058976664, "rougeLsum_fmeasure": 0.4099104459367334, "rougeLsum_fmeasure_stderr": 0.0036009459176647974, "rougeLsum_precision": 0.4492052702368639, "rougeLsum_precision_stderr": 0.005309452409480573, "rougeLsum_recall": 0.47692163068169213, "rougeLsum_recall_stderr": 0.004649900767270472}, "implicit-graph-description": {"bleu": 1.9133849288269293, "bleu_stderr": 0.04864213509199311, "rouge1_fmeasure": 0.16402787501046193, "rouge1_fmeasure_stderr": 0.0023163623544869127, "rouge1_precision": 0.10564975683529444, "rouge1_precision_stderr": 0.002213084642274899, "rouge1_recall": 0.590100549099999, "rouge1_recall_stderr": 0.00427885948704963, "rouge2_fmeasure": 0.08217622838318879, "rouge2_fmeasure_stderr": 0.0014861030323633745, "rouge2_precision": 0.052782129610134225, "rouge2_precision_stderr": 0.0013492091587997011, "rouge2_recall": 0.3231207925990761, "rouge2_recall_stderr": 0.004256840681994388, "rougeL_fmeasure": 0.13744565617599114, "rougeL_fmeasure_stderr": 0.0017570257873037582, "rougeL_precision": 0.08753763146321639, "rougeL_precision_stderr": 0.001717519449731679, "rougeL_recall": 0.520111059713261, "rougeL_recall_stderr": 0.004351158002319349, "rougeLsum_fmeasure": 0.1459665149392452, "rougeLsum_fmeasure_stderr": 0.0021389387676408454, "rougeLsum_precision": 0.09389990009122845, "rougeLsum_precision_stderr": 0.0019967608416885307, "rougeLsum_recall": 0.5308296127486624, "rougeLsum_recall_stderr": 0.00420828920591214}, "non-explicit-description": {"bleu": 3.0104483725602478, "bleu_stderr": 0.08167667230226254, "rouge1_fmeasure": 0.2895578361728388, "rouge1_fmeasure_stderr": 0.003271301388482767, "rouge1_precision": 0.2214105510553713, "rouge1_precision_stderr": 0.00397852038777946, "rouge1_recall": 0.6470607779127537, "rouge1_recall_stderr": 0.004406369519857878, "rouge2_fmeasure": 0.14344396360996736, "rouge2_fmeasure_stderr": 0.0023093129067051307, "rouge2_precision": 0.10995468701919953, "rouge2_precision_stderr": 0.0025990883422240055, "rouge2_recall": 0.34262834959953414, "rouge2_recall_stderr": 0.004384132009223851, "rougeL_fmeasure": 0.22148965895132458, "rougeL_fmeasure_stderr": 0.0025138893644262995, "rougeL_precision": 0.16816306750707308, "rougeL_precision_stderr": 0.0031335861563326884, "rougeL_recall": 0.5184873981262235, "rougeL_recall_stderr": 0.004441211075979107, "rougeLsum_fmeasure": 0.2517396049055751, "rougeLsum_fmeasure_stderr": 0.002894872783585922, "rougeLsum_precision": 0.19252227804988106, "rougeLsum_precision_stderr": 0.003532058484150838, "rougeLsum_recall": 0.5689593244437532, "rougeLsum_recall_stderr": 0.004202078292368127}, "very-explicit-description": {"bleu": 2.7650889809369867, "bleu_stderr": 0.08861961620174373, "rouge1_fmeasure": 0.23731957047929436, "rouge1_fmeasure_stderr": 0.002799505256926228, "rouge1_precision": 0.16348222985039676, "rouge1_precision_stderr": 0.003146872550209196, "rouge1_recall": 0.6750999942571687, "rouge1_recall_stderr": 0.004231543295192062, "rouge2_fmeasure": 0.11630716134029079, "rouge2_fmeasure_stderr": 0.0019838199857175193, "rouge2_precision": 0.08053104845339831, "rouge2_precision_stderr": 0.002115424621313934, "rouge2_recall": 0.361316044646828, "rouge2_recall_stderr": 0.004498247573306673, "rougeL_fmeasure": 0.1784341269505221, "rougeL_fmeasure_stderr": 0.0021166909618823932, "rougeL_precision": 0.12237541399150458, "rougeL_precision_stderr": 0.0024629041187135487, "rougeL_recall": 0.533219718242574, "rougeL_recall_stderr": 0.004361069891506727, "rougeLsum_fmeasure": 0.2132548329175755, "rougeLsum_fmeasure_stderr": 0.0024974068122956754, "rougeLsum_precision": 0.14659271750171685, "rougeLsum_precision_stderr": 0.002803961686626933, "rougeLsum_recall": 0.6147977963499615, "rougeLsum_recall_stderr": 0.004227824501478167}}, "4": {"PALM_prompt": {"bleu": 0.5255384435057461, "bleu_stderr": 0.03418580070894041, "rouge1_fmeasure": 0.11520145885524227, "rouge1_fmeasure_stderr": 0.0016544612830049232, "rouge1_precision": 0.07289393265058683, "rouge1_precision_stderr": 0.001221203882596679, "rouge1_recall": 0.39210555607501246, "rouge1_recall_stderr": 0.004907654599821545, "rouge2_fmeasure": 0.052942763106877684, "rouge2_fmeasure_stderr": 0.0010297898091120672, "rouge2_precision": 0.03333523314795858, "rouge2_precision_stderr": 0.0007290358506575526, "rouge2_recall": 0.19457648128168623, "rouge2_recall_stderr": 0.003616303249775819, "rougeL_fmeasure": 0.10655328646588215, "rougeL_fmeasure_stderr": 0.0014999277772535491, "rougeL_precision": 0.06752250125401707, "rougeL_precision_stderr": 0.0011086146136256522, "rougeL_recall": 0.35896707400416245, "rougeL_recall_stderr": 0.004279012704408649, "rougeLsum_fmeasure": 0.10971668045319283, "rougeLsum_fmeasure_stderr": 0.001577124609883823, "rougeLsum_precision": 0.06949836967046885, "rougeLsum_precision_stderr": 0.0011669918564828742, "rougeLsum_recall": 0.3713609435685992, "rougeLsum_recall_stderr": 0.004540169352829801}, "explicit-graph-description2": {"bleu": 9.748227001648862, "bleu_stderr": 0.30465771546881343, "rouge1_fmeasure": 0.4679567762670693, "rouge1_fmeasure_stderr": 0.003781568636086802, "rouge1_precision": 0.5265934478163272, "rouge1_precision_stderr": 0.005660483553931701, "rouge1_recall": 0.5179234781275224, "rouge1_recall_stderr": 0.004868884431211911, "rouge2_fmeasure": 0.2637473868679139, "rouge2_fmeasure_stderr": 0.0035396559441471915, "rouge2_precision": 0.3011205615910547, "rouge2_precision_stderr": 0.004723582948062983, "rouge2_recall": 0.29876898581754224, "rouge2_recall_stderr": 0.004360553531900465, "rougeL_fmeasure": 0.3813739343787495, "rougeL_fmeasure_stderr": 0.003540381010340878, "rougeL_precision": 0.42896446744335276, "rougeL_precision_stderr": 0.005075014781599364, "rougeL_recall": 0.42738457538098473, "rougeL_recall_stderr": 0.0045810446815337195, "rougeLsum_fmeasure": 0.4136944072158051, "rougeLsum_fmeasure_stderr": 0.0035754128454684865, "rougeLsum_precision": 0.4663362213990527, "rougeLsum_precision_stderr": 0.005289062452425001, "rougeLsum_recall": 0.4618244358692011, "rougeLsum_recall_stderr": 0.004696080984340952}, "implicit-graph-description": {"bleu": 1.869050618620975, "bleu_stderr": 0.05010088619712935, "rouge1_fmeasure": 0.17069557641057814, "rouge1_fmeasure_stderr": 0.0023989321755758107, "rouge1_precision": 0.11579207158486948, "rouge1_precision_stderr": 0.0027602287304337434, "rouge1_recall": 0.5769641854763163, "rouge1_recall_stderr": 0.004366449526449767, "rouge2_fmeasure": 0.08649984259029919, "rouge2_fmeasure_stderr": 0.0015781356762232315, "rouge2_precision": 0.058968068407052966, "rouge2_precision_stderr": 0.0017140614292096468, "rouge2_recall": 0.316819162619288, "rouge2_recall_stderr": 0.004169288630207608, "rougeL_fmeasure": 0.14186920549480303, "rougeL_fmeasure_stderr": 0.0018314233516765332, "rougeL_precision": 0.09510940810876867, "rougeL_precision_stderr": 0.002143422239902668, "rougeL_recall": 0.5023698294671003, "rougeL_recall_stderr": 0.004376544901142601, "rougeLsum_fmeasure": 0.15217682563526053, "rougeLsum_fmeasure_stderr": 0.0021887647918235626, "rougeLsum_precision": 0.103240891734108, "rougeLsum_precision_stderr": 0.0024728914583356716, "rougeLsum_recall": 0.5194520315283727, "rougeLsum_recall_stderr": 0.004312781932079048}, "non-explicit-description": {"bleu": 3.015666054890952, "bleu_stderr": 0.06037288426011814, "rouge1_fmeasure": 0.2938502007499035, "rouge1_fmeasure_stderr": 0.003471270572664401, "rouge1_precision": 0.23364828958458597, "rouge1_precision_stderr": 0.004386673691400771, "rouge1_recall": 0.6182500596257372, "rouge1_recall_stderr": 0.0044847981647693925, "rouge2_fmeasure": 0.14528102357273256, "rouge2_fmeasure_stderr": 0.002520152138311905, "rouge2_precision": 0.11633321819129998, "rouge2_precision_stderr": 0.0029245832979215612, "rouge2_recall": 0.32548790345098827, "rouge2_recall_stderr": 0.004370629409800285, "rougeL_fmeasure": 0.22758047906242576, "rougeL_fmeasure_stderr": 0.0028386957066170496, "rougeL_precision": 0.18086003912409268, "rougeL_precision_stderr": 0.003662637084531143, "rougeL_recall": 0.49642367395924064, "rougeL_recall_stderr": 0.004453695802686763, "rougeLsum_fmeasure": 0.25592564085521324, "rougeLsum_fmeasure_stderr": 0.003081730914660469, "rougeLsum_precision": 0.20359868681678098, "rougeLsum_precision_stderr": 0.0039290266599936495, "rougeLsum_recall": 0.5457174954752654, "rougeLsum_recall_stderr": 0.004285590260813723}, "very-explicit-description": {"bleu": 2.618154777450835, "bleu_stderr": 0.07956567939253502, "rouge1_fmeasure": 0.23104727016762644, "rouge1_fmeasure_stderr": 0.0028026542146094534, "rouge1_precision": 0.1597629414302839, "rouge1_precision_stderr": 0.0033033901216669354, "rouge1_recall": 0.6823730805286056, "rouge1_recall_stderr": 0.004113105685624019, "rouge2_fmeasure": 0.11317180592447942, "rouge2_fmeasure_stderr": 0.0019371364795214888, "rouge2_precision": 0.07889136187736123, "rouge2_precision_stderr": 0.0021839099115212703, "rouge2_recall": 0.36533837647073053, "rouge2_recall_stderr": 0.004382971120376491, "rougeL_fmeasure": 0.17336408160930467, "rougeL_fmeasure_stderr": 0.0020829759705918555, "rougeL_precision": 0.11952172602503129, "rougeL_precision_stderr": 0.0025993702279697504, "rougeL_recall": 0.5378383490680047, "rougeL_recall_stderr": 0.0042111474429222555, "rougeLsum_fmeasure": 0.2079495191882183, "rougeLsum_fmeasure_stderr": 0.0024838961950283595, "rougeLsum_precision": 0.14312638913068235, "rougeLsum_precision_stderr": 0.0029110532892333936, "rougeLsum_recall": 0.6230515627846621, "rougeLsum_recall_stderr": 0.004122693884939311}}, "5": {"PALM_prompt": {"bleu": 0.5823615010118224, "bleu_stderr": 0.037398869921054644, "rouge1_fmeasure": 0.11639260450950206, "rouge1_fmeasure_stderr": 0.0016495247557604747, "rouge1_precision": 0.07346269739111895, "rouge1_precision_stderr": 0.0012187681049390564, "rouge1_recall": 0.4042470714837263, "rouge1_recall_stderr": 0.005065518163952181, "rouge2_fmeasure": 0.054089458597439195, "rouge2_fmeasure_stderr": 0.0010444166935940533, "rouge2_precision": 0.033940599163428696, "rouge2_precision_stderr": 0.0007391477550345831, "rouge2_recall": 0.20304234761076909, "rouge2_recall_stderr": 0.003764961452207978, "rougeL_fmeasure": 0.10741878654019481, "rougeL_fmeasure_stderr": 0.0015017472778460627, "rougeL_precision": 0.06792933252591714, "rougeL_precision_stderr": 0.0011100797550413972, "rougeL_recall": 0.3687266029366595, "rougeL_recall_stderr": 0.004388331338140494, "rougeLsum_fmeasure": 0.11057542829413115, "rougeLsum_fmeasure_stderr": 0.0015808583588389114, "rougeLsum_precision": 0.06991029736225668, "rougeLsum_precision_stderr": 0.001173008019162227, "rougeLsum_recall": 0.38168151833425595, "rougeLsum_recall_stderr": 0.00467124723267976}, "explicit-graph-description2": {"bleu": 11.081728167045133, "bleu_stderr": 0.32013487215713093, "rouge1_fmeasure": 0.48448665725831064, "rouge1_fmeasure_stderr": 0.003823220765053316, "rouge1_precision": 0.5462360493800595, "rouge1_precision_stderr": 0.005571448210438329, "rouge1_recall": 0.5208556355840253, "rouge1_recall_stderr": 0.0048319172639873075, "rouge2_fmeasure": 0.2771416755952798, "rouge2_fmeasure_stderr": 0.0037021003433152576, "rouge2_precision": 0.3165259707878258, "rouge2_precision_stderr": 0.004816977889561416, "rouge2_recall": 0.3048098435714976, "rouge2_recall_stderr": 0.004476193519328458, "rougeL_fmeasure": 0.3962223744993141, "rougeL_fmeasure_stderr": 0.0035914467869508364, "rougeL_precision": 0.44640610204674275, "rougeL_precision_stderr": 0.005009545091352138, "rougeL_recall": 0.4305349369311683, "rougeL_recall_stderr": 0.004579985246877059, "rougeLsum_fmeasure": 0.4284939067785979, "rougeLsum_fmeasure_stderr": 0.0035897376609210704, "rougeLsum_precision": 0.4831117938397976, "rougeLsum_precision_stderr": 0.005150478425110863, "rougeLsum_recall": 0.46459910587278797, "rougeLsum_recall_stderr": 0.004669405569917108}, "implicit-graph-description": {"bleu": 1.8835135529548273, "bleu_stderr": 0.06110654983373539, "rouge1_fmeasure": 0.17411810979908632, "rouge1_fmeasure_stderr": 0.0025066282022309615, "rouge1_precision": 0.11973455465296313, "rouge1_precision_stderr": 0.002935618771876205, "rouge1_recall": 0.5670989994254594, "rouge1_recall_stderr": 0.0044381978706761, "rouge2_fmeasure": 0.08905956074973984, "rouge2_fmeasure_stderr": 0.0016585801215981953, "rouge2_precision": 0.06166786996995915, "rouge2_precision_stderr": 0.0018761458565112188, "rouge2_recall": 0.31541818233678315, "rouge2_recall_stderr": 0.0042975296956600266, "rougeL_fmeasure": 0.14380871166547562, "rougeL_fmeasure_stderr": 0.0019567050216977417, "rougeL_precision": 0.09825615481667112, "rougeL_precision_stderr": 0.0023871232471909232, "rougeL_recall": 0.48839020581913895, "rougeL_recall_stderr": 0.004392795682637931, "rougeLsum_fmeasure": 0.15569229150500444, "rougeLsum_fmeasure_stderr": 0.00226672543918796, "rougeLsum_precision": 0.10703554238169732, "rougeLsum_precision_stderr": 0.002636419782384992, "rougeLsum_recall": 0.5126878599494855, "rougeLsum_recall_stderr": 0.004346737958136646}, "non-explicit-description": {"bleu": 3.0967052765116168, "bleu_stderr": 0.11202147855167385, "rouge1_fmeasure": 0.2956946057982067, "rouge1_fmeasure_stderr": 0.0036965817626566813, "rouge1_precision": 0.24397794795287123, "rouge1_precision_stderr": 0.004784117876438954, "rouge1_recall": 0.6048647467371622, "rouge1_recall_stderr": 0.004654471513157018, "rouge2_fmeasure": 0.14969245864050304, "rouge2_fmeasure_stderr": 0.0027172096623066288, "rouge2_precision": 0.12634347401889862, "rouge2_precision_stderr": 0.00332524580607351, "rouge2_recall": 0.32128383108894604, "rouge2_recall_stderr": 0.004439902579230902, "rougeL_fmeasure": 0.22972535325752128, "rougeL_fmeasure_stderr": 0.003016005286453144, "rougeL_precision": 0.19006924520843738, "rougeL_precision_stderr": 0.0040021127107316675, "rougeL_recall": 0.48595046650876467, "rougeL_recall_stderr": 0.0044648101430210824, "rougeLsum_fmeasure": 0.2582096378416781, "rougeLsum_fmeasure_stderr": 0.0033136409122061722, "rougeLsum_precision": 0.21334505184004257, "rougeLsum_precision_stderr": 0.0043020289632197005, "rougeLsum_recall": 0.5333276897372095, "rougeLsum_recall_stderr": 0.004389519168542818}, "very-explicit-description": {"bleu": 2.615370951594709, "bleu_stderr": 0.07817523000072259, "rouge1_fmeasure": 0.2261504763429586, "rouge1_fmeasure_stderr": 0.0025955875570617386, "rouge1_precision": 0.14955217982650695, "rouge1_precision_stderr": 0.0027686157067858337, "rouge1_recall": 0.6935554112517629, "rouge1_recall_stderr": 0.003988293582524454, "rouge2_fmeasure": 0.11088772044085522, "rouge2_fmeasure_stderr": 0.001779337380104819, "rouge2_precision": 0.07346044534208657, "rouge2_precision_stderr": 0.0018168270456380426, "rouge2_recall": 0.3742165028406123, "rouge2_recall_stderr": 0.004409012138004924, "rougeL_fmeasure": 0.16925176967777375, "rougeL_fmeasure_stderr": 0.001942005609299536, "rougeL_precision": 0.11122194025511048, "rougeL_precision_stderr": 0.0021369513975720436, "rougeL_recall": 0.5461500634585164, "rougeL_recall_stderr": 0.004179927239919983, "rougeLsum_fmeasure": 0.2039003781849349, "rougeLsum_fmeasure_stderr": 0.002364317599492143, "rougeLsum_precision": 0.134551132386729, "rougeLsum_precision_stderr": 0.002499557879854736, "rougeLsum_recall": 0.6321720185352039, "rougeLsum_recall_stderr": 0.003993926482242108}}}, "GEM/wiki_lingua_en": {"0": {"article_summary_en": {"bleu": 1.7901688600266696, "bleu_stderr": 0.09473494560163237, "rouge1_fmeasure": 0.19551467871244005, "rouge1_fmeasure_stderr": 0.0019193758332128445, "rouge1_precision": 0.1646736204497174, "rouge1_precision_stderr": 0.001944375965208406, "rouge1_recall": 0.28908804331097093, "rouge1_recall_stderr": 0.0027751587359995016, "rouge2_fmeasure": 0.0401546744710191, "rouge2_fmeasure_stderr": 0.0008933850730994426, "rouge2_precision": 0.033596976210297794, "rouge2_precision_stderr": 0.0007783779535836992, "rouge2_recall": 0.06189607439452786, "rouge2_recall_stderr": 0.0015755697883312382, "rougeL_fmeasure": 0.13948475018165474, "rougeL_fmeasure_stderr": 0.001256562390447179, "rougeL_precision": 0.1159536094809458, "rougeL_precision_stderr": 0.0012430606328199091, "rougeL_recall": 0.21237606965032682, "rougeL_recall_stderr": 0.0021178449235415907, "rougeLsum_fmeasure": 0.18192377197057139, "rougeLsum_fmeasure_stderr": 0.001775025387207881, "rougeLsum_precision": 0.15307248616009633, "rougeLsum_precision_stderr": 0.001797771989893273, "rougeLsum_recall": 0.2697257608869243, "rougeLsum_recall_stderr": 0.002597333354461373}, "rephrase_en": {"bleu": 0.5845357736513314, "bleu_stderr": 0.03072266241415454, "rouge1_fmeasure": 0.0881869594314063, "rouge1_fmeasure_stderr": 0.0014841987809875977, "rouge1_precision": 0.07643821536141272, "rouge1_precision_stderr": 0.00140218679541492, "rouge1_recall": 0.12653305762501957, "rouge1_recall_stderr": 0.0021916544609098315, "rouge2_fmeasure": 0.011775575519769068, "rouge2_fmeasure_stderr": 0.0005303096103283162, "rouge2_precision": 0.00991125872816128, "rouge2_precision_stderr": 0.0004535614337776502, "rouge2_recall": 0.018159423646898012, "rouge2_recall_stderr": 0.0009381758127412718, "rougeL_fmeasure": 0.07869794174000905, "rougeL_fmeasure_stderr": 0.0012488046435799164, "rougeL_precision": 0.06754639878793735, "rougeL_precision_stderr": 0.0011484776442586243, "rougeL_recall": 0.11490796429365001, "rougeL_recall_stderr": 0.00197654999272724, "rougeLsum_fmeasure": 0.08220909707902971, "rougeLsum_fmeasure_stderr": 0.00136877406788079, "rougeLsum_precision": 0.0710963976259558, "rougeLsum_precision_stderr": 0.0012842232965217812, "rougeLsum_recall": 0.11843767033976603, "rougeLsum_recall_stderr": 0.002056362191261535}, "summarize_above_en": {"bleu": 0.6707040144799241, "bleu_stderr": 0.03623222494925353, "rouge1_fmeasure": 0.12655297110315525, "rouge1_fmeasure_stderr": 0.0017223356296797345, "rouge1_precision": 0.11663618257355224, "rouge1_precision_stderr": 0.0020010876313614093, "rouge1_recall": 0.17750836507313886, "rouge1_recall_stderr": 0.002420755888491993, "rouge2_fmeasure": 0.017208857113077958, "rouge2_fmeasure_stderr": 0.0006040791246522063, "rouge2_precision": 0.01645523569334117, "rouge2_precision_stderr": 0.0007281743616814746, "rouge2_recall": 0.025103051974347854, "rouge2_recall_stderr": 0.0010138593670258253, "rougeL_fmeasure": 0.1047005055608947, "rougeL_fmeasure_stderr": 0.0013001810997754814, "rougeL_precision": 0.09528496641012617, "rougeL_precision_stderr": 0.0015116946024390716, "rougeL_recall": 0.14985724685972096, "rougeL_recall_stderr": 0.0019870170906742244, "rougeLsum_fmeasure": 0.11665980577339671, "rougeLsum_fmeasure_stderr": 0.001570131237321446, "rougeLsum_precision": 0.1070586643291757, "rougeLsum_precision_stderr": 0.0018157291974967367, "rougeLsum_recall": 0.164451689142304, "rougeLsum_recall_stderr": 0.002238195137110574}, "tldr_en": {"bleu": 1.4552750330972206, "bleu_stderr": 0.05646997136025405, "rouge1_fmeasure": 0.17104966216653122, "rouge1_fmeasure_stderr": 0.0018388410541572498, "rouge1_precision": 0.1463073350919793, "rouge1_precision_stderr": 0.001849976731473478, "rouge1_recall": 0.24687692205659445, "rouge1_recall_stderr": 0.0026245416604448834, "rouge2_fmeasure": 0.03393757157227001, "rouge2_fmeasure_stderr": 0.0008395819605445009, "rouge2_precision": 0.02868070817665277, "rouge2_precision_stderr": 0.0007455100631059865, "rouge2_recall": 0.050970339370236116, "rouge2_recall_stderr": 0.0013773850589296726, "rougeL_fmeasure": 0.13431002294362815, "rougeL_fmeasure_stderr": 0.001329252784566218, "rougeL_precision": 0.1134975332744349, "rougeL_precision_stderr": 0.0013046086784248739, "rougeL_recall": 0.19882847469371265, "rougeL_recall_stderr": 0.002160779528818444, "rougeLsum_fmeasure": 0.1570085078363825, "rougeLsum_fmeasure_stderr": 0.0016723834304551517, "rougeLsum_precision": 0.1341572569615249, "rougeLsum_precision_stderr": 0.001680913399139742, "rougeLsum_recall": 0.22735010800139077, "rougeLsum_recall_stderr": 0.002429214690741554}, "write_abstract_en": {"bleu": 0.9470864610845786, "bleu_stderr": 0.03249798513588096, "rouge1_fmeasure": 0.1242136099389791, "rouge1_fmeasure_stderr": 0.0016788768510989007, "rouge1_precision": 0.10754772441287547, "rouge1_precision_stderr": 0.0016958455385689898, "rouge1_recall": 0.17816562206177886, "rouge1_recall_stderr": 0.0023545814766439887, "rouge2_fmeasure": 0.018163595558952392, "rouge2_fmeasure_stderr": 0.0006156401137280107, "rouge2_precision": 0.015527269147006644, "rouge2_precision_stderr": 0.0005591631732568687, "rouge2_recall": 0.027484473420738165, "rouge2_recall_stderr": 0.0010336047843406612, "rougeL_fmeasure": 0.10783351661135016, "rougeL_fmeasure_stderr": 0.0013120711524955912, "rougeL_precision": 0.09256520381779812, "rougeL_precision_stderr": 0.001314822114051182, "rougeL_recall": 0.15709670414112983, "rougeL_recall_stderr": 0.0019776606072259206, "rougeLsum_fmeasure": 0.11523321457897195, "rougeLsum_fmeasure_stderr": 0.0015382866944637125, "rougeLsum_precision": 0.09949748623454367, "rougeLsum_precision_stderr": 0.0015502911503945975, "rougeLsum_recall": 0.16623114509571557, "rougeLsum_recall_stderr": 0.002196915069674266}}, "1": {"article_summary_en": {"bleu": 2.1642687288297267, "bleu_stderr": 0.03609118184667803, "rouge1_fmeasure": 0.19965649147362877, "rouge1_fmeasure_stderr": 0.001954107478734798, "rouge1_precision": 0.18024946393550403, "rouge1_precision_stderr": 0.0023365148116592273, "rouge1_recall": 0.2858663200405377, "rouge1_recall_stderr": 0.0028577355675838095, "rouge2_fmeasure": 0.04582510801169016, "rouge2_fmeasure_stderr": 0.0009888177785209124, "rouge2_precision": 0.04273282245742925, "rouge2_precision_stderr": 0.0012020669128774719, "rouge2_recall": 0.06727475322952328, "rouge2_recall_stderr": 0.0015736861230501358, "rougeL_fmeasure": 0.14528886846186012, "rougeL_fmeasure_stderr": 0.0013520944350136044, "rougeL_precision": 0.1309377911100276, "rougeL_precision_stderr": 0.0017619039502037427, "rougeL_recall": 0.2128779538670664, "rougeL_recall_stderr": 0.0022235556075845303, "rougeLsum_fmeasure": 0.18583509127014775, "rougeLsum_fmeasure_stderr": 0.0018160956867022732, "rougeLsum_precision": 0.16786109069022864, "rougeLsum_precision_stderr": 0.002201832250644331, "rougeLsum_recall": 0.2665923708356373, "rougeLsum_recall_stderr": 0.0026751422180246536}, "rephrase_en": {"bleu": 1.6714920428051223, "bleu_stderr": 0.07493178085444238, "rouge1_fmeasure": 0.15296390666938464, "rouge1_fmeasure_stderr": 0.0019322995775618457, "rouge1_precision": 0.13347948856316436, "rouge1_precision_stderr": 0.00190203694307572, "rouge1_recall": 0.21739906685237975, "rouge1_recall_stderr": 0.002808744726060509, "rouge2_fmeasure": 0.028480082558715494, "rouge2_fmeasure_stderr": 0.000844119944335723, "rouge2_precision": 0.02419402358312838, "rouge2_precision_stderr": 0.0007370518433472602, "rouge2_recall": 0.042662063621771204, "rouge2_recall_stderr": 0.001398060091307213, "rougeL_fmeasure": 0.11635297169383609, "rougeL_fmeasure_stderr": 0.0013711527236179993, "rougeL_precision": 0.10050917437006718, "rougeL_precision_stderr": 0.0013279033129049013, "rougeL_recall": 0.16941427074493215, "rougeL_recall_stderr": 0.002206464398514822, "rougeLsum_fmeasure": 0.14255020342356567, "rougeLsum_fmeasure_stderr": 0.0017856582246053023, "rougeLsum_precision": 0.12419910398685438, "rougeLsum_precision_stderr": 0.001754709084087944, "rougeLsum_recall": 0.20345233530607656, "rougeLsum_recall_stderr": 0.0026368733023479575}, "summarize_above_en": {"bleu": 1.7421869697677301, "bleu_stderr": 0.09072404254874503, "rouge1_fmeasure": 0.175746351793284, "rouge1_fmeasure_stderr": 0.0019277453775472107, "rouge1_precision": 0.1538305631637577, "rouge1_precision_stderr": 0.002118137735001832, "rouge1_recall": 0.2534164331356793, "rouge1_recall_stderr": 0.0027384226381832084, "rouge2_fmeasure": 0.03420510386300404, "rouge2_fmeasure_stderr": 0.0008827127864692935, "rouge2_precision": 0.030776430413685576, "rouge2_precision_stderr": 0.0009835988394066894, "rouge2_recall": 0.05014575490604494, "rouge2_recall_stderr": 0.0014072381348858034, "rougeL_fmeasure": 0.13274190810557143, "rougeL_fmeasure_stderr": 0.0013509301692574175, "rougeL_precision": 0.11549602941828332, "rougeL_precision_stderr": 0.0015598015652852524, "rougeL_recall": 0.19577461014108763, "rougeL_recall_stderr": 0.002133713428500096, "rougeLsum_fmeasure": 0.16461109581502553, "rougeLsum_fmeasure_stderr": 0.0017958278888830555, "rougeLsum_precision": 0.14403643967460825, "rougeLsum_precision_stderr": 0.0019822677694410597, "rougeLsum_recall": 0.23804944003176418, "rougeLsum_recall_stderr": 0.0025835044144481086}, "tldr_en": {"bleu": 2.807870663958701, "bleu_stderr": 0.08388649431195123, "rouge1_fmeasure": 0.21883314563862602, "rouge1_fmeasure_stderr": 0.001968035833203396, "rouge1_precision": 0.19104818364291273, "rouge1_precision_stderr": 0.0022212767005973058, "rouge1_recall": 0.3165545579856358, "rouge1_recall_stderr": 0.002818835296854103, "rouge2_fmeasure": 0.05423182118294372, "rouge2_fmeasure_stderr": 0.0010465141353409224, "rouge2_precision": 0.04737338234449901, "rouge2_precision_stderr": 0.0010449765003741809, "rouge2_recall": 0.081573363995006, "rouge2_recall_stderr": 0.0017297723073854117, "rougeL_fmeasure": 0.15600647601501733, "rougeL_fmeasure_stderr": 0.0013358035921943341, "rougeL_precision": 0.1352656471757331, "rougeL_precision_stderr": 0.0015360042939327185, "rougeL_recall": 0.23168675674466935, "rougeL_recall_stderr": 0.0022572191515858007, "rougeLsum_fmeasure": 0.20602296990458846, "rougeLsum_fmeasure_stderr": 0.0018407304080033926, "rougeLsum_precision": 0.17973117727215357, "rougeLsum_precision_stderr": 0.0020830282164021902, "rougeLsum_recall": 0.29889898054403574, "rougeLsum_recall_stderr": 0.0026791790500151685}, "write_abstract_en": {"bleu": 1.0004698111272452, "bleu_stderr": 0.061814579441424315, "rouge1_fmeasure": 0.11703392994984627, "rouge1_fmeasure_stderr": 0.0017519304821591252, "rouge1_precision": 0.10903330571413737, "rouge1_precision_stderr": 0.0018189243088536072, "rouge1_recall": 0.16131809640403347, "rouge1_recall_stderr": 0.002537055300872018, "rouge2_fmeasure": 0.015006978500477466, "rouge2_fmeasure_stderr": 0.0006493711106973352, "rouge2_precision": 0.013663796459003277, "rouge2_precision_stderr": 0.0006556047677219756, "rouge2_recall": 0.02186335600606571, "rouge2_recall_stderr": 0.0010597085086764856, "rougeL_fmeasure": 0.0890622266980778, "rougeL_fmeasure_stderr": 0.0011999850822209978, "rougeL_precision": 0.08286088730918939, "rougeL_precision_stderr": 0.0013035354194490178, "rougeL_recall": 0.12527349544952351, "rougeL_recall_stderr": 0.001902974449846937, "rougeLsum_fmeasure": 0.11027318467498079, "rougeLsum_fmeasure_stderr": 0.0016371611741646582, "rougeLsum_precision": 0.10274455693959166, "rougeLsum_precision_stderr": 0.0017065171168041343, "rougeLsum_recall": 0.15212866545305415, "rougeLsum_recall_stderr": 0.0023755694730805688}}, "2": {"article_summary_en": {"bleu": 2.5286650412737517, "bleu_stderr": 0.08014901866692561, "rouge1_fmeasure": 0.210239996789526, "rouge1_fmeasure_stderr": 0.0019255181063177478, "rouge1_precision": 0.202956767711254, "rouge1_precision_stderr": 0.002676187150983512, "rouge1_recall": 0.29159713975018653, "rouge1_recall_stderr": 0.0028298980612215777, "rouge2_fmeasure": 0.05097602932656367, "rouge2_fmeasure_stderr": 0.0010520475342058347, "rouge2_precision": 0.052380042676589786, "rouge2_precision_stderr": 0.0015473198207191262, "rouge2_recall": 0.07214674692201985, "rouge2_recall_stderr": 0.001657658589837987, "rougeL_fmeasure": 0.15265446172906474, "rougeL_fmeasure_stderr": 0.001367842815236307, "rougeL_precision": 0.14900247000453432, "rougeL_precision_stderr": 0.002165214590894441, "rougeL_recall": 0.21545359034531394, "rougeL_recall_stderr": 0.00221033373558327, "rougeLsum_fmeasure": 0.19702119815111355, "rougeLsum_fmeasure_stderr": 0.0018063906023379486, "rougeLsum_precision": 0.1905565529645814, "rougeLsum_precision_stderr": 0.0025620703427353705, "rougeLsum_recall": 0.27385801709096713, "rougeLsum_recall_stderr": 0.0026819224687665406}, "rephrase_en": {"bleu": 2.6275735827595286, "bleu_stderr": 0.07295788379483721, "rouge1_fmeasure": 0.17865057487457384, "rouge1_fmeasure_stderr": 0.0019832213339117484, "rouge1_precision": 0.1554086212544319, "rouge1_precision_stderr": 0.002067211936039912, "rouge1_recall": 0.2544322573521774, "rouge1_recall_stderr": 0.002810922596892263, "rouge2_fmeasure": 0.04342414284752008, "rouge2_fmeasure_stderr": 0.001003062599440997, "rouge2_precision": 0.037714132074904975, "rouge2_precision_stderr": 0.0009374766307825127, "rouge2_recall": 0.06361946862872042, "rouge2_recall_stderr": 0.001591682210752721, "rougeL_fmeasure": 0.1435215312656172, "rougeL_fmeasure_stderr": 0.0014687451140009584, "rougeL_precision": 0.12334722471039417, "rougeL_precision_stderr": 0.0015067280447067734, "rougeL_recall": 0.2091785533285848, "rougeL_recall_stderr": 0.002327818378218966, "rougeLsum_fmeasure": 0.16537912003845628, "rougeLsum_fmeasure_stderr": 0.0018481029569662097, "rougeLsum_precision": 0.14357511331593745, "rougeLsum_precision_stderr": 0.001917549831974918, "rougeLsum_recall": 0.23666987395605485, "rougeLsum_recall_stderr": 0.00267335114823891}, "summarize_above_en": {"bleu": 2.552370721219516, "bleu_stderr": 0.056456617033753535, "rouge1_fmeasure": 0.1952096816483372, "rouge1_fmeasure_stderr": 0.001995746852085018, "rouge1_precision": 0.18530745879024665, "rouge1_precision_stderr": 0.0026984526710165596, "rouge1_recall": 0.27372904579227686, "rouge1_recall_stderr": 0.002825062477476198, "rouge2_fmeasure": 0.04558123756721243, "rouge2_fmeasure_stderr": 0.0010186767133771723, "rouge2_precision": 0.0457473960215223, "rouge2_precision_stderr": 0.0014209292446018405, "rouge2_recall": 0.0647678078130054, "rouge2_recall_stderr": 0.001616150713511116, "rougeL_fmeasure": 0.1516146243997318, "rougeL_fmeasure_stderr": 0.0014416996193126431, "rougeL_precision": 0.14324298206218397, "rougeL_precision_stderr": 0.0020861451808508974, "rougeL_recall": 0.2174336811042295, "rougeL_recall_stderr": 0.002306599328778544, "rougeLsum_fmeasure": 0.18225288248589572, "rougeLsum_fmeasure_stderr": 0.0018597154608296434, "rougeLsum_precision": 0.17310048096253244, "rougeLsum_precision_stderr": 0.002540880696334963, "rougeLsum_recall": 0.2564277940851817, "rougeLsum_recall_stderr": 0.002676735414961789}, "tldr_en": {"bleu": 3.0758182963274967, "bleu_stderr": 0.07269865886048295, "rouge1_fmeasure": 0.213319730404769, "rouge1_fmeasure_stderr": 0.001947942927707738, "rouge1_precision": 0.22091940935835946, "rouge1_precision_stderr": 0.003036544780507811, "rouge1_recall": 0.28592540020450335, "rouge1_recall_stderr": 0.0028268458929152375, "rouge2_fmeasure": 0.05619324678157442, "rouge2_fmeasure_stderr": 0.0011378174233996204, "rouge2_precision": 0.0625668775391695, "rouge2_precision_stderr": 0.001863481800778566, "rouge2_recall": 0.07636442527094099, "rouge2_recall_stderr": 0.0016705370533154184, "rougeL_fmeasure": 0.15707830961714367, "rougeL_fmeasure_stderr": 0.001424704542577375, "rougeL_precision": 0.16524622277086715, "rougeL_precision_stderr": 0.002504977446892061, "rougeL_recall": 0.21312172953880804, "rougeL_recall_stderr": 0.002230572391304009, "rougeLsum_fmeasure": 0.20133540483239756, "rougeLsum_fmeasure_stderr": 0.0018432855451969799, "rougeLsum_precision": 0.2088533869488438, "rougeLsum_precision_stderr": 0.0029209986594073956, "rougeLsum_recall": 0.27031759901723496, "rougeLsum_recall_stderr": 0.00269091513844755}, "write_abstract_en": {"bleu": 1.2275559960359324, "bleu_stderr": 0.055602030188377644, "rouge1_fmeasure": 0.08849504604578734, "rouge1_fmeasure_stderr": 0.0017680512462351341, "rouge1_precision": 0.09690581008488926, "rouge1_precision_stderr": 0.0024248475400055754, "rouge1_recall": 0.11326850686300136, "rouge1_recall_stderr": 0.002336894022718304, "rouge2_fmeasure": 0.015011655554296155, "rouge2_fmeasure_stderr": 0.0007047911945416481, "rouge2_precision": 0.018300502482703724, "rouge2_precision_stderr": 0.001212414444454405, "rouge2_recall": 0.01947166619266159, "rouge2_recall_stderr": 0.0009643040730582587, "rougeL_fmeasure": 0.07359670365804201, "rougeL_fmeasure_stderr": 0.0013609375455583069, "rougeL_precision": 0.08087122864775938, "rougeL_precision_stderr": 0.002027221626489267, "rougeL_recall": 0.09562923022814045, "rougeL_recall_stderr": 0.00189683385928486, "rougeLsum_fmeasure": 0.0827072308194733, "rougeLsum_fmeasure_stderr": 0.0016679545909741406, "rougeLsum_precision": 0.09100233183422309, "rougeLsum_precision_stderr": 0.0023243523379837595, "rougeLsum_recall": 0.1060183257047995, "rougeLsum_recall_stderr": 0.002210133291756926}}, "3": {"article_summary_en": {"bleu": 2.6775327228175545, "bleu_stderr": 0.09702171839636106, "rouge1_fmeasure": 0.1757806794088883, "rouge1_fmeasure_stderr": 0.002256613151873409, "rouge1_precision": 0.17231067692889554, "rouge1_precision_stderr": 0.0027956027935857304, "rouge1_recall": 0.243172171476554, "rouge1_recall_stderr": 0.0032858246005163156, "rouge2_fmeasure": 0.042273822347749235, "rouge2_fmeasure_stderr": 0.0010073987728216936, "rouge2_precision": 0.04314429570273796, "rouge2_precision_stderr": 0.0013561190191307, "rouge2_recall": 0.05984664442402973, "rouge2_recall_stderr": 0.0015879641807220839, "rougeL_fmeasure": 0.12839540685323708, "rougeL_fmeasure_stderr": 0.00161856743478718, "rougeL_precision": 0.1269442197755899, "rougeL_precision_stderr": 0.0021738281830229454, "rougeL_recall": 0.18134356725730427, "rougeL_recall_stderr": 0.0025688638483573395, "rougeLsum_fmeasure": 0.16451248533027701, "rougeLsum_fmeasure_stderr": 0.002109355534586791, "rougeLsum_precision": 0.16152443369273212, "rougeLsum_precision_stderr": 0.0026463298498459026, "rougeLsum_recall": 0.22829827365100974, "rougeLsum_recall_stderr": 0.0031167189145814398}, "rephrase_en": {"bleu": 2.696717061739458, "bleu_stderr": 0.10004414748713579, "rouge1_fmeasure": 0.1487528972942182, "rouge1_fmeasure_stderr": 0.0021573858174289036, "rouge1_precision": 0.13637703415923158, "rouge1_precision_stderr": 0.0023528280402658097, "rouge1_recall": 0.2101854175375818, "rouge1_recall_stderr": 0.003134663411489244, "rouge2_fmeasure": 0.03703687600072442, "rouge2_fmeasure_stderr": 0.0009586366255869859, "rouge2_precision": 0.03442882399419072, "rouge2_precision_stderr": 0.001085658422803244, "rouge2_recall": 0.05416049539107172, "rouge2_recall_stderr": 0.0015569213296257193, "rougeL_fmeasure": 0.12065017013720424, "rougeL_fmeasure_stderr": 0.0016584003927085527, "rougeL_precision": 0.10957513679331089, "rougeL_precision_stderr": 0.0018197153562858968, "rougeL_recall": 0.17454640678104377, "rougeL_recall_stderr": 0.002643517915284805, "rougeLsum_fmeasure": 0.13759161249237567, "rougeLsum_fmeasure_stderr": 0.0020057670100919424, "rougeLsum_precision": 0.12614818245454176, "rougeLsum_precision_stderr": 0.0022002630882711474, "rougeLsum_recall": 0.1951306252568764, "rougeLsum_recall_stderr": 0.0029575901472317177}, "summarize_above_en": {"bleu": 2.8903876098590975, "bleu_stderr": 0.08033021470087338, "rouge1_fmeasure": 0.16311913030851707, "rouge1_fmeasure_stderr": 0.002334208239517657, "rouge1_precision": 0.16655273761856884, "rouge1_precision_stderr": 0.0030693164785274346, "rouge1_recall": 0.22239594480968503, "rouge1_recall_stderr": 0.0032442335330023385, "rouge2_fmeasure": 0.04051754398323109, "rouge2_fmeasure_stderr": 0.0010700958454151245, "rouge2_precision": 0.04426024560468626, "rouge2_precision_stderr": 0.0015913171237693896, "rouge2_recall": 0.05557686275427169, "rouge2_recall_stderr": 0.001569025220107771, "rougeL_fmeasure": 0.1277212531982069, "rougeL_fmeasure_stderr": 0.0017662200246993489, "rougeL_precision": 0.1312478956516053, "rougeL_precision_stderr": 0.0025056286449919598, "rougeL_recall": 0.17774125142496916, "rougeL_recall_stderr": 0.0026625038105766233, "rougeLsum_fmeasure": 0.1523177208194968, "rougeLsum_fmeasure_stderr": 0.002187292192524935, "rougeLsum_precision": 0.15605200382139137, "rougeLsum_precision_stderr": 0.002922128458943474, "rougeLsum_recall": 0.20818681432495342, "rougeLsum_recall_stderr": 0.003071577904437599}, "tldr_en": {"bleu": 3.2676357062273516, "bleu_stderr": 0.12290136548273946, "rouge1_fmeasure": 0.17690311487463797, "rouge1_fmeasure_stderr": 0.0023173541475324253, "rouge1_precision": 0.21388175415191102, "rouge1_precision_stderr": 0.0037807201763198858, "rouge1_recall": 0.22109179833036308, "rouge1_recall_stderr": 0.00317048229219639, "rouge2_fmeasure": 0.04775424867054453, "rouge2_fmeasure_stderr": 0.0011657814934947282, "rouge2_precision": 0.06310561248802647, "rouge2_precision_stderr": 0.0021350241076741575, "rouge2_recall": 0.05983363177094545, "rouge2_recall_stderr": 0.0015915281660158853, "rougeL_fmeasure": 0.13377606619329538, "rougeL_fmeasure_stderr": 0.0017521905460361976, "rougeL_precision": 0.16656647324261953, "rougeL_precision_stderr": 0.0032154715784240336, "rougeL_recall": 0.16788106886771248, "rougeL_recall_stderr": 0.0024768275501905303, "rougeLsum_fmeasure": 0.16765421945999662, "rougeLsum_fmeasure_stderr": 0.002202375233222233, "rougeLsum_precision": 0.2032660883469589, "rougeLsum_precision_stderr": 0.003645991164048245, "rougeLsum_recall": 0.20988404520976878, "rougeLsum_recall_stderr": 0.003033569663633178}, "write_abstract_en": {"bleu": 1.4434859465762546, "bleu_stderr": 0.07239736975664855, "rouge1_fmeasure": 0.07581549153450809, "rouge1_fmeasure_stderr": 0.0018567329894756907, "rouge1_precision": 0.08727403229824231, "rouge1_precision_stderr": 0.002754747362815702, "rouge1_recall": 0.10038330296542393, "rouge1_recall_stderr": 0.0025674911073591644, "rouge2_fmeasure": 0.015436277594316166, "rouge2_fmeasure_stderr": 0.00075744970662451, "rouge2_precision": 0.022062717078419312, "rouge2_precision_stderr": 0.0016202827842642569, "rouge2_recall": 0.020684116383698558, "rouge2_recall_stderr": 0.0010865171767872877, "rougeL_fmeasure": 0.0633376000334696, "rougeL_fmeasure_stderr": 0.0014673941274429858, "rougeL_precision": 0.07371123650658079, "rougeL_precision_stderr": 0.0023770899696980465, "rougeL_recall": 0.08513597314680194, "rougeL_recall_stderr": 0.00212812432731865, "rougeLsum_fmeasure": 0.07094827851387892, "rougeLsum_fmeasure_stderr": 0.0017436711590059298, "rougeLsum_precision": 0.08218273785211457, "rougeLsum_precision_stderr": 0.0026325580514724206, "rougeLsum_recall": 0.09419184346305906, "rougeLsum_recall_stderr": 0.0024294175505879165}}, "4": {"article_summary_en": {"bleu": 0.4585803694189491, "bleu_stderr": 0.042449805324812594, "rouge1_fmeasure": 0.05586170258027261, "rouge1_fmeasure_stderr": 0.0019271475347510442, "rouge1_precision": 0.05610293921952886, "rouge1_precision_stderr": 0.0021934282933142807, "rouge1_recall": 0.08033002095461989, "rouge1_recall_stderr": 0.002892648072265376, "rouge2_fmeasure": 0.013928620381908459, "rouge2_fmeasure_stderr": 0.000708425052711742, "rouge2_precision": 0.014518361578080535, "rouge2_precision_stderr": 0.0009832377227663412, "rouge2_recall": 0.021124036864236993, "rouge2_recall_stderr": 0.0011841595966348834, "rougeL_fmeasure": 0.04182384962792516, "rougeL_fmeasure_stderr": 0.0014295239104789405, "rougeL_precision": 0.042762546108040304, "rougeL_precision_stderr": 0.0017529381402243264, "rougeL_recall": 0.06135293147334968, "rougeL_recall_stderr": 0.0022662727518870454, "rougeLsum_fmeasure": 0.05238777661289263, "rougeLsum_fmeasure_stderr": 0.0018093330643826854, "rougeLsum_precision": 0.05281014844060752, "rougeLsum_precision_stderr": 0.0020838090836962377, "rougeLsum_recall": 0.07549636242613741, "rougeLsum_recall_stderr": 0.002735416186516671}, "rephrase_en": {"bleu": 0.45244912498308465, "bleu_stderr": 0.04522890822214683, "rouge1_fmeasure": 0.0460595175107433, "rouge1_fmeasure_stderr": 0.001696044544780373, "rouge1_precision": 0.043639000260860615, "rouge1_precision_stderr": 0.0017644092166845472, "rouge1_recall": 0.06690857706863987, "rouge1_recall_stderr": 0.002513457134572579, "rouge2_fmeasure": 0.011604393750423206, "rouge2_fmeasure_stderr": 0.0006238601062608368, "rouge2_precision": 0.010977304219473067, "rouge2_precision_stderr": 0.0007111284641033093, "rouge2_recall": 0.017770704280928078, "rouge2_recall_stderr": 0.0010731197359821348, "rougeL_fmeasure": 0.038179249417304884, "rougeL_fmeasure_stderr": 0.0013818818185200492, "rougeL_precision": 0.03598281883079817, "rougeL_precision_stderr": 0.0014505603927416834, "rougeL_recall": 0.05685382605643658, "rougeL_recall_stderr": 0.002163845582643641, "rougeLsum_fmeasure": 0.04260911072473102, "rougeLsum_fmeasure_stderr": 0.001575090983128174, "rougeLsum_precision": 0.04044545116309038, "rougeLsum_precision_stderr": 0.0016519202271058225, "rougeLsum_recall": 0.062070973864341086, "rougeLsum_recall_stderr": 0.0023531363135469723}, "summarize_above_en": {"bleu": 0.3208877330233361, "bleu_stderr": 0.03494147074048822, "rouge1_fmeasure": 0.04834308907312041, "rouge1_fmeasure_stderr": 0.0018503707685198511, "rouge1_precision": 0.05445104964004579, "rouge1_precision_stderr": 0.002443079649660942, "rouge1_recall": 0.06600911297746138, "rouge1_recall_stderr": 0.0025520971880013655, "rouge2_fmeasure": 0.01222026826951555, "rouge2_fmeasure_stderr": 0.000704325225575219, "rouge2_precision": 0.015529728785536703, "rouge2_precision_stderr": 0.0011838329122922067, "rouge2_recall": 0.016747064756245943, "rouge2_recall_stderr": 0.0010305687534927282, "rougeL_fmeasure": 0.03857646230433444, "rougeL_fmeasure_stderr": 0.0014595106562382601, "rougeL_precision": 0.04394332257272588, "rougeL_precision_stderr": 0.00201438218193949, "rougeL_recall": 0.05360636578416732, "rougeL_recall_stderr": 0.002104351294157986, "rougeLsum_fmeasure": 0.04516151790202869, "rougeLsum_fmeasure_stderr": 0.0017340898610069717, "rougeLsum_precision": 0.05112939118161437, "rougeLsum_precision_stderr": 0.0023090030630797185, "rougeLsum_recall": 0.06163586533175658, "rougeLsum_recall_stderr": 0.002391851276162087}, "tldr_en": {"bleu": 0.3207302415446247, "bleu_stderr": 0.03951812995159553, "rouge1_fmeasure": 0.05381265634572307, "rouge1_fmeasure_stderr": 0.0019615926895963105, "rouge1_precision": 0.07217489085431607, "rouge1_precision_stderr": 0.003030440715398035, "rouge1_recall": 0.06719640775054124, "rouge1_recall_stderr": 0.0025953918642397556, "rouge2_fmeasure": 0.014607128479951145, "rouge2_fmeasure_stderr": 0.0007893617726734669, "rouge2_precision": 0.021665967083708774, "rouge2_precision_stderr": 0.0015098116987345535, "rouge2_recall": 0.01860172806438717, "rouge2_recall_stderr": 0.0011134284906677621, "rougeL_fmeasure": 0.041897511040161206, "rougeL_fmeasure_stderr": 0.0015198623386063986, "rougeL_precision": 0.057955761496930415, "rougeL_precision_stderr": 0.002549676328226009, "rougeL_recall": 0.05288705994275681, "rougeL_recall_stderr": 0.0020802147270799964, "rougeLsum_fmeasure": 0.05091825334557307, "rougeLsum_fmeasure_stderr": 0.001857333758746474, "rougeLsum_precision": 0.06845672227370601, "rougeLsum_precision_stderr": 0.0028945029688266562, "rougeLsum_recall": 0.06371644081837961, "rougeLsum_recall_stderr": 0.0024692144864177654}, "write_abstract_en": {"bleu": 0.050241534910141376, "bleu_stderr": 0.00739529390509812, "rouge1_fmeasure": 0.019677720071019873, "rouge1_fmeasure_stderr": 0.0011247541319417723, "rouge1_precision": 0.02122229087394324, "rouge1_precision_stderr": 0.0014003737821723116, "rouge1_recall": 0.02652989886804251, "rouge1_recall_stderr": 0.001520146131752233, "rouge2_fmeasure": 0.003915565527437388, "rouge2_fmeasure_stderr": 0.0003687016348744532, "rouge2_precision": 0.004726681009885704, "rouge2_precision_stderr": 0.0006457139194226688, "rouge2_recall": 0.005360196615771528, "rouge2_recall_stderr": 0.0005369786057079111, "rougeL_fmeasure": 0.016278889399882698, "rougeL_fmeasure_stderr": 0.0008990662478189324, "rougeL_precision": 0.017594475833257826, "rougeL_precision_stderr": 0.0011648478992631079, "rougeL_recall": 0.022309854870142254, "rougeL_recall_stderr": 0.0012646138680947798, "rougeLsum_fmeasure": 0.018303637479182004, "rougeLsum_fmeasure_stderr": 0.0010484815874735873, "rougeLsum_precision": 0.019818945547900205, "rougeLsum_precision_stderr": 0.0013230955041361396, "rougeLsum_recall": 0.024765412779247827, "rougeLsum_recall_stderr": 0.0014222742525507647}}, "5": {"article_summary_en": {"bleu": 1.929822783994029e-07, "bleu_stderr": 6.454754027890242e-07, "rouge1_fmeasure": 0.008568589881060913, "rouge1_fmeasure_stderr": 0.0008425364496636647, "rouge1_precision": 0.01015187837282015, "rouge1_precision_stderr": 0.0012118368481013803, "rouge1_recall": 0.012597080447224141, "rouge1_recall_stderr": 0.0012614668952345483, "rouge2_fmeasure": 0.0020632447863588753, "rouge2_fmeasure_stderr": 0.0002724530157499788, "rouge2_precision": 0.002070478186206016, "rouge2_precision_stderr": 0.0003593417665592502, "rouge2_recall": 0.0031561784431225486, "rouge2_recall_stderr": 0.00042801548350165436, "rougeL_fmeasure": 0.006424819886854842, "rougeL_fmeasure_stderr": 0.0006247554238591577, "rougeL_precision": 0.008016061817972518, "rougeL_precision_stderr": 0.0010514974373518058, "rougeL_recall": 0.009704734876679203, "rougeL_recall_stderr": 0.0009871790542858633, "rougeLsum_fmeasure": 0.007978769124555215, "rougeLsum_fmeasure_stderr": 0.0007798817433535373, "rougeLsum_precision": 0.009477961645303942, "rougeLsum_precision_stderr": 0.001148548084981138, "rougeLsum_recall": 0.011862110948524358, "rougeLsum_recall_stderr": 0.00119353729888407}, "rephrase_en": {"bleu": 5.251000373867168e-08, "bleu_stderr": 1.1017248657586459e-07, "rouge1_fmeasure": 0.0070051523902687545, "rouge1_fmeasure_stderr": 0.000741818765951434, "rouge1_precision": 0.0070189779398907685, "rouge1_precision_stderr": 0.0008113662184147494, "rouge1_recall": 0.009756903947301174, "rouge1_recall_stderr": 0.0010491271958210223, "rouge2_fmeasure": 0.0018773141620031116, "rouge2_fmeasure_stderr": 0.0002924665783736417, "rouge2_precision": 0.0018834961678454644, "rouge2_precision_stderr": 0.0003137323371258015, "rouge2_recall": 0.0025948535559862265, "rouge2_recall_stderr": 0.0004145432652310791, "rougeL_fmeasure": 0.0055660827324147755, "rougeL_fmeasure_stderr": 0.0005823170803562475, "rougeL_precision": 0.005576197279787843, "rougeL_precision_stderr": 0.0006409300377598502, "rougeL_recall": 0.007827196064222617, "rougeL_recall_stderr": 0.0008343635995047826, "rougeLsum_fmeasure": 0.0064846745016825785, "rougeLsum_fmeasure_stderr": 0.0006821624022857435, "rougeLsum_precision": 0.006493130647527894, "rougeLsum_precision_stderr": 0.0007525468046300129, "rougeLsum_recall": 0.009066194317283121, "rougeLsum_recall_stderr": 0.0009684333729492796}, "summarize_above_en": {"bleu": 2.642808156949046e-10, "bleu_stderr": 1.0958818603819076e-09, "rouge1_fmeasure": 0.006459190800150568, "rouge1_fmeasure_stderr": 0.0007207860950963499, "rouge1_precision": 0.007010026455738011, "rouge1_precision_stderr": 0.0009017435483523394, "rouge1_recall": 0.00878867615147148, "rouge1_recall_stderr": 0.0010157525862754392, "rouge2_fmeasure": 0.0014447433493777688, "rouge2_fmeasure_stderr": 0.0002371563408671573, "rouge2_precision": 0.0017497370302994892, "rouge2_precision_stderr": 0.00034838975656659773, "rouge2_recall": 0.0017941002253548174, "rouge2_recall_stderr": 0.00029001602890075666, "rougeL_fmeasure": 0.005000926315966357, "rougeL_fmeasure_stderr": 0.0005492449452936106, "rougeL_precision": 0.005424480425576281, "rougeL_precision_stderr": 0.0007093535726255014, "rougeL_recall": 0.006810607613407806, "rougeL_recall_stderr": 0.0007726864270681841, "rougeLsum_fmeasure": 0.005981178381897325, "rougeLsum_fmeasure_stderr": 0.0006652187976853396, "rougeLsum_precision": 0.0065291661542634, "rougeLsum_precision_stderr": 0.0008431825007789647, "rougeLsum_recall": 0.008127880829555213, "rougeLsum_recall_stderr": 0.0009349057417684032}, "tldr_en": {"bleu": 2.2490869778774144e-09, "bleu_stderr": 4.579875964652809e-08, "rouge1_fmeasure": 0.008600982975105755, "rouge1_fmeasure_stderr": 0.000871586481245996, "rouge1_precision": 0.012746310259293975, "rouge1_precision_stderr": 0.001498370110228753, "rouge1_recall": 0.01036719903184163, "rouge1_recall_stderr": 0.0011041663951146658, "rouge2_fmeasure": 0.0027143726441978717, "rouge2_fmeasure_stderr": 0.0004021894165602589, "rouge2_precision": 0.0046837043167768724, "rouge2_precision_stderr": 0.000851048694914921, "rouge2_recall": 0.0032041325852383415, "rouge2_recall_stderr": 0.00048168285623642387, "rougeL_fmeasure": 0.006953217638177468, "rougeL_fmeasure_stderr": 0.0007188116843350712, "rougeL_precision": 0.010658331542732118, "rougeL_precision_stderr": 0.0013171585867806077, "rougeL_recall": 0.008285275201527513, "rougeL_recall_stderr": 0.0008892336234217646, "rougeLsum_fmeasure": 0.008236010332581721, "rougeLsum_fmeasure_stderr": 0.0008379564793264335, "rougeLsum_precision": 0.01225616498642844, "rougeLsum_precision_stderr": 0.0014511703007000516, "rougeLsum_recall": 0.009942438567258063, "rougeLsum_recall_stderr": 0.0010638347073576458}, "write_abstract_en": {"bleu": 1.862729574232818e-18, "bleu_stderr": 1.585352301614098e-16, "rouge1_fmeasure": 0.0015455675621237376, "rouge1_fmeasure_stderr": 0.00032572665041987446, "rouge1_precision": 0.0016940318129783684, "rouge1_precision_stderr": 0.00043807301732911447, "rouge1_recall": 0.0021116213008436154, "rouge1_recall_stderr": 0.0004157235401925989, "rouge2_fmeasure": 0.00024341445452042402, "rouge2_fmeasure_stderr": 9.580449462072127e-05, "rouge2_precision": 0.0003818107289535861, "rouge2_precision_stderr": 0.00019345670461531905, "rouge2_recall": 0.00026797247013965285, "rouge2_recall_stderr": 0.00010331614803977232, "rougeL_fmeasure": 0.0012780986139275944, "rougeL_fmeasure_stderr": 0.0002518502465672378, "rougeL_precision": 0.0013961176573495614, "rougeL_precision_stderr": 0.00034646394432675024, "rougeL_recall": 0.0017757153532734638, "rougeL_recall_stderr": 0.00034186622977328717, "rougeLsum_fmeasure": 0.0014218801930012729, "rougeLsum_fmeasure_stderr": 0.00031119324514029866, "rougeLsum_precision": 0.001588174661722081, "rougeLsum_precision_stderr": 0.0004292160855573092, "rougeLsum_recall": 0.0019250318967419596, "rougeLsum_recall_stderr": 0.00039002719041384033}}}, "anli_r1": {"0": {"GPT-3 style": {"acc": 0.331, "acc_norm": 0.314, "acc_norm_stderr": 0.014683991951087967, "acc_stderr": 0.014888272588203928, "subset": 1}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.34, "acc_norm_stderr": 0.014987482264363937, "acc_stderr": 0.014910846164229871, "subset": 1}, "can we infer": {"acc": 0.358, "acc_norm": 0.335, "acc_norm_stderr": 0.014933117490932572, "acc_stderr": 0.015167928865407559, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.327, "acc_norm": 0.341, "acc_norm_stderr": 0.0149981313484027, "acc_stderr": 0.014842213153411247, "subset": 1}, "justified in saying": {"acc": 0.356, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229857, "acc_stderr": 0.015149042659306625, "subset": 1}}, "1": {"GPT-3 style": {"acc": 0.327, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014842213153411244, "subset": 1}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "can we infer": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.332, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811482, "acc_stderr": 0.014899597242811482, "subset": 1}, "justified in saying": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229863, "acc_stderr": 0.014910846164229863, "subset": 1}}, "2": {"GPT-3 style": {"acc": 0.335, "acc_norm": 0.325, "acc_norm_stderr": 0.014818724459095526, "acc_stderr": 0.014933117490932577, "subset": 1}, "MNLI crowdsource": {"acc": 0.358, "acc_norm": 0.351, "acc_norm_stderr": 0.015100563798316407, "acc_stderr": 0.015167928865407557, "subset": 1}, "can we infer": {"acc": 0.361, "acc_norm": 0.347, "acc_norm_stderr": 0.01506047203170662, "acc_stderr": 0.015195720118175115, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.329, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928366, "acc_stderr": 0.014865395385928366, "subset": 1}, "justified in saying": {"acc": 0.358, "acc_norm": 0.338, "acc_norm_stderr": 0.014965960710224473, "acc_stderr": 0.015167928865407555, "subset": 1}}, "3": {"GPT-3 style": {"acc": 0.347, "acc_norm": 0.341, "acc_norm_stderr": 0.014998131348402707, "acc_stderr": 0.01506047203170662, "subset": 1}, "MNLI crowdsource": {"acc": 0.358, "acc_norm": 0.358, "acc_norm_stderr": 0.015167928865407557, "acc_stderr": 0.015167928865407557, "subset": 1}, "can we infer": {"acc": 0.35, "acc_norm": 0.343, "acc_norm_stderr": 0.015019206922356953, "acc_stderr": 0.015090650341444233, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.328, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348639, "acc_stderr": 0.014853842487270334, "subset": 1}, "justified in saying": {"acc": 0.355, "acc_norm": 0.341, "acc_norm_stderr": 0.014998131348402709, "acc_stderr": 0.015139491543780529, "subset": 1}}, "4": {"GPT-3 style": {"acc": 0.329, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732958, "acc_stderr": 0.014865395385928373, "subset": 1}, "MNLI crowdsource": {"acc": 0.354, "acc_norm": 0.343, "acc_norm_stderr": 0.015019206922356953, "acc_stderr": 0.015129868238451775, "subset": 1}, "can we infer": {"acc": 0.344, "acc_norm": 0.327, "acc_norm_stderr": 0.014842213153411237, "acc_stderr": 0.015029633724408947, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.328, "acc_norm": 0.335, "acc_norm_stderr": 0.014933117490932573, "acc_stderr": 0.014853842487270334, "subset": 1}, "justified in saying": {"acc": 0.336, "acc_norm": 0.326, "acc_norm_stderr": 0.01483050720454103, "acc_stderr": 0.014944140233795027, "subset": 1}}, "5": {"GPT-3 style": {"acc": 0.339, "acc_norm": 0.326, "acc_norm_stderr": 0.01483050720454103, "acc_stderr": 0.014976758771620335, "subset": 1}, "MNLI crowdsource": {"acc": 0.345, "acc_norm": 0.332, "acc_norm_stderr": 0.014899597242811478, "acc_stderr": 0.015039986742055235, "subset": 1}, "can we infer": {"acc": 0.329, "acc_norm": 0.316, "acc_norm_stderr": 0.014709193056057128, "acc_stderr": 0.014865395385928362, "subset": 1}, "guaranteed/possible/impossible": {"acc": 0.331, "acc_norm": 0.334, "acc_norm_stderr": 0.014922019523732967, "acc_stderr": 0.01488827258820394, "subset": 1}, "justified in saying": {"acc": 0.337, "acc_norm": 0.329, "acc_norm_stderr": 0.01486539538592837, "acc_stderr": 0.014955087918653605, "subset": 1}}}, "anli_r2": {"0": {"GPT-3 style": {"acc": 0.334, "acc_norm": 0.357, "acc_norm_stderr": 0.015158521721486767, "acc_stderr": 0.014922019523732958, "subset": 2}, "MNLI crowdsource": {"acc": 0.333, "acc_norm": 0.333, "acc_norm_stderr": 0.014910846164229859, "acc_stderr": 0.014910846164229871, "subset": 2}, "can we infer": {"acc": 0.35, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.015090650341444233, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.34, "acc_norm": 0.331, "acc_norm_stderr": 0.014888272588203948, "acc_stderr": 0.014987482264363935, "subset": 2}, "justified in saying": {"acc": 0.339, "acc_norm": 0.333, "acc_norm_stderr": 0.01491084616422987, "acc_stderr": 0.014976758771620342, "subset": 2}}, "1": {"GPT-3 style": {"acc": 0.313, "acc_norm": 0.322, "acc_norm_stderr": 0.01478291360099668, "acc_stderr": 0.01467127282297788, "subset": 2}, "MNLI crowdsource": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "can we infer": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.314, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014683991951087973, "subset": 2}, "justified in saying": {"acc": 0.315, "acc_norm": 0.315, "acc_norm_stderr": 0.014696631960792506, "acc_stderr": 0.014696631960792506, "subset": 2}}, "2": {"GPT-3 style": {"acc": 0.333, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928369, "acc_stderr": 0.014910846164229873, "subset": 2}, "MNLI crowdsource": {"acc": 0.329, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348628, "acc_stderr": 0.014865395385928374, "subset": 2}, "can we infer": {"acc": 0.323, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996692, "acc_stderr": 0.01479492784334863, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.31, "acc_norm": 0.319, "acc_norm_stderr": 0.014746404865473477, "acc_stderr": 0.014632638658632895, "subset": 2}, "justified in saying": {"acc": 0.323, "acc_norm": 0.32, "acc_norm_stderr": 0.014758652303574893, "acc_stderr": 0.014794927843348632, "subset": 2}}, "3": {"GPT-3 style": {"acc": 0.337, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456734, "acc_stderr": 0.01495508791865359, "subset": 2}, "MNLI crowdsource": {"acc": 0.317, "acc_norm": 0.33, "acc_norm_stderr": 0.014876872027456736, "acc_stderr": 0.014721675438880215, "subset": 2}, "can we infer": {"acc": 0.338, "acc_norm": 0.313, "acc_norm_stderr": 0.014671272822977881, "acc_stderr": 0.014965960710224479, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.32, "acc_norm": 0.323, "acc_norm_stderr": 0.014794927843348633, "acc_stderr": 0.014758652303574885, "subset": 2}, "justified in saying": {"acc": 0.329, "acc_norm": 0.317, "acc_norm_stderr": 0.014721675438880213, "acc_stderr": 0.014865395385928364, "subset": 2}}, "4": {"GPT-3 style": {"acc": 0.336, "acc_norm": 0.307, "acc_norm_stderr": 0.01459328489285263, "acc_stderr": 0.014944140233795021, "subset": 2}, "MNLI crowdsource": {"acc": 0.314, "acc_norm": 0.304, "acc_norm_stderr": 0.014553205687950436, "acc_stderr": 0.014683991951087964, "subset": 2}, "can we infer": {"acc": 0.334, "acc_norm": 0.32, "acc_norm_stderr": 0.01475865230357489, "acc_stderr": 0.014922019523732956, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.326, "acc_norm": 0.321, "acc_norm_stderr": 0.014770821817934645, "acc_stderr": 0.014830507204541038, "subset": 2}, "justified in saying": {"acc": 0.329, "acc_norm": 0.316, "acc_norm_stderr": 0.014709193056057137, "acc_stderr": 0.014865395385928367, "subset": 2}}, "5": {"GPT-3 style": {"acc": 0.342, "acc_norm": 0.312, "acc_norm_stderr": 0.014658474370509012, "acc_stderr": 0.015008706182121731, "subset": 2}, "MNLI crowdsource": {"acc": 0.304, "acc_norm": 0.324, "acc_norm_stderr": 0.014806864733738857, "acc_stderr": 0.01455320568795043, "subset": 2}, "can we infer": {"acc": 0.324, "acc_norm": 0.329, "acc_norm_stderr": 0.01486539538592837, "acc_stderr": 0.014806864733738863, "subset": 2}, "guaranteed/possible/impossible": {"acc": 0.332, "acc_norm": 0.329, "acc_norm_stderr": 0.014865395385928367, "acc_stderr": 0.014899597242811485, "subset": 2}, "justified in saying": {"acc": 0.317, "acc_norm": 0.322, "acc_norm_stderr": 0.014782913600996676, "acc_stderr": 0.014721675438880219, "subset": 2}}}, "anli_r3": {"0": {"GPT-3 style": {"acc": 0.335, "acc_norm": 0.35583333333333333, "acc_norm_stderr": 0.013826518748493324, "acc_stderr": 0.013630871843821476, "subset": 3}, "MNLI crowdsource": {"acc": 0.3358333333333333, "acc_norm": 0.3233333333333333, "acc_norm_stderr": 0.013508372867300219, "acc_stderr": 0.013639261190932877, "subset": 3}, "can we infer": {"acc": 0.3333333333333333, "acc_norm": 0.335, "acc_norm_stderr": 0.013630871843821469, "acc_stderr": 0.013613950010225603, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.32083333333333336, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103247, "acc_stderr": 0.013480882752851555, "subset": 3}, "justified in saying": {"acc": 0.3416666666666667, "acc_norm": 0.33416666666666667, "acc_norm_stderr": 0.013622434813136774, "acc_stderr": 0.013696658778002524, "subset": 3}}, "1": {"GPT-3 style": {"acc": 0.335, "acc_norm": 0.345, "acc_norm_stderr": 0.013728421539454872, "acc_stderr": 0.01363087184382147, "subset": 3}, "MNLI crowdsource": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.013647602942406393, "acc_stderr": 0.013647602942406393, "subset": 3}, "can we infer": {"acc": 0.33666666666666667, "acc_norm": 0.3375, "acc_norm_stderr": 0.013655897185463653, "acc_stderr": 0.013647602942406393, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3358333333333333, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.01364760294240639, "acc_stderr": 0.013639261190932887, "subset": 3}, "justified in saying": {"acc": 0.33666666666666667, "acc_norm": 0.33666666666666667, "acc_norm_stderr": 0.01364760294240639, "acc_stderr": 0.013647602942406393, "subset": 3}}, "2": {"GPT-3 style": {"acc": 0.33, "acc_norm": 0.33916666666666667, "acc_norm_stderr": 0.013672343491681822, "acc_stderr": 0.01357953127780092, "subset": 3}, "MNLI crowdsource": {"acc": 0.3233333333333333, "acc_norm": 0.30916666666666665, "acc_norm_stderr": 0.013346684134591948, "acc_stderr": 0.013508372867300219, "subset": 3}, "can we infer": {"acc": 0.325, "acc_norm": 0.33, "acc_norm_stderr": 0.01357953127780092, "acc_stderr": 0.013526454480351021, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.32, "acc_norm": 0.32666666666666666, "acc_norm_stderr": 0.013544340907003665, "acc_stderr": 0.013471620929769144, "subset": 3}, "justified in saying": {"acc": 0.325, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103247, "acc_stderr": 0.013526454480351021, "subset": 3}}, "3": {"GPT-3 style": {"acc": 0.33166666666666667, "acc_norm": 0.3433333333333333, "acc_norm_stderr": 0.01371263383046586, "acc_stderr": 0.013596836729485164, "subset": 3}, "MNLI crowdsource": {"acc": 0.31833333333333336, "acc_norm": 0.3225, "acc_norm_stderr": 0.013499258621103247, "acc_stderr": 0.013452948996996296, "subset": 3}, "can we infer": {"acc": 0.3325, "acc_norm": 0.3258333333333333, "acc_norm_stderr": 0.01353542204341746, "acc_stderr": 0.013605417345710526, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3308333333333333, "acc_norm": 0.33, "acc_norm_stderr": 0.013579531277800923, "acc_stderr": 0.013588208070709007, "subset": 3}, "justified in saying": {"acc": 0.3408333333333333, "acc_norm": 0.32416666666666666, "acc_norm_stderr": 0.013517438120881629, "acc_stderr": 0.013688600793296936, "subset": 3}}, "4": {"GPT-3 style": {"acc": 0.32666666666666666, "acc_norm": 0.3383333333333333, "acc_norm_stderr": 0.013664144006618271, "acc_stderr": 0.013544340907003663, "subset": 3}, "MNLI crowdsource": {"acc": 0.31583333333333335, "acc_norm": 0.30333333333333334, "acc_norm_stderr": 0.013275870057740436, "acc_stderr": 0.01342456883035645, "subset": 3}, "can we infer": {"acc": 0.31583333333333335, "acc_norm": 0.3233333333333333, "acc_norm_stderr": 0.013508372867300222, "acc_stderr": 0.013424568830356453, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.33666666666666667, "acc_norm": 0.3416666666666667, "acc_norm_stderr": 0.013696658778002515, "acc_stderr": 0.013647602942406394, "subset": 3}, "justified in saying": {"acc": 0.3175, "acc_norm": 0.3275, "acc_norm_stderr": 0.01355321116725195, "acc_stderr": 0.013443538681348054, "subset": 3}}, "5": {"GPT-3 style": {"acc": 0.31916666666666665, "acc_norm": 0.3275, "acc_norm_stderr": 0.013553211167251939, "acc_stderr": 0.01346230971200513, "subset": 3}, "MNLI crowdsource": {"acc": 0.31, "acc_norm": 0.31333333333333335, "acc_norm_stderr": 0.013395739415639082, "acc_stderr": 0.01335659633120026, "subset": 3}, "can we infer": {"acc": 0.31166666666666665, "acc_norm": 0.32083333333333336, "acc_norm_stderr": 0.013480882752851555, "acc_stderr": 0.013376268790982096, "subset": 3}, "guaranteed/possible/impossible": {"acc": 0.3383333333333333, "acc_norm": 0.34, "acc_norm_stderr": 0.013680495725767792, "acc_stderr": 0.013664144006618263, "subset": 3}, "justified in saying": {"acc": 0.30833333333333335, "acc_norm": 0.31916666666666665, "acc_norm_stderr": 0.013462309712005129, "acc_stderr": 0.013336721143136467, "subset": 3}}}, "arc_easy": {"0": {"heres_a_problem": {"acc": 0.25, "acc_norm": 0.25, "acc_norm_stderr": 0.008885233166386385, "acc_stderr": 0.008885233166386385}, "i_am_hesitating": {"acc": 0.35395622895622897, "acc_norm": 0.3253367003367003, "acc_norm_stderr": 0.009613427708996196, "acc_stderr": 0.009812370644174421}, "multiple_choice": {"acc": 0.23378839590443687, "acc_norm": 0.26621160409556316, "acc_norm_stderr": 0.012915774781523224, "acc_stderr": 0.012368225378507135}, "pick_the_most_correct_option": {"acc": 0.24705387205387205, "acc_norm": 0.24705387205387205, "acc_norm_stderr": 0.008850055161459239, "acc_stderr": 0.008850055161459239}, "qa_options": {"acc": 0.26023890784982934, "acc_norm": 0.2935153583617747, "acc_norm_stderr": 0.01330725044494113, "acc_stderr": 0.012821930225112563}}, "1": {"heres_a_problem": {"acc": 0.24368686868686867, "acc_norm": 0.24368686868686867, "acc_norm_stderr": 0.00880917174472056, "acc_stderr": 0.00880917174472056}, "i_am_hesitating": {"acc": 0.3468013468013468, "acc_norm": 0.32112794612794615, "acc_norm_stderr": 0.009580787536986797, "acc_stderr": 0.009766326091716005}, "multiple_choice": {"acc": 0.3253367003367003, "acc_norm": 0.335016835016835, "acc_norm_stderr": 0.009685160765932363, "acc_stderr": 0.009613427708996185}, "pick_the_most_correct_option": {"acc": 0.2295221843003413, "acc_norm": 0.2295221843003413, "acc_norm_stderr": 0.012288926760890797, "acc_stderr": 0.012288926760890797}, "qa_options": {"acc": 0.3425925925925926, "acc_norm": 0.31523569023569026, "acc_norm_stderr": 0.009533589368505848, "acc_stderr": 0.009738105469984187}}, "2": {"heres_a_problem": {"acc": 0.2508532423208191, "acc_norm": 0.2508532423208191, "acc_norm_stderr": 0.01266819862131543, "acc_stderr": 0.01266819862131543}, "i_am_hesitating": {"acc": 0.3383838383838384, "acc_norm": 0.3223905723905724, "acc_norm_stderr": 0.009590672908157438, "acc_stderr": 0.009709034670525097}, "multiple_choice": {"acc": 0.351010101010101, "acc_norm": 0.35563973063973064, "acc_norm_stderr": 0.009822854395535489, "acc_stderr": 0.009793703885101047}, "pick_the_most_correct_option": {"acc": 0.24829351535836178, "acc_norm": 0.24829351535836178, "acc_norm_stderr": 0.012624912868089769, "acc_stderr": 0.012624912868089769}, "qa_options": {"acc": 0.335016835016835, "acc_norm": 0.3202861952861953, "acc_norm_stderr": 0.009574152668739424, "acc_stderr": 0.009685160765932357}}, "3": {"heres_a_problem": {"acc": 0.24915824915824916, "acc_norm": 0.24915824915824916, "acc_norm_stderr": 0.008875238553583164, "acc_stderr": 0.008875238553583164}, "i_am_hesitating": {"acc": 0.25170648464163825, "acc_norm": 0.2738907849829352, "acc_norm_stderr": 0.013032004972989501, "acc_stderr": 0.012682496334042967}, "multiple_choice": {"acc": 0.2380546075085324, "acc_norm": 0.25853242320819114, "acc_norm_stderr": 0.012794553754288679, "acc_stderr": 0.012445770028026206}, "pick_the_most_correct_option": {"acc": 0.25170648464163825, "acc_norm": 0.25170648464163825, "acc_norm_stderr": 0.012682496334042961, "acc_stderr": 0.012682496334042961}, "qa_options": {"acc": 0.3400673400673401, "acc_norm": 0.31902356902356904, "acc_norm_stderr": 0.009564133249441085, "acc_stderr": 0.009720765494805276}}, "4": {"heres_a_problem": {"acc": 0.24284511784511784, "acc_norm": 0.24284511784511784, "acc_norm_stderr": 0.008798836444222042, "acc_stderr": 0.008798836444222042}, "i_am_hesitating": {"acc": 0.3480639730639731, "acc_norm": 0.31986531986531985, "acc_norm_stderr": 0.009570821820573587, "acc_stderr": 0.009774627600259014}, "multiple_choice": {"acc": 0.24146757679180889, "acc_norm": 0.25341296928327645, "acc_norm_stderr": 0.012710896778378607, "acc_stderr": 0.012506564839739432}, "pick_the_most_correct_option": {"acc": 0.24284511784511784, "acc_norm": 0.24284511784511784, "acc_norm_stderr": 0.008798836444222037, "acc_stderr": 0.008798836444222037}, "qa_options": {"acc": 0.3367003367003367, "acc_norm": 0.3164983164983165, "acc_norm_stderr": 0.009543851857323888, "acc_stderr": 0.009697166595752472}}, "5": {"heres_a_problem": {"acc": 0.2431740614334471, "acc_norm": 0.2431740614334471, "acc_norm_stderr": 0.012536554144587092, "acc_stderr": 0.012536554144587092}, "i_am_hesitating": {"acc": 0.33880471380471383, "acc_norm": 0.3253367003367003, "acc_norm_stderr": 0.009613427708996189, "acc_stderr": 0.009711980224301643}, "multiple_choice": {"acc": 0.33796296296296297, "acc_norm": 0.3480639730639731, "acc_norm_stderr": 0.009774627600259012, "acc_stderr": 0.00970608053863286}, "pick_the_most_correct_option": {"acc": 0.25, "acc_norm": 0.25, "acc_norm_stderr": 0.008885233166386385, "acc_stderr": 0.008885233166386385}, "qa_options": {"acc": 0.25170648464163825, "acc_norm": 0.2627986348122867, "acc_norm_stderr": 0.012862523175351331, "acc_stderr": 0.01268249633404297}}}, "boolq": {"0": {"GPT-3 Style": {"acc": 0.5143333333333333, "acc_norm": 0.6296666666666667, "acc_norm_stderr": 0.008817866528166162, "acc_stderr": 0.009126478842204577}, "after_reading": {"acc": 0.6233333333333333, "acc_norm": 0.5203333333333333, "acc_norm_stderr": 0.009122678313140908, "acc_stderr": 0.00884811049411477}, "exercise": {"acc": 0.6236666666666667, "acc_norm": 0.6216666666666667, "acc_norm_stderr": 0.008855801251873015, "acc_stderr": 0.008846558976258922}, "valid_binary": {"acc": 0.5753333333333334, "acc_norm": 0.411, "acc_norm_stderr": 0.008984425782182318, "acc_stderr": 0.009026006087500425}, "yes_no_question": {"acc": 0.5276666666666666, "acc_norm": 0.6236666666666667, "acc_norm_stderr": 0.008846558976258922, "acc_stderr": 0.009116243039079383}}, "1": {"GPT-3 Style": {"acc": 0.493, "acc_norm": 0.5726666666666667, "acc_norm_stderr": 0.009033293159951217, "acc_stderr": 0.009129336317272389}, "after_reading": {"acc": 0.546, "acc_norm": 0.5413333333333333, "acc_norm_stderr": 0.009098980657278165, "acc_stderr": 0.009091509877386513}, "exercise": {"acc": 0.6096666666666667, "acc_norm": 0.5866666666666667, "acc_norm_stderr": 0.008992028793524417, "acc_stderr": 0.008907909838637953}, "valid_binary": {"acc": 0.5676666666666667, "acc_norm": 0.5483333333333333, "acc_norm_stderr": 0.009087472531749428, "acc_stderr": 0.009046234144187917}, "yes_no_question": {"acc": 0.5406666666666666, "acc_norm": 0.5406666666666666, "acc_norm_stderr": 0.009099982269204863, "acc_stderr": 0.009099982269204863}}, "2": {"GPT-3 Style": {"acc": 0.5063333333333333, "acc_norm": 0.5886666666666667, "acc_norm_stderr": 0.008985524690229492, "acc_stderr": 0.009129498646958133}, "after_reading": {"acc": 0.5836666666666667, "acc_norm": 0.5663333333333334, "acc_norm_stderr": 0.009049526374650793, "acc_stderr": 0.00900149831714761}, "exercise": {"acc": 0.6033333333333334, "acc_norm": 0.5933333333333334, "acc_norm_stderr": 0.008969751860881005, "acc_stderr": 0.008933122315228996}, "valid_binary": {"acc": 0.593, "acc_norm": 0.57, "acc_norm_stderr": 0.009040312075041279, "acc_stderr": 0.008970906255948529}, "yes_no_question": {"acc": 0.5303333333333333, "acc_norm": 0.5826666666666667, "acc_norm_stderr": 0.009004578551254038, "acc_stderr": 0.009113413981658816}}, "3": {"GPT-3 Style": {"acc": 0.528, "acc_norm": 0.5966666666666667, "acc_norm_stderr": 0.008957972256087354, "acc_stderr": 0.009115903679831517}, "after_reading": {"acc": 0.6116666666666667, "acc_norm": 0.5953333333333334, "acc_norm_stderr": 0.008962735560535848, "acc_stderr": 0.008899620943397692}, "exercise": {"acc": 0.6083333333333333, "acc_norm": 0.601, "acc_norm_stderr": 0.008942016171856509, "acc_stderr": 0.008913348354532979}, "valid_binary": {"acc": 0.6066666666666667, "acc_norm": 0.592, "acc_norm_stderr": 0.008974343780026192, "acc_stderr": 0.008920048383377177}, "yes_no_question": {"acc": 0.5283333333333333, "acc_norm": 0.5786666666666667, "acc_norm_stderr": 0.009016519157880409, "acc_stderr": 0.009115560243539187}}, "4": {"GPT-3 Style": {"acc": 0.531, "acc_norm": 0.6066666666666667, "acc_norm_stderr": 0.008920048383377182, "acc_stderr": 0.009112665923139411}, "after_reading": {"acc": 0.6136666666666667, "acc_norm": 0.6006666666666667, "acc_norm_stderr": 0.008943269429955153, "acc_stderr": 0.008891174310695492}, "exercise": {"acc": 0.6133333333333333, "acc_norm": 0.607, "acc_norm_stderr": 0.00891871708850756, "acc_stderr": 0.008892593055774285}, "valid_binary": {"acc": 0.614, "acc_norm": 0.6006666666666667, "acc_norm_stderr": 0.008943269429955157, "acc_stderr": 0.008889751171543848}, "yes_no_question": {"acc": 0.5186666666666667, "acc_norm": 0.5753333333333334, "acc_norm_stderr": 0.009026006087500427, "acc_stderr": 0.009123866148533357}}, "5": {"GPT-3 Style": {"acc": 0.5486666666666666, "acc_norm": 0.6083333333333333, "acc_norm_stderr": 0.008913348354532974, "acc_stderr": 0.00908687931270849}, "after_reading": {"acc": 0.6126666666666667, "acc_norm": 0.603, "acc_norm_stderr": 0.00893440584870012, "acc_stderr": 0.008895417372116209}, "exercise": {"acc": 0.6183333333333333, "acc_norm": 0.606, "acc_norm_stderr": 0.00892269792043816, "acc_stderr": 0.008870849530787627}, "valid_binary": {"acc": 0.6123333333333333, "acc_norm": 0.601, "acc_norm_stderr": 0.008942016171856509, "acc_stderr": 0.008896822947561613}, "yes_no_question": {"acc": 0.5196666666666667, "acc_norm": 0.582, "acc_norm_stderr": 0.009006610887558775, "acc_stderr": 0.00912316564893404}}}, "cb": {"0": {"GPT-3 style": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.22456964006259783}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359538, "f1": 0.1940928270042194}, "can we infer": {"acc": 0.5357142857142857, "acc_stderr": 0.06724777654937658, "f1": 0.4156746031746032}, "guaranteed/possible/impossible": {"acc": 0.10714285714285714, "acc_stderr": 0.0417053005800816, "f1": 0.10352728047740835}, "justified in saying": {"acc": 0.5178571428571429, "acc_stderr": 0.06737697508644648, "f1": 0.4046085858585858}}, "1": {"GPT-3 style": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27045454545454545}, "MNLI crowdsource": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}, "guaranteed/possible/impossible": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.27858293075684376}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2842025699168556}}, "2": {"GPT-3 style": {"acc": 0.35714285714285715, "acc_stderr": 0.0646095738380922, "f1": 0.2613756613756614}, "MNLI crowdsource": {"acc": 0.4642857142857143, "acc_stderr": 0.0672477765493766, "f1": 0.31979092421002614}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2794380587484036}, "guaranteed/possible/impossible": {"acc": 0.25, "acc_stderr": 0.058387420812114225, "f1": 0.21626712849026222}, "justified in saying": {"acc": 0.39285714285714285, "acc_stderr": 0.0658538889806635, "f1": 0.2794380587484036}}, "3": {"GPT-3 style": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2489177489177489}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359541, "f1": 0.2887426900584795}, "can we infer": {"acc": 0.39285714285714285, "acc_stderr": 0.06585388898066351, "f1": 0.2772108843537415}, "guaranteed/possible/impossible": {"acc": 0.14285714285714285, "acc_stderr": 0.0471841613625583, "f1": 0.13505848989719957}, "justified in saying": {"acc": 0.375, "acc_stderr": 0.06527912098338669, "f1": 0.2596413657577991}}, "4": {"GPT-3 style": {"acc": 0.32142857142857145, "acc_stderr": 0.06297362289056341, "f1": 0.23462970093697855}, "MNLI crowdsource": {"acc": 0.42857142857142855, "acc_stderr": 0.06672848092813058, "f1": 0.301994301994302}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.31333333333333335}, "guaranteed/possible/impossible": {"acc": 0.10714285714285714, "acc_stderr": 0.0417053005800816, "f1": 0.09963985594237694}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3173681664247702}}, "5": {"GPT-3 style": {"acc": 0.2857142857142857, "acc_stderr": 0.06091449038731725, "f1": 0.25051020408163266}, "MNLI crowdsource": {"acc": 0.4107142857142857, "acc_stderr": 0.06633634150359542, "f1": 0.28651292802236195}, "can we infer": {"acc": 0.44642857142857145, "acc_stderr": 0.06703189227942398, "f1": 0.3113354970549345}, "guaranteed/possible/impossible": {"acc": 0.14285714285714285, "acc_stderr": 0.04718416136255829, "f1": 0.14384662956091526}, "justified in saying": {"acc": 0.44642857142857145, "acc_stderr": 0.067031892279424, "f1": 0.3143399810066477}}}, "copa": {"0": {"best_option": {"acc": 0.6, "acc_norm": 0.49, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.049236596391733084}, "cause_effect": {"acc": 0.54, "acc_norm": 0.48, "acc_norm_stderr": 0.050211673156867795, "acc_stderr": 0.05009082659620332}, "choose": {"acc": 0.58, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.049604496374885836}, "i_am_hesitating": {"acc": 0.54, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.05009082659620333}, "plausible_alternatives": {"acc": 0.54, "acc_norm": 0.51, "acc_norm_stderr": 0.05024183937956912, "acc_stderr": 0.05009082659620333}}, "1": {"best_option": {"acc": 0.53, "acc_norm": 0.45, "acc_norm_stderr": 0.05, "acc_stderr": 0.05016135580465919}, "cause_effect": {"acc": 0.42, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.049604496374885836}, "choose": {"acc": 0.44, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102, "acc_stderr": 0.04988876515698589}, "i_am_hesitating": {"acc": 0.43, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102, "acc_stderr": 0.04975698519562428}, "plausible_alternatives": {"acc": 0.45, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.04999999999999999}}, "2": {"best_option": {"acc": 0.63, "acc_norm": 0.58, "acc_norm_stderr": 0.04960449637488584, "acc_stderr": 0.04852365870939099}, "cause_effect": {"acc": 0.44, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.04988876515698589}, "choose": {"acc": 0.4, "acc_norm": 0.39, "acc_norm_stderr": 0.04902071300001974, "acc_stderr": 0.049236596391733084}, "i_am_hesitating": {"acc": 0.41, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.04943110704237102}, "plausible_alternatives": {"acc": 0.42, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.049604496374885836}}, "3": {"best_option": {"acc": 0.6, "acc_norm": 0.62, "acc_norm_stderr": 0.048783173121456316, "acc_stderr": 0.049236596391733084}, "cause_effect": {"acc": 0.44, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.04988876515698589}, "choose": {"acc": 0.39, "acc_norm": 0.41, "acc_norm_stderr": 0.04943110704237102, "acc_stderr": 0.04902071300001974}, "i_am_hesitating": {"acc": 0.44, "acc_norm": 0.43, "acc_norm_stderr": 0.049756985195624284, "acc_stderr": 0.04988876515698589}, "plausible_alternatives": {"acc": 0.44, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04988876515698589}}, "4": {"best_option": {"acc": 0.62, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102, "acc_stderr": 0.04878317312145632}, "cause_effect": {"acc": 0.45, "acc_norm": 0.46, "acc_norm_stderr": 0.05009082659620332, "acc_stderr": 0.04999999999999999}, "choose": {"acc": 0.41, "acc_norm": 0.42, "acc_norm_stderr": 0.049604496374885836, "acc_stderr": 0.04943110704237102}, "i_am_hesitating": {"acc": 0.43, "acc_norm": 0.45, "acc_norm_stderr": 0.04999999999999999, "acc_stderr": 0.049756985195624284}, "plausible_alternatives": {"acc": 0.44, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04988876515698589}}, "5": {"best_option": {"acc": 0.58, "acc_norm": 0.59, "acc_norm_stderr": 0.04943110704237102, "acc_stderr": 0.049604496374885836}, "cause_effect": {"acc": 0.47, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05016135580465919}, "choose": {"acc": 0.44, "acc_norm": 0.44, "acc_norm_stderr": 0.04988876515698589, "acc_stderr": 0.04988876515698589}, "i_am_hesitating": {"acc": 0.48, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.050211673156867795}, "plausible_alternatives": {"acc": 0.46, "acc_norm": 0.47, "acc_norm_stderr": 0.05016135580465919, "acc_stderr": 0.05009082659620332}}}, "e2e_nlg_cleaned": {"0": {"coherent_text": {"bleu": 2.7427887623503127, "bleu_stderr": 0.05495550525224015, "rouge1_fmeasure": 0.22723646490231306, "rouge1_fmeasure_stderr": 0.0015666486705066473, "rouge1_precision": 0.18437430718819275, "rouge1_precision_stderr": 0.0016081919163123087, "rouge1_recall": 0.3251983558888638, "rouge1_recall_stderr": 0.0020138546152268445, "rouge2_fmeasure": 0.06946399430025461, "rouge2_fmeasure_stderr": 0.0009869119968554082, "rouge2_precision": 0.056050363270124, "rouge2_precision_stderr": 0.0008590677097817941, "rouge2_recall": 0.10050481850962295, "rouge2_recall_stderr": 0.0014187911826345318, "rougeL_fmeasure": 0.18067137808834505, "rougeL_fmeasure_stderr": 0.0012648308698162281, "rougeL_precision": 0.1456094405733479, "rougeL_precision_stderr": 0.0012316335261671133, "rougeL_recall": 0.26113451142460736, "rougeL_recall_stderr": 0.0017752376482140586, "rougeLsum_fmeasure": 0.19654380094534718, "rougeLsum_fmeasure_stderr": 0.0013999134084878718, "rougeLsum_precision": 0.15934149267059367, "rougeLsum_precision_stderr": 0.0014098113104567104, "rougeLsum_recall": 0.2813363867963323, "rougeLsum_recall_stderr": 0.0018136711023303616}, "create_text_for_me": {"bleu": 0.8753334130290518, "bleu_stderr": 0.0348620294223961, "rouge1_fmeasure": 0.16280348551957885, "rouge1_fmeasure_stderr": 0.001309447781869726, "rouge1_precision": 0.12180299903557583, "rouge1_precision_stderr": 0.0011144789606423104, "rouge1_recall": 0.2607151270005577, "rouge1_recall_stderr": 0.001833065327541976, "rouge2_fmeasure": 0.022387020564367744, "rouge2_fmeasure_stderr": 0.0006945772086659809, "rouge2_precision": 0.01683964413978754, "rouge2_precision_stderr": 0.0005356789210692077, "rouge2_recall": 0.0356320865210295, "rouge2_recall_stderr": 0.0011364294225061037, "rougeL_fmeasure": 0.140452983092435, "rougeL_fmeasure_stderr": 0.0010520758669989665, "rougeL_precision": 0.1047466751047272, "rougeL_precision_stderr": 0.0008761170348141068, "rougeL_recall": 0.226582446043322, "rougeL_recall_stderr": 0.0015936684448292061, "rougeLsum_fmeasure": 0.13916001785403956, "rougeLsum_fmeasure_stderr": 0.0011384861820446248, "rougeLsum_precision": 0.10382518824250553, "rougeLsum_precision_stderr": 0.0009416399281524375, "rougeLsum_recall": 0.2242636517948043, "rougeLsum_recall_stderr": 0.0017031410291911053}, "generate_gramatically_correct_text": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0015622118728288864, "rouge1_fmeasure_stderr": 0.00025395262482497686, "rouge1_precision": 0.013, "rouge1_precision_stderr": 0.0020684356751050448, "rouge1_recall": 0.0008334849399141362, "rouge1_recall_stderr": 0.0001358503424917359, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.0015622118728288864, "rougeL_fmeasure_stderr": 0.00025395262482497686, "rougeL_precision": 0.013, "rougeL_precision_stderr": 0.0020684356751050448, "rougeL_recall": 0.0008334849399141362, "rougeL_recall_stderr": 0.0001358503424917359, "rougeLsum_fmeasure": 0.0015622118728288864, "rougeLsum_fmeasure_stderr": 0.00025395262482497686, "rougeLsum_precision": 0.013, "rougeLsum_precision_stderr": 0.0020684356751050448, "rougeLsum_recall": 0.0008334849399141362, "rougeLsum_recall_stderr": 0.0001358503424917359}, "generate_text_restaurant": {"bleu": 3.3868925634521596, "bleu_stderr": 0.07415023515490869, "rouge1_fmeasure": 0.16968795558812888, "rouge1_fmeasure_stderr": 0.0020520636735140885, "rouge1_precision": 0.12781588961197554, "rouge1_precision_stderr": 0.0016392738386146906, "rouge1_recall": 0.26686211195039367, "rouge1_recall_stderr": 0.0030010672113457175, "rouge2_fmeasure": 0.06289750165250287, "rouge2_fmeasure_stderr": 0.0013193385142581298, "rouge2_precision": 0.04754240795799523, "rouge2_precision_stderr": 0.001014281868832704, "rouge2_recall": 0.09732440743600192, "rouge2_recall_stderr": 0.002035877095135508, "rougeL_fmeasure": 0.15745602394794062, "rougeL_fmeasure_stderr": 0.0017868944105710683, "rougeL_precision": 0.11843660338175163, "rougeL_precision_stderr": 0.0014270824435811313, "rougeL_recall": 0.24837983245494402, "rougeL_recall_stderr": 0.0026307193382184542, "rougeLsum_fmeasure": 0.14802696104042146, "rougeLsum_fmeasure_stderr": 0.0018361037263776232, "rougeLsum_precision": 0.11138415788379845, "rougeLsum_precision_stderr": 0.0014602010171592388, "rougeLsum_recall": 0.23362654109870873, "rougeLsum_recall_stderr": 0.0027185132812130603}, "text": {"bleu": 2.0677053326576096, "bleu_stderr": 0.06271134039957105, "rouge1_fmeasure": 0.1395687520228349, "rouge1_fmeasure_stderr": 0.0026646833399520944, "rouge1_precision": 0.1095707231662833, "rouge1_precision_stderr": 0.0021591140734212957, "rouge1_recall": 0.20802103992679305, "rouge1_recall_stderr": 0.003979629734649372, "rouge2_fmeasure": 0.05547805696954945, "rouge2_fmeasure_stderr": 0.001212976335572, "rouge2_precision": 0.043273009183308724, "rouge2_precision_stderr": 0.0009648056549439121, "rouge2_recall": 0.084551914403829, "rouge2_recall_stderr": 0.0019148621747591432, "rougeL_fmeasure": 0.11040718684668492, "rougeL_fmeasure_stderr": 0.002066401071200146, "rougeL_precision": 0.08666935739550846, "rougeL_precision_stderr": 0.0016576469738929047, "rougeL_recall": 0.16514211361615944, "rougeL_recall_stderr": 0.0031559470820256225, "rougeLsum_fmeasure": 0.12059646907618253, "rougeLsum_fmeasure_stderr": 0.0023117453399075457, "rougeLsum_precision": 0.09469111770984258, "rougeLsum_precision_stderr": 0.0018586644683897386, "rougeLsum_recall": 0.17996098817894363, "rougeLsum_recall_stderr": 0.003488975907287554}}, "1": {"coherent_text": {"bleu": 5.501660579699312, "bleu_stderr": 0.059490555517424666, "rouge1_fmeasure": 0.3871051829172928, "rouge1_fmeasure_stderr": 0.001953296778601808, "rouge1_precision": 0.3112584786759358, "rouge1_precision_stderr": 0.001907448658666166, "rouge1_recall": 0.5491290672753819, "rouge1_recall_stderr": 0.002745962690555782, "rouge2_fmeasure": 0.1605728054943973, "rouge2_fmeasure_stderr": 0.001412706883124633, "rouge2_precision": 0.12796428764311324, "rouge2_precision_stderr": 0.0012052147710974312, "rouge2_recall": 0.2325200457741587, "rouge2_recall_stderr": 0.002130744747415604, "rougeL_fmeasure": 0.27100478266727807, "rougeL_fmeasure_stderr": 0.0013681493938654708, "rougeL_precision": 0.21648135721974734, "rougeL_precision_stderr": 0.0012587276826098216, "rougeL_recall": 0.38988924141191994, "rougeL_recall_stderr": 0.002309000377521498, "rougeLsum_fmeasure": 0.31806770177523574, "rougeLsum_fmeasure_stderr": 0.0018173275489533744, "rougeLsum_precision": 0.25574830221838724, "rougeLsum_precision_stderr": 0.0017155621473719167, "rougeLsum_recall": 0.45162452980561607, "rougeLsum_recall_stderr": 0.002604535523838573}, "create_text_for_me": {"bleu": 5.721366546161991, "bleu_stderr": 0.059879890672636354, "rouge1_fmeasure": 0.37872230003166957, "rouge1_fmeasure_stderr": 0.0017386604468687269, "rouge1_precision": 0.29934120585636687, "rouge1_precision_stderr": 0.00167831396994261, "rouge1_recall": 0.5469683642207921, "rouge1_recall_stderr": 0.0025082833435656686, "rouge2_fmeasure": 0.15944824307809596, "rouge2_fmeasure_stderr": 0.0013532088062737225, "rouge2_precision": 0.12496074032642647, "rouge2_precision_stderr": 0.00112594043655694, "rouge2_recall": 0.23529069358454732, "rouge2_recall_stderr": 0.0021079015900675175, "rougeL_fmeasure": 0.2590184966950011, "rougeL_fmeasure_stderr": 0.0013377464931078383, "rougeL_precision": 0.2036609095228398, "rougeL_precision_stderr": 0.0011853406607828087, "rougeL_recall": 0.37837767528899835, "rougeL_recall_stderr": 0.0022449611252738045, "rougeLsum_fmeasure": 0.31560678490975924, "rougeLsum_fmeasure_stderr": 0.0016501630567473267, "rougeLsum_precision": 0.24947318226454393, "rougeLsum_precision_stderr": 0.0015365533884932646, "rougeLsum_recall": 0.4560568485237131, "rougeLsum_recall_stderr": 0.0024137023011478282}, "generate_gramatically_correct_text": {"bleu": 0.5565042233022994, "bleu_stderr": 0.06597968521354808, "rouge1_fmeasure": 0.07140698262139507, "rouge1_fmeasure_stderr": 0.0028371334110872675, "rouge1_precision": 0.08593898665872673, "rouge1_precision_stderr": 0.003701585604317789, "rouge1_recall": 0.0887838872066671, "rouge1_recall_stderr": 0.003666707231082007, "rouge2_fmeasure": 0.028616212681722628, "rouge2_fmeasure_stderr": 0.0012853116962859368, "rouge2_precision": 0.024892824112996058, "rouge2_precision_stderr": 0.0011425344587644847, "rouge2_recall": 0.0371764329986735, "rouge2_recall_stderr": 0.0017241156866971928, "rougeL_fmeasure": 0.048987083153452016, "rougeL_fmeasure_stderr": 0.0019444092164217727, "rougeL_precision": 0.06606011638251028, "rougeL_precision_stderr": 0.0032423797227769642, "rougeL_recall": 0.06093221366523217, "rougeL_recall_stderr": 0.002567923224143668, "rougeLsum_fmeasure": 0.059745065640798484, "rougeLsum_fmeasure_stderr": 0.0023782780189300344, "rougeLsum_precision": 0.07579156901424648, "rougeLsum_precision_stderr": 0.0034585652796348012, "rougeLsum_recall": 0.07390893983809439, "rougeLsum_recall_stderr": 0.003072194012262202}, "generate_text_restaurant": {"bleu": 5.656637998489558, "bleu_stderr": 0.06138640019754124, "rouge1_fmeasure": 0.29857293628895076, "rouge1_fmeasure_stderr": 0.0017883517178428313, "rouge1_precision": 0.22448431958439818, "rouge1_precision_stderr": 0.0015774519438982097, "rouge1_recall": 0.475937695786901, "rouge1_recall_stderr": 0.002823854240730373, "rouge2_fmeasure": 0.12449829406834531, "rouge2_fmeasure_stderr": 0.0012625501065363337, "rouge2_precision": 0.09278798103068853, "rouge2_precision_stderr": 0.0010182785337770847, "rouge2_recall": 0.2039364568886483, "rouge2_recall_stderr": 0.0021426616098330804, "rougeL_fmeasure": 0.2478615356783164, "rougeL_fmeasure_stderr": 0.0013407338809586194, "rougeL_precision": 0.18571244490415678, "rougeL_precision_stderr": 0.0011675141052374309, "rougeL_recall": 0.3980265502028155, "rougeL_recall_stderr": 0.0023515010622215645, "rougeLsum_fmeasure": 0.24278851384210426, "rougeLsum_fmeasure_stderr": 0.0016843478489220692, "rougeLsum_precision": 0.18244634491896064, "rougeLsum_precision_stderr": 0.001437571751990845, "rougeLsum_recall": 0.3876575760439037, "rougeLsum_recall_stderr": 0.002699802672015727}, "text": {"bleu": 6.776339014367035, "bleu_stderr": 0.08648488550482673, "rouge1_fmeasure": 0.4455306771983514, "rouge1_fmeasure_stderr": 0.00201463745384298, "rouge1_precision": 0.3689526387645257, "rouge1_precision_stderr": 0.0021333449754014004, "rouge1_recall": 0.5958464926870726, "rouge1_recall_stderr": 0.002476964048840286, "rouge2_fmeasure": 0.19999728868621525, "rouge2_fmeasure_stderr": 0.0015176461844321897, "rouge2_precision": 0.16443602410248034, "rouge2_precision_stderr": 0.001373572653978035, "rouge2_recall": 0.2723125733003977, "rouge2_recall_stderr": 0.002120174112140588, "rougeL_fmeasure": 0.31215943868340223, "rougeL_fmeasure_stderr": 0.0015312730015625377, "rougeL_precision": 0.25666663991049066, "rougeL_precision_stderr": 0.00146996338508828, "rougeL_recall": 0.423240490787575, "rougeL_recall_stderr": 0.0023419654403491713, "rougeLsum_fmeasure": 0.37009049888168155, "rougeLsum_fmeasure_stderr": 0.0019204542565398291, "rougeLsum_precision": 0.3062178505891934, "rougeLsum_precision_stderr": 0.001925082430561453, "rougeLsum_recall": 0.4957583957629278, "rougeLsum_recall_stderr": 0.0024755678177244595}}, "2": {"coherent_text": {"bleu": 6.324394303872813, "bleu_stderr": 0.10867005579792703, "rouge1_fmeasure": 0.4051242312515321, "rouge1_fmeasure_stderr": 0.0018087304910981952, "rouge1_precision": 0.33094397240746043, "rouge1_precision_stderr": 0.0018564220392525137, "rouge1_recall": 0.5631788704398346, "rouge1_recall_stderr": 0.002636056309828146, "rouge2_fmeasure": 0.177921888015793, "rouge2_fmeasure_stderr": 0.0014368818296271887, "rouge2_precision": 0.14418060422905016, "rouge2_precision_stderr": 0.0012689215486412471, "rouge2_recall": 0.25265308621087024, "rouge2_recall_stderr": 0.0021790130347415517, "rougeL_fmeasure": 0.2822165267320512, "rougeL_fmeasure_stderr": 0.0013886080174190089, "rougeL_precision": 0.2297030175226026, "rougeL_precision_stderr": 0.001353128997875971, "rougeL_recall": 0.39650377304869283, "rougeL_recall_stderr": 0.0023226005472347976, "rougeLsum_fmeasure": 0.34056029442652097, "rougeLsum_fmeasure_stderr": 0.0017186859749549793, "rougeLsum_precision": 0.278170389520455, "rougeLsum_precision_stderr": 0.0017013879950176962, "rougeLsum_recall": 0.47403743423884753, "rougeLsum_recall_stderr": 0.0025306580301756973}, "create_text_for_me": {"bleu": 6.535547229648224, "bleu_stderr": 0.07998247245957349, "rouge1_fmeasure": 0.38702008716988656, "rouge1_fmeasure_stderr": 0.0017370872647159401, "rouge1_precision": 0.3043393031178695, "rouge1_precision_stderr": 0.0016987372631724537, "rouge1_recall": 0.564230482567467, "rouge1_recall_stderr": 0.0024769176130864756, "rouge2_fmeasure": 0.17045094780052067, "rouge2_fmeasure_stderr": 0.0014026421153863727, "rouge2_precision": 0.1329353387981719, "rouge2_precision_stderr": 0.0011973152327962053, "rouge2_recall": 0.2547108199869813, "rouge2_recall_stderr": 0.002222655965723188, "rougeL_fmeasure": 0.2694273591107714, "rougeL_fmeasure_stderr": 0.0013460707667909964, "rougeL_precision": 0.21068603957785997, "rougeL_precision_stderr": 0.001216562219549965, "rougeL_recall": 0.3978790249309968, "rougeL_recall_stderr": 0.002297088221037071, "rougeLsum_fmeasure": 0.3262488238812611, "rougeLsum_fmeasure_stderr": 0.0016716617044544048, "rougeLsum_precision": 0.2565183996017009, "rougeLsum_precision_stderr": 0.0015725570964798953, "rougeLsum_recall": 0.47610471055377285, "rougeLsum_recall_stderr": 0.0024377798420915993}, "generate_gramatically_correct_text": {"bleu": 4.952955446585565, "bleu_stderr": 0.14703261112875554, "rouge1_fmeasure": 0.16985011321499935, "rouge1_fmeasure_stderr": 0.003824921158615819, "rouge1_precision": 0.1775561426910298, "rouge1_precision_stderr": 0.004354374307061146, "rouge1_recall": 0.2112636167055307, "rouge1_recall_stderr": 0.004916752832733977, "rouge2_fmeasure": 0.07112804192230661, "rouge2_fmeasure_stderr": 0.0018368523862338794, "rouge2_precision": 0.061741721519856914, "rouge2_precision_stderr": 0.0016305676991503655, "rouge2_recall": 0.09131710409720983, "rouge2_recall_stderr": 0.00243568525413484, "rougeL_fmeasure": 0.11698574747654279, "rougeL_fmeasure_stderr": 0.002667973913528021, "rougeL_precision": 0.13080469299969233, "rougeL_precision_stderr": 0.003698218373012542, "rougeL_recall": 0.14612619148208336, "rougeL_recall_stderr": 0.003515094714410421, "rougeLsum_fmeasure": 0.14129352369422737, "rougeLsum_fmeasure_stderr": 0.0032206138029846256, "rougeLsum_precision": 0.15254844160782838, "rougeLsum_precision_stderr": 0.0040016749319373825, "rougeLsum_recall": 0.175513921363672, "rougeLsum_recall_stderr": 0.0041508619445660075}, "generate_text_restaurant": {"bleu": 6.856901680068561, "bleu_stderr": 0.08124041054587171, "rouge1_fmeasure": 0.32161895837988735, "rouge1_fmeasure_stderr": 0.00168146427395452, "rouge1_precision": 0.2401800477102809, "rouge1_precision_stderr": 0.0014473576335462571, "rouge1_recall": 0.5163816757468545, "rouge1_recall_stderr": 0.0027490552702758896, "rouge2_fmeasure": 0.14591205568832014, "rouge2_fmeasure_stderr": 0.001294685527449689, "rouge2_precision": 0.10780300493629737, "rouge2_precision_stderr": 0.0010069472486850257, "rouge2_recall": 0.2413921319510364, "rouge2_recall_stderr": 0.002257493837016372, "rougeL_fmeasure": 0.2704562548681343, "rougeL_fmeasure_stderr": 0.0013038206676934103, "rougeL_precision": 0.20140298531801745, "rougeL_precision_stderr": 0.0011006514010170898, "rougeL_recall": 0.43694912141494435, "rougeL_recall_stderr": 0.002368493512084033, "rougeLsum_fmeasure": 0.2651360409168281, "rougeLsum_fmeasure_stderr": 0.00165326573625161, "rougeLsum_precision": 0.19780238487026972, "rougeLsum_precision_stderr": 0.0013643352859564666, "rougeLsum_recall": 0.42666941865224894, "rougeLsum_recall_stderr": 0.0027363306527661384}, "text": {"bleu": 6.956869478198272, "bleu_stderr": 0.08706326939132922, "rouge1_fmeasure": 0.4357140603298011, "rouge1_fmeasure_stderr": 0.0020471026571017327, "rouge1_precision": 0.35860576976732106, "rouge1_precision_stderr": 0.0021204385851447834, "rouge1_recall": 0.5877143657823972, "rouge1_recall_stderr": 0.0025365074511194768, "rouge2_fmeasure": 0.19727207437417654, "rouge2_fmeasure_stderr": 0.0015716203234345902, "rouge2_precision": 0.1610875918591629, "rouge2_precision_stderr": 0.0013996599831001433, "rouge2_recall": 0.2714598427748779, "rouge2_recall_stderr": 0.0022353847705593125, "rougeL_fmeasure": 0.30752457963236757, "rougeL_fmeasure_stderr": 0.0015398007381628844, "rougeL_precision": 0.25147400899840344, "rougeL_precision_stderr": 0.0014714723008184948, "rougeL_recall": 0.42019871172862305, "rougeL_recall_stderr": 0.0023315626390538035, "rougeLsum_fmeasure": 0.3671539275155354, "rougeLsum_fmeasure_stderr": 0.0019677103053781135, "rougeLsum_precision": 0.30198591705675015, "rougeLsum_precision_stderr": 0.001941444722568633, "rougeLsum_recall": 0.49589524688238196, "rougeLsum_recall_stderr": 0.002550990119421592}}, "3": {"coherent_text": {"bleu": 7.023795509429225, "bleu_stderr": 0.1045133219407453, "rouge1_fmeasure": 0.40121542494211165, "rouge1_fmeasure_stderr": 0.001826481389337569, "rouge1_precision": 0.3281077630214524, "rouge1_precision_stderr": 0.0018892313549911306, "rouge1_recall": 0.5565110906097839, "rouge1_recall_stderr": 0.002632763132900043, "rouge2_fmeasure": 0.1797184980766115, "rouge2_fmeasure_stderr": 0.0014978605293431552, "rouge2_precision": 0.14595032198106483, "rouge2_precision_stderr": 0.001332168833443709, "rouge2_recall": 0.25433155482778347, "rouge2_recall_stderr": 0.0022419507977446606, "rougeL_fmeasure": 0.28212439353399954, "rougeL_fmeasure_stderr": 0.0014319999333914978, "rougeL_precision": 0.23006830678751594, "rougeL_precision_stderr": 0.001412084408622655, "rougeL_recall": 0.394784182200868, "rougeL_recall_stderr": 0.0023087386618001874, "rougeLsum_fmeasure": 0.3404036668863611, "rougeLsum_fmeasure_stderr": 0.0017688250393426252, "rougeLsum_precision": 0.2784043771811358, "rougeLsum_precision_stderr": 0.0017584082669023667, "rougeLsum_recall": 0.4725249414865908, "rougeLsum_recall_stderr": 0.002566203128742103}, "create_text_for_me": {"bleu": 6.816658602745218, "bleu_stderr": 0.04583016968843249, "rouge1_fmeasure": 0.3878861536787146, "rouge1_fmeasure_stderr": 0.0017017932936630189, "rouge1_precision": 0.30377319756142107, "rouge1_precision_stderr": 0.0016392467233435767, "rouge1_recall": 0.5686273285167529, "rouge1_recall_stderr": 0.002510804564764257, "rouge2_fmeasure": 0.1725725568885113, "rouge2_fmeasure_stderr": 0.0014322789151160747, "rouge2_precision": 0.13391657548983904, "rouge2_precision_stderr": 0.0011760375900396684, "rouge2_recall": 0.25896717520218415, "rouge2_recall_stderr": 0.0022974650442523993, "rougeL_fmeasure": 0.2705133936819741, "rougeL_fmeasure_stderr": 0.001354231331623015, "rougeL_precision": 0.2108908717196471, "rougeL_precision_stderr": 0.0011962828612559094, "rougeL_recall": 0.4006206317458079, "rougeL_recall_stderr": 0.002304730683692073, "rougeLsum_fmeasure": 0.3286981074887459, "rougeLsum_fmeasure_stderr": 0.0016837415400218188, "rougeLsum_precision": 0.25737446644019807, "rougeLsum_precision_stderr": 0.0015454274177506696, "rougeLsum_recall": 0.4820507894942288, "rougeLsum_recall_stderr": 0.002515972922796953}, "generate_gramatically_correct_text": {"bleu": 8.035933183438738, "bleu_stderr": 0.16111218297605656, "rouge1_fmeasure": 0.23676635522395745, "rouge1_fmeasure_stderr": 0.003969984523240725, "rouge1_precision": 0.23042023299400893, "rouge1_precision_stderr": 0.004205272460982702, "rouge1_recall": 0.2950691553359754, "rouge1_recall_stderr": 0.005135544471290643, "rouge2_fmeasure": 0.10309288089148716, "rouge2_fmeasure_stderr": 0.0020209463742805306, "rouge2_precision": 0.08901699667028115, "rouge2_precision_stderr": 0.0017692731972765087, "rouge2_recall": 0.13239068456120945, "rouge2_recall_stderr": 0.002720773239274303, "rougeL_fmeasure": 0.16447844130188313, "rougeL_fmeasure_stderr": 0.0028207459523148914, "rougeL_precision": 0.16676542398506278, "rougeL_precision_stderr": 0.0035028714409443473, "rougeL_recall": 0.20592751042876095, "rougeL_recall_stderr": 0.0037381203986915166, "rougeLsum_fmeasure": 0.19647057857599984, "rougeLsum_fmeasure_stderr": 0.00337544239864232, "rougeLsum_precision": 0.19511953619785463, "rougeLsum_precision_stderr": 0.003830263451296712, "rougeLsum_recall": 0.24484238681845968, "rougeLsum_recall_stderr": 0.004384904615208511}, "generate_text_restaurant": {"bleu": 7.067980420028392, "bleu_stderr": 0.07674666618798165, "rouge1_fmeasure": 0.3234918087831591, "rouge1_fmeasure_stderr": 0.0016866852215600267, "rouge1_precision": 0.24119330473410452, "rouge1_precision_stderr": 0.001436265091489619, "rouge1_recall": 0.520447176584629, "rouge1_recall_stderr": 0.0027841731171524635, "rouge2_fmeasure": 0.14908085018598377, "rouge2_fmeasure_stderr": 0.001325863965517803, "rouge2_precision": 0.1098856604345649, "rouge2_precision_stderr": 0.0010230282133462645, "rouge2_recall": 0.24765707981063853, "rouge2_recall_stderr": 0.002344687079258296, "rougeL_fmeasure": 0.2728226991547497, "rougeL_fmeasure_stderr": 0.001319038570084664, "rougeL_precision": 0.2028877822846739, "rougeL_precision_stderr": 0.00110597560698619, "rougeL_recall": 0.44168375010036015, "rougeL_recall_stderr": 0.0024210245845319865, "rougeLsum_fmeasure": 0.2684669151387224, "rougeLsum_fmeasure_stderr": 0.0016553181450139245, "rougeLsum_precision": 0.19995751457276323, "rougeLsum_precision_stderr": 0.0013597786817193271, "rougeLsum_recall": 0.4331317668864426, "rougeLsum_recall_stderr": 0.002771288232621466}, "text": {"bleu": 7.009191295391601, "bleu_stderr": 0.07880823868019797, "rouge1_fmeasure": 0.43148059444634873, "rouge1_fmeasure_stderr": 0.0020169750963498836, "rouge1_precision": 0.3545298481996187, "rouge1_precision_stderr": 0.002080164442210043, "rouge1_recall": 0.5832652468397247, "rouge1_recall_stderr": 0.0025434678918276176, "rouge2_fmeasure": 0.1964238350803286, "rouge2_fmeasure_stderr": 0.0015986177875609523, "rouge2_precision": 0.16017594753225423, "rouge2_precision_stderr": 0.0014169660528143824, "rouge2_recall": 0.2708031087813346, "rouge2_recall_stderr": 0.0022903011042423914, "rougeL_fmeasure": 0.30412703782905015, "rougeL_fmeasure_stderr": 0.0015615998587158336, "rougeL_precision": 0.24852387761819955, "rougeL_precision_stderr": 0.001485017547336716, "rougeL_recall": 0.4157032927811326, "rougeL_recall_stderr": 0.002345334628732105, "rougeLsum_fmeasure": 0.3640982827070694, "rougeLsum_fmeasure_stderr": 0.0019717220049021546, "rougeLsum_precision": 0.299053998385982, "rougeLsum_precision_stderr": 0.0019322475227694653, "rougeLsum_recall": 0.4926202062191711, "rougeLsum_recall_stderr": 0.002581909625016066}}, "4": {"coherent_text": {"bleu": 7.241056271417146, "bleu_stderr": 0.06584261856760189, "rouge1_fmeasure": 0.3975971671274264, "rouge1_fmeasure_stderr": 0.001836450632172448, "rouge1_precision": 0.32573683579768875, "rouge1_precision_stderr": 0.001877843583143733, "rouge1_recall": 0.548887629470057, "rouge1_recall_stderr": 0.0026299920384990267, "rouge2_fmeasure": 0.17842808573274627, "rouge2_fmeasure_stderr": 0.0015395035598836696, "rouge2_precision": 0.14525291203480517, "rouge2_precision_stderr": 0.0013572915734951672, "rouge2_recall": 0.2508152581679459, "rouge2_recall_stderr": 0.002279211546178668, "rougeL_fmeasure": 0.2810484863937827, "rougeL_fmeasure_stderr": 0.0014637888600224375, "rougeL_precision": 0.22958828009634885, "rougeL_precision_stderr": 0.0014143451756674962, "rougeL_recall": 0.391229276944502, "rougeL_recall_stderr": 0.002334722588500979, "rougeLsum_fmeasure": 0.3398221063833657, "rougeLsum_fmeasure_stderr": 0.0017989152716216044, "rougeLsum_precision": 0.2784229970473634, "rougeLsum_precision_stderr": 0.0017664712981204373, "rougeLsum_recall": 0.4693077141628463, "rougeLsum_recall_stderr": 0.002584144223809568}, "create_text_for_me": {"bleu": 6.979957732229111, "bleu_stderr": 0.0659505224006796, "rouge1_fmeasure": 0.3884017275288488, "rouge1_fmeasure_stderr": 0.0017060422782565476, "rouge1_precision": 0.30391913123253583, "rouge1_precision_stderr": 0.0016517877163181051, "rouge1_recall": 0.5698197681808488, "rouge1_recall_stderr": 0.0024983427107223084, "rouge2_fmeasure": 0.1732811817482548, "rouge2_fmeasure_stderr": 0.0014176809818272833, "rouge2_precision": 0.1344501382831743, "rouge2_precision_stderr": 0.0011696066678081112, "rouge2_recall": 0.25994005857695457, "rouge2_recall_stderr": 0.002276014994019727, "rougeL_fmeasure": 0.2699817385866637, "rougeL_fmeasure_stderr": 0.0013509422774341565, "rougeL_precision": 0.21017112712931915, "rougeL_precision_stderr": 0.0011880019144919782, "rougeL_recall": 0.4004756665207533, "rougeL_recall_stderr": 0.0023172780379295812, "rougeLsum_fmeasure": 0.3295357685903064, "rougeLsum_fmeasure_stderr": 0.0016750203066771487, "rougeLsum_precision": 0.2577902147906641, "rougeLsum_precision_stderr": 0.0015458181802897012, "rougeLsum_recall": 0.48387236758259095, "rougeLsum_recall_stderr": 0.0025093856044876017}, "generate_gramatically_correct_text": {"bleu": 9.838539717642748, "bleu_stderr": 0.1542199121945329, "rouge1_fmeasure": 0.2846908650582164, "rouge1_fmeasure_stderr": 0.003829987590059196, "rouge1_precision": 0.27122631402577446, "rouge1_precision_stderr": 0.004019032209864867, "rouge1_recall": 0.3543225457053986, "rouge1_recall_stderr": 0.004967042595033049, "rouge2_fmeasure": 0.1257055751954671, "rouge2_fmeasure_stderr": 0.002028879799449096, "rouge2_precision": 0.10895746152226353, "rouge2_precision_stderr": 0.0017929065525348804, "rouge2_recall": 0.16028390936179449, "rouge2_recall_stderr": 0.0027130809496109776, "rougeL_fmeasure": 0.19965383022630442, "rougeL_fmeasure_stderr": 0.0027610441762478733, "rougeL_precision": 0.19623445722856525, "rougeL_precision_stderr": 0.0033607871400964796, "rougeL_recall": 0.2494589917112239, "rougeL_recall_stderr": 0.0036616398264334195, "rougeLsum_fmeasure": 0.23705688169423808, "rougeLsum_fmeasure_stderr": 0.0032905873586870126, "rougeLsum_precision": 0.22931849058794615, "rougeLsum_precision_stderr": 0.003672760310574185, "rougeLsum_recall": 0.2951186979389869, "rougeLsum_recall_stderr": 0.004285864671929924}, "generate_text_restaurant": {"bleu": 7.301331414189049, "bleu_stderr": 0.07282835065460666, "rouge1_fmeasure": 0.3281997540286874, "rouge1_fmeasure_stderr": 0.0016842914784987548, "rouge1_precision": 0.24454894011699824, "rouge1_precision_stderr": 0.0014458985189821688, "rouge1_recall": 0.5282893998840518, "rouge1_recall_stderr": 0.0027239476703194553, "rouge2_fmeasure": 0.15250785714191883, "rouge2_fmeasure_stderr": 0.0013289295083122636, "rouge2_precision": 0.11241096101927037, "rouge2_precision_stderr": 0.0010296837842975505, "rouge2_recall": 0.25292679532334217, "rouge2_recall_stderr": 0.0023135968641909677, "rougeL_fmeasure": 0.27576809325399554, "rougeL_fmeasure_stderr": 0.001332292317195587, "rougeL_precision": 0.20493346554385022, "rougeL_precision_stderr": 0.0011217276272122712, "rougeL_recall": 0.44669665680269593, "rougeL_recall_stderr": 0.0023872195322647088, "rougeLsum_fmeasure": 0.2720277049182356, "rougeLsum_fmeasure_stderr": 0.0016797663432208906, "rougeLsum_precision": 0.20253099841039804, "rougeLsum_precision_stderr": 0.0013836573195200987, "rougeLsum_recall": 0.4389119231936933, "rougeLsum_recall_stderr": 0.0027715255759945875}, "text": {"bleu": 7.000198019935731, "bleu_stderr": 0.06913407290704363, "rouge1_fmeasure": 0.42692314967926737, "rouge1_fmeasure_stderr": 0.001988659000618427, "rouge1_precision": 0.35024781130506905, "rouge1_precision_stderr": 0.0020285351396598593, "rouge1_recall": 0.5776691917985523, "rouge1_recall_stderr": 0.0025294416589362444, "rouge2_fmeasure": 0.19301601907405783, "rouge2_fmeasure_stderr": 0.0015901538810711403, "rouge2_precision": 0.15717110741288884, "rouge2_precision_stderr": 0.0013996429298285594, "rouge2_recall": 0.266320618349341, "rouge2_recall_stderr": 0.002283184726774742, "rougeL_fmeasure": 0.3001609211346324, "rougeL_fmeasure_stderr": 0.0015693837295482464, "rougeL_precision": 0.24489748566928496, "rougeL_precision_stderr": 0.0014705299982066997, "rougeL_recall": 0.41079255911678236, "rougeL_recall_stderr": 0.0023718685348987417, "rougeLsum_fmeasure": 0.3613177972378212, "rougeLsum_fmeasure_stderr": 0.0019560581480454974, "rougeLsum_precision": 0.29636957517330675, "rougeLsum_precision_stderr": 0.0019035502093200786, "rougeLsum_recall": 0.48931428544569894, "rougeLsum_recall_stderr": 0.0025795333772663286}}, "5": {"coherent_text": {"bleu": 7.2465196884878145, "bleu_stderr": 0.07468916464119668, "rouge1_fmeasure": 0.3966890040503588, "rouge1_fmeasure_stderr": 0.001815803597852792, "rouge1_precision": 0.3255815725397693, "rouge1_precision_stderr": 0.0019032883471518422, "rouge1_recall": 0.546540731713107, "rouge1_recall_stderr": 0.0025733832831453347, "rouge2_fmeasure": 0.17801029501851723, "rouge2_fmeasure_stderr": 0.0014940190331615374, "rouge2_precision": 0.14519398169659237, "rouge2_precision_stderr": 0.0013353231399095156, "rouge2_recall": 0.24987277938742591, "rouge2_recall_stderr": 0.0022140552438889268, "rougeL_fmeasure": 0.28146458367847593, "rougeL_fmeasure_stderr": 0.0014472139059049296, "rougeL_precision": 0.2301383140529255, "rougeL_precision_stderr": 0.001418389584367908, "rougeL_recall": 0.39137629197532353, "rougeL_recall_stderr": 0.0023042337566443194, "rougeLsum_fmeasure": 0.34001582427002575, "rougeLsum_fmeasure_stderr": 0.0017807422041094688, "rougeLsum_precision": 0.2791614306561805, "rougeLsum_precision_stderr": 0.0017928244822471586, "rougeLsum_recall": 0.4685579869769028, "rougeLsum_recall_stderr": 0.002529144473225649}, "create_text_for_me": {"bleu": 6.9467832258458815, "bleu_stderr": 0.0713556533267485, "rouge1_fmeasure": 0.38816794389969306, "rouge1_fmeasure_stderr": 0.001684513494480907, "rouge1_precision": 0.3040190125185005, "rouge1_precision_stderr": 0.0016439994643967335, "rouge1_recall": 0.5691357371244624, "rouge1_recall_stderr": 0.002460959843459368, "rouge2_fmeasure": 0.17347865411768018, "rouge2_fmeasure_stderr": 0.0014147739784681232, "rouge2_precision": 0.1346910777243502, "rouge2_precision_stderr": 0.0011732332291231678, "rouge2_recall": 0.2605212984297539, "rouge2_recall_stderr": 0.002269174033571433, "rougeL_fmeasure": 0.26961120657660587, "rougeL_fmeasure_stderr": 0.00134699971864286, "rougeL_precision": 0.21001184801420653, "rougeL_precision_stderr": 0.0011889650722311973, "rougeL_recall": 0.39991518737149667, "rougeL_recall_stderr": 0.002307549465530525, "rougeLsum_fmeasure": 0.33036410242618386, "rougeLsum_fmeasure_stderr": 0.0016552917619089278, "rougeLsum_precision": 0.2587096095734276, "rougeLsum_precision_stderr": 0.0015374119276061612, "rougeLsum_recall": 0.4845073574529188, "rougeLsum_recall_stderr": 0.002456173501378085}, "generate_gramatically_correct_text": {"bleu": 9.887554411273673, "bleu_stderr": 0.07716295936766063, "rouge1_fmeasure": 0.3191894028386597, "rouge1_fmeasure_stderr": 0.003572075569447729, "rouge1_precision": 0.29889933654446116, "rouge1_precision_stderr": 0.0037338421535844956, "rouge1_recall": 0.3963217358852519, "rouge1_recall_stderr": 0.004651910319820262, "rouge2_fmeasure": 0.14176701175164574, "rouge2_fmeasure_stderr": 0.0019854849759513705, "rouge2_precision": 0.12313981096867667, "rouge2_precision_stderr": 0.001764254452955877, "rouge2_recall": 0.1798873724588336, "rouge2_recall_stderr": 0.002654894317462101, "rougeL_fmeasure": 0.2252484886653337, "rougeL_fmeasure_stderr": 0.002627200495231524, "rougeL_precision": 0.21599407087504968, "rougeL_precision_stderr": 0.0031436009861130674, "rougeL_recall": 0.28082966722369046, "rougeL_recall_stderr": 0.00350910000314051, "rougeLsum_fmeasure": 0.26634567471821236, "rougeLsum_fmeasure_stderr": 0.0031018728642803, "rougeLsum_precision": 0.2524224681188342, "rougeLsum_precision_stderr": 0.0034299667817147893, "rougeLsum_recall": 0.33082396406528924, "rougeLsum_recall_stderr": 0.004058856021277282}, "generate_text_restaurant": {"bleu": 7.391640493190723, "bleu_stderr": 0.07696132388022625, "rouge1_fmeasure": 0.33316361794611193, "rouge1_fmeasure_stderr": 0.0016721318654132375, "rouge1_precision": 0.248561550528162, "rouge1_precision_stderr": 0.0014842009296995168, "rouge1_recall": 0.5361907676530253, "rouge1_recall_stderr": 0.0026923692664488426, "rouge2_fmeasure": 0.15567663749325128, "rouge2_fmeasure_stderr": 0.001316099001704392, "rouge2_precision": 0.11498051543992278, "rouge2_precision_stderr": 0.0010486619107346665, "rouge2_recall": 0.2578870983510302, "rouge2_recall_stderr": 0.0022815014259324113, "rougeL_fmeasure": 0.27881193769275936, "rougeL_fmeasure_stderr": 0.0013297414200118412, "rougeL_precision": 0.2074081838133947, "rougeL_precision_stderr": 0.0011575590389617165, "rougeL_recall": 0.45169489347455444, "rougeL_recall_stderr": 0.0023905395426844995, "rougeLsum_fmeasure": 0.27704641367351746, "rougeLsum_fmeasure_stderr": 0.001654188879883823, "rougeLsum_precision": 0.20657408552804282, "rougeLsum_precision_stderr": 0.0014132706167399262, "rougeLsum_recall": 0.4467577031168348, "rougeLsum_recall_stderr": 0.0027053770792765386}, "text": {"bleu": 6.9767120337505135, "bleu_stderr": 0.07800248993311952, "rouge1_fmeasure": 0.4274281275013867, "rouge1_fmeasure_stderr": 0.0019410392555796734, "rouge1_precision": 0.3507462370081385, "rouge1_precision_stderr": 0.002007947032661163, "rouge1_recall": 0.5785590837053719, "rouge1_recall_stderr": 0.0024745176796048766, "rouge2_fmeasure": 0.1925812091012645, "rouge2_fmeasure_stderr": 0.0015855872884769345, "rouge2_precision": 0.15691758500190972, "rouge2_precision_stderr": 0.0014061231147467989, "rouge2_recall": 0.2657298946460687, "rouge2_recall_stderr": 0.002263553464788525, "rougeL_fmeasure": 0.3006040774381334, "rougeL_fmeasure_stderr": 0.001530792855406818, "rougeL_precision": 0.24541799797892744, "rougeL_precision_stderr": 0.0014553330784770901, "rougeL_recall": 0.41122561852623524, "rougeL_recall_stderr": 0.002317891502947824, "rougeLsum_fmeasure": 0.3606676599309041, "rougeLsum_fmeasure_stderr": 0.0019100284921251059, "rougeLsum_precision": 0.2960131787945338, "rougeLsum_precision_stderr": 0.00188423815157528, "rougeLsum_recall": 0.4883301232467587, "rougeLsum_recall_stderr": 0.002513006325165267}}}, "gem_xsum": {"0": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.7256315560298398, "bleu_stderr": 0.054268166719320964, "rouge1_fmeasure": 0.13852188755221997, "rouge1_fmeasure_stderr": 0.0021293763458264075, "rouge1_precision": 0.09769813350875045, "rouge1_precision_stderr": 0.0015529769050519056, "rouge1_recall": 0.24854008316454318, "rouge1_recall_stderr": 0.003822284525903826, "rouge2_fmeasure": 0.021404604329843858, "rouge2_fmeasure_stderr": 0.000978061112594587, "rouge2_precision": 0.01493502693762796, "rouge2_precision_stderr": 0.000687168087320552, "rouge2_recall": 0.03970637215388994, "rouge2_recall_stderr": 0.0018488604775761157, "rougeL_fmeasure": 0.11349412170506364, "rougeL_fmeasure_stderr": 0.0015962350151948002, "rougeL_precision": 0.07997070021861699, "rougeL_precision_stderr": 0.001165505436023522, "rougeL_recall": 0.204221486314787, "rougeL_recall_stderr": 0.0029072542956042127, "rougeLsum_fmeasure": 0.11211976668519616, "rougeLsum_fmeasure_stderr": 0.0017813295955859596, "rougeLsum_precision": 0.07895320097795921, "rougeLsum_precision_stderr": 0.0012925709790883738, "rougeLsum_recall": 0.20223663136840953, "rougeLsum_recall_stderr": 0.0032525187208582097}, "DOC_tldr": {"bleu": 1.5074370696814168, "bleu_stderr": 0.06306583053985478, "rouge1_fmeasure": 0.17862356070756422, "rouge1_fmeasure_stderr": 0.0025190444667790237, "rouge1_precision": 0.13056123327549546, "rouge1_precision_stderr": 0.00211653085075359, "rouge1_recall": 0.30744582623088057, "rouge1_recall_stderr": 0.004345708260459012, "rouge2_fmeasure": 0.04267694660641669, "rouge2_fmeasure_stderr": 0.0014272559106211188, "rouge2_precision": 0.031215859261324586, "rouge2_precision_stderr": 0.0011455957695466357, "rouge2_recall": 0.0753592824533711, "rouge2_recall_stderr": 0.0025928926122515596, "rougeL_fmeasure": 0.14532363973135456, "rougeL_fmeasure_stderr": 0.001945356869811387, "rougeL_precision": 0.10607783393120825, "rougeL_precision_stderr": 0.0016604953804639634, "rougeL_recall": 0.2510352242981599, "rougeL_recall_stderr": 0.0034186066493379894, "rougeLsum_fmeasure": 0.14077407394066704, "rougeLsum_fmeasure_stderr": 0.002139215859454691, "rougeLsum_precision": 0.10275001625696571, "rougeLsum_precision_stderr": 0.0017792008566591565, "rougeLsum_recall": 0.243889829015575, "rougeLsum_recall_stderr": 0.0038153653094762778}, "article_DOC_summary": {"bleu": 1.8973746751821576, "bleu_stderr": 0.07374608873039461, "rouge1_fmeasure": 0.20448305051370513, "rouge1_fmeasure_stderr": 0.0024499517599214865, "rouge1_precision": 0.14714404834539455, "rouge1_precision_stderr": 0.0018923895784899547, "rouge1_recall": 0.35332808198249066, "rouge1_recall_stderr": 0.004277854613117222, "rouge2_fmeasure": 0.04639360161894793, "rouge2_fmeasure_stderr": 0.0015361212037212923, "rouge2_precision": 0.03290270141635618, "rouge2_precision_stderr": 0.0010989269503369368, "rouge2_recall": 0.08287973270989689, "rouge2_recall_stderr": 0.002833032880998302, "rougeL_fmeasure": 0.1522143356903949, "rougeL_fmeasure_stderr": 0.0018364801701113477, "rougeL_precision": 0.10929554255441323, "rougeL_precision_stderr": 0.001387362394565063, "rougeL_recall": 0.26469649902339626, "rougeL_recall_stderr": 0.0033616815736178403, "rougeLsum_fmeasure": 0.1627172387554665, "rougeLsum_fmeasure_stderr": 0.0020740152357133783, "rougeLsum_precision": 0.11672705515751206, "rougeLsum_precision_stderr": 0.001546854617689576, "rougeLsum_recall": 0.28315180080173324, "rougeLsum_recall_stderr": 0.0037966298013545237}, "summarize_DOC": {"bleu": 1.3629401242051673, "bleu_stderr": 0.07163139222356285, "rouge1_fmeasure": 0.20343816500554804, "rouge1_fmeasure_stderr": 0.0022810928631204065, "rouge1_precision": 0.15006207382743458, "rouge1_precision_stderr": 0.001987657336844973, "rouge1_recall": 0.3454637430656072, "rouge1_recall_stderr": 0.004018757653016808, "rouge2_fmeasure": 0.037047223818116884, "rouge2_fmeasure_stderr": 0.0012769054344671355, "rouge2_precision": 0.027682031685392947, "rouge2_precision_stderr": 0.0012572133596850813, "rouge2_recall": 0.0647058637194844, "rouge2_recall_stderr": 0.0023098436259221776, "rougeL_fmeasure": 0.14427043435851508, "rougeL_fmeasure_stderr": 0.001641927464053827, "rougeL_precision": 0.10668249292401177, "rougeL_precision_stderr": 0.0015581101621468096, "rougeL_recall": 0.2459446989428375, "rougeL_recall_stderr": 0.0029903868489755438, "rougeLsum_fmeasure": 0.15967622365586978, "rougeLsum_fmeasure_stderr": 0.0019165619775066089, "rougeLsum_precision": 0.11765973495308749, "rougeLsum_precision_stderr": 0.0017048462642842042, "rougeLsum_recall": 0.27281684754355495, "rougeLsum_recall_stderr": 0.0034923333313323017}, "summarize_this_DOC_summary": {"bleu": 1.9932437983297846, "bleu_stderr": 0.05005508121283258, "rouge1_fmeasure": 0.2056827728723364, "rouge1_fmeasure_stderr": 0.002442623835139831, "rouge1_precision": 0.14761415510887813, "rouge1_precision_stderr": 0.0018460271861890638, "rouge1_recall": 0.35644215533066814, "rouge1_recall_stderr": 0.004313928291972872, "rouge2_fmeasure": 0.048478969239621084, "rouge2_fmeasure_stderr": 0.0016015359697998386, "rouge2_precision": 0.03437672503730759, "rouge2_precision_stderr": 0.0011443426120385147, "rouge2_recall": 0.08689221376038479, "rouge2_recall_stderr": 0.0029525081998479433, "rougeL_fmeasure": 0.1546552506639713, "rougeL_fmeasure_stderr": 0.0018693227706026462, "rougeL_precision": 0.11084459960593879, "rougeL_precision_stderr": 0.0014045626956033276, "rougeL_recall": 0.2695965057726157, "rougeL_recall_stderr": 0.003458083185065001, "rougeLsum_fmeasure": 0.16171184786817824, "rougeLsum_fmeasure_stderr": 0.0020719541841424964, "rougeLsum_precision": 0.11575748816387031, "rougeLsum_precision_stderr": 0.0015363157261191857, "rougeLsum_recall": 0.2824907234086633, "rougeLsum_recall_stderr": 0.003845732411905414}}, "1": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.7100208650541717, "bleu_stderr": 0.06307666643323927, "rouge1_fmeasure": 0.15918378891591406, "rouge1_fmeasure_stderr": 0.0021744946842033627, "rouge1_precision": 0.11318588173143941, "rouge1_precision_stderr": 0.0016141770639632976, "rouge1_recall": 0.279979949784874, "rouge1_recall_stderr": 0.0037134168551931756, "rouge2_fmeasure": 0.019363039161219217, "rouge2_fmeasure_stderr": 0.0009344774229308126, "rouge2_precision": 0.013696212725928195, "rouge2_precision_stderr": 0.0006678764989725612, "rouge2_recall": 0.03434190326540072, "rouge2_recall_stderr": 0.0016508020659010145, "rougeL_fmeasure": 0.11153061433705051, "rougeL_fmeasure_stderr": 0.001401528481365617, "rougeL_precision": 0.07914360184018943, "rougeL_precision_stderr": 0.00103668752345318, "rougeL_recall": 0.19748984092630087, "rougeL_recall_stderr": 0.002485099284856001, "rougeLsum_fmeasure": 0.1276700935379806, "rougeLsum_fmeasure_stderr": 0.001741533787976053, "rougeLsum_precision": 0.09056691007854044, "rougeLsum_precision_stderr": 0.0012802454821530644, "rougeLsum_recall": 0.22609222107906188, "rougeLsum_recall_stderr": 0.0030831336083997113}, "DOC_tldr": {"bleu": 1.9294507192370047, "bleu_stderr": 0.06927174390558496, "rouge1_fmeasure": 0.19355866105950706, "rouge1_fmeasure_stderr": 0.0024904888130049163, "rouge1_precision": 0.137488328967968, "rouge1_precision_stderr": 0.001854761861679906, "rouge1_recall": 0.3409552422180664, "rouge1_recall_stderr": 0.004347354842228533, "rouge2_fmeasure": 0.046583919666909064, "rouge2_fmeasure_stderr": 0.0015343154616235192, "rouge2_precision": 0.032790830434572774, "rouge2_precision_stderr": 0.00109127458636922, "rouge2_recall": 0.0841464589274841, "rouge2_recall_stderr": 0.0028228926118902585, "rougeL_fmeasure": 0.15350551415370856, "rougeL_fmeasure_stderr": 0.0019452968584178893, "rougeL_precision": 0.10887517550153661, "rougeL_precision_stderr": 0.0014322908116765737, "rougeL_recall": 0.27169151726551716, "rougeL_recall_stderr": 0.003535082608448959, "rougeLsum_fmeasure": 0.15376034054280813, "rougeLsum_fmeasure_stderr": 0.0021049152368113177, "rougeLsum_precision": 0.10904170474313231, "rougeLsum_precision_stderr": 0.0015480655523710234, "rougeLsum_recall": 0.27225025414903886, "rougeLsum_recall_stderr": 0.003786898530655058}, "article_DOC_summary": {"bleu": 1.6712614198546765, "bleu_stderr": 0.08029090892350033, "rouge1_fmeasure": 0.18885391905418497, "rouge1_fmeasure_stderr": 0.0025584792794476653, "rouge1_precision": 0.1344416388362992, "rouge1_precision_stderr": 0.0019127327308320389, "rouge1_recall": 0.33053933219336484, "rouge1_recall_stderr": 0.00435936977132835, "rouge2_fmeasure": 0.041352212313184845, "rouge2_fmeasure_stderr": 0.0015256985849036882, "rouge2_precision": 0.02919021495874484, "rouge2_precision_stderr": 0.001083579043412117, "rouge2_recall": 0.07395642387380598, "rouge2_recall_stderr": 0.0027833547930418757, "rougeL_fmeasure": 0.1447818747001178, "rougeL_fmeasure_stderr": 0.001931305342910804, "rougeL_precision": 0.10283551822207533, "rougeL_precision_stderr": 0.0014281321188522025, "rougeL_recall": 0.25511273674645885, "rougeL_recall_stderr": 0.0034307162021354593, "rougeLsum_fmeasure": 0.14973596762001254, "rougeLsum_fmeasure_stderr": 0.002121716066142779, "rougeLsum_precision": 0.10637101855903888, "rougeLsum_precision_stderr": 0.0015675521812090556, "rougeLsum_recall": 0.26365396988763345, "rougeLsum_recall_stderr": 0.003725007857599503}, "summarize_DOC": {"bleu": 1.8090419671658906, "bleu_stderr": 0.06243960286799727, "rouge1_fmeasure": 0.20959848507445628, "rouge1_fmeasure_stderr": 0.002442544487345855, "rouge1_precision": 0.1491508085903834, "rouge1_precision_stderr": 0.0018269519621024542, "rouge1_recall": 0.3664832216419509, "rouge1_recall_stderr": 0.0042114762385522956, "rouge2_fmeasure": 0.04746052242779793, "rouge2_fmeasure_stderr": 0.001497983093227225, "rouge2_precision": 0.03343924101857058, "rouge2_precision_stderr": 0.001064909350142866, "rouge2_recall": 0.08498394458221385, "rouge2_recall_stderr": 0.0027112134192700124, "rougeL_fmeasure": 0.15491949613982683, "rougeL_fmeasure_stderr": 0.0018314818566152655, "rougeL_precision": 0.11001044604391327, "rougeL_precision_stderr": 0.0013469472913403972, "rougeL_recall": 0.27255320287378415, "rougeL_recall_stderr": 0.003314920441365678, "rougeLsum_fmeasure": 0.16551777637641962, "rougeLsum_fmeasure_stderr": 0.002095777574542374, "rougeLsum_precision": 0.11751544431488214, "rougeLsum_precision_stderr": 0.0015368040243137976, "rougeLsum_recall": 0.2910075794259318, "rougeLsum_recall_stderr": 0.0037311617285632286}, "summarize_this_DOC_summary": {"bleu": 1.483994717023367, "bleu_stderr": 0.0583433944934504, "rouge1_fmeasure": 0.18107269784715221, "rouge1_fmeasure_stderr": 0.0025610519024537653, "rouge1_precision": 0.1290438512870586, "rouge1_precision_stderr": 0.001908907445184339, "rouge1_recall": 0.31582870770710825, "rouge1_recall_stderr": 0.00430942059053028, "rouge2_fmeasure": 0.03724797091171915, "rouge2_fmeasure_stderr": 0.0014590126800901643, "rouge2_precision": 0.026404127167916132, "rouge2_precision_stderr": 0.0010432396448414494, "rouge2_recall": 0.06561202026843391, "rouge2_recall_stderr": 0.002586868170319681, "rougeL_fmeasure": 0.13967411856886675, "rougeL_fmeasure_stderr": 0.0019003725867380186, "rougeL_precision": 0.09933917048841584, "rougeL_precision_stderr": 0.001409747027873095, "rougeL_recall": 0.2451300217584245, "rougeL_recall_stderr": 0.0032869960015931565, "rougeLsum_fmeasure": 0.1437650081283148, "rougeLsum_fmeasure_stderr": 0.002115707145967448, "rougeLsum_precision": 0.10225509404962359, "rougeLsum_precision_stderr": 0.001564036497709352, "rougeLsum_recall": 0.25220739881743365, "rougeLsum_recall_stderr": 0.003637572454499514}}, "2": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.9602615670933534, "bleu_stderr": 0.06975113829339996, "rouge1_fmeasure": 0.1629955337480239, "rouge1_fmeasure_stderr": 0.0023615726845261148, "rouge1_precision": 0.1160701827729658, "rouge1_precision_stderr": 0.0017439664665625302, "rouge1_recall": 0.2860944708906913, "rouge1_recall_stderr": 0.004085639279103821, "rouge2_fmeasure": 0.024834458548132245, "rouge2_fmeasure_stderr": 0.0011433201739787905, "rouge2_precision": 0.017447949224103376, "rouge2_precision_stderr": 0.0008032811020562298, "rouge2_recall": 0.04485569053060954, "rouge2_recall_stderr": 0.002114380505243146, "rougeL_fmeasure": 0.11904854566150291, "rougeL_fmeasure_stderr": 0.0016169606046750717, "rougeL_precision": 0.08456270084600391, "rougeL_precision_stderr": 0.0011815898217163434, "rougeL_recall": 0.21064825146724378, "rougeL_recall_stderr": 0.002957758437750362, "rougeLsum_fmeasure": 0.12880725110187866, "rougeLsum_fmeasure_stderr": 0.0018796612939592603, "rougeLsum_precision": 0.09152186527385546, "rougeLsum_precision_stderr": 0.0013729069191967156, "rougeLsum_recall": 0.22753083497124593, "rougeLsum_recall_stderr": 0.0033757963655885394}, "DOC_tldr": {"bleu": 2.0334951949441202, "bleu_stderr": 0.0861569471135935, "rouge1_fmeasure": 0.20561129767344294, "rouge1_fmeasure_stderr": 0.00245643585677957, "rouge1_precision": 0.14622869832638033, "rouge1_precision_stderr": 0.001840271628617099, "rouge1_recall": 0.36110939972212, "rouge1_recall_stderr": 0.004317171977659118, "rouge2_fmeasure": 0.05089418045053158, "rouge2_fmeasure_stderr": 0.0015644827567191525, "rouge2_precision": 0.03579409350172391, "rouge2_precision_stderr": 0.0011122812276210662, "rouge2_recall": 0.09230985292859638, "rouge2_recall_stderr": 0.002925197294466604, "rougeL_fmeasure": 0.1614717754951783, "rougeL_fmeasure_stderr": 0.0018726384968348116, "rougeL_precision": 0.11460227671669204, "rougeL_precision_stderr": 0.0013793463537122247, "rougeL_recall": 0.28536047123610525, "rougeL_recall_stderr": 0.0035023837864113175, "rougeLsum_fmeasure": 0.163228640222356, "rougeLsum_fmeasure_stderr": 0.002103524354064436, "rougeLsum_precision": 0.11587559985347855, "rougeLsum_precision_stderr": 0.0015497939952997533, "rougeLsum_recall": 0.28811845556237264, "rougeLsum_recall_stderr": 0.003818249039685}, "article_DOC_summary": {"bleu": 1.7895958187086474, "bleu_stderr": 0.04787428909067152, "rouge1_fmeasure": 0.19729124660372196, "rouge1_fmeasure_stderr": 0.0024865996635105967, "rouge1_precision": 0.14045160525988198, "rouge1_precision_stderr": 0.0018675726895835078, "rouge1_recall": 0.3448073202698317, "rouge1_recall_stderr": 0.004206053313996407, "rouge2_fmeasure": 0.04614704751240165, "rouge2_fmeasure_stderr": 0.001558612721206191, "rouge2_precision": 0.03250319865239163, "rouge2_precision_stderr": 0.001104582027493848, "rouge2_recall": 0.08285013633430294, "rouge2_recall_stderr": 0.0028679116547561338, "rougeL_fmeasure": 0.15159285154298055, "rougeL_fmeasure_stderr": 0.0018751369925771642, "rougeL_precision": 0.1076421486644076, "rougeL_precision_stderr": 0.00138815532226034, "rougeL_recall": 0.2669756747819945, "rougeL_recall_stderr": 0.0033509483772327935, "rougeLsum_fmeasure": 0.1571758204423751, "rougeLsum_fmeasure_stderr": 0.002082270276853359, "rougeLsum_precision": 0.11165266260752228, "rougeLsum_precision_stderr": 0.0015420215686911798, "rougeLsum_recall": 0.2762348157117063, "rougeLsum_recall_stderr": 0.0036302813408504828}, "summarize_DOC": {"bleu": 1.9591273158820803, "bleu_stderr": 0.044543479812057196, "rouge1_fmeasure": 0.21227676754700187, "rouge1_fmeasure_stderr": 0.0024708186366684537, "rouge1_precision": 0.15135142269847585, "rouge1_precision_stderr": 0.0018503020885729993, "rouge1_recall": 0.36992385810715606, "rouge1_recall_stderr": 0.00428697893431127, "rouge2_fmeasure": 0.05137980859666271, "rouge2_fmeasure_stderr": 0.0015384305388393403, "rouge2_precision": 0.03621955126854001, "rouge2_precision_stderr": 0.0010923924587675196, "rouge2_recall": 0.09215185931049906, "rouge2_recall_stderr": 0.0028353381841421765, "rougeL_fmeasure": 0.16024351750916332, "rougeL_fmeasure_stderr": 0.001828916419211741, "rougeL_precision": 0.1140477375940746, "rougeL_precision_stderr": 0.001355907302607709, "rougeL_recall": 0.2809365376305075, "rougeL_recall_stderr": 0.0033434191813504033, "rougeLsum_fmeasure": 0.1682190350645718, "rougeLsum_fmeasure_stderr": 0.0020795057799479827, "rougeLsum_precision": 0.11966566066238304, "rougeLsum_precision_stderr": 0.0015275000983393555, "rougeLsum_recall": 0.2949558808752265, "rougeLsum_recall_stderr": 0.0037725507061813924}, "summarize_this_DOC_summary": {"bleu": 1.6369889601746683, "bleu_stderr": 0.08510169629278132, "rouge1_fmeasure": 0.18909993339896647, "rouge1_fmeasure_stderr": 0.0025443533012134306, "rouge1_precision": 0.13513852151062536, "rouge1_precision_stderr": 0.0019145000289811317, "rouge1_recall": 0.3276171253130558, "rouge1_recall_stderr": 0.004253252892642702, "rouge2_fmeasure": 0.04154990869291492, "rouge2_fmeasure_stderr": 0.001535468901684112, "rouge2_precision": 0.029483119025250706, "rouge2_precision_stderr": 0.0010991930679911828, "rouge2_recall": 0.07320552894804701, "rouge2_recall_stderr": 0.0027333394649254345, "rougeL_fmeasure": 0.14575285141182545, "rougeL_fmeasure_stderr": 0.0019148076940627124, "rougeL_precision": 0.10394340309946604, "rougeL_precision_stderr": 0.0014280355461601787, "rougeL_recall": 0.25410078113161033, "rougeL_recall_stderr": 0.0033207296201257777, "rougeLsum_fmeasure": 0.1484681379100036, "rougeLsum_fmeasure_stderr": 0.0020754922487701485, "rougeLsum_precision": 0.10579391735030412, "rougeLsum_precision_stderr": 0.0015362690884521812, "rougeLsum_recall": 0.25908485264591374, "rougeLsum_recall_stderr": 0.0036064198067757756}}, "3": {"DOC_boils_down_to_simple_idea_that": {"bleu": 1.213942331424008, "bleu_stderr": 0.06983125587361878, "rouge1_fmeasure": 0.16536255544133652, "rouge1_fmeasure_stderr": 0.002782591039843851, "rouge1_precision": 0.1198315859407343, "rouge1_precision_stderr": 0.0021313635031272324, "rouge1_recall": 0.2845973554839675, "rouge1_recall_stderr": 0.004807019416971591, "rouge2_fmeasure": 0.0308541164205269, "rouge2_fmeasure_stderr": 0.0013433559082160224, "rouge2_precision": 0.021995282393962437, "rouge2_precision_stderr": 0.0009659870502554929, "rouge2_recall": 0.05477147588190337, "rouge2_recall_stderr": 0.0024254506123566537, "rougeL_fmeasure": 0.12329381135423298, "rougeL_fmeasure_stderr": 0.002007770555031566, "rougeL_precision": 0.0891431800173679, "rougeL_precision_stderr": 0.001518317618261569, "rougeL_recall": 0.2133363419019677, "rougeL_recall_stderr": 0.0035492022408266455, "rougeLsum_fmeasure": 0.13115287292645436, "rougeLsum_fmeasure_stderr": 0.0022631518633725732, "rougeLsum_precision": 0.09477716470429574, "rougeLsum_precision_stderr": 0.0017017716066415637, "rougeLsum_recall": 0.22706163311880304, "rougeLsum_recall_stderr": 0.003999321665699719}, "DOC_tldr": {"bleu": 1.9326408991617487, "bleu_stderr": 0.05911662841116225, "rouge1_fmeasure": 0.20045136953668377, "rouge1_fmeasure_stderr": 0.002718747872622448, "rouge1_precision": 0.14592947816165613, "rouge1_precision_stderr": 0.0021611659038272, "rouge1_recall": 0.3443812486249235, "rouge1_recall_stderr": 0.004671315297787991, "rouge2_fmeasure": 0.05035265573315919, "rouge2_fmeasure_stderr": 0.0015735834442296916, "rouge2_precision": 0.03602991518922341, "rouge2_precision_stderr": 0.0011524705369005208, "rouge2_recall": 0.08958291369771845, "rouge2_recall_stderr": 0.0028952310760534993, "rougeL_fmeasure": 0.1571872470032892, "rougeL_fmeasure_stderr": 0.0020963391106268445, "rougeL_precision": 0.1140823083670884, "rougeL_precision_stderr": 0.0016349867638249449, "rougeL_recall": 0.2720512676890386, "rougeL_recall_stderr": 0.0037892985318394535, "rougeLsum_fmeasure": 0.15729367100488104, "rougeLsum_fmeasure_stderr": 0.002267612514707712, "rougeLsum_precision": 0.11426494976351477, "rougeLsum_precision_stderr": 0.0017631615041811524, "rougeLsum_recall": 0.2716320363223784, "rougeLsum_recall_stderr": 0.004008514578347172}, "article_DOC_summary": {"bleu": 1.799886255448494, "bleu_stderr": 0.06241605343512123, "rouge1_fmeasure": 0.19247676178691825, "rouge1_fmeasure_stderr": 0.0027783344646224338, "rouge1_precision": 0.1397482550494329, "rouge1_precision_stderr": 0.0021614980521941947, "rouge1_recall": 0.3314206756572503, "rouge1_recall_stderr": 0.0048258842534713575, "rouge2_fmeasure": 0.04492583126972709, "rouge2_fmeasure_stderr": 0.0015630404482977962, "rouge2_precision": 0.032034815897723355, "rouge2_precision_stderr": 0.0011234176352706835, "rouge2_recall": 0.08021224571691461, "rouge2_recall_stderr": 0.00290937059770342, "rougeL_fmeasure": 0.14710040508558128, "rougeL_fmeasure_stderr": 0.0020844906061187624, "rougeL_precision": 0.10664386827077653, "rougeL_precision_stderr": 0.0016219261205998713, "rougeL_recall": 0.25469242016766824, "rougeL_recall_stderr": 0.003747468811937111, "rougeLsum_fmeasure": 0.1523907734490472, "rougeLsum_fmeasure_stderr": 0.0023066662456799336, "rougeLsum_precision": 0.11038994072863482, "rougeLsum_precision_stderr": 0.001775109847716677, "rougeLsum_recall": 0.2637144833686383, "rougeLsum_recall_stderr": 0.004087246305150146}, "summarize_DOC": {"bleu": 2.0596089820557593, "bleu_stderr": 0.08930377223048487, "rouge1_fmeasure": 0.20846450018241194, "rouge1_fmeasure_stderr": 0.0027571228316079694, "rouge1_precision": 0.1520556889414388, "rouge1_precision_stderr": 0.002189742662234435, "rouge1_recall": 0.3562962414780344, "rouge1_recall_stderr": 0.004749166123906837, "rouge2_fmeasure": 0.05134777699396957, "rouge2_fmeasure_stderr": 0.001609365371866251, "rouge2_precision": 0.03687572904819467, "rouge2_precision_stderr": 0.0011771782163178155, "rouge2_recall": 0.0901761315164544, "rouge2_recall_stderr": 0.0028796580054493573, "rougeL_fmeasure": 0.15665407959636418, "rougeL_fmeasure_stderr": 0.002098634389081189, "rougeL_precision": 0.11417177031962449, "rougeL_precision_stderr": 0.0016577260631727522, "rougeL_recall": 0.26888326992906886, "rougeL_recall_stderr": 0.0037215896036639656, "rougeLsum_fmeasure": 0.16513176347393482, "rougeLsum_fmeasure_stderr": 0.0023093147777262708, "rougeLsum_precision": 0.12014388751084557, "rougeLsum_precision_stderr": 0.0017911422977108812, "rougeLsum_recall": 0.28401937552529777, "rougeLsum_recall_stderr": 0.004092634223313844}, "summarize_this_DOC_summary": {"bleu": 1.6288699845696568, "bleu_stderr": 0.07351917145628349, "rouge1_fmeasure": 0.17983877354806166, "rouge1_fmeasure_stderr": 0.0028344653807296284, "rouge1_precision": 0.13077969013065976, "rouge1_precision_stderr": 0.002174976181761563, "rouge1_recall": 0.3051790928330733, "rouge1_recall_stderr": 0.004806410522034264, "rouge2_fmeasure": 0.040167379424322004, "rouge2_fmeasure_stderr": 0.0014918819374576107, "rouge2_precision": 0.028769652888890242, "rouge2_precision_stderr": 0.001066351955086345, "rouge2_recall": 0.07052778069631396, "rouge2_recall_stderr": 0.0027420302852526044, "rougeL_fmeasure": 0.1399605239628858, "rougeL_fmeasure_stderr": 0.002175089499821927, "rougeL_precision": 0.1015239172914668, "rougeL_precision_stderr": 0.001644583430262152, "rougeL_recall": 0.23886024900651934, "rougeL_recall_stderr": 0.0037908298592866764, "rougeLsum_fmeasure": 0.14082010524518954, "rougeLsum_fmeasure_stderr": 0.0022782275382841146, "rougeLsum_precision": 0.10201260281036947, "rougeLsum_precision_stderr": 0.0017070040757652396, "rougeLsum_recall": 0.24078075446433067, "rougeLsum_recall_stderr": 0.003985094366002531}}, "4": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.6027229140523703, "bleu_stderr": 0.11533577622824943, "rouge1_fmeasure": 0.044055315863565106, "rouge1_fmeasure_stderr": 0.0026368956007959852, "rouge1_precision": 0.03617972980376136, "rouge1_precision_stderr": 0.0022835108300555376, "rouge1_recall": 0.06939836439110722, "rouge1_recall_stderr": 0.004267803204475849, "rouge2_fmeasure": 0.008953373288301222, "rouge2_fmeasure_stderr": 0.0008618162172086633, "rouge2_precision": 0.006768492713667434, "rouge2_precision_stderr": 0.0006816043308723117, "rouge2_recall": 0.015027042518545813, "rouge2_recall_stderr": 0.0014854463791600376, "rougeL_fmeasure": 0.033494270940581335, "rougeL_fmeasure_stderr": 0.0019921856030734055, "rougeL_precision": 0.027808549592201456, "rougeL_precision_stderr": 0.0017616362314983645, "rougeL_recall": 0.05275279200412055, "rougeL_recall_stderr": 0.0032412133339056147, "rougeLsum_fmeasure": 0.035448110651920155, "rougeLsum_fmeasure_stderr": 0.0021385233474543013, "rougeLsum_precision": 0.029343763312784644, "rougeLsum_precision_stderr": 0.0018717600410582465, "rougeLsum_recall": 0.05584466177175486, "rougeLsum_recall_stderr": 0.0034859449912314064}, "DOC_tldr": {"bleu": 0.9888155443961298, "bleu_stderr": 0.16386874187258033, "rouge1_fmeasure": 0.0553051713195741, "rouge1_fmeasure_stderr": 0.0030382631257604253, "rouge1_precision": 0.04775403576214989, "rouge1_precision_stderr": 0.0029942578637535498, "rouge1_recall": 0.08769716230460457, "rouge1_recall_stderr": 0.004942918692076916, "rouge2_fmeasure": 0.013792609834707413, "rouge2_fmeasure_stderr": 0.0011026959034726492, "rouge2_precision": 0.010407613906275788, "rouge2_precision_stderr": 0.0008600375403691117, "rouge2_recall": 0.023456621445988514, "rouge2_recall_stderr": 0.001909353475291992, "rougeL_fmeasure": 0.042863987037903804, "rougeL_fmeasure_stderr": 0.0023336709067174025, "rougeL_precision": 0.03800524902986652, "rougeL_precision_stderr": 0.0025635931438879765, "rougeL_recall": 0.06826516249123325, "rougeL_recall_stderr": 0.003864253776835998, "rougeLsum_fmeasure": 0.043714690958694166, "rougeLsum_fmeasure_stderr": 0.002416760693299736, "rougeLsum_precision": 0.03874591139206328, "rougeLsum_precision_stderr": 0.002614949735399864, "rougeLsum_recall": 0.06947543218015861, "rougeLsum_recall_stderr": 0.003996515839046432}, "article_DOC_summary": {"bleu": 0.912241430159217, "bleu_stderr": 0.1508779538575514, "rouge1_fmeasure": 0.052109895215743926, "rouge1_fmeasure_stderr": 0.0029325961974541494, "rouge1_precision": 0.04293551268263482, "rouge1_precision_stderr": 0.0025300915700631186, "rouge1_recall": 0.08251909123638941, "rouge1_recall_stderr": 0.0047552295627311435, "rouge2_fmeasure": 0.01203821663060757, "rouge2_fmeasure_stderr": 0.001058756941479336, "rouge2_precision": 0.008902116582691397, "rouge2_precision_stderr": 0.0007882814434287245, "rouge2_recall": 0.020355209333257882, "rouge2_recall_stderr": 0.0018399757489791825, "rougeL_fmeasure": 0.039428572953528446, "rougeL_fmeasure_stderr": 0.002210624192813922, "rougeL_precision": 0.03291018535700006, "rougeL_precision_stderr": 0.001988611171780747, "rougeL_recall": 0.06245660895956642, "rougeL_recall_stderr": 0.0036021201516783473, "rougeLsum_fmeasure": 0.0423468971865807, "rougeLsum_fmeasure_stderr": 0.002398760878941016, "rougeLsum_precision": 0.03518063964656488, "rougeLsum_precision_stderr": 0.0021155739741956746, "rougeLsum_recall": 0.0671088243853192, "rougeLsum_recall_stderr": 0.003920638757312349}, "summarize_DOC": {"bleu": 0.9295235714400739, "bleu_stderr": 0.17952600712489128, "rouge1_fmeasure": 0.057092747254164154, "rouge1_fmeasure_stderr": 0.003080646206696148, "rouge1_precision": 0.047023957094332526, "rouge1_precision_stderr": 0.0027799550540700086, "rouge1_recall": 0.09027695540385026, "rouge1_recall_stderr": 0.0049774928615611445, "rouge2_fmeasure": 0.012186294567403028, "rouge2_fmeasure_stderr": 0.000974923421275362, "rouge2_precision": 0.00908866042073288, "rouge2_precision_stderr": 0.0007468809595545078, "rouge2_recall": 0.020385013066777098, "rouge2_recall_stderr": 0.0016479365749736059, "rougeL_fmeasure": 0.041702960752974014, "rougeL_fmeasure_stderr": 0.0022604615815070856, "rougeL_precision": 0.03498091370776939, "rougeL_precision_stderr": 0.0022036086527599703, "rougeL_recall": 0.06597700098824039, "rougeL_recall_stderr": 0.003697132762313373, "rougeLsum_fmeasure": 0.0453763496307077, "rougeLsum_fmeasure_stderr": 0.0024814034302527927, "rougeLsum_precision": 0.037710892572660165, "rougeLsum_precision_stderr": 0.0023299806643745616, "rougeLsum_recall": 0.071991225804228, "rougeLsum_recall_stderr": 0.0040607831831936175}, "summarize_this_DOC_summary": {"bleu": 0.7016713676020814, "bleu_stderr": 0.12854976408337526, "rouge1_fmeasure": 0.04710206848660278, "rouge1_fmeasure_stderr": 0.002827851220284823, "rouge1_precision": 0.03885731681664587, "rouge1_precision_stderr": 0.0024465188352010866, "rouge1_recall": 0.072218223256745, "rouge1_recall_stderr": 0.004400630629119156, "rouge2_fmeasure": 0.010750703825901103, "rouge2_fmeasure_stderr": 0.0009638910126157537, "rouge2_precision": 0.008293369893348976, "rouge2_precision_stderr": 0.0007674824532485735, "rouge2_recall": 0.017409607367553894, "rouge2_recall_stderr": 0.0016283584197287304, "rougeL_fmeasure": 0.03580387652237057, "rougeL_fmeasure_stderr": 0.002145060472538106, "rougeL_precision": 0.029640111134550683, "rougeL_precision_stderr": 0.0018709888056480106, "rougeL_recall": 0.05506280693983692, "rougeL_recall_stderr": 0.003373140482326596, "rougeLsum_fmeasure": 0.037509929346318714, "rougeLsum_fmeasure_stderr": 0.0022590153287918896, "rougeLsum_precision": 0.031047920466470008, "rougeLsum_precision_stderr": 0.0019660483285081367, "rougeLsum_recall": 0.05761387256991218, "rougeLsum_recall_stderr": 0.003550284822808244}}, "5": {"DOC_boils_down_to_simple_idea_that": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.00033237715251661107, "rouge1_fmeasure_stderr": 0.00015078706434605712, "rouge1_precision": 0.004288164665523156, "rouge1_precision_stderr": 0.0019144304746682786, "rouge1_recall": 0.0001731252011799053, "rouge1_recall_stderr": 7.864942490494626e-05, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.00033237715251661107, "rougeL_fmeasure_stderr": 0.00015078706434605712, "rougeL_precision": 0.004288164665523156, "rougeL_precision_stderr": 0.0019144304746682786, "rougeL_recall": 0.0001731252011799053, "rougeL_recall_stderr": 7.864942490494626e-05, "rougeLsum_fmeasure": 0.00033237715251661107, "rougeLsum_fmeasure_stderr": 0.00015078706434605712, "rougeLsum_precision": 0.004288164665523156, "rougeLsum_precision_stderr": 0.0019144304746682786, "rougeLsum_recall": 0.0001731252011799053, "rougeLsum_recall_stderr": 7.864942490494626e-05}, "DOC_tldr": {"bleu": 3.9891071758683426e-44, "bleu_stderr": 9.981986929231053e-37, "rouge1_fmeasure": 0.002720852866414494, "rouge1_fmeasure_stderr": 0.0007709104718885456, "rouge1_precision": 0.003193068638626207, "rouge1_precision_stderr": 0.0009043418737167503, "rouge1_recall": 0.0025114151662593677, "rouge1_recall_stderr": 0.0007268593279769246, "rouge2_fmeasure": 0.0004211068403029026, "rouge2_fmeasure_stderr": 0.00018200691938578806, "rouge2_precision": 0.00047578207955566436, "rouge2_precision_stderr": 0.00021103651915959466, "rouge2_recall": 0.00041802798878270574, "rouge2_recall_stderr": 0.00018654044401303826, "rougeL_fmeasure": 0.0018716501706524194, "rougeL_fmeasure_stderr": 0.0005361379726101486, "rougeL_precision": 0.0021986780281657314, "rougeL_precision_stderr": 0.0006286463155410547, "rougeL_recall": 0.001725510678386032, "rougeL_recall_stderr": 0.0005031104040557216, "rougeLsum_fmeasure": 0.0020513446899695803, "rougeLsum_fmeasure_stderr": 0.0005879898596125877, "rougeLsum_precision": 0.002403627074679706, "rougeLsum_precision_stderr": 0.0006873477461862152, "rougeLsum_recall": 0.0018880944258754403, "rougeLsum_recall_stderr": 0.0005483050471105171}, "article_DOC_summary": {"bleu": 3.097815638153428e-39, "bleu_stderr": 5.018895149426352e-34, "rouge1_fmeasure": 0.0023684978392920267, "rouge1_fmeasure_stderr": 0.000684399718764436, "rouge1_precision": 0.002739968547331711, "rouge1_precision_stderr": 0.0008278012212952025, "rouge1_recall": 0.0022134599272398445, "rouge1_recall_stderr": 0.0006273422111807437, "rouge2_fmeasure": 0.0003256051958251534, "rouge2_fmeasure_stderr": 0.00013752393773970521, "rouge2_precision": 0.000376696230153845, "rouge2_precision_stderr": 0.00016337393027254123, "rouge2_recall": 0.00030314025597044463, "rouge2_recall_stderr": 0.00012776302473939982, "rougeL_fmeasure": 0.0019272015023736237, "rougeL_fmeasure_stderr": 0.0005653230599258551, "rougeL_precision": 0.0021741866307845633, "rougeL_precision_stderr": 0.0006503820290684123, "rougeL_recall": 0.0018275205470825698, "rougeL_recall_stderr": 0.0005304151694392347, "rougeLsum_fmeasure": 0.0020216316744952494, "rougeLsum_fmeasure_stderr": 0.0005921597049542807, "rougeLsum_precision": 0.002299258100195655, "rougeLsum_precision_stderr": 0.0006916877058747434, "rougeLsum_recall": 0.0019069310038515171, "rougeLsum_recall_stderr": 0.0005514157386454834}, "summarize_DOC": {"bleu": 1.3118412132647612e-38, "bleu_stderr": 5.142891951955452e-33, "rouge1_fmeasure": 0.0026690932382273276, "rouge1_fmeasure_stderr": 0.0007150299022018968, "rouge1_precision": 0.0029488555074161714, "rouge1_precision_stderr": 0.0007930006513468652, "rouge1_recall": 0.0025816173879792048, "rouge1_recall_stderr": 0.0007065729972344169, "rouge2_fmeasure": 0.00026998206537521717, "rouge2_fmeasure_stderr": 0.00011079170761441466, "rouge2_precision": 0.00030619737889438225, "rouge2_precision_stderr": 0.00012483905412694334, "rouge2_recall": 0.00024479974951673066, "rouge2_recall_stderr": 0.00010164474700168012, "rougeL_fmeasure": 0.0020707423522561583, "rougeL_fmeasure_stderr": 0.0005514160671606608, "rougeL_precision": 0.0022740359120988827, "rougeL_precision_stderr": 0.0006037958962838299, "rougeL_recall": 0.0020309527255223335, "rougeL_recall_stderr": 0.0005682477636303094, "rougeLsum_fmeasure": 0.0019001191955137153, "rougeLsum_fmeasure_stderr": 0.0005013736076589153, "rougeLsum_precision": 0.0020831557924846085, "rougeLsum_precision_stderr": 0.0005457471442594348, "rougeLsum_recall": 0.0018757125016714748, "rougeLsum_recall_stderr": 0.0005287065797899205}, "summarize_this_DOC_summary": {"bleu": 0.0, "bleu_stderr": 0.0, "rouge1_fmeasure": 0.0006709563634213615, "rouge1_fmeasure_stderr": 0.00024206325506310295, "rouge1_precision": 0.0032875929102344197, "rouge1_precision_stderr": 0.0012833920426189144, "rouge1_recall": 0.0003777789691102385, "rouge1_recall_stderr": 0.0001360677657980004, "rouge2_fmeasure": 0.0, "rouge2_fmeasure_stderr": 0.0, "rouge2_precision": 0.0, "rouge2_precision_stderr": 0.0, "rouge2_recall": 0.0, "rouge2_recall_stderr": 0.0, "rougeL_fmeasure": 0.000609696868199602, "rougeL_fmeasure_stderr": 0.0002177019049863885, "rougeL_precision": 0.0030017152658662095, "rougeL_precision_stderr": 0.0011845885037617687, "rougeL_recall": 0.0003434736517860532, "rougeL_recall_stderr": 0.00012248298015115857, "rougeLsum_fmeasure": 0.000609696868199602, "rougeLsum_fmeasure_stderr": 0.0002177019049863885, "rougeLsum_precision": 0.0030017152658662095, "rougeLsum_precision_stderr": 0.0011845885037617687, "rougeLsum_recall": 0.0003434736517860532, "rougeLsum_recall_stderr": 0.00012248298015115857}}}, "piqa": {"0": {"Correct the solution": {"bleu": 8.293650196551072, "bleu_stderr": 0.3291933211979279, "rouge1_fmeasure": 0.26518149573262756, "rouge1_fmeasure_stderr": 0.00598353213568122, "rouge1_precision": 0.20985881282206706, "rouge1_precision_stderr": 0.005678392457961964, "rouge1_recall": 0.6760010324649197, "rouge1_recall_stderr": 0.007109960469106136, "rouge2_fmeasure": 0.20168519353681674, "rouge2_fmeasure_stderr": 0.005583925104466641, "rouge2_precision": 0.1581319955963504, "rouge2_precision_stderr": 0.005136391219179097, "rouge2_recall": 0.5233448090942575, "rouge2_recall_stderr": 0.008115996413199544, "rougeL_fmeasure": 0.2574325851884139, "rougeL_fmeasure_stderr": 0.005932612011453031, "rougeL_precision": 0.2034802697348527, "rougeL_precision_stderr": 0.005608894909532193, "rougeL_recall": 0.6584424374867024, "rougeL_recall_stderr": 0.007242236248876609, "rougeLsum_fmeasure": 0.2586740738981541, "rougeLsum_fmeasure_stderr": 0.005969487363499531, "rougeLsum_precision": 0.20469461558102828, "rougeLsum_precision_stderr": 0.005642747231844393, "rougeLsum_recall": 0.6591156896691646, "rougeLsum_recall_stderr": 0.0072783852307659385}, "choose the most appropriate solution": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "no prompt needed": {"bleu": 0.1743540081034935, "bleu_stderr": 0.010323609186140817, "rouge1_fmeasure": 0.035178147983051675, "rouge1_fmeasure_stderr": 0.0008727304917432003, "rouge1_precision": 0.020414014144919227, "rouge1_precision_stderr": 0.0005848929509747042, "rouge1_recall": 0.22265270386663025, "rouge1_recall_stderr": 0.004060591384362697, "rouge2_fmeasure": 0.005612668710216912, "rouge2_fmeasure_stderr": 0.0002723517397472548, "rouge2_precision": 0.003228525003290761, "rouge2_precision_stderr": 0.00016737474198048977, "rouge2_recall": 0.0383496959158395, "rouge2_recall_stderr": 0.0019867052472860103, "rougeL_fmeasure": 0.03213626849465759, "rougeL_fmeasure_stderr": 0.0007335055493238853, "rougeL_precision": 0.018559243344714177, "rougeL_precision_stderr": 0.00048315200547689713, "rougeL_recall": 0.20821746365510307, "rougeL_recall_stderr": 0.003757747342281253, "rougeLsum_fmeasure": 0.029311011598519118, "rougeLsum_fmeasure_stderr": 0.0007339751223464163, "rougeLsum_precision": 0.016991539384663735, "rougeLsum_precision_stderr": 0.0004971662800825557, "rougeLsum_recall": 0.19247194188686473, "rougeLsum_recall_stderr": 0.0037031262038145878}, "pick_correct_choice_index": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "what_is_the_correct_ending": {"acc": 0.5609357997823722, "acc_norm": 0.5625680087051143, "acc_norm_stderr": 0.011574126069682387, "acc_stderr": 0.011578865649321299}}, "1": {"Correct the solution": {"bleu": 6.9708485559775974, "bleu_stderr": 0.34463867046580154, "rouge1_fmeasure": 0.22252428799090185, "rouge1_fmeasure_stderr": 0.005166919209222038, "rouge1_precision": 0.16692070561892322, "rouge1_precision_stderr": 0.004836046744236672, "rouge1_recall": 0.6789384494865478, "rouge1_recall_stderr": 0.006554803195463939, "rouge2_fmeasure": 0.16670313208229318, "rouge2_fmeasure_stderr": 0.004842857551789808, "rouge2_precision": 0.12325642385963735, "rouge2_precision_stderr": 0.004315397965867194, "rouge2_recall": 0.5132917664492825, "rouge2_recall_stderr": 0.008124723992810777, "rougeL_fmeasure": 0.21392670473431238, "rougeL_fmeasure_stderr": 0.005120862001741173, "rougeL_precision": 0.1603231731890274, "rougeL_precision_stderr": 0.004760330118066573, "rougeL_recall": 0.6553730557520384, "rougeL_recall_stderr": 0.0068118148422351065, "rougeLsum_fmeasure": 0.2165274990499404, "rougeLsum_fmeasure_stderr": 0.005151253631478041, "rougeLsum_precision": 0.16247005134061046, "rougeLsum_precision_stderr": 0.004797762841990119, "rougeLsum_recall": 0.6612423664976972, "rougeLsum_recall_stderr": 0.0068194387242296725}, "choose the most appropriate solution": {"acc": 0.49510337323177367, "acc_norm": 0.49510337323177367, "acc_norm_stderr": 0.01166526473007815, "acc_stderr": 0.01166526473007815}, "no prompt needed": {"bleu": 0.174213094733269, "bleu_stderr": 0.02197451232409411, "rouge1_fmeasure": 0.03403289853539769, "rouge1_fmeasure_stderr": 0.0009305982872464937, "rouge1_precision": 0.020403702655232312, "rouge1_precision_stderr": 0.0007707685830758424, "rouge1_recall": 0.214395939532725, "rouge1_recall_stderr": 0.004149318989451621, "rouge2_fmeasure": 0.005467793325653137, "rouge2_fmeasure_stderr": 0.00031728961152867736, "rouge2_precision": 0.003189729472705606, "rouge2_precision_stderr": 0.00021737905154245153, "rouge2_recall": 0.037318744091622835, "rouge2_recall_stderr": 0.00206867790744111, "rougeL_fmeasure": 0.03171656567240596, "rougeL_fmeasure_stderr": 0.0008276871670439616, "rougeL_precision": 0.018933871870268792, "rougeL_precision_stderr": 0.0006694389007203683, "rougeL_recall": 0.202411719381226, "rougeL_recall_stderr": 0.0039026013675624565, "rougeLsum_fmeasure": 0.02830120340253178, "rougeLsum_fmeasure_stderr": 0.0007928221584109298, "rougeLsum_precision": 0.01699166069756899, "rougeLsum_precision_stderr": 0.000659008917435192, "rougeLsum_recall": 0.18391954436557392, "rougeLsum_recall_stderr": 0.0037126321626407112}, "pick_correct_choice_index": {"acc": 0.4967355821545158, "acc_norm": 0.4967355821545158, "acc_norm_stderr": 0.01166557553076037, "acc_stderr": 0.01166557553076037}, "what_is_the_correct_ending": {"acc": 0.5680087051142546, "acc_norm": 0.5674646354733406, "acc_norm_stderr": 0.011559142916063143, "acc_stderr": 0.011557407210100255}}, "2": {"Correct the solution": {"bleu": 6.9957771713059085, "bleu_stderr": 0.3238947499660514, "rouge1_fmeasure": 0.21337595032188442, "rouge1_fmeasure_stderr": 0.005144022732419237, "rouge1_precision": 0.16352590100073616, "rouge1_precision_stderr": 0.0051809522811463865, "rouge1_recall": 0.6961503957283254, "rouge1_recall_stderr": 0.006309156641782605, "rouge2_fmeasure": 0.1635128839739126, "rouge2_fmeasure_stderr": 0.004978071911619273, "rouge2_precision": 0.12411789818339344, "rouge2_precision_stderr": 0.004723425320608877, "rouge2_recall": 0.5262901845589097, "rouge2_recall_stderr": 0.008088441034716209, "rougeL_fmeasure": 0.2054038225613471, "rougeL_fmeasure_stderr": 0.005126617953639098, "rougeL_precision": 0.15682105604362034, "rougeL_precision_stderr": 0.005078191461050391, "rougeL_recall": 0.6706778746147497, "rougeL_recall_stderr": 0.006640724268201555, "rougeLsum_fmeasure": 0.20828419170971285, "rougeLsum_fmeasure_stderr": 0.005133723055482889, "rougeLsum_precision": 0.15931759159992323, "rougeLsum_precision_stderr": 0.005115274381138, "rougeLsum_recall": 0.680473606884987, "rougeLsum_recall_stderr": 0.006549559772955488}, "choose the most appropriate solution": {"acc": 0.5108813928182807, "acc_norm": 0.5108813928182807, "acc_norm_stderr": 0.011663061261117748, "acc_stderr": 0.011663061261117748}, "no prompt needed": {"bleu": 0.1491691772441667, "bleu_stderr": 0.016463506105742935, "rouge1_fmeasure": 0.03119020870747495, "rouge1_fmeasure_stderr": 0.0007848115949067071, "rouge1_precision": 0.018743075278271827, "rouge1_precision_stderr": 0.0006585934745600448, "rouge1_recall": 0.20113968945623895, "rouge1_recall_stderr": 0.004069048528819971, "rouge2_fmeasure": 0.004589484275527073, "rouge2_fmeasure_stderr": 0.00024840528981741393, "rouge2_precision": 0.0026023719861491207, "rouge2_precision_stderr": 0.00014730866868978165, "rouge2_recall": 0.03304396129526227, "rouge2_recall_stderr": 0.0019097087061322854, "rougeL_fmeasure": 0.029387188096479906, "rougeL_fmeasure_stderr": 0.000717400002936173, "rougeL_precision": 0.017648037515718574, "rougeL_precision_stderr": 0.000612945231693746, "rougeL_recall": 0.191148004137635, "rougeL_recall_stderr": 0.003801093134569005, "rougeLsum_fmeasure": 0.025633495048222255, "rougeLsum_fmeasure_stderr": 0.0006527133219539642, "rougeLsum_precision": 0.015525163139534474, "rougeLsum_precision_stderr": 0.0006012960856164704, "rougeLsum_recall": 0.17186503534549524, "rougeLsum_recall_stderr": 0.003612669608221627}, "pick_correct_choice_index": {"acc": 0.4836779107725789, "acc_norm": 0.4836779107725789, "acc_norm_stderr": 0.011659606710151779, "acc_stderr": 0.011659606710151779}, "what_is_the_correct_ending": {"acc": 0.5516866158868335, "acc_norm": 0.5505984766050055, "acc_norm_stderr": 0.011605936624156083, "acc_stderr": 0.011603326108334514}}, "3": {"Correct the solution": {"bleu": 6.990071211182333, "bleu_stderr": 0.2967253826874956, "rouge1_fmeasure": 0.20922245282570967, "rouge1_fmeasure_stderr": 0.0051976113847955795, "rouge1_precision": 0.15871004605077652, "rouge1_precision_stderr": 0.00515504545306226, "rouge1_recall": 0.7034466308669174, "rouge1_recall_stderr": 0.006207428258406954, "rouge2_fmeasure": 0.16228271943343794, "rouge2_fmeasure_stderr": 0.004992233069443837, "rouge2_precision": 0.12184134221668164, "rouge2_precision_stderr": 0.004666177846152433, "rouge2_recall": 0.535375224176284, "rouge2_recall_stderr": 0.008001057333235266, "rougeL_fmeasure": 0.20121070269350022, "rougeL_fmeasure_stderr": 0.005165478141129265, "rougeL_precision": 0.15189168360996885, "rougeL_precision_stderr": 0.00504068577523969, "rougeL_recall": 0.6780782501114493, "rougeL_recall_stderr": 0.006547450748075444, "rougeLsum_fmeasure": 0.20417200076191475, "rougeLsum_fmeasure_stderr": 0.0051750936785322785, "rougeLsum_precision": 0.1543756601976579, "rougeLsum_precision_stderr": 0.005083274387419325, "rougeLsum_recall": 0.688408073065803, "rougeLsum_recall_stderr": 0.006448637747839622}, "choose the most appropriate solution": {"acc": 0.5016322089227421, "acc_norm": 0.5016322089227421, "acc_norm_stderr": 0.011665762007194866, "acc_stderr": 0.011665762007194866}, "no prompt needed": {"bleu": 0.12911173855917576, "bleu_stderr": 0.011212622020653384, "rouge1_fmeasure": 0.030624519846734758, "rouge1_fmeasure_stderr": 0.0008233916974397996, "rouge1_precision": 0.018353557289208323, "rouge1_precision_stderr": 0.0006164592823400039, "rouge1_recall": 0.19679724157173895, "rouge1_recall_stderr": 0.004065759031603333, "rouge2_fmeasure": 0.004446131485933507, "rouge2_fmeasure_stderr": 0.0002459591462609964, "rouge2_precision": 0.0025089231703953195, "rouge2_precision_stderr": 0.00014314849944617542, "rouge2_recall": 0.03367952377754561, "rouge2_recall_stderr": 0.0020381019021143807, "rougeL_fmeasure": 0.02816065241160017, "rougeL_fmeasure_stderr": 0.0007063876575393347, "rougeL_precision": 0.016850481271058308, "rougeL_precision_stderr": 0.0005360133862246574, "rougeL_recall": 0.1837392053172025, "rougeL_recall_stderr": 0.0037458785538945077, "rougeLsum_fmeasure": 0.025134463700752695, "rougeLsum_fmeasure_stderr": 0.0006674889527922072, "rougeLsum_precision": 0.015093144272162332, "rougeLsum_precision_stderr": 0.0005171501945296606, "rougeLsum_recall": 0.16891544570433364, "rougeLsum_recall_stderr": 0.003653462651005743}, "pick_correct_choice_index": {"acc": 0.4766050054406964, "acc_norm": 0.4766050054406964, "acc_norm_stderr": 0.011653047155927788, "acc_stderr": 0.011653047155927788}, "what_is_the_correct_ending": {"acc": 0.5625680087051143, "acc_norm": 0.5625680087051143, "acc_norm_stderr": 0.011574126069682387, "acc_stderr": 0.011574126069682387}}, "4": {"Correct the solution": {"bleu": 7.240744535616564, "bleu_stderr": 0.2586062831369915, "rouge1_fmeasure": 0.2205327621041109, "rouge1_fmeasure_stderr": 0.005523926029284187, "rouge1_precision": 0.17254149538622868, "rouge1_precision_stderr": 0.005654525592751173, "rouge1_recall": 0.7169800690761489, "rouge1_recall_stderr": 0.0060374304726635245, "rouge2_fmeasure": 0.17392590467314775, "rouge2_fmeasure_stderr": 0.005274573095809241, "rouge2_precision": 0.13569869881237576, "rouge2_precision_stderr": 0.005133251060824273, "rouge2_recall": 0.5506147150470818, "rouge2_recall_stderr": 0.007924999642538401, "rougeL_fmeasure": 0.21292785321625532, "rougeL_fmeasure_stderr": 0.0054688898695651805, "rougeL_precision": 0.16613299736339213, "rougeL_precision_stderr": 0.005531569846691979, "rougeL_recall": 0.6938260684470078, "rougeL_recall_stderr": 0.006371003512990666, "rougeLsum_fmeasure": 0.2160261905713099, "rougeLsum_fmeasure_stderr": 0.005483109104579196, "rougeLsum_precision": 0.16847957385780346, "rougeLsum_precision_stderr": 0.0055567140187079665, "rougeLsum_recall": 0.7040155922937625, "rougeLsum_recall_stderr": 0.006248279598171387}, "choose the most appropriate solution": {"acc": 0.5021762785636561, "acc_norm": 0.5021762785636561, "acc_norm_stderr": 0.011665713661738878, "acc_stderr": 0.011665713661738878}, "no prompt needed": {"bleu": 0.11653255801457632, "bleu_stderr": 0.00937865353959567, "rouge1_fmeasure": 0.029037401002642696, "rouge1_fmeasure_stderr": 0.0007664957090889966, "rouge1_precision": 0.01764227588810449, "rouge1_precision_stderr": 0.0006403823122550869, "rouge1_recall": 0.18185072302335697, "rouge1_recall_stderr": 0.0038480944045729292, "rouge2_fmeasure": 0.004018163648220329, "rouge2_fmeasure_stderr": 0.00022597062969223787, "rouge2_precision": 0.0023525543276751216, "rouge2_precision_stderr": 0.00014914943547875338, "rouge2_recall": 0.028175098812063096, "rouge2_recall_stderr": 0.0016880276568031164, "rougeL_fmeasure": 0.027024334508787648, "rougeL_fmeasure_stderr": 0.0006867914982061877, "rougeL_precision": 0.01642546114280103, "rougeL_precision_stderr": 0.0005899392695445675, "rougeL_recall": 0.17098066504375273, "rougeL_recall_stderr": 0.003570562801659878, "rougeLsum_fmeasure": 0.023723025806798186, "rougeLsum_fmeasure_stderr": 0.000614747502166158, "rougeLsum_precision": 0.014394564094771958, "rougeLsum_precision_stderr": 0.0005138074352751769, "rougeLsum_recall": 0.15527067254188423, "rougeLsum_recall_stderr": 0.0034206077082441716}, "pick_correct_choice_index": {"acc": 0.4896626768226333, "acc_norm": 0.4896626768226333, "acc_norm_stderr": 0.011663330673075898, "acc_stderr": 0.011663330673075898}, "what_is_the_correct_ending": {"acc": 0.5489662676822633, "acc_norm": 0.5527747551686616, "acc_norm_stderr": 0.011600659443292926, "acc_stderr": 0.01160974720073308}}, "5": {"Correct the solution": {"bleu": 7.501952156457839, "bleu_stderr": 0.39271130030770846, "rouge1_fmeasure": 0.23639707996315365, "rouge1_fmeasure_stderr": 0.006047125566678411, "rouge1_precision": 0.1907119925251693, "rouge1_precision_stderr": 0.006242576921723348, "rouge1_recall": 0.7229868834154785, "rouge1_recall_stderr": 0.005996824073230608, "rouge2_fmeasure": 0.18898088840112187, "rouge2_fmeasure_stderr": 0.0058122997504793505, "rouge2_precision": 0.15227920059782085, "rouge2_precision_stderr": 0.005751542723918666, "rouge2_recall": 0.5588359253387518, "rouge2_recall_stderr": 0.007903814632285042, "rougeL_fmeasure": 0.2293218614443035, "rougeL_fmeasure_stderr": 0.006023695302966697, "rougeL_precision": 0.1847082573134137, "rougeL_precision_stderr": 0.00614731549204923, "rougeL_recall": 0.7001578119554788, "rougeL_recall_stderr": 0.006345471739520215, "rougeLsum_fmeasure": 0.2322126006841029, "rougeLsum_fmeasure_stderr": 0.0060236108326316, "rougeLsum_precision": 0.1869491193721871, "rougeLsum_precision_stderr": 0.0061683914136742134, "rougeLsum_recall": 0.7106290738998889, "rougeLsum_recall_stderr": 0.0062144876381464535}, "choose the most appropriate solution": {"acc": 0.499455930359086, "acc_norm": 0.499455930359086, "acc_norm_stderr": 0.011665817258899182, "acc_stderr": 0.011665817258899182}, "no prompt needed": {"bleu": 0.10185172940494028, "bleu_stderr": 0.007051902591792091, "rouge1_fmeasure": 0.028066473296435548, "rouge1_fmeasure_stderr": 0.000754026689838101, "rouge1_precision": 0.018013100442129417, "rouge1_precision_stderr": 0.0009190145899227636, "rouge1_recall": 0.17846455901442868, "rouge1_recall_stderr": 0.003868171470663707, "rouge2_fmeasure": 0.0037514567534632703, "rouge2_fmeasure_stderr": 0.0002162496835966985, "rouge2_precision": 0.002344978071093655, "rouge2_precision_stderr": 0.00021411077145171528, "rouge2_recall": 0.02798475436220232, "rouge2_recall_stderr": 0.0018363804191775573, "rougeL_fmeasure": 0.02582308765153566, "rougeL_fmeasure_stderr": 0.0006671470754149905, "rougeL_precision": 0.016632311383909484, "rougeL_precision_stderr": 0.0008812891847948634, "rougeL_recall": 0.16638495652422183, "rougeL_recall_stderr": 0.0035613541742668953, "rougeLsum_fmeasure": 0.023227601838740366, "rougeLsum_fmeasure_stderr": 0.0006168362129579054, "rougeLsum_precision": 0.01508965511135499, "rougeLsum_precision_stderr": 0.0008651573938015946, "rougeLsum_recall": 0.15460309329405014, "rougeLsum_recall_stderr": 0.0035069550811540557}, "pick_correct_choice_index": {"acc": 0.4885745375408052, "acc_norm": 0.4885745375408052, "acc_norm_stderr": 0.011662778026451675, "acc_stderr": 0.011662778026451675}, "what_is_the_correct_ending": {"acc": 0.5576713819368879, "acc_norm": 0.5609357997823722, "acc_norm_stderr": 0.011578865649321297, "acc_stderr": 0.01158796354550718}}}, "sciq": {"0": {"Direct Question": {"acc": 0.862, "acc_norm": 0.808, "acc_norm_stderr": 0.012461592646659966, "acc_stderr": 0.010912152632504411}, "Direct Question (Closed Book)": {"acc": 0.498, "acc_norm": 0.465, "acc_norm_stderr": 0.015780495050030156, "acc_stderr": 0.015819173374302706}, "Multiple Choice": {"acc": 0.569, "acc_norm": 0.498, "acc_norm_stderr": 0.015819173374302702, "acc_stderr": 0.0156679444881735}, "Multiple Choice (Closed Book)": {"acc": 0.422, "acc_norm": 0.395, "acc_norm_stderr": 0.015466551464829344, "acc_stderr": 0.015625625112620667}, "Multiple Choice Question First": {"acc": 0.571, "acc_norm": 0.506, "acc_norm_stderr": 0.015818160898606715, "acc_stderr": 0.015658997547870243}}, "1": {"Direct Question": {"acc": 0.896, "acc_norm": 0.895, "acc_norm_stderr": 0.009698921026024947, "acc_stderr": 0.009658016218524277}, "Direct Question (Closed Book)": {"acc": 0.65, "acc_norm": 0.621, "acc_norm_stderr": 0.01534909100222535, "acc_stderr": 0.015090650341444233}, "Multiple Choice": {"acc": 0.55, "acc_norm": 0.511, "acc_norm_stderr": 0.01581547119529269, "acc_stderr": 0.01574000469338385}, "Multiple Choice (Closed Book)": {"acc": 0.43, "acc_norm": 0.425, "acc_norm_stderr": 0.01564032031704011, "acc_stderr": 0.015663503610155283}, "Multiple Choice Question First": {"acc": 0.427, "acc_norm": 0.412, "acc_norm_stderr": 0.015572363292015093, "acc_stderr": 0.015649789644462217}}, "2": {"Direct Question": {"acc": 0.917, "acc_norm": 0.905, "acc_norm_stderr": 0.009276910103103317, "acc_stderr": 0.008728527206074787}, "Direct Question (Closed Book)": {"acc": 0.664, "acc_norm": 0.649, "acc_norm_stderr": 0.015100563798316405, "acc_stderr": 0.014944140233795023}, "Multiple Choice": {"acc": 0.565, "acc_norm": 0.543, "acc_norm_stderr": 0.015760691590136384, "acc_stderr": 0.0156850572527172}, "Multiple Choice (Closed Book)": {"acc": 0.441, "acc_norm": 0.436, "acc_norm_stderr": 0.015689173023144064, "acc_stderr": 0.015708779894242676}, "Multiple Choice Question First": {"acc": 0.431, "acc_norm": 0.414, "acc_norm_stderr": 0.015583544104177522, "acc_stderr": 0.015667944488173498}}, "3": {"Direct Question": {"acc": 0.921, "acc_norm": 0.91, "acc_norm_stderr": 0.00905439020486644, "acc_stderr": 0.008534156773333443}, "Direct Question (Closed Book)": {"acc": 0.681, "acc_norm": 0.663, "acc_norm_stderr": 0.0149550879186536, "acc_stderr": 0.01474640486547348}, "Multiple Choice": {"acc": 0.571, "acc_norm": 0.553, "acc_norm_stderr": 0.015730176046009063, "acc_stderr": 0.01565899754787025}, "Multiple Choice (Closed Book)": {"acc": 0.481, "acc_norm": 0.452, "acc_norm_stderr": 0.015746235865880677, "acc_stderr": 0.01580787426850585}, "Multiple Choice Question First": {"acc": 0.441, "acc_norm": 0.426, "acc_norm_stderr": 0.015645087688113814, "acc_stderr": 0.015708779894242676}}, "4": {"Direct Question": {"acc": 0.918, "acc_norm": 0.913, "acc_norm_stderr": 0.008916866630745892, "acc_stderr": 0.008680515615523715}, "Direct Question (Closed Book)": {"acc": 0.686, "acc_norm": 0.69, "acc_norm_stderr": 0.014632638658632905, "acc_stderr": 0.01468399195108797}, "Multiple Choice": {"acc": 0.588, "acc_norm": 0.563, "acc_norm_stderr": 0.015693223928730377, "acc_stderr": 0.015572363292015098}, "Multiple Choice (Closed Book)": {"acc": 0.501, "acc_norm": 0.484, "acc_norm_stderr": 0.01581119837311488, "acc_stderr": 0.015819268290576817}, "Multiple Choice Question First": {"acc": 0.448, "acc_norm": 0.435, "acc_norm_stderr": 0.0156850572527172, "acc_stderr": 0.015733516566347836}}, "5": {"Direct Question": {"acc": 0.923, "acc_norm": 0.915, "acc_norm_stderr": 0.008823426366942293, "acc_stderr": 0.008434580140240669}, "Direct Question (Closed Book)": {"acc": 0.708, "acc_norm": 0.707, "acc_norm_stderr": 0.014399942998441271, "acc_stderr": 0.014385511563477345}, "Multiple Choice": {"acc": 0.599, "acc_norm": 0.57, "acc_norm_stderr": 0.01566350361015528, "acc_stderr": 0.015506109745498325}, "Multiple Choice (Closed Book)": {"acc": 0.524, "acc_norm": 0.5, "acc_norm_stderr": 0.015819299929208316, "acc_stderr": 0.015801065586651755}, "Multiple Choice Question First": {"acc": 0.451, "acc_norm": 0.438, "acc_norm_stderr": 0.01569721001969469, "acc_stderr": 0.01574315237958554}}}, "story_cloze_2016": {"0": {"Answer Given options": {"acc": 0.4719401389631213, "acc_norm": 0.49545697487974344, "acc_norm_stderr": 0.011561954965856516, "acc_stderr": 0.011544210396951669}, "Choose Story Ending": {"acc": 0.484233030464992, "acc_norm": 0.518439337252806, "acc_norm_stderr": 0.011554566910658105, "acc_stderr": 0.01155668204219638}, "Novel Correct Ending": {"acc": 0.48583645109567075, "acc_norm": 0.4991982896846606, "acc_norm_stderr": 0.01156241738830021, "acc_stderr": 0.01155779233130167}, "Story Continuation and Options": {"acc": 0.4804917156600748, "acc_norm": 0.5114911811865313, "acc_norm_stderr": 0.011559378273599126, "acc_stderr": 0.011553628196999314}}, "1": {"Answer Given options": {"acc": 0.4730090860502405, "acc_norm": 0.4927846071619455, "acc_norm_stderr": 0.01156122826464673, "acc_stderr": 0.011545573278697235}, "Choose Story Ending": {"acc": 0.4794227685729556, "acc_norm": 0.5104222340994121, "acc_norm_stderr": 0.011559920087347776, "acc_stderr": 0.011552636515221862}, "Novel Correct Ending": {"acc": 0.47835382148583644, "acc_norm": 0.504008551576697, "acc_norm_stderr": 0.011562060664045727, "acc_stderr": 0.011551591851683338}, "Story Continuation and Options": {"acc": 0.4681988241582042, "acc_norm": 0.49812934259754144, "acc_norm_stderr": 0.011562351329083266, "acc_stderr": 0.011539022035111226}}, "2": {"Answer Given options": {"acc": 0.46018172100481025, "acc_norm": 0.48690539818278994, "acc_norm_stderr": 0.011558466383367182, "acc_stderr": 0.011525709570367521}, "Choose Story Ending": {"acc": 0.4596472474612507, "acc_norm": 0.481560662747194, "acc_norm_stderr": 0.011554566910658103, "acc_stderr": 0.01152471548624065}, "Novel Correct Ending": {"acc": 0.47140566541956175, "acc_norm": 0.4820951362907536, "acc_norm_stderr": 0.011555016408505476, "acc_stderr": 0.011543509045585206}, "Story Continuation and Options": {"acc": 0.4494922501336184, "acc_norm": 0.4735435595938001, "acc_norm_stderr": 0.011546234813777399, "acc_stderr": 0.011503288699799179}}, "3": {"Answer Given options": {"acc": 0.46178514163548906, "acc_norm": 0.4778193479422769, "acc_norm_stderr": 0.011551049647290312, "acc_stderr": 0.011528611805439893}, "Choose Story Ending": {"acc": 0.46873329770176375, "acc_norm": 0.4751469802244789, "acc_norm_stderr": 0.011548139823074772, "acc_stderr": 0.011539803085637733}, "Novel Correct Ending": {"acc": 0.4607161945483699, "acc_norm": 0.4681988241582042, "acc_norm_stderr": 0.011539022035111226, "acc_stderr": 0.011526690316014589}, "Story Continuation and Options": {"acc": 0.4580438268305719, "acc_norm": 0.4623196151790486, "acc_norm_stderr": 0.011529552555884575, "acc_stderr": 0.011521653168224729}}, "4": {"Answer Given options": {"acc": 0.4607161945483699, "acc_norm": 0.46980224478888294, "acc_norm_stderr": 0.011541325320336616, "acc_stderr": 0.01152669031601459}, "Choose Story Ending": {"acc": 0.46018172100481025, "acc_norm": 0.4735435595938001, "acc_norm_stderr": 0.0115462348137774, "acc_stderr": 0.011525709570367509}, "Novel Correct Ending": {"acc": 0.4537680384820951, "acc_norm": 0.46285408872260825, "acc_norm_stderr": 0.011530479981182624, "acc_stderr": 0.011512899199863032}, "Story Continuation and Options": {"acc": 0.4569748797434527, "acc_norm": 0.4665954035275254, "acc_norm_stderr": 0.011536599118298168, "acc_stderr": 0.011519544865928062}}, "5": {"Answer Given options": {"acc": 0.467129877071085, "acc_norm": 0.4756814537680385, "acc_norm_stderr": 0.011548748301487317, "acc_stderr": 0.011537420054210303}, "Choose Story Ending": {"acc": 0.4580438268305719, "acc_norm": 0.46125066809192944, "acc_norm_stderr": 0.011527657726586461, "acc_stderr": 0.011521653168224729}, "Novel Correct Ending": {"acc": 0.4548369855692143, "acc_norm": 0.4548369855692143, "acc_norm_stderr": 0.011515167912227987, "acc_stderr": 0.011515167912227987}, "Story Continuation and Options": {"acc": 0.45056119722073756, "acc_norm": 0.45537145911277394, "acc_norm_stderr": 0.011516282203726655, "acc_stderr": 0.01150577173876986}}}, "superglue_rte": {"0": {"GPT-3 style": {"acc": 0.516245487364621, "acc_norm": 0.47653429602888087, "acc_norm_stderr": 0.030063300411902652, "acc_stderr": 0.030080573208738064}, "MNLI crowdsource": {"acc": 0.48375451263537905, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030080573208738064}, "does it follow that": {"acc": 0.48375451263537905, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.5379061371841155, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030009848912529113}, "should assume": {"acc": 0.5018050541516246, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030096267148976633}}, "1": {"GPT-3 style": {"acc": 0.51985559566787, "acc_norm": 0.48375451263537905, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317177}, "MNLI crowdsource": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "does it follow that": {"acc": 0.49097472924187724, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "guaranteed true": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}, "should assume": {"acc": 0.49097472924187724, "acc_norm": 0.49097472924187724, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030091559826331334}}, "2": {"GPT-3 style": {"acc": 0.51985559566787, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317177}, "MNLI crowdsource": {"acc": 0.5018050541516246, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030096267148976626}, "does it follow that": {"acc": 0.51985559566787, "acc_norm": 0.516245487364621, "acc_norm_stderr": 0.030080573208738064, "acc_stderr": 0.030072723167317177}, "guaranteed true": {"acc": 0.5018050541516246, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.030096267148976626}, "should assume": {"acc": 0.5090252707581228, "acc_norm": 0.5415162454873647, "acc_norm_stderr": 0.029992535385373314, "acc_stderr": 0.030091559826331334}}, "3": {"GPT-3 style": {"acc": 0.5234657039711191, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.03006330041190266}, "MNLI crowdsource": {"acc": 0.49458483754512633, "acc_norm": 0.49458483754512633, "acc_norm_stderr": 0.03009469812323996, "acc_stderr": 0.030094698123239966}, "does it follow that": {"acc": 0.516245487364621, "acc_norm": 0.5523465703971119, "acc_norm_stderr": 0.02993107036293953, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.516245487364621, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.03003973059219781, "acc_stderr": 0.030080573208738064}, "should assume": {"acc": 0.5270758122743683, "acc_norm": 0.5234657039711191, "acc_norm_stderr": 0.030063300411902652, "acc_stderr": 0.0300523034631437}}, "4": {"GPT-3 style": {"acc": 0.5234657039711191, "acc_norm": 0.5306859205776173, "acc_norm_stderr": 0.03003973059219781, "acc_stderr": 0.03006330041190266}, "MNLI crowdsource": {"acc": 0.4584837545126354, "acc_norm": 0.5090252707581228, "acc_norm_stderr": 0.030091559826331334, "acc_stderr": 0.029992535385373314}, "does it follow that": {"acc": 0.516245487364621, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.030080573208738064}, "guaranteed true": {"acc": 0.49458483754512633, "acc_norm": 0.5487364620938628, "acc_norm_stderr": 0.029953149241808946, "acc_stderr": 0.03009469812323996}, "should assume": {"acc": 0.516245487364621, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030080573208738064}}, "5": {"GPT-3 style": {"acc": 0.5270758122743683, "acc_norm": 0.5379061371841155, "acc_norm_stderr": 0.030009848912529113, "acc_stderr": 0.030052303463143706}, "MNLI crowdsource": {"acc": 0.44765342960288806, "acc_norm": 0.5270758122743683, "acc_norm_stderr": 0.030052303463143706, "acc_stderr": 0.02993107036293953}, "does it follow that": {"acc": 0.4981949458483754, "acc_norm": 0.5342960288808665, "acc_norm_stderr": 0.030025579819366426, "acc_stderr": 0.030096267148976633}, "guaranteed true": {"acc": 0.47653429602888087, "acc_norm": 0.5595667870036101, "acc_norm_stderr": 0.029882123363118726, "acc_stderr": 0.03006330041190266}, "should assume": {"acc": 0.51985559566787, "acc_norm": 0.5487364620938628, "acc_norm_stderr": 0.029953149241808943, "acc_stderr": 0.030072723167317184}}}, "winogrande": {"0": {"Replace": {"acc": 0.5090765588003157, "acc_norm": 0.5059194948697711, "acc_norm_stderr": 0.014051500838485807, "acc_stderr": 0.014050170094497697}, "True or False": {"acc": 0.4956590370955012, "acc_norm": 0.4956590370955012, "acc_norm_stderr": 0.014051956064076896, "acc_stderr": 0.014051956064076896}, "does underscore refer to": {"acc": 0.5082872928176796, "acc_norm": 0.4980268350434096, "acc_norm_stderr": 0.014052376259225636, "acc_stderr": 0.014050555322824192}, "stand for": {"acc": 0.5082872928176796, "acc_norm": 0.4964483030781373, "acc_norm_stderr": 0.014052131146915857, "acc_stderr": 0.014050555322824189}, "underscore refer to": {"acc": 0.4956590370955012, "acc_norm": 0.48539857932123126, "acc_norm_stderr": 0.014046492383275835, "acc_stderr": 0.0140519560640769}}, "1": {"Replace": {"acc": 0.4964483030781373, "acc_norm": 0.4940805051302289, "acc_norm_stderr": 0.01405150083848581, "acc_stderr": 0.014052131146915845}, "True or False": {"acc": 0.5082872928176796, "acc_norm": 0.4925019731649566, "acc_norm_stderr": 0.01405090552122858, "acc_stderr": 0.014050555322824192}, "does underscore refer to": {"acc": 0.5074980268350434, "acc_norm": 0.4964483030781373, "acc_norm_stderr": 0.01405213114691586, "acc_stderr": 0.014050905521228573}, "stand for": {"acc": 0.4996053670086819, "acc_norm": 0.5098658247829518, "acc_norm_stderr": 0.014049749833367582, "acc_stderr": 0.014052481306049512}, "underscore refer to": {"acc": 0.4980268350434096, "acc_norm": 0.5043409629044988, "acc_norm_stderr": 0.014051956064076892, "acc_stderr": 0.014052376259225636}}, "2": {"Replace": {"acc": 0.4846093133385951, "acc_norm": 0.48855564325177586, "acc_norm_stderr": 0.014048804199859325, "acc_stderr": 0.014045826789783668}, "True or False": {"acc": 0.489344909234412, "acc_norm": 0.5067087608524072, "acc_norm_stderr": 0.014051220692330346, "acc_stderr": 0.0140492945362904}, "does underscore refer to": {"acc": 0.5122336227308603, "acc_norm": 0.49329123914759276, "acc_norm_stderr": 0.014051220692330352, "acc_stderr": 0.014048278820405621}, "stand for": {"acc": 0.5043409629044988, "acc_norm": 0.4996053670086819, "acc_norm_stderr": 0.014052481306049512, "acc_stderr": 0.01405195606407689}, "underscore refer to": {"acc": 0.4988161010260458, "acc_norm": 0.5122336227308603, "acc_norm_stderr": 0.014048278820405621, "acc_stderr": 0.014052446290529024}}, "3": {"Replace": {"acc": 0.5019731649565904, "acc_norm": 0.4846093133385951, "acc_norm_stderr": 0.014045826789783668, "acc_stderr": 0.014052376259225632}, "True or False": {"acc": 0.49013417521704816, "acc_norm": 0.500394632991318, "acc_norm_stderr": 0.01405248130604952, "acc_stderr": 0.014049749833367589}, "does underscore refer to": {"acc": 0.5240726124704025, "acc_norm": 0.4988161010260458, "acc_norm_stderr": 0.014052446290529015, "acc_stderr": 0.014036189665395136}, "stand for": {"acc": 0.4940805051302289, "acc_norm": 0.4988161010260458, "acc_norm_stderr": 0.014052446290529019, "acc_stderr": 0.014051500838485807}, "underscore refer to": {"acc": 0.5153906866614049, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290396, "acc_stderr": 0.014045826789783668}}, "4": {"Replace": {"acc": 0.4996053670086819, "acc_norm": 0.48224151539068666, "acc_norm_stderr": 0.014043619596174964, "acc_stderr": 0.014052481306049516}, "True or False": {"acc": 0.5035516969218626, "acc_norm": 0.510655090765588, "acc_norm_stderr": 0.014049294536290393, "acc_stderr": 0.014052131146915853}, "does underscore refer to": {"acc": 0.5169692186266772, "acc_norm": 0.4925019731649566, "acc_norm_stderr": 0.014050905521228573, "acc_stderr": 0.014044390401612976}, "stand for": {"acc": 0.505130228887135, "acc_norm": 0.4972375690607735, "acc_norm_stderr": 0.014052271211616438, "acc_stderr": 0.014051745961790513}, "underscore refer to": {"acc": 0.5256511444356748, "acc_norm": 0.5303867403314917, "acc_norm_stderr": 0.014026510839428743, "acc_stderr": 0.014033980956108553}}, "5": {"Replace": {"acc": 0.5035516969218626, "acc_norm": 0.48697711128650356, "acc_norm_stderr": 0.014047718393997663, "acc_stderr": 0.014052131146915869}, "True or False": {"acc": 0.505130228887135, "acc_norm": 0.5027624309392266, "acc_norm_stderr": 0.014052271211616441, "acc_stderr": 0.01405174596179051}, "does underscore refer to": {"acc": 0.5169692186266772, "acc_norm": 0.5090765588003157, "acc_norm_stderr": 0.0140501700944977, "acc_stderr": 0.014044390401612976}, "stand for": {"acc": 0.5327545382794001, "acc_norm": 0.5059194948697711, "acc_norm_stderr": 0.014051500838485807, "acc_stderr": 0.014022300570434137}, "underscore refer to": {"acc": 0.5256511444356748, "acc_norm": 0.5272296764009471, "acc_norm_stderr": 0.014031631629827696, "acc_stderr": 0.01403398095610855}}}} \ No newline at end of file