Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl +3 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until +1 -0
- scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v2.0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"arc_challenge": {"acc": 0.26621160409556316, "acc_norm": 0.28242320819112626, "acc_norm_stderr": 0.01315545688409722, "acc_stderr": 0.01291577478152323}}, "versions": {"arc_challenge": "2.0"}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_1dc-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
04c3a63a6b3c579bd3775d92b3076ba9130041d5ce7cf9244d3f86e95c804387
|
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
49edb1e735660631ea6cc309721e6c0b80b7106a613a6959514852ca48f1130e
|
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2888d6d098a5ef8c1e7f0d8295ba80826e2e04e431f57508dfb71d53e1cd4604
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
064c38fcd072b8bd12f54ea4f8e41599ed4e11dc386e93b77e1fc07967d1f960
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_complex_NP_island-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
f46cfcc7e43050a235fd2a6b989cabbfbcce76786df74db9f0d4a9cd1caa1628
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_coordinate_structure_constraint_complex_left_branch": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_complex_left_branch": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_2-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
123e2acd00fbba60aba1fbae607c79a062e512c9e79c7d8dfafff63e30111d76
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relative_clause-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
bf78e2b53c0f3531303c668c96bd3897a0a35e960da37439e63724ecba4e371a
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
d14e4b7fcdd68991eb39b9cf3ade4b37dee9ddd39b688f861d81a327e47a969f
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_quantifiers_2-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
6e6add7baff4217f383425bef58288202018e041b24084edcaa5df8af08f820c
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_existential_there_subject_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_subject_raising": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_expletive_it_object_raising-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_expletive_it_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_expletive_it_object_raising": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
7084358b1b7dd7fb5ead1a58f4b499d6f7610eca897bfac25a986d0f9a91aa5d
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_left_branch_island_echo_question-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
9852b38612db8c6adf938a5d8a7a9e5ce9e655259d6cc806b142506fcaff0ed4
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_only_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_only_npi_scope": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_passive_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_1": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_2-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_principle_A_case_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_2": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_3-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
38454befedcf1f3f6ef27d3bef9ccfdfb3e94a7ab32d86a63493a920d2d50093
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
80f5f98fad26240de2767fe58c4b18d864df41cbfa76f06c84c3fce9f14f4833
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_superlative_quantifiers_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_superlative_quantifiers_2": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
a142cc2a6fcd93230b650927b07367cad957b8f3f42cb4072151da53dea301df
|
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap_long_distance-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"blimp_wh_vs_that_no_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_no_gap_long_distance": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
77b11f4348eb8a7f57faf95c531fda01ab4bf0e729f91a82451ed8e71ec8e66d
|
scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-greedy_until
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
4a8605d5deed0423ec095700251ed93325b45d320aca35d4ce1e94702094435e
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_nationality-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
b85bc849811ccfa9971a6ee3fca7342752c314c0cb6f126e10d9ec4d0450c541
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_race_color-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
0a750596d77cd96502dc414ff699a399b1b91c2078adeec1d3dd982b3d591089
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"crows_pairs_french_race_color": {"likelihood_difference": 0.33233909422443764, "likelihood_difference_stderr": 0.010623405969915857, "pct_stereotype": 0.4782608695652174, "pct_stereotype_stderr": 0.023315932363473738}}, "versions": {"crows_pairs_french_race_color": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_religion-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
8af6445eeb634dad5f0723e40615afe993e1e3f129a4f314fe4117e633c2efd3
|
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_socioeconomic-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
8ba0a525c65f795c99f6416e70c998e75e4b6cc43bf9a4bd7ccacd3c3591e9cb
|
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
8021db8de46850090ddae6e6ec2d382029c3027b7c69884607503f916d09b709
|
scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_f307d52964c295e2005c5e782b688c24388e0cecadf29f1e6fc7f394236ea9c0.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f11de4b3d45d1590ba78935e824ae86ef75bbc370df500f89dde2c397d11c01a
|
3 |
+
size 1297
|
scripts/yans/lm-evaluation-harness/tests/testdata/hellaswag-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
abb808c97d6529eda6c11067837a132c62d25cba0394d720f80cca6df9f7196e
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-college_computer_science": {"acc": 0.22, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909282, "acc_stderr": 0.041633319989322695}}, "versions": {"hendrycksTest-college_computer_science": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-college_mathematics": {"acc": 0.18, "acc_norm": 0.2, "acc_norm_stderr": 0.04020151261036844, "acc_stderr": 0.038612291966536955}}, "versions": {"hendrycksTest-college_mathematics": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
dd6e0a9be1407890e9f8cd4434fb6aa4752ab3d2473837fd465ad99f60ad685e
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_medicine-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-college_medicine": {"acc": 0.27167630057803466, "acc_norm": 0.2543352601156069, "acc_norm_stderr": 0.0332055644308557, "acc_stderr": 0.03391750322321659}}, "versions": {"hendrycksTest-college_medicine": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-college_physics": {"acc": 0.23529411764705882, "acc_norm": 0.23529411764705882, "acc_norm_stderr": 0.04220773659171453, "acc_stderr": 0.04220773659171452}}, "versions": {"hendrycksTest-college_physics": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-global_facts": {"acc": 0.23, "acc_norm": 0.23, "acc_norm_stderr": 0.04229525846816507, "acc_stderr": 0.04229525846816507}}, "versions": {"hendrycksTest-global_facts": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_macroeconomics-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-high_school_macroeconomics": {"acc": 0.2230769230769231, "acc_norm": 0.22564102564102564, "acc_norm_stderr": 0.021193632525148522, "acc_stderr": 0.021107730127244}}, "versions": {"hendrycksTest-high_school_macroeconomics": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-human_aging": {"acc": 0.21524663677130046, "acc_norm": 0.17937219730941703, "acc_norm_stderr": 0.025749819569192804, "acc_stderr": 0.02758406660220827}}, "versions": {"hendrycksTest-human_aging": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"hendrycksTest-machine_learning": {"acc": 0.23214285714285715, "acc_norm": 0.22321428571428573, "acc_norm_stderr": 0.039523019677025116, "acc_stderr": 0.04007341809755806}}, "versions": {"hendrycksTest-machine_learning": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
92a5fad6e9ec700f84946faeccd399dda3569fb71837c9fb0c5c87f5ec29c43e
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"lambada_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_cloze": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_it-v0-loglikelihood
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a
|
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_standard_cloze-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"lambada_standard_cloze": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_standard_cloze": 0}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-greedy_until
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
|
scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v1-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"math_counting_and_prob": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_counting_and_prob": 1}}
|
scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-greedy_until
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
b920ccb507afdcf3ef6f4c04891913731e9f32ec914801791c6d9f8abf6e1897
|
scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-res.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"results": {"math_prealgebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_prealgebra": 0}}
|