diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..4321db2604d4ef8d992f587841264964acfb065f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_4ds-v0-res.json @@ -0,0 +1 @@ +{"results": {"arithmetic_4ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_4ds": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..c7773f373de3cf44b0d750454a45c4cb581a9957 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5ds-v0-res.json @@ -0,0 +1 @@ +{"results": {"arithmetic_5ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5ds": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..32b700ea9e48728cbf99c82ae417261e53698bb3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-loglikelihood @@ -0,0 +1 @@ +2d8964e56a17661502ecf3f09c0befba63915360ddf2145b0bd845816950515d \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..8970b32aff4c8e5f815453c87bc241e8ca2f01e5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_complex_left_branch-v0-loglikelihood @@ -0,0 +1 @@ +7e1cc5b9f71abfbe56c4bdf343a1e5632785b66a986b8e904a41ed8f45a2c33e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..defc3560d98de3c640d8e7f41e5bf9bf95d34aa4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_determiner_noun_agreement_with_adj_irregular_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_irregular_1": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..d8ce0672c29ac799339056d0464c733e3f169745 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_distractor_agreement_relational_noun": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_distractor_agreement_relational_noun": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5b721ca1529d4fe03bb77f8f581411a6fccbfc92 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_2-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_ellipsis_n_bar_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..8d1b39c2d44fc9651099252fbb4c5d4e37c4668d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_inchoative-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_inchoative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_inchoative": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_intransitive-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_intransitive-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..b16238545d5e94fa8c1c8e3166bf0d00863dbf89 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_intransitive-v0-loglikelihood @@ -0,0 +1 @@ +6469ae3b0d46b008846b5fd132f2d2b26ea2858745d056df1470b89aa97a790f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..d70bd8bad3bdbb6d000939f1cf57261a9351a00a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_1-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_irregular_plural_subject_verb_agreement_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_1": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..910e490a982ab520346e71df2a3de6369db05dd3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-loglikelihood @@ -0,0 +1 @@ +3ef532a85e0ee8f8ff779bc7ddc873d515969a708da84a4eb4a85b7c843cf244 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..f325c2e3e34f2d07f90e32517bf236339bd63b48 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_case_1-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_principle_A_case_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_principle_A_case_1": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..0e201fe3c840152dbb271ba82c794f5ab5c9d5b5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_2-v0-loglikelihood @@ -0,0 +1 @@ +eb5ddf0a97982373ab1a4e58267cfcdebdecdb86c376dfd5ebf46737c9d3ee12 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..f8d1d1f87fb4347f4261920ccb2f12fdda14b7fb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_reconstruction-v0-loglikelihood @@ -0,0 +1 @@ +894efedfd8750d5b8de6157f9b2ed2b51b5290d3a78ea9b041fc62d34e96efbc \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..b7d2819cb3b61b90bd5efee98e890b486fc02f39 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_1-v0-loglikelihood @@ -0,0 +1 @@ +8a01f6a5ea87a01c0c9b0c7b3bc4de4711bf0ff050976976651182b9ed34a0d4 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..4a8317f0b3ac61c3e677a5caa03bd47223a3fb7b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_superlative_quantifiers_2-v0-loglikelihood @@ -0,0 +1 @@ +59c20ff0f632cf42afc74ecc682cf92e5e740417b01e6cf9a610a3bc544d2ea5 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..c9b8c7d06179f5427a99dda5e6b24245e2ea0dbb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_tough_vs_raising_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_tough_vs_raising_2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..4b21da71d54c0a8fe09f204dd4a78f0841c6ae85 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-res.json @@ -0,0 +1 @@ +{"results": {"blimp_wh_questions_subject_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2b459d8b28901b531dc0068425a4583747ac552d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v0-res.json @@ -0,0 +1 @@ +{"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v1-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..7811121c9fda0c7ec33c2c36639c8ed8febccb05 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v1-loglikelihood @@ -0,0 +1 @@ +6577e0d88572772ef08e64f624c0e3df0953286ae1f118ccef15623b59ffeabf \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/copa-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/copa-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ebe4c6512a5a4befba815e4ab3b52a3732600607 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/copa-v0-loglikelihood @@ -0,0 +1 @@ +66276b9045b5300cba4b81340db06f674f031fa0b8883714ad0d03be464cd799 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..44a4c513e5ba88fe7ed54dcb35021b709bc407e2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-loglikelihood @@ -0,0 +1 @@ +2bf62b7cc678f64ffad4a6e6715ff76a2b984bfe8d1165da4b76b3b4dfafb2f9 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_socioeconomic-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_socioeconomic-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..32065ff76227a91aa5631f95b66ff1ce19490800 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_socioeconomic-v0-loglikelihood @@ -0,0 +1 @@ +c309eabfd247a702e32efc4e08211f9a72693d38995be5dd444d497b476396bd \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..d0fc86b66760ef71b6791ffeb9b9061e4cb49720 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french-v0-loglikelihood @@ -0,0 +1 @@ +4fb61dcf4d2c59d6470b297a01d5f429ee442864e225e1760fbf191b2a0901cd \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_age-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_age-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..dbec353c35db547d54e918c718164a0788abc569 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_age-v0-loglikelihood @@ -0,0 +1 @@ +b14a5769f415a234abe89063a1b546aa4a990c84217e5d4a697874cd7f85af35 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..3b2b697c91962eb160da3950bb22e45889c265e6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/drop-v1-greedy_until @@ -0,0 +1 @@ +a670f911ab2999d72db15f534b22703d19e7837edbda4f9f199ad587f7aae6b2 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..0c01f548806c747150690d942f7def8b2d98f2a2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism-v0-loglikelihood @@ -0,0 +1 @@ +88872f1ed1b203f9649a4ced4fb4627d18c17af455d713de6e17c05eced4ec60 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/gsm8k-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/gsm8k-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..d49400007f95ecd048628bb2f1cadf92132bef24 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/gsm8k-v0-greedy_until @@ -0,0 +1 @@ +e7292dbdd7fd8419ba954f2e0701e04c8d0e8842fe053dbf2fe47d926630e35e \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..86f54245d557e0091d1166b7ffb2029520e566e9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-clinical_knowledge-v0-loglikelihood @@ -0,0 +1 @@ +fbcb7ce507e0675d811e71e10a67c8d05a6605e29036f46776e04a6588cefbda \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_chemistry-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_chemistry-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..4dc95a151ac2da73b3c5eb23e3fe24a7ccc8024d --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_chemistry-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-college_chemistry": {"acc": 0.28, "acc_norm": 0.26, "acc_norm_stderr": 0.04408440022768078, "acc_stderr": 0.04512608598542127}}, "versions": {"hendrycksTest-college_chemistry": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a840b6b6420053c343787f08d8d723ab5ba5c1d3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_mathematics-v0-loglikelihood @@ -0,0 +1 @@ +e9fe80752686527281f834d2397875b4580581434b94799f9de6aaa450bd73ff \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a4751fdbfad7b614f9ec059a78130426e1d8a39c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-global_facts-v0-loglikelihood @@ -0,0 +1 @@ +9fdc85240b8170839278b1e883ee0868611d84dce202cb8aa037c841ec76d089 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2d81594963cefe41f139a813fcdc16c0f247f9ed --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_chemistry-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_chemistry": {"acc": 0.2857142857142857, "acc_norm": 0.2660098522167488, "acc_norm_stderr": 0.031089826002937523, "acc_stderr": 0.031785297106427496}}, "versions": {"hendrycksTest-high_school_chemistry": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ac80d178809ddbacc3aeb8ff368d8a68605a6430 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-loglikelihood @@ -0,0 +1 @@ +add45970ea3865be7c7a31f788a835949f6937ac73f699b122ca56a3431e95f8 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..16cc02ff0a897dda3a6c6dc97e9b7815ea120fc2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_government_and_politics": {"acc": 0.24352331606217617, "acc_norm": 0.23834196891191708, "acc_norm_stderr": 0.03074890536390988, "acc_stderr": 0.030975436386845436}}, "versions": {"hendrycksTest-high_school_government_and_politics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..dc86769fa93781e03ca8f7e7b3493b39338bcdaa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_mathematics-v0-loglikelihood @@ -0,0 +1 @@ +ab368d16fc4648ad27940f71abd266366663f51db612f732a0b9b0eea28de9f8 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..cf698d181c95ef88e774204df6f92622116d690c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_microeconomics-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_microeconomics": {"acc": 0.24369747899159663, "acc_norm": 0.22268907563025211, "acc_norm_stderr": 0.027025433498882378, "acc_stderr": 0.027886828078380558}}, "versions": {"hendrycksTest-high_school_microeconomics": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..bd4edd2394a4ccaf2d75c578a9f45ad614657dd8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-international_law": {"acc": 0.2396694214876033, "acc_norm": 0.3140495867768595, "acc_norm_stderr": 0.042369647530410164, "acc_stderr": 0.03896878985070417}}, "versions": {"hendrycksTest-international_law": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..571873985758661a0feed74e164f606318e58d8c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-loglikelihood @@ -0,0 +1 @@ +355489f4bd176ab84db5ef4c03d56ddeeeb1b0ad69827122b2d800e1cdc7e5f0 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..7a84623fabf793b7748d34c18f4c358649f31a97 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-management-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-management": {"acc": 0.24271844660194175, "acc_norm": 0.2621359223300971, "acc_norm_stderr": 0.043546310772605956, "acc_stderr": 0.04245022486384495}}, "versions": {"hendrycksTest-management": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..8f7b30ba8823a0a0d8fc94f69ef64d362835e0db --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-loglikelihood @@ -0,0 +1 @@ +ab70f500cf24e876f6ae6bdc27525a1d6074fa9b6ea97770255d9fc2559b36ff \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-security_studies-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-security_studies-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2c9de8886a29e0479074513470594c9266c5d0ac --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-security_studies-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-security_studies": {"acc": 0.2979591836734694, "acc_norm": 0.2693877551020408, "acc_norm_stderr": 0.02840125202902294, "acc_stderr": 0.029279567411065674}}, "versions": {"hendrycksTest-security_studies": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..d3f581c9f256191c2c0403a582fd72696150b34a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-sociology-v0-loglikelihood @@ -0,0 +1 @@ +f99a3caece11169f2a5cc951001f92027104afd25d29b2a399883bd4bf118605 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..0f414a928b62a8f8eefc939d693c944dd2521733 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"iwslt17-ar-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015049895477752772, "chrf_stderr": 0.0002940315671893584, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-ar-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ae19de0e6951bd90cd1e713d14816767496044e8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_de-v0-loglikelihood @@ -0,0 +1 @@ +5ad125e1708499832b2cee8c3388f89f9c0277010fd96fbd3359039ce8105984 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..efd450a8f2a4ca067f7380af809fdda48d1ee465 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-loglikelihood @@ -0,0 +1 @@ +6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_es-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_es-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..df895fe6d6bf04fc51c1633d26fb835941176534 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_es-v0-loglikelihood @@ -0,0 +1 @@ +4a88f4b316c72fe0396c382d6cbb33568ac4d0ad225150d3536635c085359fc9 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..2e7f6ef516e5e59af82f1768cfde132d57c1a1ec --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_openai_mt_it": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_mt_it": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v1-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..3ab10de26a038019a18699e20887de6da66981c4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v1-greedy_until @@ -0,0 +1 @@ +d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..71bbd8d9c221ca484d517bda46c109b2610f79f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v0-greedy_until @@ -0,0 +1 @@ +bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..dabd07c07cbbad2886b20acb25189b111676bbcd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mathqa-v0-res.json @@ -0,0 +1 @@ +{"results": {"mathqa": {"acc": 0.20770519262981574, "acc_norm": 0.2050251256281407, "acc_norm_stderr": 0.007390619359738901, "acc_stderr": 0.007426217631188539}}, "versions": {"mathqa": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/scripts/yans/lm-evaluation-harness/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6c56c1261f69f89dff5ac169fd75d16a5a9f43 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt @@ -0,0 +1,22 @@ +| Tasks |Version|Filter|n-shot|Metric| |Value | |Stderr| +|-------------------------------|------:|------|-----:|------|---|-----:|---|------| +|stem | 2|none | |acc |↑ |0.2474|± | N/A| +| - abstract_algebra | 1|none | 0|acc |↑ |0.2000|± | N/A| +| - anatomy | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - astronomy | 1|none | 0|acc |↑ |0.1000|± | N/A| +| - college_biology | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - college_chemistry | 1|none | 0|acc |↑ |0.1000|± | N/A| +| - college_computer_science | 1|none | 0|acc |↑ |0.2000|± | N/A| +| - college_mathematics | 1|none | 0|acc |↑ |0.2000|± | N/A| +| - college_physics | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - computer_security | 1|none | 0|acc |↑ |0.5000|± | N/A| +| - conceptual_physics | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - electrical_engineering | 1|none | 0|acc |↑ |0.4000|± | N/A| +| - elementary_mathematics | 1|none | 0|acc |↑ |0.0000|± | N/A| +| - high_school_biology | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - high_school_chemistry | 1|none | 0|acc |↑ |0.4000|± | N/A| +| - high_school_computer_science| 1|none | 0|acc |↑ |0.3000|± | N/A| +| - high_school_mathematics | 1|none | 0|acc |↑ |0.2000|± | N/A| +| - high_school_physics | 1|none | 0|acc |↑ |0.3000|± | N/A| +| - high_school_statistics | 1|none | 0|acc |↑ |0.0000|± | N/A| +| - machine_learning | 1|none | 0|acc |↑ |0.3000|± | N/A| \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..d9dada7a0244534c35d86efb71a03fbd90217328 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mnli-v0-res.json @@ -0,0 +1 @@ +{"results": {"mnli": {"acc": 0.32868059093224655, "acc_stderr": 0.004741640290753859}}, "versions": {"mnli": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v1-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..52a89c6f9eaf1cece362cf2d4bd114f8ae3cbdda --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v1-loglikelihood @@ -0,0 +1 @@ +0e793bd6f637a70a04c6f2cda080188fc037961b2f909095fe63f7bdbc4a90c6 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..698b03e8b3b437f94f22744ffe12ba2fff9285f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.1631037706429144e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..757ef06f79c938e6968583eb58a135fad23c897e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_gutenberg": {"bits_per_byte": 1.2443606332351536e-06, "byte_perplexity": 1.0000012443614075, "word_perplexity": 1.0000072174665404}}, "versions": {"pile_gutenberg": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..bd7b15927f717baab5b7ce2e9d659dda6d681769 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v1-loglikelihood_rolling @@ -0,0 +1 @@ +02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v0-loglikelihood_rolling new file mode 100644 index 0000000000000000000000000000000000000000..4fbbc241ba9487c2513cdf46dbb76e004e401418 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_philpapers-v0-loglikelihood_rolling @@ -0,0 +1 @@ +339ba5d8c044c4a3ff9b9a8eaa24da1d6c01b72972074eb671a7da049eeb7047 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..dff51cba766a795077324ffe9bf71d786dbb695a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_ubuntu-irc": {"bits_per_byte": 1.6298315496830533e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..0e3b1b25977cc5c7eba81358df76e7ed45d1b04a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_ubuntu-irc-v1-res.json @@ -0,0 +1 @@ +{"results": {"pile_ubuntu-irc": {"bits_per_byte": 2.3513498942121155e-06, "byte_perplexity": 1.0000016298328778, "word_perplexity": 1.0000108866656874}}, "versions": {"pile_ubuntu-irc": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..bfffde9938833ae29f5665130d844630c7fb9735 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_wikipedia-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_wikipedia": {"bits_per_byte": 0.00016834722287561703, "byte_perplexity": 1.0001683613940646, "word_perplexity": 1.001084677949439}}, "versions": {"pile_wikipedia": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..a94b8cdec9e45e5b236703414fbcc6a3ed74f7ff --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/prost-v0-loglikelihood @@ -0,0 +1 @@ +7c475f5b36a8b79f94c2be035441e7fd59dac021b0713b1fc72d256424c70b0b \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/rte-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/rte-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..10314dd047e4d7202c755fe8cfc55bc9b1edd5f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/rte-v0-res.json @@ -0,0 +1 @@ +{"results": {"rte": {"acc": 0.5379061371841155, "acc_stderr": 0.030009848912529117}}, "versions": {"rte": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..024652e0a39ed0298f8f6f67453f644a68f3a367 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v0-greedy_until @@ -0,0 +1 @@ +b261e8885c11750ce6911bb11e8693de03d53758297c26fb14cfc1ef508862cb \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-de-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-de-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..826e0382abb32da67ac0dd0b271527b40fd3b6ae --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-de-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt16-de-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.013700416764482968, "chrf_stderr": 0.00016071651360909355, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-de-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..267763793d5fa5a16c41cbcdd9eb7b134cd34cea --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-ro-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt16-ro-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01262029828861831, "chrf_stderr": 0.00014507496111350828, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-ro-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-cs-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-cs-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..70c80afe5bd10baabdcb507faa385db124c1f42e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-cs-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-cs-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.006212086270964023, "chrf_stderr": 0.0001119165191795531, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-cs-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ps-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ps-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..77b600c49afa12cf988280e337a9d4747195f95f --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ps-v0-greedy_until @@ -0,0 +1 @@ +8411c2cb73114cbd0c6e0f17eab2625d486cc3a601105deb0ea1338a401df689 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ru-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ru-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..d21d39ac9f9616bda1e21d1ba5bd63fdb542a7aa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ru-v0-greedy_until @@ -0,0 +1 @@ +a1613831f69c1679a54670092af40ce76617b79d7cc837984803b0fc52bb8bde \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..b04f968d76055c494762ec6cd2a84327c8351742 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ta-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-en-ta": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-en-ta": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..27c60fb72194325e51647ce0fe137710df6dff86 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ru-en-v0-greedy_until @@ -0,0 +1 @@ +1477ab6542c26bd0222cc1aded174f33bf8d04d1cf6a1c0959aeca4ff3779adc \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-zh-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-zh-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..11b8df7f8739d9e4a459636640af6ebb2b7b868a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-zh-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-zh-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.008438201290981157, "chrf_stderr": 0.0001109053964076822, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-zh-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..8841cb74d16977645c1c7399d8b58de094bafef1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-res.json @@ -0,0 +1 @@ +{"results": {"wnli": {"acc": 0.3380281690140845, "acc_stderr": 0.05653887739133514}}, "versions": {"wnli": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..cbf4ad3777eebbcee7c1ccf1c4a4cac64829ad2b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v1-loglikelihood @@ -0,0 +1 @@ +8a0f81661d2ab2334bbc8031fac31c0c8882f1d9271dd51599d21dfdbb726dea \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wsc273-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc273-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..9d592917bd96783c290237446891eef56083a693 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wsc273-v0-loglikelihood @@ -0,0 +1 @@ +26450d414c4581feb51a09882080e7a9b95882e7eab47b1751a4a6024b5a60ee \ No newline at end of file