koichi12 commited on Nov 28, 2024

Commit

f3d9fcb

verified ·

1 Parent(s): b98e04a

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

scripts/yans/lm-evaluation-harness/tests/testdata/anagrams2-v0-greedy_until +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/anli_r1-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/anli_r2-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2ds-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v1-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/drop-v1-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_cm-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_deontology-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_sexuality-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_en-v0-loglikelihood +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v0-greedy_until +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-greedy_until +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/mc_taco-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v0-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v1-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v1-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_europarl-v0-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v1-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v1-res.json +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling +1 -0

scripts/yans/lm-evaluation-harness/tests/testdata/anagrams2-v0-greedy_until ADDED Viewed

	@@ -0,0 +1 @@


1	+ 6700a3c44e48abe8337238dcbe3b54cf4abafe0c204c52d921e590872fbd05e7

scripts/yans/lm-evaluation-harness/tests/testdata/anli_r1-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3a84baf2f170e138c6ce0bc9f06f905def35d705fa2b8781f10c87aef404c4cb

scripts/yans/lm-evaluation-harness/tests/testdata/anli_r2-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ d0ea3c3e09d533982c15b4c034439896d6af4bbafb2254d305e20215534a251d

scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 41c34c96cca8ace661911d0033d630c554b283f5a3953bcdc50720ae6b00a9c1

scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"arithmetic_2da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2da": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2ds-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"arithmetic_2ds": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_2ds": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_5da-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"arithmetic_5da": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"arithmetic_5da": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_gender_agreement-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_anaphor_gender_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_gender_agreement": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_animate_subject_passive-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_animate_subject_passive": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_animate_subject_passive": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 23ddafdff7b1ebe331b146e23b2c21aa109fe57aa1ce8ca201a0d239fcbdd166

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_coordinate_structure_constraint_object_extraction-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_coordinate_structure_constraint_object_extraction": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_coordinate_structure_constraint_object_extraction": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_adjectives-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 47c56f336df11924d8b97feb46339ce55bea4b216b6fd13946cc999ea36a4a95

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_past_participle_verbs-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_irregular_past_participle_verbs": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_past_participle_verbs": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_npi_present_1-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_npi_present_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_npi_present_1": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_scope-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ fc0be817478c212327050fa297ef61ad214f4847dbff61d4e0fe7914c06a1691

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ e6666c5657215ff4bfd646b8ee3ae6df956e71c0be9ab1c287fb1b68291dd0d1

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_scope-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_sentential_negation_npi_scope": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_scope": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_object_gap-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"blimp_wh_questions_object_gap": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_object_gap": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ d5486ffcc075cad4302e37ece9bbf5b2063c0b5a48e76c8e1dd365e22a5a48fc

scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ eed67491bdf493a1dad8f1d9766bc7bd0e79946365b833c0f7eb81ac998e3dca

scripts/yans/lm-evaluation-harness/tests/testdata/boolq-v1-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"boolq": {"acc": 0.5048929663608562, "acc_stderr": 0.00874463623355505}}, "versions": {"boolq": 1}}

scripts/yans/lm-evaluation-harness/tests/testdata/coqa-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"coqa": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"coqa": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_gender-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"crows_pairs_english_gender": {"likelihood_difference": 0.3361377482385407, "likelihood_difference_stderr": 0.012853081126751691, "pct_stereotype": 0.478125, "pct_stereotype_stderr": 0.027967820983765136}}, "versions": {"crows_pairs_english_gender": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/drop-v1-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 1}}

scripts/yans/lm-evaluation-harness/tests/testdata/ethics_cm-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"ethics_cm": {"acc": 0.49987129987129986, "acc_stderr": 0.008022881531793336}}, "versions": {"ethics_cm": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/ethics_deontology-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 74ecebe322457d70afc16fde848978410a09b854dc65c47f428d100bd1593248

scripts/yans/lm-evaluation-harness/tests/testdata/ethics_justice-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ d7dfc44fea507b5c5c3a8218f79ed8197da8599ebb396d85feb91c25512126b6

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-abstract_algebra-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ e35d1eeb356ac1084d4e9773f028cb3c81ba1c6e5574d598ac4a78aa467cd797

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-conceptual_physics-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-conceptual_physics": {"acc": 0.2680851063829787, "acc_norm": 0.2553191489361702, "acc_norm_stderr": 0.028504856470514185, "acc_stderr": 0.028957342788342347}}, "versions": {"hendrycksTest-conceptual_physics": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_sexuality-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-human_sexuality": {"acc": 0.22137404580152673, "acc_norm": 0.22900763358778625, "acc_norm_stderr": 0.036853466317118506, "acc_stderr": 0.0364129708131373}}, "versions": {"hendrycksTest-human_sexuality": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-jurisprudence-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-jurisprudence": {"acc": 0.25, "acc_norm": 0.3148148148148148, "acc_norm_stderr": 0.04489931073591312, "acc_stderr": 0.04186091791394607}}, "versions": {"hendrycksTest-jurisprudence": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-logical_fallacies-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 2e9449dd803f9e2334dc562d9f04031fd013ed36b883b44ab500533a5dbbface

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-prehistory-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"hendrycksTest-prehistory": {"acc": 0.2623456790123457, "acc_norm": 0.26851851851851855, "acc_norm_stderr": 0.024659685185967277, "acc_stderr": 0.02447722285613511}}, "versions": {"hendrycksTest-prehistory": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_accounting-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 847418f7b22cd9b499e95fd73c40a2fbc40076895280cc2c560199c0c4c4f433

scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ a1a338d0083a21054f74d36a296d6bd8e2e457327c0fd630bebcc61ed758044d

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_en-v0-loglikelihood ADDED Viewed

	@@ -0,0 +1 @@


1	+ 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20

scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_fr-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"lambada_openai_mt_fr": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_mt_fr": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v0-greedy_until ADDED Viewed

	@@ -0,0 +1 @@


1	+ 46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42

scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"math_intermediate_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_intermediate_algebra": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/math_precalc-v1-greedy_until ADDED Viewed

	@@ -0,0 +1 @@


1	+ bc834b06fd79473ca6fe38a51b714aad0bf0478c1b0eec787eca34dbdf69cb71

scripts/yans/lm-evaluation-harness/tests/testdata/mc_taco-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"mc_taco": {"em": 0.07732732732732733, "f1": 0.41600515965511614}}, "versions": {"mc_taco": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v0-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ 814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec

scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v1-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ 814f9954e44368559602c00f7e85fa3971acdfd0315f508ec7df6318a79c55ec

scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ 5c17ddfebeab8c41dabadb6fc216ceda91e3fe5dc95aaf1b2c843d7f11828b03

scripts/yans/lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v1-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"pile_dm-mathematics": {"bits_per_byte": 8.910951449933553e-05, "byte_perplexity": 1.0000617679162955, "word_perplexity": 1.0002875035042451}}, "versions": {"pile_dm-mathematics": 1}}

scripts/yans/lm-evaluation-harness/tests/testdata/pile_europarl-v0-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"pile_europarl": {"bits_per_byte": 8.648858203555344e-06, "byte_perplexity": 1.000008648895605, "word_perplexity": 1.000063506523818}}, "versions": {"pile_europarl": 0}}

scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v1-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ d77f3f68aadd6cbf1290c2f6737b2ed5d5c2a60e4c81a65c280f207783caabe1

scripts/yans/lm-evaluation-harness/tests/testdata/pile_freelaw-v1-res.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"results": {"pile_freelaw": {"bits_per_byte": 4.5623635481434923e-05, "byte_perplexity": 1.0000316243943415, "word_perplexity": 1.000203169094218}}, "versions": {"pile_freelaw": 1}}

scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v0-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ df384c3df3d8f53273e97127c5bb84c17e638acad7d6bc9c91f6dee96d43b639

scripts/yans/lm-evaluation-harness/tests/testdata/pile_opensubtitles-v0-loglikelihood_rolling ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0f1c23a1f4ddec0c2b1ff34de8d1505b0eb9e2868d8edbcc1b6de13d02f32036