koichi12 commited on
Commit
b66510d
·
verified ·
1 Parent(s): 5701701

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. scripts/yans/lm-evaluation-harness/tests/testdata/anagrams1-v0-greedy_until +1 -0
  2. scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-loglikelihood +1 -0
  3. scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2dm-v0-loglikelihood +1 -0
  4. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood +1 -0
  5. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood +1 -0
  6. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-loglikelihood +1 -0
  7. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json +1 -0
  8. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-res.json +1 -0
  9. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood +1 -0
  10. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood +1 -0
  11. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-loglikelihood +1 -0
  12. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood +1 -0
  13. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood +1 -0
  14. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-res.json +1 -0
  15. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood +1 -0
  16. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood +1 -0
  17. scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json +1 -0
  18. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-res.json +1 -0
  19. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_autre-v0-res.json +1 -0
  20. scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-loglikelihood +1 -0
  21. scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-res.json +1 -0
  22. scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-res.json +1 -0
  23. scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl +3 -0
  24. scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl +3 -0
  25. scripts/yans/lm-evaluation-harness/tests/testdata/headqa-v0-loglikelihood +1 -0
  26. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood +1 -0
  27. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood +1 -0
  28. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-econometrics-v0-res.json +1 -0
  29. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json +1 -0
  30. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood +1 -0
  31. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-formal_logic-v0-res.json +1 -0
  32. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood +1 -0
  33. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json +1 -0
  34. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood +1 -0
  35. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood +1 -0
  36. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood +1 -0
  37. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-marketing-v0-res.json +1 -0
  38. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood +1 -0
  39. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json +1 -0
  40. scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-res.json +1 -0
  41. scripts/yans/lm-evaluation-harness/tests/testdata/lambada-v0-loglikelihood +1 -0
  42. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-loglikelihood +1 -0
  43. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_en-v0-res.json +1 -0
  44. scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt +4 -0
  45. scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v1-greedy_until +1 -0
  46. scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-res.json +1 -0
  47. scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-res.json +1 -0
  48. scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v1-res.json +1 -0
  49. scripts/yans/lm-evaluation-harness/tests/testdata/pile_books3-v1-res.json +1 -0
  50. scripts/yans/lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling +1 -0
scripts/yans/lm-evaluation-harness/tests/testdata/anagrams1-v0-greedy_until ADDED
@@ -0,0 +1 @@
 
 
1
+ 7c0c5246d3f751f39119a5629ac1d4b2c6fd2a315f78d6de9b2c387e24e3fef1
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2da-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 6ca1ca6ebd7cac4420d5005f7f35b0edbc921377f5e4f8874cc176e4fb6d79d4
scripts/yans/lm-evaluation-harness/tests/testdata/arithmetic_2dm-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 14ac5e510cdf82967d6827a9ca059906ee1db2e347be1b17f36403a157e73552
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ ccc64b4d5e80c081d5161aae5828212ba49d277ca8c5a4281f181744727a6a99
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_distractor_agreement_relational_noun-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 8aab641bd5933f84f46a14f5c1208a3c855cace7e67b44abcd5aff8fec96717d
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_drop_argument-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 616109e63f162dcd31a632943e7ef0c9e0431afeb179e83e9b04b39007b16f5b
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_ellipsis_n_bar_1-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_ellipsis_n_bar_1": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_ellipsis_n_bar_1": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_existential_there_object_raising": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_existential_there_object_raising": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_subject_raising-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 9b324b28ae3e1b5d49ecf4b7b2a16c7bbc8ff38d000cf216fab75df633da2084
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_only_npi_licensor_present-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d2d0711611b5b218c6fa8c7278494749252b7868c396451919b761303556bd66
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_1-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ fa4addddd8e380031b8e0871776cabcb707c0f21dcaf5d8b3defec66cce55043
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_principle_A_domain_1-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 290e7eddacea4ec16989af697f2ee3373fdd9aef4b452bf887184c6e2f6e7d9d
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_scope-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 32fcbd0a1c6e664af2751bad552587b5ca3911973b07f4fb2cf0a2acd3de5349
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_subject_island-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_sentential_subject_island": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_subject_island": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_tough_vs_raising_2-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d255a10a34f14d77d9526604a17b0f6747d32f62fc2e3a09e9ab10054535fd45
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_no_gap-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ d1d3e439b2020ef5ed232bfebbcc9634adc5117e9eb61e38fdbbe2c8ea128d54
scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_vs_that_with_gap_long_distance-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"blimp_wh_vs_that_with_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_vs_that_with_gap_long_distance": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_english": {"likelihood_difference": 0.3367363060632734, "likelihood_difference_stderr": 0.005827747024053628, "pct_stereotype": 0.5062611806797853, "pct_stereotype_stderr": 0.012212341600228745}}, "versions": {"crows_pairs_english": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_autre-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"crows_pairs_english_autre": {"likelihood_difference": 0.3424336593343321, "likelihood_difference_stderr": 0.08588068996335849, "pct_stereotype": 0.2727272727272727, "pct_stereotype_stderr": 0.14083575804390605}}, "versions": {"crows_pairs_english_autre": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 90c1bcfdeec0ff51d891ee8cf00ae2a5ec61bab6739faea9865809b8ffed2cdb
scripts/yans/lm-evaluation-harness/tests/testdata/drop-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"drop": {"em": 0.0, "em_stderr": 0.0, "f1": 0.0, "f1_stderr": 0.0}}, "versions": {"drop": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/ethics_virtue-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"ethics_virtue": {"acc": 0.5035175879396985, "acc_stderr": 0.0070893491553555765, "em": 0.036180904522613064}}, "versions": {"ethics_virtue": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/gguf_test_8fcf3f2f52afeb2acd7c8e02c2cc3ce31a691b665d295f6c4e4bbd71c7caa1a2.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f79475c06a8800d8abef183b690409f304e0a6963681965f6caba1ca985b243
3
+ size 532
scripts/yans/lm-evaluation-harness/tests/testdata/gpt3_test_0deb8e9bde8e8327bbc48157f638ff3ba06b0cd816dad2beb8ad90f7fbe795c7.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f7a190d338d1ef03f209a8a3340c0d282c73723633b8f5a71a8dc8ee94b9535
3
+ size 570
scripts/yans/lm-evaluation-harness/tests/testdata/headqa-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 767ca34d9714edd9fb030ddbcc35a64e5180d1e247b0cb557fbb22fdf971ad1f
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ a8a1892d1906cc3e7ffd321043f0a60f3b8b69ef76e5c6ff03c6ea41dc87d0cb
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-conceptual_physics-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 622f191ccfc7a597d99f39897ebe3f95a9ddce0e662fcfb411aa554b289bb355
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-econometrics-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-econometrics": {"acc": 0.24561403508771928, "acc_norm": 0.24561403508771928, "acc_norm_stderr": 0.04049339297748142, "acc_stderr": 0.040493392977481425}}, "versions": {"hendrycksTest-econometrics": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-electrical_engineering-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-electrical_engineering": {"acc": 0.2689655172413793, "acc_norm": 0.2827586206896552, "acc_norm_stderr": 0.037528339580033376, "acc_stderr": 0.036951833116502325}}, "versions": {"hendrycksTest-electrical_engineering": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-formal_logic-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ c0d0f0c008a5f3faf2f6f4268d87bbc09c40bb66ae08cf38eea0bf2e519c5a59
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-formal_logic-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-formal_logic": {"acc": 0.25396825396825395, "acc_norm": 0.2698412698412698, "acc_norm_stderr": 0.03970158273235172, "acc_stderr": 0.03893259610604674}}, "versions": {"hendrycksTest-formal_logic": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_psychology-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 0e4c8d13806d3696167e40544d2d114c557c10c74bc61fcb9c51bbfced0266ef
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_psychology-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-high_school_psychology": {"acc": 0.24587155963302754, "acc_norm": 0.23302752293577983, "acc_norm_stderr": 0.018125669180861493, "acc_stderr": 0.018461940968708436}}, "versions": {"hendrycksTest-high_school_psychology": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_aging-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 0880b3a78f8d7b17ffc612031427b9085367cf65dabe2a68c4b64e3171d17e88
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-human_sexuality-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 4b07922fa1d549b655c21440b13d869263ce7dd9771d8147c450f11c91d26c10
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-machine_learning-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 7a7138821a66ef946e427b40344cf7f1a916a2926995a85ef731a3bee40cb7ce
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-marketing-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-marketing": {"acc": 0.2863247863247863, "acc_norm": 0.2905982905982906, "acc_norm_stderr": 0.029745048572674043, "acc_stderr": 0.029614323690456648}}, "versions": {"hendrycksTest-marketing": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-medical_genetics-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ db6141246889a19dd3f6b9109f314d49c1a70f7a98795858804378b095c4a2fe
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-moral_scenarios": {"acc": 0.2547486033519553, "acc_norm": 0.25251396648044694, "acc_norm_stderr": 0.014530330201468654, "acc_stderr": 0.014572650383409158}}, "versions": {"hendrycksTest-moral_scenarios": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_psychology-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"hendrycksTest-professional_psychology": {"acc": 0.27124183006535946, "acc_norm": 0.2826797385620915, "acc_norm_stderr": 0.01821726955205344, "acc_stderr": 0.01798661530403031}}, "versions": {"hendrycksTest-professional_psychology": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/lambada-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 6829e6a8aa5922e6c92dd31403cc060f242dc0ede4a775e085a70da095ab2e20
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_cloze-v0-loglikelihood ADDED
@@ -0,0 +1 @@
 
 
1
+ 7655e748b63ae7e9911411d2d2a2577221d6c861ca4448509992541294d689f3
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_en-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"lambada_mt_en": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_en": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
2
+ |--------------|------:|------|-----:|----------|---|-------:|---|------|
3
+ |lambada_openai| 1|none | 0|acc |↑ | 0.1000|± | N/A|
4
+ | | |none | 0|perplexity|↓ |605.3866|± | N/A|
scripts/yans/lm-evaluation-harness/tests/testdata/math_geometry-v1-greedy_until ADDED
@@ -0,0 +1 @@
 
 
1
+ 46bc4cb219b6903397da782699a684bdbb982c0c954ff82e6beeed5c84878f42
scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/mrpc-v0-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"mrpc": {"acc": 0.5392156862745098, "acc_stderr": 0.024707732873723128, "f1": 0.5982905982905982, "f1_stderr": 0.028928325246283727}}, "versions": {"mrpc": 0}}
scripts/yans/lm-evaluation-harness/tests/testdata/multirc-v1-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"multirc": {"acc": 0.046169989506820566, "acc_stderr": 0.006801377886208738}}, "versions": {"multirc": 1}}
scripts/yans/lm-evaluation-harness/tests/testdata/pile_books3-v1-res.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"results": {"pile_books3": {"bits_per_byte": 1.2901280503011222e-06, "byte_perplexity": 1.0000008942490204, "word_perplexity": 1.0000052870063607}}, "versions": {"pile_books3": 1}}
scripts/yans/lm-evaluation-harness/tests/testdata/pile_dm-mathematics-v0-loglikelihood_rolling ADDED
@@ -0,0 +1 @@
 
 
1
+ d5b7967c0ece8b816f3921a8bd0fad23365349e935b491595e2ad1135af42da6