diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt b/scripts/yans/lm-evaluation-harness/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6ba01f21d5711e90b22bafd9979f7a517b98b0ef
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
@@ -0,0 +1,6 @@
+|    Tasks    |Version|Filter|n-shot| Metric |   |Value|   |Stderr|
+|-------------|------:|------|-----:|--------|---|----:|---|------|
+|arc_challenge|      1|none  |     0|acc     |↑  |  0.0|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.0|±  |   N/A|
+|arc_easy     |      1|none  |     0|acc     |↑  |  0.3|±  |   N/A|
+|             |       |none  |     0|acc_norm|↑  |  0.1|±  |   N/A|
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..c89528892ae2cb5dfc87cf28f587062a18323d87
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/anagrams1-v0-res.json
@@ -0,0 +1 @@
+{"results": {"anagrams1": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"anagrams1": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r3-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r3-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..548dea1e2285461362f32707937ff84f37572957
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r3-v0-res.json
@@ -0,0 +1 @@
+{"results": {"anli_r3": {"acc": 0.31916666666666665, "acc_stderr": 0.01346230971200514}}, "versions": {"anli_r3": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..347570f3a6912d8f556eec252867f26777516506
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-loglikelihood
@@ -0,0 +1 @@
+0bdad31c974ba064e1f1ba931841ec2ba7461e8b0ca54ea5f79f08b6bae0bab5
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..68bbe21379d0d6326ce5cc07b0a2bc1589ed73df
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_anaphor_number_agreement-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_anaphor_number_agreement": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_anaphor_number_agreement": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..5a0f6a35590db43e610a0550607dd7ab5e382f5f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-loglikelihood
@@ -0,0 +1 @@
+3d67ad025185dbb0808ebd7f508edcb5750c18fc3c01ad91f20fda80780c916c
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..90dc95da8116c38d2ff3bec041973004b7f5703b
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_causative-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_causative": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_causative": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..5fe9e64bc639f3fdf1521cd6f71b8019c987f09e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_1-v0-loglikelihood
@@ -0,0 +1 @@
+2df8cc7f17089f7e8c7d974dcb324c809d30ef059a5be22aed6b69f44230809f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..a260838746d5405e89cba4147101e9194f93b88e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-loglikelihood
@@ -0,0 +1 @@
+95acb74fac7d57ae2c9d208361a5f8ad36b0b19a055f02e648ed8e99505f4b43
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..67ea47559d248f90cc66870a37fdecd850ba4c79
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_determiner_noun_agreement_with_adj_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_determiner_noun_agreement_with_adj_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..6756cc4020c8016b08fb43470dcdfcc4d1d5b374
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adj_irregular_1-v0-loglikelihood
@@ -0,0 +1 @@
+ad61c619aa79433d02f1aeacde2ab87291fd5d5c370032c24d41c4f0065ed1f9
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d765bb590653a5c4eb3e2517f9b3788cdefc7fa5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_determiner_noun_agreement_with_adjective_1-v0-loglikelihood
@@ -0,0 +1 @@
+007c47e5fbf88119c5180feef75e1345d448e56adcd4c7ab2d52fb8d67350d34
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d23fba902ae50f259bed6e5fb5f33083dc1bf5fc
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_existential_there_object_raising-v0-loglikelihood
@@ -0,0 +1 @@
+63567712076256f373131971676c1c6d711efef73cd0e4de3cc639bc631a2413
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..b0289b9dea483e58b56403fdfa30575b61fdfbd1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_irregular_plural_subject_verb_agreement_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_irregular_plural_subject_verb_agreement_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_irregular_plural_subject_verb_agreement_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_2-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5a4dd092c4a82b59d702c027e16c684c634649e1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_passive_2-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_passive_2": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_passive_2": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4305bb313c67880a0e4ebf7827c29a2aa2df6d66
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_sentential_negation_npi_licensor_present-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_sentential_negation_npi_licensor_present": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_sentential_negation_npi_licensor_present": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d27f1316dc96be401dee9392f973e9bbd799a409
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_island-v0-loglikelihood
@@ -0,0 +1 @@
+91a9e4b60b0f3572a7fdbd7648d0e69f36e5eb34db715315b0082558d7ed8b65
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe6bbf95e5406ad38d4894bf5d4609beeaa05f9a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/blimp_wh_questions_subject_gap_long_distance-v0-res.json
@@ -0,0 +1 @@
+{"results": {"blimp_wh_questions_subject_gap_long_distance": {"acc": 0.485, "acc_stderr": 0.0158121796418149}}, "versions": {"blimp_wh_questions_subject_gap_long_distance": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ba386fd6c7e67c5048d2f4a4240e1b308dca7db5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v0-res.json
@@ -0,0 +1 @@
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1cff410b2c35a16b457d163d95ac7cbd8eb704e2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cb-v1-res.json
@@ -0,0 +1 @@
+{"results": {"cb": {"acc": 0.3392857142857143, "acc_stderr": 0.06384226561930825, "f1": 0.2819143819143819}}, "versions": {"cb": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..63749433f1703a4c81965e6c04fec04177631bae
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english-v0-loglikelihood
@@ -0,0 +1 @@
+ee3ce1ddb8071d4189e5b06e7f3c618a434221ac52935d0f434c4d183f01458a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..14510a13a1c390adfbb9c73149b88e5b8a2c4f64
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_disability-v0-res.json
@@ -0,0 +1 @@
+{"results": {"crows_pairs_english_disability": {"likelihood_difference": 0.3148684792547637, "likelihood_difference_stderr": 0.02800803147051987, "pct_stereotype": 0.36923076923076925, "pct_stereotype_stderr": 0.06032456592830047}}, "versions": {"crows_pairs_english_disability": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..fedfdac52d966f6edcdb229456858da1959b24d1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_physical_appearance-v0-loglikelihood
@@ -0,0 +1 @@
+d1823f5038afafa7a5338e42531720480c8ccf4e177789526caf294d52d56e89
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9068a24ef5af549a13fe5b4362c2b5afc741bd29
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/cycle_letters-v0-greedy_until
@@ -0,0 +1 @@
+eb23f7d5de7528eefd8ed5f8054c402ff947319cccfef7195995946f99389201
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..bd3ff6c459c5a5739b233dd86c5434f64bbc1b16
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-loglikelihood
@@ -0,0 +1 @@
+5b42ba1faf5ece6a6ec9a3976ce79c1fac8df5b98272aab85457188c2142693c
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..16940c8f5a7dd9ebb1d73298346ab1d19811ec90
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/ethics_utilitarianism_original-v0-res.json
@@ -0,0 +1 @@
+{"results": {"ethics_utilitarianism_original": {"acc": 0.5214226289517471, "acc_stderr": 0.007204999520618661}}, "versions": {"ethics_utilitarianism_original": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..a0f8b7c09b3b6307123f1328c51c1dcfb797aed2
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-loglikelihood
@@ -0,0 +1 @@
+b3b27e9dbad587377d3c8cab1072782de883e245da93a563bd8b3099017b1fc0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..dcc5116204283941b74dfea97e3a1ce5edd9dc27
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-business_ethics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-business_ethics": {"acc": 0.29, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.045604802157206845}}, "versions": {"hendrycksTest-business_ethics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..7f665ef4a1bd06ecfd30d999ae6880c00ba849cf
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_biology-v0-loglikelihood
@@ -0,0 +1 @@
+c29e4e67ff91af29b9434884874414d1b1b32ccc32903c6b1639469b19907419
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..695bc8c31592a4c33d70d5d07a8c5b523d9bd3cc
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_computer_science-v0-loglikelihood
@@ -0,0 +1 @@
+4ea26ad780290429ac5a3317559c154848d662bd40532c966458ba6f2a32d0a3
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..7c2e2f4bf73266d532c7514c98defcba0133f231
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-college_physics-v0-loglikelihood
@@ -0,0 +1 @@
+704a7671ef981fb95594782bc446dd632e87ebdbe89436a0603b714fb5786c75
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..60f02eba9cb04602d8b67d67269d8b82e0930721
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-computer_security-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-computer_security": {"acc": 0.24, "acc_norm": 0.27, "acc_norm_stderr": 0.044619604333847394, "acc_stderr": 0.042923469599092816}}, "versions": {"hendrycksTest-computer_security": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..84cd983ee9d33f831ee397ffd8b11990b70a4b60
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-elementary_mathematics-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-elementary_mathematics": {"acc": 0.2724867724867725, "acc_norm": 0.2830687830687831, "acc_norm_stderr": 0.023201392938194978, "acc_stderr": 0.022930973071633345}}, "versions": {"hendrycksTest-elementary_mathematics": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..2b6aa8d605765b06a262877dec34cd156d0a66f9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-international_law-v0-loglikelihood
@@ -0,0 +1 @@
+ea9b2cefd27959db564168f6ad1169a5eaa012fc5a5d5b8faf9e34d94e335dc1
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..b09e99721b8ec71dc85c7ed0798d55a6e0274860
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-miscellaneous-v0-loglikelihood
@@ -0,0 +1 @@
+972dd88dbbaf09d14766e243cfc233425e7c01a26dbc61bdb9eeefa788822331
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d5ea0d8156ae4efaa0f7568ae8fd3a8ed3992d37
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-moral_scenarios-v0-loglikelihood
@@ -0,0 +1 @@
+a8e1882e77728b53c8b86312254d08320d8363fb606d746a8dd145b812f62cf5
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..e2838f880581f7cf743d83ba99a26827c18a09de
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-nutrition-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-nutrition": {"acc": 0.24509803921568626, "acc_norm": 0.28104575163398693, "acc_norm_stderr": 0.025738854797818723, "acc_stderr": 0.02463004897982476}}, "versions": {"hendrycksTest-nutrition": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ec9c1e79c117c88246fa596ca90821025c9786af
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-philosophy-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-philosophy": {"acc": 0.26366559485530544, "acc_norm": 0.2733118971061093, "acc_norm_stderr": 0.02531176597542612, "acc_stderr": 0.02502553850053234}}, "versions": {"hendrycksTest-philosophy": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..23fbfcf78e79595a64037311668042a1ec7f637f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-professional_law-v0-loglikelihood
@@ -0,0 +1 @@
+c38c9d5d84eeb7a5f3c4a34d6e70d7e15847b3c38f26e4b119c982bb935e118f
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..9ba711cca75cfa5f22bb2dc52e68839ac3820b88
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-public_relations-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-public_relations": {"acc": 0.3090909090909091, "acc_norm": 0.2636363636363636, "acc_norm_stderr": 0.04220224692971987, "acc_stderr": 0.044262946482000985}}, "versions": {"hendrycksTest-public_relations": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..1077380de88cb9ce23894ce31fbbeceea90f2079
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-us_foreign_policy-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-us_foreign_policy": {"acc": 0.2, "acc_norm": 0.24, "acc_norm_stderr": 0.04292346959909283, "acc_stderr": 0.040201512610368445}}, "versions": {"hendrycksTest-us_foreign_policy": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0004b194049a5dce0266002b4a19882fbb8c6bfa
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-virology-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-virology": {"acc": 0.27710843373493976, "acc_norm": 0.2710843373493976, "acc_norm_stderr": 0.03460579907553027, "acc_stderr": 0.034843315926805875}}, "versions": {"hendrycksTest-virology": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0fff75a7eaf2e0773a7e3dcda446f59a59dad878
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-world_religions-v0-res.json
@@ -0,0 +1 @@
+{"results": {"hendrycksTest-world_religions": {"acc": 0.21637426900584794, "acc_norm": 0.22807017543859648, "acc_norm_stderr": 0.03218093795602357, "acc_stderr": 0.03158149539338734}}, "versions": {"hendrycksTest-world_religions": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..82921d1db066020f53d61c21d46498a512144b37
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-ar-en-v0-greedy_until
@@ -0,0 +1 @@
+e94d310de91fad7ce36f4cf3305552020221482c5588f2efcefaa019893504f1
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..a22fa9036c790cb48e142bd05a59da7824a9c83f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/iwslt17-en-ar-v0-res.json
@@ -0,0 +1 @@
+{"results": {"iwslt17-en-ar": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.0, "chrf_stderr": 0.0, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"iwslt17-en-ar": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..5f95957324e138bb424e71ff93f81a0c0a11f2cb
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_mt_es-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_mt_es": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_mt_es": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..688e67a5534f801d5b256905a0d05a60c0adf8fc
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-loglikelihood
@@ -0,0 +1 @@
+9ca5643bbaafed2f027eab5b68cc438e9e268f6df9a678e956e61726a985cf0b
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..12e7f527bde7683ea74111603618c1e99cdd93a6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai-v2.0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_openai": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai": "2.0"}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..f7fdfc9c2d5c6d5d4abb7d6e932454615c095ea1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_en-v0-res.json
@@ -0,0 +1 @@
+{"results": {"lambada_openai_mt_en": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_mt_en": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..7a80c24d1b3e57ffca8ca89252d3c9b01b506f49
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/logiqa-v0-res.json
@@ -0,0 +1 @@
+{"results": {"logiqa": {"acc": 0.25806451612903225, "acc_norm": 0.2764976958525346, "acc_norm_stderr": 0.017543209075825194, "acc_stderr": 0.017162894755127077}}, "versions": {"logiqa": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..10d18c2f864117ae56fe56ba1191f6cde4bec7b3
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_algebra-v1-res.json
@@ -0,0 +1 @@
+{"results": {"math_algebra": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_algebra": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..6f49557ecf42758d64d1297c5569f3d4d95dd9c1
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_counting_and_prob-v0-greedy_until
@@ -0,0 +1 @@
+2aa9ae43ee9dbb2457525247d7b65358632c5eaa9cbfc40cf95a4f17f5d942ad
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..3ab10de26a038019a18699e20887de6da66981c4
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_intermediate_algebra-v0-greedy_until
@@ -0,0 +1 @@
+d53c699de272d517ed7ad783b4e692302be9f9f97a8d4ac7a6541e538a7cabe0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..00917b90ddb0602c62c8a9fef959b9e91eb45c2e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_num_theory-v1-res.json
@@ -0,0 +1 @@
+{"results": {"math_num_theory": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"math_num_theory": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..5200f4cfa9ed3a735661e987791bf1434555db6e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/math_prealgebra-v0-greedy_until
@@ -0,0 +1 @@
+752cdf343d7152e476b0273065024f6ea0e0f47ea385c6bdf9067736cb39724a
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..d19d0c6fee7f47af1ad3f5af9ff1d7a1544e2e98
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_arxiv-v0-res.json
@@ -0,0 +1 @@
+{"results": {"pile_arxiv": {"bits_per_byte": 1.0750412350569374e-05, "byte_perplexity": 1.0000107504701365, "word_perplexity": 1.0000819333090385}}, "versions": {"pile_arxiv": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..967c14934b81e0880063c4239593fb74cd99cd8d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_bookcorpus2-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_bookcorpus2": {"bits_per_byte": 1.6780040419457868e-06, "byte_perplexity": 1.000001163104447, "word_perplexity": 1.0000066499426599}}, "versions": {"pile_bookcorpus2": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..abe7b45f9aff9b6427068ceb1ba39977fa843c38
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_enron-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_enron": {"bits_per_byte": 0.0004564546920781453, "byte_perplexity": 1.000316440339552, "word_perplexity": 1.00224668051869}}, "versions": {"pile_enron": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..bd7b15927f717baab5b7ce2e9d659dda6d681769
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_gutenberg-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+02a559f74a9105145e7d4d9c5ddea372b5b4938f5368dc8ffafc39cbe3b4c7ef
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ea135278b720703540187531afb0ef82e7d6a1ce
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_hackernews-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_hackernews": {"bits_per_byte": 0.00014672607267878518, "byte_perplexity": 1.0001017079354932, "word_perplexity": 1.0006273924348839}}, "versions": {"pile_hackernews": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..0e40fc8268a77618471344585bc1a1586fd69e0f
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_nih-exporter-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_nih-exporter": {"bits_per_byte": 0.00035193728014978225, "byte_perplexity": 1.0002439740903082, "word_perplexity": 1.0016712202288802}}, "versions": {"pile_nih-exporter": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..ca433e3c854780d034839c8e4d029cb6b5bfca1a
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_openwebtext2-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_openwebtext2": {"bits_per_byte": 0.000184802319359215, "byte_perplexity": 1.000128103411166, "word_perplexity": 1.0007951516532847}}, "versions": {"pile_openwebtext2": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..4d4a241ace01e28f15cd7bd88d3f855b1bf5372d
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v1-res.json
@@ -0,0 +1 @@
+{"results": {"pile_pubmed-central": {"bits_per_byte": 2.2812488135667854e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/qnli-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/qnli-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..883202c385fdfcbdb3e362737691ee0343adc430
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/qnli-v0-loglikelihood
@@ -0,0 +1 @@
+4281d4ff5cf1244358b0ea0220c67863c69fbade850696b43e8ff05138e01e12
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/race-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/race-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..017b00669b8b60dc06947e4e78428fb429734df5
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/race-v0-res.json
@@ -0,0 +1 @@
+{"results": {"race": {"acc": 0.23253588516746412, "acc_stderr": 0.013074460615265295}}, "versions": {"race": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..ae564e39b35d5553aac83af33c0dd126e3acf53e
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_e7ad1e9f52a39e1ddd1e50f3c57ffa4546728dd150a67c0a0ddc8675c04e15d1.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a052215e2933be5a3d5eb709fbf0d6136c25bd1bfef9ff0448165445bbce2afd
+size 1848
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..03838a9bcf5b523ebbea661a3265fd5d05867143
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_f4bfe4beb605bd52a8ab6be3c9293639e7e2261d98de58159d15ccb83131bf4e.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:beb2dc04ba4d834a434169fa0fe6dc5b6a20bb2b1144f25caf48703edb821ce5
+size 1911
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..985f64c8e0eb3bc1dd563becf0cdf186baa172cd
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/triviaqa-v1-res.json
@@ -0,0 +1 @@
+{"results": {"triviaqa": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"triviaqa": 1}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..d27430a9a2eab0a6a5e265e249237201a4a56061
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wic-v0-loglikelihood
@@ -0,0 +1 @@
+403a08da05e4c44d7e3dd3358382a7ba489c41d223e24cd1a9ed82ef1a2d004b
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-loglikelihood_rolling b/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-loglikelihood_rolling
new file mode 100644
index 0000000000000000000000000000000000000000..f09af45a38c0de097358c587420858c7a53a10aa
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wikitext-v0-loglikelihood_rolling
@@ -0,0 +1 @@
+b6f83e6cf7535ee41b0057c3e2ec2cf7f2fa5a9119b305c479a83091d1142b2c
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..45eaaaca8c5892944b1b9c9af0c469e3c63e4881
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-de-v0-greedy_until
@@ -0,0 +1 @@
+d71e2074af3770e9b29ac561caf2e1c29ad6b0dc50ec2e7bcc5501747b11f0da
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-res.json
new file mode 100644
index 0000000000000000000000000000000000000000..babb8d2d74fb5585cf9578f8b1dc8be3dde43f63
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt16-en-ro-v0-res.json
@@ -0,0 +1 @@
+{"results": {"wmt16-en-ro": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.012004814364156886, "chrf_stderr": 6.424423961332661e-05, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt16-en-ro": 0}}
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..7cb9424082836f0d56afe809cf44c78fc844d993
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-de-fr-v0-greedy_until
@@ -0,0 +1 @@
+7f197bc281d6dbf9425900ef0dee7175021c43e355050f149f43b161c52bf0b0
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..d14fc4939aecb7bb40458c34954c1242d9f20501
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-cs-v0-greedy_until
@@ -0,0 +1 @@
+5a34e6863bf6965afd31653de50bac5fecf58db65dbaba46921504a2b7463786
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9777002c79830918a3939ec6978d606ae967ffe6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v0-greedy_until
@@ -0,0 +1 @@
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until
new file mode 100644
index 0000000000000000000000000000000000000000..9777002c79830918a3939ec6978d606ae967ffe6
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-en-ja-v1-greedy_until
@@ -0,0 +1 @@
+7fe61f5847a51e93e97c84b39f4420978727754e4b6cf636a27851c615857530
\ No newline at end of file
diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-loglikelihood
new file mode 100644
index 0000000000000000000000000000000000000000..0c5c0b8ceb64a158bd57294d432b2186f3a0fdf9
--- /dev/null
+++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wnli-v0-loglikelihood
@@ -0,0 +1 @@
+2ffd304d6096416eb29607e2e7642b1d6043163624967bcf4c4fc00fddc6c721
\ No newline at end of file