Spaces:
Running
Running
Upload compile_log_files.py
Browse files- compile_log_files.py +308 -0
compile_log_files.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Author: Martin Fajcik
|
2 |
+
|
3 |
+
import argparse
|
4 |
+
import copy
|
5 |
+
import glob
|
6 |
+
import hashlib
|
7 |
+
import os
|
8 |
+
import json
|
9 |
+
import re
|
10 |
+
|
11 |
+
import jsonlines
|
12 |
+
from tqdm import tqdm
|
13 |
+
|
14 |
+
SUPPORTED_METRICS = [
|
15 |
+
"avg_mcauroc", # for classification tasks
|
16 |
+
"exact_match", # for QA tasks
|
17 |
+
"acc", # for multichoice tasks
|
18 |
+
"rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
|
19 |
+
"rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility
|
20 |
+
"word_perplexity", # for language modeling tasks
|
21 |
+
]
|
22 |
+
EXTRA_INFO_RELEASE_KEYS = [
|
23 |
+
'filtered_resps',
|
24 |
+
'doc_id',
|
25 |
+
]
|
26 |
+
|
27 |
+
with open("leaderboard/metadata.json", "r") as f:
|
28 |
+
METADATA = json.load(f)
|
29 |
+
|
30 |
+
# TASK MAP
|
31 |
+
# from promptname to taskname
|
32 |
+
MAP = {
|
33 |
+
'benchmark_agree': 'benczechmark_agree',
|
34 |
+
'benchmark_belebele': 'benczechmark_belebele',
|
35 |
+
'benchmark_czechnews': 'benczechmark_czechnews',
|
36 |
+
'benchmark_subjectivity': 'benczechmark_subjectivity',
|
37 |
+
'benczechmark_snli': 'benczechmark_snli',
|
38 |
+
'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
|
39 |
+
'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
|
40 |
+
'propaganda_nazor': 'benczechmark_propaganda_nazor',
|
41 |
+
'propaganda_strach': 'benczechmark_propaganda_strach',
|
42 |
+
'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
|
43 |
+
'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
|
44 |
+
'propaganda_lokace': 'benczechmark_propaganda_lokace',
|
45 |
+
'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
|
46 |
+
'propaganda_vina': 'benczechmark_propaganda_vina',
|
47 |
+
'propaganda_zanr': 'benczechmark_propaganda_zanr',
|
48 |
+
'propaganda_emoce': 'benczechmark_propaganda_emoce',
|
49 |
+
'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
|
50 |
+
'propaganda_rusko': 'benczechmark_propaganda_rusko',
|
51 |
+
'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
|
52 |
+
'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
|
53 |
+
'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
|
54 |
+
'benczechmark_summarization': 'benczechmark_summarization',
|
55 |
+
'gec': 'benczechmark_grammarerrorcorrection',
|
56 |
+
'cs_nq_open': 'benczechmark_cs_naturalquestions',
|
57 |
+
'cs_sqad_open': 'benczechmark_cs_sqad32',
|
58 |
+
'cs_triviaqa': 'benczechmark_cs_triviaQA',
|
59 |
+
'csfever': 'benczechmark_csfever_nli',
|
60 |
+
'ctkfacts': 'benczechmark_ctkfacts_nli',
|
61 |
+
'cnec_ner': 'benczechmark_cs_ner',
|
62 |
+
'cdec_ner': 'benczechmark_cs_court_decisions_ner',
|
63 |
+
'klokan_qa': 'benczechmark_klokan_qa',
|
64 |
+
'umimeto_biology': 'benczechmark_umimeto_biology',
|
65 |
+
'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
|
66 |
+
'umimeto_czech': 'benczechmark_umimeto_czech',
|
67 |
+
'umimeto_history': 'benczechmark_umimeto_history',
|
68 |
+
'umimeto_informatics': 'benczechmark_umimeto_informatics',
|
69 |
+
'umimeto_math': 'benczechmark_umimeto_math',
|
70 |
+
'umimeto_physics': 'benczechmark_umimeto_physics',
|
71 |
+
'cermat_czech_open': 'benczechmark_cermat_czech_open',
|
72 |
+
'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
|
73 |
+
'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
|
74 |
+
'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
|
75 |
+
'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
|
76 |
+
'history_ir': 'benczechmark_history_ir',
|
77 |
+
'benczechmark_histcorpus': "benczechmark_histcorpus",
|
78 |
+
'benczechmark_hellaswag': "benczechmark_hellaswag",
|
79 |
+
'benczechmark_essay': 'benczechmark_essay',
|
80 |
+
'benczechmark_fiction': 'benczechmark_fiction',
|
81 |
+
'benczechmark_capek': 'benczechmark_capek',
|
82 |
+
'benczechmark_correspondence': 'benczechmark_correspondence',
|
83 |
+
'benczechmark_havlicek': 'benczechmark_havlicek',
|
84 |
+
'benczechmark_speeches': 'benczechmark_speeches',
|
85 |
+
'benczechmark_spoken': 'benczechmark_spoken',
|
86 |
+
'benczechmark_dialect': 'benczechmark_dialect'
|
87 |
+
}
|
88 |
+
|
89 |
+
NO_PROMPT_TASKS = ["benczechmark_histcorpus",
|
90 |
+
"benczechmark_hellaswag",
|
91 |
+
"benczechmark_essay",
|
92 |
+
"benczechmark_fiction",
|
93 |
+
"benczechmark_capek",
|
94 |
+
"benczechmark_correspondence",
|
95 |
+
"benczechmark_havlicek",
|
96 |
+
"benczechmark_speeches",
|
97 |
+
"benczechmark_spoken",
|
98 |
+
"benczechmark_dialect"]
|
99 |
+
|
100 |
+
|
101 |
+
def resolve_taskname(taskname):
|
102 |
+
if taskname not in MAP:
|
103 |
+
raise ValueError(f"Taskname {taskname} not found.")
|
104 |
+
return MAP[taskname]
|
105 |
+
|
106 |
+
|
107 |
+
def rename_keys(d, resolve_taskname):
|
108 |
+
orig_len = len(d)
|
109 |
+
for k, v in list(d.items()):
|
110 |
+
new_key = resolve_taskname(k)
|
111 |
+
d[new_key] = d.pop(k)
|
112 |
+
|
113 |
+
# make sure list length didnt changed
|
114 |
+
assert len(d) == orig_len
|
115 |
+
|
116 |
+
|
117 |
+
def process_harness_logs(input_folders, output_file):
|
118 |
+
"""
|
119 |
+
- Selects best prompt for each task
|
120 |
+
- Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
|
121 |
+
"""
|
122 |
+
|
123 |
+
def expand_input_folders(input_folders):
|
124 |
+
# Check if input_folders is a wildcard pattern
|
125 |
+
if '*' in input_folders or '?' in input_folders:
|
126 |
+
# Expand the wildcard into a list of matching directories
|
127 |
+
matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
|
128 |
+
return matching_directories
|
129 |
+
else:
|
130 |
+
# If it's not a wildcard, return the input as a single-item list if it's a valid directory
|
131 |
+
if os.path.isdir(input_folders):
|
132 |
+
return [input_folders]
|
133 |
+
else:
|
134 |
+
return []
|
135 |
+
|
136 |
+
input_folders = expand_input_folders(input_folders)
|
137 |
+
|
138 |
+
per_task_results = {}
|
139 |
+
metric_per_task = {}
|
140 |
+
predictions = {}
|
141 |
+
|
142 |
+
all_harness_results = dict()
|
143 |
+
for input_folder in tqdm(input_folders, desc="Loading files"):
|
144 |
+
# read all files in input_folder
|
145 |
+
# consider first folder within this folder
|
146 |
+
input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
|
147 |
+
# find file which starts with results... prefix in the input_folder
|
148 |
+
result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
|
149 |
+
with open(os.path.join(input_folder, result_file), "r") as f:
|
150 |
+
harness_results = json.load(f)
|
151 |
+
all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
|
152 |
+
current_multipleprompt_tasknames = []
|
153 |
+
for name, result in harness_results['results'].items():
|
154 |
+
if name in NO_PROMPT_TASKS:
|
155 |
+
# not prompts
|
156 |
+
taskname = name
|
157 |
+
# process metric names
|
158 |
+
for k, v in copy.deepcopy(result).items():
|
159 |
+
if "," in k:
|
160 |
+
name, _ = k.split(",")
|
161 |
+
del result[k]
|
162 |
+
result[name] = v
|
163 |
+
per_task_results[taskname] = result
|
164 |
+
|
165 |
+
if result['alias'].strip().startswith('- prompt-'):
|
166 |
+
# process taskname
|
167 |
+
taskname = name[:-1]
|
168 |
+
if taskname.endswith("_"):
|
169 |
+
taskname = taskname[:-1]
|
170 |
+
|
171 |
+
# process metric names
|
172 |
+
for k, v in copy.deepcopy(result).items():
|
173 |
+
if "," in k:
|
174 |
+
name, key = k.split(",")
|
175 |
+
del result[k]
|
176 |
+
result[name] = v
|
177 |
+
|
178 |
+
if taskname not in per_task_results:
|
179 |
+
per_task_results[taskname] = [result]
|
180 |
+
current_multipleprompt_tasknames.append(taskname)
|
181 |
+
else:
|
182 |
+
per_task_results[taskname].append(result)
|
183 |
+
|
184 |
+
# get best result according to metric priority given in SUPPORTED_METRICS list
|
185 |
+
for taskname, results in per_task_results.items():
|
186 |
+
if not taskname in current_multipleprompt_tasknames:
|
187 |
+
continue
|
188 |
+
best_result = None
|
189 |
+
target_metric = None
|
190 |
+
for m in SUPPORTED_METRICS:
|
191 |
+
if m in results[0]:
|
192 |
+
target_metric = m
|
193 |
+
break
|
194 |
+
if target_metric is None:
|
195 |
+
raise ValueError(f"No supported metric found in {taskname}")
|
196 |
+
metric_per_task[taskname] = target_metric
|
197 |
+
|
198 |
+
all_measured_results = []
|
199 |
+
for result in results:
|
200 |
+
all_measured_results.append(result[target_metric])
|
201 |
+
if best_result is None:
|
202 |
+
best_result = result
|
203 |
+
else:
|
204 |
+
if result[target_metric] > best_result[target_metric]:
|
205 |
+
best_result = result
|
206 |
+
# Compute max-centered variance
|
207 |
+
max_value = best_result[target_metric]
|
208 |
+
squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
|
209 |
+
max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
|
210 |
+
best_result['max_centered_variance'] = max_centered_variance
|
211 |
+
|
212 |
+
per_task_results[taskname] = best_result
|
213 |
+
|
214 |
+
for file in os.listdir(input_folder):
|
215 |
+
if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
|
216 |
+
continue
|
217 |
+
for taskname in per_task_results.keys():
|
218 |
+
if taskname in file:
|
219 |
+
print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
|
220 |
+
# check this file corresponds to same prompt
|
221 |
+
winning_prompt = per_task_results[taskname]['alias'][-1]
|
222 |
+
if taskname in NO_PROMPT_TASKS:
|
223 |
+
current_prompt = "-1"
|
224 |
+
else:
|
225 |
+
try:
|
226 |
+
current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
|
227 |
+
except AttributeError:
|
228 |
+
raise ValueError(f"Prompt not found in {file}")
|
229 |
+
if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
|
230 |
+
# load file contents
|
231 |
+
predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
|
232 |
+
# only keep data necessary for metrics
|
233 |
+
for prediction in predictions[taskname]:
|
234 |
+
for key in list(prediction.keys()):
|
235 |
+
if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
|
236 |
+
del prediction[key]
|
237 |
+
|
238 |
+
# rename keys (tasknames) using resolve_tasknames:
|
239 |
+
rename_keys(predictions, resolve_taskname)
|
240 |
+
rename_keys(per_task_results, resolve_taskname)
|
241 |
+
|
242 |
+
# assert keys in predictions and results are the same
|
243 |
+
# assert set(predictions.keys()) == set(per_task_results.keys())
|
244 |
+
if not set(predictions.keys()) == set(per_task_results.keys()):
|
245 |
+
# print missing keys
|
246 |
+
print("Missing keys in predictions:")
|
247 |
+
print(set(predictions.keys()) - set(per_task_results.keys()))
|
248 |
+
# print extra keys
|
249 |
+
print("Extra keys in predictions:")
|
250 |
+
print(set(per_task_results.keys()) - set(predictions.keys()))
|
251 |
+
raise ValueError("Keys in predictions and results are not the same")
|
252 |
+
|
253 |
+
aggregated_predictions = dict()
|
254 |
+
aggregated_predictions["predictions"] = predictions
|
255 |
+
aggregated_predictions["results"] = per_task_results
|
256 |
+
aggregated_predictions["metadata"] = {
|
257 |
+
'git_hash': harness_results['git_hash'],
|
258 |
+
'transformers_version': harness_results['transformers_version'],
|
259 |
+
'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
|
260 |
+
'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
|
261 |
+
'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
|
262 |
+
'eot_token_id': harness_results['eot_token_id'],
|
263 |
+
'max_length': harness_results['max_length'],
|
264 |
+
'task_hashes': harness_results['task_hashes'],
|
265 |
+
'model_source': harness_results['model_source'],
|
266 |
+
'model_name': harness_results['model_name'],
|
267 |
+
'model_name_sanitized': harness_results['model_name_sanitized'],
|
268 |
+
'system_instruction': harness_results['system_instruction'],
|
269 |
+
'system_instruction_sha': harness_results['system_instruction_sha'],
|
270 |
+
'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
|
271 |
+
'chat_template': harness_results['chat_template'],
|
272 |
+
'chat_template_sha': harness_results['chat_template_sha'],
|
273 |
+
'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
|
274 |
+
'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
|
275 |
+
}
|
276 |
+
|
277 |
+
# make sure all tasks are present
|
278 |
+
all_tasks = set(METADATA["tasks"].keys())
|
279 |
+
all_expected_tasks = set(per_task_results.keys())
|
280 |
+
all_missing_tasks = all_tasks - all_expected_tasks
|
281 |
+
all_extra_tasks = all_expected_tasks - all_tasks
|
282 |
+
if len(all_missing_tasks) > 0:
|
283 |
+
EOLN = "\n"
|
284 |
+
# print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
|
285 |
+
raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment
|
286 |
+
if len(all_extra_tasks) > 0:
|
287 |
+
EOLN = "\n"
|
288 |
+
raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
|
289 |
+
with open(output_file, "w") as f:
|
290 |
+
json.dump(aggregated_predictions, f)
|
291 |
+
print("Success!")
|
292 |
+
print("Output saved to", output_file)
|
293 |
+
|
294 |
+
|
295 |
+
def main():
|
296 |
+
parser = argparse.ArgumentParser(
|
297 |
+
description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
|
298 |
+
parser.add_argument("-i", "-f", "--input_folder", "--folder",
|
299 |
+
help="Folder with unprocessed results from lm harness.", required=True)
|
300 |
+
parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
|
301 |
+
args = parser.parse_args()
|
302 |
+
|
303 |
+
process_harness_logs(args.input_folder, args.output_file)
|
304 |
+
|
305 |
+
|
306 |
+
if __name__ == "__main__":
|
307 |
+
main()
|
308 |
+
|