mfajcik commited on
Commit
f8e9924
β€’
1 Parent(s): 8864264

Delete compile_log_files.py

Browse files
Files changed (1) hide show
  1. compile_log_files.py +0 -308
compile_log_files.py DELETED
@@ -1,308 +0,0 @@
1
- # Author: Martin Fajcik
2
-
3
- import argparse
4
- import copy
5
- import glob
6
- import hashlib
7
- import os
8
- import json
9
- import re
10
-
11
- import jsonlines
12
- from tqdm import tqdm
13
-
14
- SUPPORTED_METRICS = [
15
- "avg_mcauroc", # for classification tasks
16
- "exact_match", # for QA tasks
17
- "acc", # for multichoice tasks
18
- "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
19
- "rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility
20
- "word_perplexity", # for language modeling tasks
21
- ]
22
- EXTRA_INFO_RELEASE_KEYS = [
23
- 'filtered_resps',
24
- 'doc_id',
25
- ]
26
-
27
- with open("leaderboard/metadata.json", "r") as f:
28
- METADATA = json.load(f)
29
-
30
- # TASK MAP
31
- # from promptname to taskname
32
- MAP = {
33
- 'benchmark_agree': 'benczechmark_agree',
34
- 'benchmark_belebele': 'benczechmark_belebele',
35
- 'benchmark_czechnews': 'benczechmark_czechnews',
36
- 'benchmark_subjectivity': 'benczechmark_subjectivity',
37
- 'benczechmark_snli': 'benczechmark_snli',
38
- 'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
39
- 'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
40
- 'propaganda_nazor': 'benczechmark_propaganda_nazor',
41
- 'propaganda_strach': 'benczechmark_propaganda_strach',
42
- 'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
43
- 'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
44
- 'propaganda_lokace': 'benczechmark_propaganda_lokace',
45
- 'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
46
- 'propaganda_vina': 'benczechmark_propaganda_vina',
47
- 'propaganda_zanr': 'benczechmark_propaganda_zanr',
48
- 'propaganda_emoce': 'benczechmark_propaganda_emoce',
49
- 'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
50
- 'propaganda_rusko': 'benczechmark_propaganda_rusko',
51
- 'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
52
- 'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
53
- 'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
54
- 'benczechmark_summarization': 'benczechmark_summarization',
55
- 'gec': 'benczechmark_grammarerrorcorrection',
56
- 'cs_nq_open': 'benczechmark_cs_naturalquestions',
57
- 'cs_sqad_open': 'benczechmark_cs_sqad32',
58
- 'cs_triviaqa': 'benczechmark_cs_triviaQA',
59
- 'csfever': 'benczechmark_csfever_nli',
60
- 'ctkfacts': 'benczechmark_ctkfacts_nli',
61
- 'cnec_ner': 'benczechmark_cs_ner',
62
- 'cdec_ner': 'benczechmark_cs_court_decisions_ner',
63
- 'klokan_qa': 'benczechmark_klokan_qa',
64
- 'umimeto_biology': 'benczechmark_umimeto_biology',
65
- 'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
66
- 'umimeto_czech': 'benczechmark_umimeto_czech',
67
- 'umimeto_history': 'benczechmark_umimeto_history',
68
- 'umimeto_informatics': 'benczechmark_umimeto_informatics',
69
- 'umimeto_math': 'benczechmark_umimeto_math',
70
- 'umimeto_physics': 'benczechmark_umimeto_physics',
71
- 'cermat_czech_open': 'benczechmark_cermat_czech_open',
72
- 'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
73
- 'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
74
- 'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
75
- 'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
76
- 'history_ir': 'benczechmark_history_ir',
77
- 'benczechmark_histcorpus': "benczechmark_histcorpus",
78
- 'benczechmark_hellaswag': "benczechmark_hellaswag",
79
- 'benczechmark_essay': 'benczechmark_essay',
80
- 'benczechmark_fiction': 'benczechmark_fiction',
81
- 'benczechmark_capek': 'benczechmark_capek',
82
- 'benczechmark_correspondence': 'benczechmark_correspondence',
83
- 'benczechmark_havlicek': 'benczechmark_havlicek',
84
- 'benczechmark_speeches': 'benczechmark_speeches',
85
- 'benczechmark_spoken': 'benczechmark_spoken',
86
- 'benczechmark_dialect': 'benczechmark_dialect'
87
- }
88
-
89
- NO_PROMPT_TASKS = ["benczechmark_histcorpus",
90
- "benczechmark_hellaswag",
91
- "benczechmark_essay",
92
- "benczechmark_fiction",
93
- "benczechmark_capek",
94
- "benczechmark_correspondence",
95
- "benczechmark_havlicek",
96
- "benczechmark_speeches",
97
- "benczechmark_spoken",
98
- "benczechmark_dialect"]
99
-
100
-
101
- def resolve_taskname(taskname):
102
- if taskname not in MAP:
103
- raise ValueError(f"Taskname {taskname} not found.")
104
- return MAP[taskname]
105
-
106
-
107
- def rename_keys(d, resolve_taskname):
108
- orig_len = len(d)
109
- for k, v in list(d.items()):
110
- new_key = resolve_taskname(k)
111
- d[new_key] = d.pop(k)
112
-
113
- # make sure list length didnt changed
114
- assert len(d) == orig_len
115
-
116
-
117
- def process_harness_logs(input_folders, output_file):
118
- """
119
- - Selects best prompt for each task
120
- - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
121
- """
122
-
123
- def expand_input_folders(input_folders):
124
- # Check if input_folders is a wildcard pattern
125
- if '*' in input_folders or '?' in input_folders:
126
- # Expand the wildcard into a list of matching directories
127
- matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
128
- return matching_directories
129
- else:
130
- # If it's not a wildcard, return the input as a single-item list if it's a valid directory
131
- if os.path.isdir(input_folders):
132
- return [input_folders]
133
- else:
134
- return []
135
-
136
- input_folders = expand_input_folders(input_folders)
137
-
138
- per_task_results = {}
139
- metric_per_task = {}
140
- predictions = {}
141
-
142
- all_harness_results = dict()
143
- for input_folder in tqdm(input_folders, desc="Loading files"):
144
- # read all files in input_folder
145
- # consider first folder within this folder
146
- input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
147
- # find file which starts with results... prefix in the input_folder
148
- result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
149
- with open(os.path.join(input_folder, result_file), "r") as f:
150
- harness_results = json.load(f)
151
- all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
152
- current_multipleprompt_tasknames = []
153
- for name, result in harness_results['results'].items():
154
- if name in NO_PROMPT_TASKS:
155
- # not prompts
156
- taskname = name
157
- # process metric names
158
- for k, v in copy.deepcopy(result).items():
159
- if "," in k:
160
- name, _ = k.split(",")
161
- del result[k]
162
- result[name] = v
163
- per_task_results[taskname] = result
164
-
165
- if result['alias'].strip().startswith('- prompt-'):
166
- # process taskname
167
- taskname = name[:-1]
168
- if taskname.endswith("_"):
169
- taskname = taskname[:-1]
170
-
171
- # process metric names
172
- for k, v in copy.deepcopy(result).items():
173
- if "," in k:
174
- name, key = k.split(",")
175
- del result[k]
176
- result[name] = v
177
-
178
- if taskname not in per_task_results:
179
- per_task_results[taskname] = [result]
180
- current_multipleprompt_tasknames.append(taskname)
181
- else:
182
- per_task_results[taskname].append(result)
183
-
184
- # get best result according to metric priority given in SUPPORTED_METRICS list
185
- for taskname, results in per_task_results.items():
186
- if not taskname in current_multipleprompt_tasknames:
187
- continue
188
- best_result = None
189
- target_metric = None
190
- for m in SUPPORTED_METRICS:
191
- if m in results[0]:
192
- target_metric = m
193
- break
194
- if target_metric is None:
195
- raise ValueError(f"No supported metric found in {taskname}")
196
- metric_per_task[taskname] = target_metric
197
-
198
- all_measured_results = []
199
- for result in results:
200
- all_measured_results.append(result[target_metric])
201
- if best_result is None:
202
- best_result = result
203
- else:
204
- if result[target_metric] > best_result[target_metric]:
205
- best_result = result
206
- # Compute max-centered variance
207
- max_value = best_result[target_metric]
208
- squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
209
- max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
210
- best_result['max_centered_variance'] = max_centered_variance
211
-
212
- per_task_results[taskname] = best_result
213
-
214
- for file in os.listdir(input_folder):
215
- if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
216
- continue
217
- for taskname in per_task_results.keys():
218
- if taskname in file:
219
- print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
220
- # check this file corresponds to same prompt
221
- winning_prompt = per_task_results[taskname]['alias'][-1]
222
- if taskname in NO_PROMPT_TASKS:
223
- current_prompt = "-1"
224
- else:
225
- try:
226
- current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
227
- except AttributeError:
228
- raise ValueError(f"Prompt not found in {file}")
229
- if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
230
- # load file contents
231
- predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
232
- # only keep data necessary for metrics
233
- for prediction in predictions[taskname]:
234
- for key in list(prediction.keys()):
235
- if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
236
- del prediction[key]
237
-
238
- # rename keys (tasknames) using resolve_tasknames:
239
- rename_keys(predictions, resolve_taskname)
240
- rename_keys(per_task_results, resolve_taskname)
241
-
242
- # assert keys in predictions and results are the same
243
- # assert set(predictions.keys()) == set(per_task_results.keys())
244
- if not set(predictions.keys()) == set(per_task_results.keys()):
245
- # print missing keys
246
- print("Missing keys in predictions:")
247
- print(set(predictions.keys()) - set(per_task_results.keys()))
248
- # print extra keys
249
- print("Extra keys in predictions:")
250
- print(set(per_task_results.keys()) - set(predictions.keys()))
251
- raise ValueError("Keys in predictions and results are not the same")
252
-
253
- aggregated_predictions = dict()
254
- aggregated_predictions["predictions"] = predictions
255
- aggregated_predictions["results"] = per_task_results
256
- aggregated_predictions["metadata"] = {
257
- 'git_hash': harness_results['git_hash'],
258
- 'transformers_version': harness_results['transformers_version'],
259
- 'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
260
- 'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
261
- 'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
262
- 'eot_token_id': harness_results['eot_token_id'],
263
- 'max_length': harness_results['max_length'],
264
- 'task_hashes': harness_results['task_hashes'],
265
- 'model_source': harness_results['model_source'],
266
- 'model_name': harness_results['model_name'],
267
- 'model_name_sanitized': harness_results['model_name_sanitized'],
268
- 'system_instruction': harness_results['system_instruction'],
269
- 'system_instruction_sha': harness_results['system_instruction_sha'],
270
- 'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
271
- 'chat_template': harness_results['chat_template'],
272
- 'chat_template_sha': harness_results['chat_template_sha'],
273
- 'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
274
- 'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
275
- }
276
-
277
- # make sure all tasks are present
278
- all_tasks = set(METADATA["tasks"].keys())
279
- all_expected_tasks = set(per_task_results.keys())
280
- all_missing_tasks = all_tasks - all_expected_tasks
281
- all_extra_tasks = all_expected_tasks - all_tasks
282
- if len(all_missing_tasks) > 0:
283
- EOLN = "\n"
284
- # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
285
- raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment
286
- if len(all_extra_tasks) > 0:
287
- EOLN = "\n"
288
- raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
289
- with open(output_file, "w") as f:
290
- json.dump(aggregated_predictions, f)
291
- print("Success!")
292
- print("Output saved to", output_file)
293
-
294
-
295
- def main():
296
- parser = argparse.ArgumentParser(
297
- description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
298
- parser.add_argument("-i", "-f", "--input_folder", "--folder",
299
- help="Folder with unprocessed results from lm harness.", required=True)
300
- parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
301
- args = parser.parse_args()
302
-
303
- process_harness_logs(args.input_folder, args.output_file)
304
-
305
-
306
- if __name__ == "__main__":
307
- main()
308
-