Spaces:
Running
Running
File size: 13,670 Bytes
e60cafc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 |
# Author: Martin Fajcik
import argparse
import copy
import glob
import hashlib
import os
import json
import re
import jsonlines
from tqdm import tqdm
SUPPORTED_METRICS = [
"avg_mcauroc", # for classification tasks
"exact_match", # for QA tasks
"acc", # for multichoice tasks
"rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
"rouge_raw_r2_mid_f", # for summarization tasks, older metric version for back compatibility
"word_perplexity", # for language modeling tasks
]
EXTRA_INFO_RELEASE_KEYS = [
'filtered_resps',
'doc_id',
]
with open("leaderboard/metadata.json", "r") as f:
METADATA = json.load(f)
# TASK MAP
# from promptname to taskname
MAP = {
'benchmark_agree': 'benczechmark_agree',
'benchmark_belebele': 'benczechmark_belebele',
'benchmark_czechnews': 'benczechmark_czechnews',
'benchmark_subjectivity': 'benczechmark_subjectivity',
'benczechmark_snli': 'benczechmark_snli',
'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
'propaganda_nazor': 'benczechmark_propaganda_nazor',
'propaganda_strach': 'benczechmark_propaganda_strach',
'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
'propaganda_lokace': 'benczechmark_propaganda_lokace',
'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
'propaganda_vina': 'benczechmark_propaganda_vina',
'propaganda_zanr': 'benczechmark_propaganda_zanr',
'propaganda_emoce': 'benczechmark_propaganda_emoce',
'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
'propaganda_rusko': 'benczechmark_propaganda_rusko',
'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
'benczechmark_summarization': 'benczechmark_summarization',
'gec': 'benczechmark_grammarerrorcorrection',
'cs_nq_open': 'benczechmark_cs_naturalquestions',
'cs_sqad_open': 'benczechmark_cs_sqad32',
'cs_triviaqa': 'benczechmark_cs_triviaQA',
'csfever': 'benczechmark_csfever_nli',
'ctkfacts': 'benczechmark_ctkfacts_nli',
'cnec_ner': 'benczechmark_cs_ner',
'cdec_ner': 'benczechmark_cs_court_decisions_ner',
'klokan_qa': 'benczechmark_klokan_qa',
'umimeto_biology': 'benczechmark_umimeto_biology',
'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
'umimeto_czech': 'benczechmark_umimeto_czech',
'umimeto_history': 'benczechmark_umimeto_history',
'umimeto_informatics': 'benczechmark_umimeto_informatics',
'umimeto_math': 'benczechmark_umimeto_math',
'umimeto_physics': 'benczechmark_umimeto_physics',
'cermat_czech_open': 'benczechmark_cermat_czech_open',
'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
'history_ir': 'benczechmark_history_ir',
'benczechmark_histcorpus': "benczechmark_histcorpus",
'benczechmark_hellaswag': "benczechmark_hellaswag",
'benczechmark_essay': 'benczechmark_essay',
'benczechmark_fiction': 'benczechmark_fiction',
'benczechmark_capek': 'benczechmark_capek',
'benczechmark_correspondence': 'benczechmark_correspondence',
'benczechmark_havlicek': 'benczechmark_havlicek',
'benczechmark_speeches': 'benczechmark_speeches',
'benczechmark_spoken': 'benczechmark_spoken',
'benczechmark_dialect': 'benczechmark_dialect'
}
NO_PROMPT_TASKS = ["benczechmark_histcorpus",
"benczechmark_hellaswag",
"benczechmark_essay",
"benczechmark_fiction",
"benczechmark_capek",
"benczechmark_correspondence",
"benczechmark_havlicek",
"benczechmark_speeches",
"benczechmark_spoken",
"benczechmark_dialect"]
def resolve_taskname(taskname):
if taskname not in MAP:
raise ValueError(f"Taskname {taskname} not found.")
return MAP[taskname]
def rename_keys(d, resolve_taskname):
orig_len = len(d)
for k, v in list(d.items()):
new_key = resolve_taskname(k)
d[new_key] = d.pop(k)
# make sure list length didnt changed
assert len(d) == orig_len
def process_harness_logs(input_folders, output_file):
"""
- Selects best prompt for each task
- Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
"""
def expand_input_folders(input_folders):
# Check if input_folders is a wildcard pattern
if '*' in input_folders or '?' in input_folders:
# Expand the wildcard into a list of matching directories
matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
return matching_directories
else:
# If it's not a wildcard, return the input as a single-item list if it's a valid directory
if os.path.isdir(input_folders):
return [input_folders]
else:
return []
input_folders = expand_input_folders(input_folders)
per_task_results = {}
metric_per_task = {}
predictions = {}
all_harness_results = dict()
for input_folder in tqdm(input_folders, desc="Loading files"):
# read all files in input_folder
# consider first folder within this folder
input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
# find file which starts with results... prefix in the input_folder
result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
with open(os.path.join(input_folder, result_file), "r") as f:
harness_results = json.load(f)
all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
current_multipleprompt_tasknames = []
for name, result in harness_results['results'].items():
if name in NO_PROMPT_TASKS:
# not prompts
taskname = name
# process metric names
for k, v in copy.deepcopy(result).items():
if "," in k:
name, _ = k.split(",")
del result[k]
result[name] = v
per_task_results[taskname] = result
if result['alias'].strip().startswith('- prompt-'):
# process taskname
taskname = name[:-1]
if taskname.endswith("_"):
taskname = taskname[:-1]
# process metric names
for k, v in copy.deepcopy(result).items():
if "," in k:
name, key = k.split(",")
del result[k]
result[name] = v
if taskname not in per_task_results:
per_task_results[taskname] = [result]
current_multipleprompt_tasknames.append(taskname)
else:
per_task_results[taskname].append(result)
# get best result according to metric priority given in SUPPORTED_METRICS list
for taskname, results in per_task_results.items():
if not taskname in current_multipleprompt_tasknames:
continue
best_result = None
target_metric = None
for m in SUPPORTED_METRICS:
if m in results[0]:
target_metric = m
break
if target_metric is None:
raise ValueError(f"No supported metric found in {taskname}")
metric_per_task[taskname] = target_metric
all_measured_results = []
for result in results:
all_measured_results.append(result[target_metric])
if best_result is None:
best_result = result
else:
if result[target_metric] > best_result[target_metric]:
best_result = result
# Compute max-centered variance
max_value = best_result[target_metric]
squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
best_result['max_centered_variance'] = max_centered_variance
per_task_results[taskname] = best_result
for file in os.listdir(input_folder):
if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
continue
for taskname in per_task_results.keys():
if taskname in file:
print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
# check this file corresponds to same prompt
winning_prompt = per_task_results[taskname]['alias'][-1]
if taskname in NO_PROMPT_TASKS:
current_prompt = "-1"
else:
try:
current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
except AttributeError:
raise ValueError(f"Prompt not found in {file}")
if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
# load file contents
predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
# only keep data necessary for metrics
for prediction in predictions[taskname]:
for key in list(prediction.keys()):
if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
del prediction[key]
# rename keys (tasknames) using resolve_tasknames:
rename_keys(predictions, resolve_taskname)
rename_keys(per_task_results, resolve_taskname)
# assert keys in predictions and results are the same
# assert set(predictions.keys()) == set(per_task_results.keys())
if not set(predictions.keys()) == set(per_task_results.keys()):
# print missing keys
print("Missing keys in predictions:")
print(set(predictions.keys()) - set(per_task_results.keys()))
# print extra keys
print("Extra keys in predictions:")
print(set(per_task_results.keys()) - set(predictions.keys()))
raise ValueError("Keys in predictions and results are not the same")
aggregated_predictions = dict()
aggregated_predictions["predictions"] = predictions
aggregated_predictions["results"] = per_task_results
aggregated_predictions["metadata"] = {
'git_hash': harness_results['git_hash'],
'transformers_version': harness_results['transformers_version'],
'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
'eot_token_id': harness_results['eot_token_id'],
'max_length': harness_results['max_length'],
'task_hashes': harness_results['task_hashes'],
'model_source': harness_results['model_source'],
'model_name': harness_results['model_name'],
'model_name_sanitized': harness_results['model_name_sanitized'],
'system_instruction': harness_results['system_instruction'],
'system_instruction_sha': harness_results['system_instruction_sha'],
'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
'chat_template': harness_results['chat_template'],
'chat_template_sha': harness_results['chat_template_sha'],
'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
}
# make sure all tasks are present
all_tasks = set(METADATA["tasks"].keys())
all_expected_tasks = set(per_task_results.keys())
all_missing_tasks = all_tasks - all_expected_tasks
all_extra_tasks = all_expected_tasks - all_tasks
if len(all_missing_tasks) > 0:
EOLN = "\n"
# print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}") # TODO: uncomment
if len(all_extra_tasks) > 0:
EOLN = "\n"
raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
with open(output_file, "w") as f:
json.dump(aggregated_predictions, f)
print("Success!")
print("Output saved to", output_file)
def main():
parser = argparse.ArgumentParser(
description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
parser.add_argument("-i", "-f", "--input_folder", "--folder",
help="Folder with unprocessed results from lm harness.", required=True)
parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
args = parser.parse_args()
process_harness_logs(args.input_folder, args.output_file)
if __name__ == "__main__":
main()
|