Spaces:

CZLC
/

BenCzechMark

Running

App Files Files Community

mfajcik commited on Sep 9, 2024

Commit

f8e9924

verified ·

1 Parent(s): 8864264

Delete compile_log_files.py

Browse files

Files changed (1) hide show

compile_log_files.py +0 -308

compile_log_files.py DELETED Viewed

@@ -1,308 +0,0 @@
-# Author: Martin Fajcik
-import argparse
-import copy
-import glob
-import hashlib
-import os
-import json
-import re
-import jsonlines
-from tqdm import tqdm
-SUPPORTED_METRICS = [
-    "avg_mcauroc",  # for classification tasks
-    "exact_match",  # for QA tasks
-    "acc",  # for multichoice tasks
-    "rouge_raw_r2_mid_f_without_bootstrap", # for summarization tasks
-    "rouge_raw_r2_mid_f",  # for summarization tasks, older metric version for back compatibility
-    "word_perplexity",  # for language modeling tasks
-]
-EXTRA_INFO_RELEASE_KEYS = [
-    'filtered_resps',
-    'doc_id',
-]
-with open("leaderboard/metadata.json", "r") as f:
-    METADATA = json.load(f)
-# TASK MAP
-# from promptname to taskname
-MAP = {
-    'benchmark_agree': 'benczechmark_agree',
-    'benchmark_belebele': 'benczechmark_belebele',
-    'benchmark_czechnews': 'benczechmark_czechnews',
-    'benchmark_subjectivity': 'benczechmark_subjectivity',
-    'benczechmark_snli': 'benczechmark_snli',
-    'propaganda_argumentace': 'benczechmark_propaganda_argumentace',
-    'propaganda_fabulace': 'benczechmark_propaganda_fabulace',
-    'propaganda_nazor': 'benczechmark_propaganda_nazor',
-    'propaganda_strach': 'benczechmark_propaganda_strach',
-    'propaganda_zamereni': 'benczechmark_propaganda_zamereni',
-    'propaganda_demonizace': 'benczechmark_propaganda_demonizace',
-    'propaganda_lokace': 'benczechmark_propaganda_lokace',
-    'propaganda_relativizace': 'benczechmark_propaganda_relativizace',
-    'propaganda_vina': 'benczechmark_propaganda_vina',
-    'propaganda_zanr': 'benczechmark_propaganda_zanr',
-    'propaganda_emoce': 'benczechmark_propaganda_emoce',
-    'propaganda_nalepkovani': 'benczechmark_propaganda_nalepkovani',
-    'propaganda_rusko': 'benczechmark_propaganda_rusko',
-    'benczechmark_sentiment_mall': 'benczechmark_sentiment_mall',
-    'benczechmark_sentiment_fb': 'benczechmark_sentiment_fb',
-    'benczechmark_sentiment_csfd': 'benczechmark_sentiment_csfd',
-    'benczechmark_summarization': 'benczechmark_summarization',
-    'gec': 'benczechmark_grammarerrorcorrection',
-    'cs_nq_open': 'benczechmark_cs_naturalquestions',
-    'cs_sqad_open': 'benczechmark_cs_sqad32',
-    'cs_triviaqa': 'benczechmark_cs_triviaQA',
-    'csfever': 'benczechmark_csfever_nli',
-    'ctkfacts': 'benczechmark_ctkfacts_nli',
-    'cnec_ner': 'benczechmark_cs_ner',
-    'cdec_ner': 'benczechmark_cs_court_decisions_ner',
-    'klokan_qa': 'benczechmark_klokan_qa',
-    'umimeto_biology': 'benczechmark_umimeto_biology',
-    'umimeto_chemistry': 'benczechmark_umimeto_chemistry',
-    'umimeto_czech': 'benczechmark_umimeto_czech',
-    'umimeto_history': 'benczechmark_umimeto_history',
-    'umimeto_informatics': 'benczechmark_umimeto_informatics',
-    'umimeto_math': 'benczechmark_umimeto_math',
-    'umimeto_physics': 'benczechmark_umimeto_physics',
-    'cermat_czech_open': 'benczechmark_cermat_czech_open',
-    'cermat_czech_mc': 'benczechmark_cermat_czech_mc',
-    'cermat_czech_tf': 'benczechmark_cermat_czech_tf',
-    'cermat_czmath_open': 'benczechmark_cermat_czmath_open',
-    'cermat_czmath_mc': 'benczechmark_cermat_czmath_mc',
-    'history_ir': 'benczechmark_history_ir',
-    'benczechmark_histcorpus': "benczechmark_histcorpus",
-    'benczechmark_hellaswag': "benczechmark_hellaswag",
-    'benczechmark_essay': 'benczechmark_essay',
-    'benczechmark_fiction': 'benczechmark_fiction',
-    'benczechmark_capek': 'benczechmark_capek',
-    'benczechmark_correspondence': 'benczechmark_correspondence',
-    'benczechmark_havlicek': 'benczechmark_havlicek',
-    'benczechmark_speeches': 'benczechmark_speeches',
-    'benczechmark_spoken': 'benczechmark_spoken',
-    'benczechmark_dialect': 'benczechmark_dialect'
-}
-NO_PROMPT_TASKS = ["benczechmark_histcorpus",
-                   "benczechmark_hellaswag",
-                   "benczechmark_essay",
-                   "benczechmark_fiction",
-                   "benczechmark_capek",
-                   "benczechmark_correspondence",
-                   "benczechmark_havlicek",
-                   "benczechmark_speeches",
-                   "benczechmark_spoken",
-                   "benczechmark_dialect"]
-def resolve_taskname(taskname):
-    if taskname not in MAP:
-        raise ValueError(f"Taskname {taskname} not found.")
-    return MAP[taskname]
-def rename_keys(d, resolve_taskname):
-    orig_len = len(d)
-    for k, v in list(d.items()):
-        new_key = resolve_taskname(k)
-        d[new_key] = d.pop(k)
-    # make sure list length didnt changed
-    assert len(d) == orig_len
-def process_harness_logs(input_folders, output_file):
-    """
-    - Selects best prompt for each task
-    - Extract data for that prompt, necessary for targe/mnt/data/ifajcik/micromamba/envs/envs/lmharnest metrics
-    """
-    def expand_input_folders(input_folders):
-        # Check if input_folders is a wildcard pattern
-        if '*' in input_folders or '?' in input_folders:
-            # Expand the wildcard into a list of matching directories
-            matching_directories = [f for f in glob.glob(input_folders) if os.path.isdir(f)]
-            return matching_directories
-        else:
-            # If it's not a wildcard, return the input as a single-item list if it's a valid directory
-            if os.path.isdir(input_folders):
-                return [input_folders]
-            else:
-                return []
-    input_folders = expand_input_folders(input_folders)
-    per_task_results = {}
-    metric_per_task = {}
-    predictions = {}
-    all_harness_results = dict()
-    for input_folder in tqdm(input_folders, desc="Loading files"):
-        # read all files in input_folder
-        # consider first folder within this folder
-        input_folder = os.path.join(input_folder, os.listdir(input_folder)[0])
-        # find file which starts with results... prefix in the input_folder
-        result_file = [f for f in os.listdir(input_folder) if f.startswith("results")][0]
-        with open(os.path.join(input_folder, result_file), "r") as f:
-            harness_results = json.load(f)
-        all_harness_results[list(harness_results['results'].values())[0]['alias']] = harness_results
-        current_multipleprompt_tasknames = []
-        for name, result in harness_results['results'].items():
-            if name in NO_PROMPT_TASKS:
-                # not prompts
-                taskname = name
-                # process metric names
-                for k, v in copy.deepcopy(result).items():
-                    if "," in k:
-                        name, _ = k.split(",")
-                        del result[k]
-                        result[name] = v
-                per_task_results[taskname] = result
-            if result['alias'].strip().startswith('- prompt-'):
-                # process taskname
-                taskname = name[:-1]
-                if taskname.endswith("_"):
-                    taskname = taskname[:-1]
-                # process metric names
-                for k, v in copy.deepcopy(result).items():
-                    if "," in k:
-                        name, key = k.split(",")
-                        del result[k]
-                        result[name] = v
-                if taskname not in per_task_results:
-                    per_task_results[taskname] = [result]
-                    current_multipleprompt_tasknames.append(taskname)
-                else:
-                    per_task_results[taskname].append(result)
-        # get best result according to metric priority given in SUPPORTED_METRICS list
-        for taskname, results in per_task_results.items():
-            if not taskname in current_multipleprompt_tasknames:
-                continue
-            best_result = None
-            target_metric = None
-            for m in SUPPORTED_METRICS:
-                if m in results[0]:
-                    target_metric = m
-                    break
-            if target_metric is None:
-                raise ValueError(f"No supported metric found in {taskname}")
-            metric_per_task[taskname] = target_metric
-            all_measured_results = []
-            for result in results:
-                all_measured_results.append(result[target_metric])
-                if best_result is None:
-                    best_result = result
-                else:
-                    if result[target_metric] > best_result[target_metric]:
-                        best_result = result
-            # Compute max-centered variance
-            max_value = best_result[target_metric]
-            squared_diffs = [(x * 100.0 - max_value * 100.0) ** 2 for x in all_measured_results]
-            max_centered_variance = sum(squared_diffs) / (len(squared_diffs) - 1)
-            best_result['max_centered_variance'] = max_centered_variance
-            per_task_results[taskname] = best_result
-        for file in os.listdir(input_folder):
-            if file == result_file or not file.startswith("samples") or not file.endswith(".jsonl"):
-                continue
-            for taskname in per_task_results.keys():
-                if taskname in file:
-                    print(f"Processing {os.path.join(input_folder, file)} for {taskname}")
-                    # check this file corresponds to same prompt
-                    winning_prompt = per_task_results[taskname]['alias'][-1]
-                    if taskname in NO_PROMPT_TASKS:
-                        current_prompt = "-1"
-                    else:
-                        try:
-                            current_prompt = re.search(rf"{taskname}_(\d+)_", file).group(1)
-                        except AttributeError:
-                            raise ValueError(f"Prompt not found in {file}")
-                    if winning_prompt == current_prompt or taskname in NO_PROMPT_TASKS:
-                        # load file contents
-                        predictions[taskname] = list(jsonlines.open(os.path.join(input_folder, file)))
-                        # only keep data necessary for metrics
-                        for prediction in predictions[taskname]:
-                            for key in list(prediction.keys()):
-                                if key not in SUPPORTED_METRICS + EXTRA_INFO_RELEASE_KEYS:
-                                    del prediction[key]
-    # rename keys (tasknames) using resolve_tasknames:
-    rename_keys(predictions, resolve_taskname)
-    rename_keys(per_task_results, resolve_taskname)
-    # assert keys in predictions and results are the same
-    # assert set(predictions.keys()) == set(per_task_results.keys())
-    if not set(predictions.keys()) == set(per_task_results.keys()):
-        # print missing keys
-        print("Missing keys in predictions:")
-        print(set(predictions.keys()) - set(per_task_results.keys()))
-        # print extra keys
-        print("Extra keys in predictions:")
-        print(set(per_task_results.keys()) - set(predictions.keys()))
-        raise ValueError("Keys in predictions and results are not the same")
-    aggregated_predictions = dict()
-    aggregated_predictions["predictions"] = predictions
-    aggregated_predictions["results"] = per_task_results
-    aggregated_predictions["metadata"] = {
-        'git_hash': harness_results['git_hash'],
-        'transformers_version': harness_results['transformers_version'],
-        'tokenizer_pad_token': harness_results['tokenizer_pad_token'],
-        'tokenizer_eos_token': harness_results['tokenizer_eos_token'],
-        'tokenizer_bos_token': harness_results['tokenizer_bos_token'],
-        'eot_token_id': harness_results['eot_token_id'],
-        'max_length': harness_results['max_length'],
-        'task_hashes': harness_results['task_hashes'],
-        'model_source': harness_results['model_source'],
-        'model_name': harness_results['model_name'],
-        'model_name_sanitized': harness_results['model_name_sanitized'],
-        'system_instruction': harness_results['system_instruction'],
-        'system_instruction_sha': harness_results['system_instruction_sha'],
-        'fewshot_as_multiturn': harness_results['fewshot_as_multiturn'],
-        'chat_template': harness_results['chat_template'],
-        'chat_template_sha': harness_results['chat_template_sha'],
-        'total_evaluation_time_seconds': {k:v['total_evaluation_time_seconds'] for k,v in all_harness_results.items()},
-        'n-shot': all_harness_results['CTKFacts NLI']['n-shot']['ctkfacts_0']
-    }
-    # make sure all tasks are present
-    all_tasks = set(METADATA["tasks"].keys())
-    all_expected_tasks = set(per_task_results.keys())
-    all_missing_tasks = all_tasks - all_expected_tasks
-    all_extra_tasks = all_expected_tasks - all_tasks
-    if len(all_missing_tasks) > 0:
-        EOLN = "\n"
-        # print(f"Missing tasks: {EOLN.join(all_missing_tasks)}")
-        raise Exception(f"Missing tasks: {EOLN.join(all_missing_tasks)}")  # TODO: uncomment
-    if len(all_extra_tasks) > 0:
-        EOLN = "\n"
-        raise Exception(f"Extra tasks: {EOLN.join(all_extra_tasks)}")
-    with open(output_file, "w") as f:
-        json.dump(aggregated_predictions, f)
-    print("Success!")
-    print("Output saved to", output_file)
-def main():
-    parser = argparse.ArgumentParser(
-        description="Process outputs of lm harness into minimum compatible format necessary for leaderboard submission.")
-    parser.add_argument("-i", "-f", "--input_folder", "--folder",
-                        help="Folder with unprocessed results from lm harness.", required=True)
-    parser.add_argument("-o", "--output_file", help="File to save processed results.", required=True)
-    args = parser.parse_args()
-    process_harness_logs(args.input_folder, args.output_file)
-if __name__ == "__main__":
-    main()