add long_32k_eval

Files changed (10) hide show

evaluation/long_32k_eval/dataset_evaluator_retro.py +178 -0
evaluation/long_32k_eval/dataset_evaluator_retro_longbench.py +203 -0
evaluation/long_32k_eval/dataset_evaluator_retro_nv.py +181 -0
evaluation/long_32k_eval/eval_retro_vllm.sh +118 -0
evaluation/long_32k_eval/extract_log.py +88 -0
evaluation/long_32k_eval/longbench/__pycache__/eval.cpython-310.pyc +0 -0
evaluation/long_32k_eval/longbench/__pycache__/metrics.cpython-310.pyc +0 -0
evaluation/long_32k_eval/longbench/eval.py +127 -0
evaluation/long_32k_eval/longbench/metrics.py +154 -0
evaluation/long_32k_eval/run_eval_vllm.sh +10 -0

evaluation/long_32k_eval/dataset_evaluator_retro.py ADDED Viewed

	@@ -0,0 +1,178 @@

+import os
+import argparse
+import json
+import shutil
+import re
+from datasets import load_dataset, load_metric
+from huggingface_hub import hf_hub_download
+DATASETS = [
+    "gov_report",
+    "summ_screen_fd",
+    "qmsum",
+    "qasper",
+    "narrative_qa",
+    "quality",
+    "quality_hard",
+    "contract_nli",
+]
+PATTERN = re.compile(r'\b[A-D]\b')
+def find_answer(s):
+    match = PATTERN.search(s)
+    if match is None:
+        return None # None is a signal of not find! NOTE
+    return match.group()
+def read_json_data(data_path):
+    references = []
+    questions = []
+    id_to_labels = dict()
+    id_list = list()
+    idx = 0
+    with open(data_path, "r") as f:
+        examples = json.load(f)
+        for data_item in examples: # dict_keys(['source', 'paragraph_id', 'question', 'answer', 'sub-paragraphs', 'word_count', 'id', 'ctxs'])
+            idx_str = str(idx) if 'id' not in data_item else str(data_item['id'])
+            idx += 1
+            id_list.append(idx_str)
+            questions.append(data_item['question'])
+            if "answers" in data_item:
+                references.append(data_item['answers'][0])
+                answer_list = [answer_str for answer_str in data_item['answers']]
+                id_to_labels[idx_str] = answer_list
+            elif "answer" in data_item:
+                references.append(data_item['answer']) # take the single answer
+                id_to_labels[idx_str] = [data_item['answer']]
+            else:
+                raise ValueError("need answer or answers from input json")
+    return id_to_labels, id_list, questions
+def convert_to_seq(aquestion, apred):
+    if apred is None:
+        apred = ""
+    matched_pred = find_answer(apred)
+    if matched_pred is None:
+        matched_pred = apred
+    apred = '({})'.format(matched_pred)
+    alist = aquestion.split('\n')
+    for aitem in alist:
+        aitem = aitem.strip()
+        if aitem.startswith(apred):
+            pred_out = ' '.join(aitem.split(' ')[1:])
+            print('from {} to [{}]'.format(apred, pred_out))
+            return pred_out
+    print('Warning: could not find ({}) from question {}'.format(apred, aquestion))
+    return apred
+# 500 -> 100
+def load_prediction(test_file, id_list, id_to_labels, questions, dataset_name):
+    predictions = []
+    with open(test_file, "r") as f:
+        for line in f.readlines():
+            predictions.append(line.strip())
+    if len(predictions) != len(id_list):
+        print("NOTE: different number of samples, {} in prediction, yet {} in reference".format(
+            len(predictions), len(id_list)))
+        id_list = id_list[0: len(predictions)]
+    id_to_prediction = dict()
+    for aid, apred in zip(id_list, predictions):
+        id_to_prediction[aid] = apred
+    if dataset_name.startswith('quality'):
+        print('quality dataset, and rewriting the prediction to the full textual sequence...')
+        questions = questions[0: len(predictions)]
+        id_to_prediction = dict()
+        for aid, aquestion, apred in zip(id_list, questions, predictions):
+            apred_seq = convert_to_seq(aquestion, apred)
+            id_to_prediction[aid] = apred_seq
+    return id_to_prediction, id_list
+def main(args, raise_on_errors=False):
+    datasets = [args.dataset] if args.dataset in DATASETS else DATASETS
+    for dataset_name in datasets:
+        print(dataset_name)
+        scrolls_metric = load_metric(download_metric(), dataset_name) # TODO cost time to load ! NOTE
+        id_to_labels, id_list, questions = read_json_data(args.datapath)
+        id_to_pred, id_list = load_prediction(args.gen_test_file,
+                id_list, id_to_labels, questions,
+                dataset_name)
+        if len(id_to_labels) > len(id_list):
+            print('NOTE: prune the reference set from {} to {}'.format(
+                len(id_to_labels), len(id_list)))
+            id_to_labels = {aid:id_to_labels[aid] for aid in id_list}
+        errors, details = verify(id_to_pred, id_to_labels)
+        if len(errors) == 0:
+            metrics = scrolls_metric.compute(**scrolls_metric.convert_from_map_format(id_to_pred, id_to_labels))
+            print(json.dumps(metrics, indent=4))
+            dislist = [str(item) for item in metrics['display']]
+            print('final display:', dataset_name, ' '.join(dislist))
+        elif len(errors) > 0:
+            errors_msg = errors[0] if len(errors) == 1 else " ".join(f"{i}: {err}" for i, err in enumerate(errors))
+            print(json.dumps(errors, indent=4))
+            raise ValueError(f"Failed to evaluate due to: {errors_msg}")
+def download_metric():
+    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", filename="metrics/scrolls.py", repo_type="dataset")
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+def verify(id_to_pred, id_to_labels):
+    errors = []
+    details = {"missing_keys": [], "redundant_keys": []}
+    if not isinstance(id_to_pred, dict):
+        errors.append('The predictions must be saved a JSON object: {"id1": "prediction1", "id2": "prediction2", ...}')
+    else:
+        if not all(isinstance(key, str) for key in id_to_pred.keys()):
+            errors.append("All keys of the predictions dictionary must be strings")
+        if not all(isinstance(value, str) for value in id_to_pred.values()):
+            errors.append("All values of the predictions dictionary must be strings")
+        if len(errors) == 0:
+            predictions_keys, reference_keys = set(id_to_pred.keys()), set(id_to_labels.keys())
+            missing_keys = reference_keys - predictions_keys
+            redundant_keys = predictions_keys - reference_keys
+            if len(missing_keys) > 0:
+                details["missing_keys"] = list(missing_keys)
+                errors.append(f"There are missing example IDs.")
+            else:
+                del details["missing_keys"]
+            if len(redundant_keys) > 0:
+                details["redundant_keys"] = list(redundant_keys)
+                errors.append(f"There are redundant example IDs.")
+            else:
+                del details["redundant_keys"]
+    return errors, details
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate SCROLLS predictions per dataset")
+    parser.add_argument("--datapath", type=str,
+            default=None, help="datapath for test json file [reference]")
+    parser.add_argument("--gen_test_file", type=str,
+            default=None, help="generations for test file [system prediction]")
+    parser.add_argument("--dataset", type=str,
+            default=None, help="name of the dataset used in scrolls: {}".format(DATASETS))
+    args = parser.parse_args()
+    main(args)

evaluation/long_32k_eval/dataset_evaluator_retro_longbench.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import argparse
+import json
+import shutil
+import re
+from datasets import load_dataset, load_metric
+from huggingface_hub import hf_hub_download
+from longbench.eval  import scorer
+LONGBENCH_DATASETS = [
+    'musique', # NOTE TODO to add other 20 datasets
+    'hotpotqa',
+    'multifieldqa_en'
+]
+PATTERN = re.compile(r'\b[A-D]\b')
+def find_answer(s):
+    match = PATTERN.search(s)
+    if match is None:
+        return None # None is a signal of not find! NOTE
+    return match.group()
+def read_json_data(data_path):
+    references = []
+    questions = []
+    id_to_labels = dict()
+    id_list = list()
+    idx = 0
+    with open(data_path, "r") as f:
+        examples = json.load(f)
+        for data_item in examples: # dict_keys(['source', 'paragraph_id', 'question', 'answer', 'sub-paragraphs', 'word_count', 'id', 'ctxs'])
+            idx_str = str(idx) if 'id' not in data_item else str(data_item['id'])
+            idx += 1
+            id_list.append(idx_str)
+            questions.append(data_item['question'])
+            if "answers" in data_item:
+                references.append(data_item['answers']) # NOTE take all the answers!
+                answer_list = [answer_str for answer_str in data_item['answers']]
+                id_to_labels[idx_str] = answer_list
+            elif "answer" in data_item:
+                references.append([data_item['answer']]) # take the single answer, as a list
+                id_to_labels[idx_str] = [data_item['answer']]
+            else:
+                raise ValueError("need answer or answers from input json")
+    return id_to_labels, id_list, questions, references #answers
+def convert_to_seq(aquestion, apred):
+    if apred is None:
+        apred = ""
+    matched_pred = find_answer(apred)
+    if matched_pred is None:
+        matched_pred = apred
+    apred = '({})'.format(matched_pred)
+    alist = aquestion.split('\n')
+    for aitem in alist:
+        aitem = aitem.strip()
+        if aitem.startswith(apred):
+            pred_out = ' '.join(aitem.split(' ')[1:])
+            print('from {} to [{}]'.format(apred, pred_out))
+            return pred_out
+    print('Warning: could not find ({}) from question {}'.format(apred, aquestion))
+    return apred
+def load_prediction_openai(test_file):
+    predictions = []
+    with open(test_file, "r") as f:
+        apred_list = list()
+        for aline in f.readlines():
+            if aline.startswith('assistant: '):
+                if len(apred_list) > 0:
+                    print('\n'.join(apred_list))
+                    predictions.append('\n'.join(apred_list))
+                    apred_list = list()
+                apred_list.append(aline[len('assistant: '):].strip())
+            else:
+                apred_list.append(aline.strip())
+        if len(apred_list) > 0:
+            predictions.append('\n'.join(apred_list))
+    print(len(predictions))
+    return predictions
+# 500 -> 100
+def load_prediction(test_file, id_list, id_to_labels,
+        questions, dataset_name, is_openai_assistant=False):
+    if is_openai_assistant:
+        predictions = load_prediction_openai(test_file)
+    else:
+        predictions = []
+        with open(test_file, "r") as f:
+            for line in f.readlines():
+                predictions.append(line.strip())
+    if len(predictions) != len(id_list):
+        print("NOTE: different number of samples, {} in prediction, yet {} in reference".format(
+            len(predictions), len(id_list)))
+        id_list = id_list[0: len(predictions)]
+    id_to_prediction = dict()
+    for aid, apred in zip(id_list, predictions):
+        id_to_prediction[aid] = apred
+    if dataset_name.startswith('quality'):
+        print('quality dataset, and rewriting the prediction to the full textual sequence...')
+        questions = questions[0: len(predictions)]
+        id_to_prediction = dict()
+        for aid, aquestion, apred in zip(id_list, questions, predictions):
+            apred_seq = convert_to_seq(aquestion, apred)
+            id_to_prediction[aid] = apred_seq
+    return id_to_prediction, id_list, predictions
+def main(args, raise_on_errors=False):
+    datasets = [args.dataset] if args.dataset in LONGBENCH_DATASETS else LONGBENCH_DATASETS
+    for dataset_name in datasets:
+        print(dataset_name)
+        id_to_labels, id_list, questions, answers = read_json_data(args.datapath)
+        id_to_pred, id_list, predictions = load_prediction(args.gen_test_file,
+                id_list, id_to_labels, questions,
+                dataset_name, args.is_openai_assistant)
+        if len(id_to_labels) > len(id_list):
+            print('NOTE: prune the reference set from {} to {}'.format(
+                len(id_to_labels), len(id_list)))
+            id_to_labels = {aid:id_to_labels[aid] for aid in id_list}
+        errors, details = verify(id_to_pred, id_to_labels)
+        if len(errors) == 0:
+            score = scorer(dataset_name, predictions, answers, all_classes=None)
+            print('final display:', dataset_name, score, "\n", args.gen_test_file)
+        elif len(errors) > 0:
+            errors_msg = errors[0] if len(errors) == 1 else " ".join(f"{i}: {err}" for i, err in enumerate(errors))
+            print(json.dumps(errors, indent=4))
+            raise ValueError(f"Failed to evaluate due to: {errors_msg}")
+def download_metric():
+    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", filename="metrics/scrolls.py", repo_type="dataset")
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+def verify(id_to_pred, id_to_labels):
+    errors = []
+    details = {"missing_keys": [], "redundant_keys": []}
+    if not isinstance(id_to_pred, dict):
+        errors.append('The predictions must be saved a JSON object: {"id1": "prediction1", "id2": "prediction2", ...}')
+    else:
+        if not all(isinstance(key, str) for key in id_to_pred.keys()):
+            errors.append("All keys of the predictions dictionary must be strings")
+        if not all(isinstance(value, str) for value in id_to_pred.values()):
+            errors.append("All values of the predictions dictionary must be strings")
+        if len(errors) == 0:
+            predictions_keys, reference_keys = set(id_to_pred.keys()), set(id_to_labels.keys())
+            missing_keys = reference_keys - predictions_keys
+            redundant_keys = predictions_keys - reference_keys
+            if len(missing_keys) > 0:
+                details["missing_keys"] = list(missing_keys)
+                errors.append(f"There are missing example IDs.")
+            else:
+                del details["missing_keys"]
+            if len(redundant_keys) > 0:
+                details["redundant_keys"] = list(redundant_keys)
+                errors.append(f"There are redundant example IDs.")
+            else:
+                del details["redundant_keys"]
+    return errors, details
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate SCROLLS predictions per dataset")
+    dataset_help = "name of the dataset used in longbench: {}".format(LONGBENCH_DATASETS)
+    parser.add_argument("--datapath", type=str, required=True,
+            default=None, help="datapath for test json file [reference]")
+    parser.add_argument("--gen_test_file", type=str, required=True,
+            default=None, help="generations for test file [system prediction]")
+    parser.add_argument("--dataset", type=str, required=True,
+            default=None, help=dataset_help)
+    parser.add_argument("--is_openai_assistant", type=bool, required=False,
+            default=False,
+            help='if openai assistant, then combine multiple lines and the 1st-line starts with assistant:')
+    args = parser.parse_args()
+    print(args)
+    main(args)

evaluation/long_32k_eval/dataset_evaluator_retro_nv.py ADDED Viewed

	@@ -0,0 +1,181 @@

+import os
+import argparse
+import json
+import shutil
+import re
+from datasets import load_dataset, load_metric
+from huggingface_hub import hf_hub_download
+from nv.evaluate_f1_sft_zeroshot import evaluate_f1
+DATASETS = [
+    'doc2dial_full_dialogue',
+]
+PATTERN = re.compile(r'\b[A-D]\b')
+def find_answer(s):
+    match = PATTERN.search(s)
+    if match is None:
+        return None # None is a signal of not find! NOTE
+    return match.group()
+def read_json_data(data_path):
+    references = []
+    questions = []
+    id_to_labels = dict()
+    id_list = list()
+    idx = 0
+    with open(data_path, "r") as f:
+        examples = json.load(f)
+        for data_item in examples: # dict_keys(['source', 'paragraph_id', 'question', 'answer', 'sub-paragraphs', 'word_count', 'id', 'ctxs'])
+            idx_str = str(idx) if 'id' not in data_item else str(data_item['id'])
+            idx += 1
+            id_list.append(idx_str)
+            questions.append(data_item['question'])
+            if "answers" in data_item:
+                references.append(data_item['answers']) # NOTE take all the answers!
+                answer_list = [answer_str for answer_str in data_item['answers']]
+                id_to_labels[idx_str] = answer_list
+            elif "answer" in data_item:
+                references.append([data_item['answer']]) # take the single answer, as a list
+                id_to_labels[idx_str] = [data_item['answer']]
+            else:
+                raise ValueError("need answer or answers from input json")
+    return id_to_labels, id_list, questions, references #answers
+def convert_to_seq(aquestion, apred):
+    if apred is None:
+        apred = ""
+    matched_pred = find_answer(apred)
+    if matched_pred is None:
+        matched_pred = apred
+    apred = '({})'.format(matched_pred)
+    alist = aquestion.split('\n')
+    for aitem in alist:
+        aitem = aitem.strip()
+        if aitem.startswith(apred):
+            pred_out = ' '.join(aitem.split(' ')[1:])
+            print('from {} to [{}]'.format(apred, pred_out))
+            return pred_out
+    print('Warning: could not find ({}) from question {}'.format(apred, aquestion))
+    return apred
+# 500 -> 100
+def load_prediction(test_file, id_list, id_to_labels, questions, dataset_name):
+    predictions = []
+    with open(test_file, "r") as f:
+        for line in f.readlines():
+            predictions.append(line.strip())
+    if len(predictions) != len(id_list):
+        print("NOTE: different number of samples, {} in prediction, yet {} in reference".format(
+            len(predictions), len(id_list)))
+        id_list = id_list[0: len(predictions)]
+    id_to_prediction = dict()
+    for aid, apred in zip(id_list, predictions):
+        id_to_prediction[aid] = apred
+    if dataset_name.startswith('quality'):
+        print('quality dataset, and rewriting the prediction to the full textual sequence...')
+        questions = questions[0: len(predictions)]
+        id_to_prediction = dict()
+        for aid, aquestion, apred in zip(id_list, questions, predictions):
+            apred_seq = convert_to_seq(aquestion, apred)
+            id_to_prediction[aid] = apred_seq
+    return id_to_prediction, id_list, predictions
+def main(args):
+    datasets = [args.dataset] if args.dataset in DATASETS else DATASETS
+    for dataset_name in datasets:
+        print(dataset_name)
+        ground_truth_file = args.datapath
+        prediction_file = args.gen_test_file
+        evaluate_f1(ground_truth_file, prediction_file, dataset_name)
+def main_orig(args, raise_on_errors=False):
+    datasets = [args.dataset] if args.dataset in DATASETS else DATASETS
+    for dataset_name in datasets:
+        print(dataset_name)
+        id_to_labels, id_list, questions, answers = read_json_data(args.datapath)
+        id_to_pred, id_list, predictions = load_prediction(args.gen_test_file,
+                id_list, id_to_labels, questions,
+                dataset_name)
+        if len(id_to_labels) > len(id_list):
+            print('NOTE: prune the reference set from {} to {}'.format(
+                len(id_to_labels), len(id_list)))
+            id_to_labels = {aid:id_to_labels[aid] for aid in id_list}
+        errors, details = verify(id_to_pred, id_to_labels)
+        if len(errors) == 0:
+            score = scorer(dataset_name, predictions, answers, all_classes=None)
+            print('final display:', dataset_name, score)
+        elif len(errors) > 0:
+            errors_msg = errors[0] if len(errors) == 1 else " ".join(f"{i}: {err}" for i, err in enumerate(errors))
+            print(json.dumps(errors, indent=4))
+            raise ValueError(f"Failed to evaluate due to: {errors_msg}")
+def download_metric():
+    scrolls_metric_path = hf_hub_download(repo_id="tau/scrolls", filename="metrics/scrolls.py", repo_type="dataset")
+    updated_scrolls_metric_path = (
+        os.path.dirname(scrolls_metric_path) + os.path.basename(scrolls_metric_path).replace(".", "_") + ".py"
+    )
+    shutil.copy(scrolls_metric_path, updated_scrolls_metric_path)
+    return updated_scrolls_metric_path
+def verify(id_to_pred, id_to_labels):
+    errors = []
+    details = {"missing_keys": [], "redundant_keys": []}
+    if not isinstance(id_to_pred, dict):
+        errors.append('The predictions must be saved a JSON object: {"id1": "prediction1", "id2": "prediction2", ...}')
+    else:
+        if not all(isinstance(key, str) for key in id_to_pred.keys()):
+            errors.append("All keys of the predictions dictionary must be strings")
+        if not all(isinstance(value, str) for value in id_to_pred.values()):
+            errors.append("All values of the predictions dictionary must be strings")
+        if len(errors) == 0:
+            predictions_keys, reference_keys = set(id_to_pred.keys()), set(id_to_labels.keys())
+            missing_keys = reference_keys - predictions_keys
+            redundant_keys = predictions_keys - reference_keys
+            if len(missing_keys) > 0:
+                details["missing_keys"] = list(missing_keys)
+                errors.append(f"There are missing example IDs.")
+            else:
+                del details["missing_keys"]
+            if len(redundant_keys) > 0:
+                details["redundant_keys"] = list(redundant_keys)
+                errors.append(f"There are redundant example IDs.")
+            else:
+                del details["redundant_keys"]
+    return errors, details
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Evaluate SCROLLS predictions per dataset")
+    parser.add_argument("--datapath", type=str,
+            default=None, help="datapath for test json file [reference]")
+    parser.add_argument("--gen_test_file", type=str,
+            default=None, help="generations for test file [system prediction]")
+    parser.add_argument("--dataset", type=str,
+            default=None, help="name of the dataset used in scrolls: {}".format(DATASETS))
+    args = parser.parse_args()
+    main(args)

evaluation/long_32k_eval/eval_retro_vllm.sh ADDED Viewed

	@@ -0,0 +1,118 @@

+#########################################################################
+# File Name: eval.sh
+# Author: Xianchao Wu, Peng Xu
+# mail: [email protected], [email protected]
+# Created Time: Mon Sep  4 07:33:40 2024
+#########################################################################
+#!/bin/bash
+# TODO change this to your reference file dir:
+REFDIR="" # data_home https://huggingface.co/nvidia/Llama3-ChatQA-2-70B/tree/main/data
+# TODO change to your tstdir
+model_path=""
+TSTDIR="${model_path}/outputs/"
+model_size=70b # TODO change this
+retriever="e5_mistral_retriever_chunkbysents1200"
+adir=$TSTDIR
+echo $adir
+declare -A dataset2num_samples
+dataset2num_samples["gov_report"]=200
+dataset2num_samples["narrative_qa"]=2000
+dataset2num_samples["qasper"]=2000
+dataset2num_samples["qmsum"]=200
+dataset2num_samples["quality"]=2000
+dataset2num_samples["summ_screen_fd"]=200
+dataset2num_samples["musique"]=200
+dataset2num_samples["hotpotqa"]=200
+dataset2num_samples["multifieldqa_en"]=200
+dataset2num_samples["squality"]=200
+dataset2num_samples["doc2dial_full_dialogue"]=1000
+echo "ref path = $REFDIR"
+echo "tst out path = $TSTDIR"
+declare -A sys2name
+sys2name["baseline"]=""
+sys2name["ret"]="_ctx5"
+for system in "baseline" "ret"
+do
+	suffix=${sys2name[${system}]}
+    echo "--final display----$system----"
+    for adataset in "qmsum" "qasper" "quality"
+    do
+        echo $adataset
+        ref_fn="${REFDIR}/${adataset}.${retriever}/test.json"
+        tst_fn="${adir}/${adataset}.e5_mistral_retriever_chunkbysents1200_output_0to${dataset2num_samples[${adataset}]}${suffix}.txt"
+        echo "ref for ${adataset}", ${ref_fn}
+        echo "tstout for ${adataset}", ${tst_fn}
+        	if [[ ! -e ${tst_fn} ]]; then
+        		echo "Error: tst_fn=${tst_fn} not exist!"
+        	fi
+        	if [[ ! -e ${ref_fn} ]]; then
+        		echo "Error: ref_fn=${ref_fn} not exist!"
+        	fi
+        #continue
+        if [[ -e ${tst_fn} && -e ${ref_fn} ]]
+        then
+            python3 dataset_evaluator_retro.py \
+                --datapath ${ref_fn} \
+                --gen_test_file ${tst_fn} \
+                --dataset $adataset
+        	else
+        		if [[ ! -e ${tst_fn} ]]; then
+        			echo "Error: tst_fn=${tst_fn} not exist!"
+        		fi
+        		if [[ ! -e ${ref_fn} ]]; then
+        			echo "Error: ref_fn=${ref_fn} not exist!"
+        		fi
+        fi
+    done
+    for adataset in "musique" "hotpotqa" "multifieldqa_en"
+    do
+        echo $adataset
+        ref_fn="${REFDIR}/${adataset}.${retriever}/test.json"
+        # TODO change this if necessary, model's prediction output
+	tst_fn="${adir}/${adataset}.e5_mistral_retriever_chunkbysents1200_output_0to${dataset2num_samples[${adataset}]}${suffix}.txt"
+        echo "ref for ${adataset}", ${ref_fn}
+        echo "tstout for ${adataset}", ${tst_fn}
+		if [[ ! -e ${tst_fn} ]]; then
+			echo "Error: tst_fn=${tst_fn} not exist!"
+		fi
+		if [[ ! -e ${ref_fn} ]]; then
+			echo "Error: ref_fn=${ref_fn} not exist!"
+		fi
+        #continue
+        if [[ -e ${tst_fn} && -e ${ref_fn} ]]
+        then
+            python3 dataset_evaluator_retro_longbench.py \
+                --datapath ${ref_fn} \
+                --gen_test_file ${tst_fn} \
+                --dataset $adataset
+		else
+			if [[ ! -e ${tst_fn} ]]; then
+				echo "Error: tst_fn=${tst_fn} not exist!"
+			fi
+			if [[ ! -e ${ref_fn} ]]; then
+				echo "Error: ref_fn=${ref_fn} not exist!"
+			fi
+        fi
+    done
+done

evaluation/long_32k_eval/extract_log.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import sys
+import numpy as np
+DATASETS1 = [
+    "qmsum",
+    "qasper",
+    "quality",
+    'musique',
+    'hotpotqa',
+    'multifieldqa_en'
+]
+DATASETS = [
+    "qmsum",
+    "qasper",
+    "quality",
+    'musique',
+    'hotpotqa',
+    'multifieldqa_en',
+]
+outrow = ''
+data2res = dict()
+def average(data2res):
+    sumvalue = 0.0
+    sumnum = 0.0
+    for adata in data2res:
+        avalue = data2res[adata]
+        sumvalue += avalue
+        sumnum += 1
+    assert sumnum > 0.0
+    return sumvalue/sumnum
+def collect(value_list, outrow, data2res):
+    #print(value_list)
+    # first add the single avg score:
+    avg = round(np.mean(value_list), 4)
+    outrow += str(avg) + ' '
+    avg2 = average(data2res)
+    avg2 = round(avg2, 4)
+    outrow += str(avg2) + ' '
+    for adata in DATASETS:
+        ares = data2res[adata] if adata in data2res else "NA"
+        outrow += str(ares) + " "
+    print(outrow.strip())
+print('system avg6 avg6 ' + ' '.join(DATASETS))
+#infn = "eval_retro_2.sh.log.2"
+#with open(infn) as br:
+    #for aline in br.readlines():
+value_list = list()
+for aline in sys.stdin:
+    #import ipdb; ipdb.set_trace()
+    aline = aline.strip()
+    if 'final display' in aline:
+        if '-baseline-' in aline or '-ret-' in aline:
+            if len(outrow) > 0 and len(data2res) > 0:
+                collect(value_list, outrow, data2res)
+            outrow = "" # reset
+            data2res = dict()
+            value_list = list()
+            aline2 = aline.replace('-', '')
+            aline2 = aline2.replace('final display', '')
+            outrow += aline2 + ' '
+            continue
+        cols = aline.split(' ')
+        adata = cols[2]
+        ares = '/'.join(cols[3:]) # NOTE use one geometric_mean instead
+        scores = cols[3:]
+        # for R1/R2/RL geometric_mean:
+        if len(scores) == 3:
+            scores = [float(item) for item in scores]
+            geo_mean = (scores[0] * scores[1] * scores[2]) ** (1.0 / 3.0)
+            ares = str(round(geo_mean, 4))
+        data2res[adata] = float(ares)
+        value_list.append(float(ares))
+collect(value_list, outrow, data2res)

evaluation/long_32k_eval/longbench/__pycache__/eval.cpython-310.pyc ADDED Viewed

Binary file (3.23 kB). View file

evaluation/long_32k_eval/longbench/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (5.93 kB). View file

evaluation/long_32k_eval/longbench/eval.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import json
+import argparse
+import numpy as np
+from .metrics import (
+    qa_f1_score,
+    rouge_zh_score,
+    qa_f1_zh_score,
+    rouge_score,
+    classification_score,
+    retrieval_score,
+    retrieval_zh_score,
+    count_score,
+    code_sim_score,
+)
+dataset2metric = {
+    "narrativeqa": qa_f1_score,
+    "qasper": qa_f1_score,
+    "multifieldqa_en": qa_f1_score, # NOTE
+    "multifieldqa_zh": qa_f1_zh_score,
+    "hotpotqa": qa_f1_score, # NOTE
+    "2wikimqa": qa_f1_score,
+    "musique": qa_f1_score, # NOTE
+    "dureader": rouge_zh_score,
+    "gov_report": rouge_score,
+    "qmsum": rouge_score,
+    "multi_news": rouge_score,
+    "vcsum": rouge_zh_score,
+    "trec": classification_score,
+    "triviaqa": qa_f1_score,
+    "samsum": rouge_score,
+    "lsht": classification_score,
+    "passage_retrieval_en": retrieval_score,
+    "passage_count": count_score,
+    "passage_retrieval_zh": retrieval_zh_score,
+    "lcc": code_sim_score,
+    "repobench-p": code_sim_score,
+}
+def parse_args(args=None):
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default=None)
+    parser.add_argument('--e', action='store_true', help="Evaluate on LongBench-E")
+    return parser.parse_args(args)
+def scorer_e(dataset, predictions, answers, lengths, all_classes):
+    scores = {"0-4k": [], "4-8k": [], "8k+": []}
+    for (prediction, ground_truths, length) in zip(predictions, answers, lengths):
+        score = 0.
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip('\n').split('\n')[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+        if length < 4000:
+            scores["0-4k"].append(score)
+        elif length < 8000:
+            scores["4-8k"].append(score)
+        else:
+            scores["8k+"].append(score)
+    for key in scores.keys():
+        scores[key] = round(100 * np.mean(scores[key]), 2)
+    return scores
+def scorer(dataset, predictions, answers, all_classes):
+    # dataset = 'hotpotqa', 'musique', 'multifieldqa_en'
+    # predictions = [pred.str, ..., ]
+    # answers = [ [answer.str, ...], ... ]
+    # all_classes = None
+    #import ipdb; ipdb.set_trace() # all_classes=None for 'hotpotqa' dataset NOTE
+    total_score = 0.
+    for (prediction, ground_truths) in zip(predictions, answers):
+        score = 0.
+        if dataset in ["trec", "triviaqa", "samsum", "lsht"]:
+            prediction = prediction.lstrip('\n').split('\n')[0]
+        for ground_truth in ground_truths:
+            score = max(score, dataset2metric[dataset](prediction, ground_truth, all_classes=all_classes))
+        total_score += score
+    outscore = round(100 * total_score / len(predictions), 2)
+    print(dataset, outscore)
+    return outscore
+if __name__ == '__main__':
+    #import ipdb; ipdb.set_trace()
+    args = parse_args()
+    scores = dict()
+    if args.e:
+        path = f"pred_e/{args.model}/"
+    else:
+        path = f"pred/{args.model}/" # 'pred/chatglm2-6b-32k/' NOTE
+    all_files = os.listdir(path) # 21 files
+    print("Evaluating on:", all_files)
+    for filename in all_files:
+        #import ipdb; ipdb.set_trace()
+        if not filename.endswith("jsonl"):
+            continue
+        predictions, answers, lengths = [], [], []
+        dataset = filename.split('.')[0] # 获取数据集的名字
+        if not dataset in ['musique', 'hotpotqa', 'multifieldqa_en']:
+            continue # TODO debug only
+        with open(f"{path}{filename}", "r", encoding="utf-8") as f:
+            for line in f: # 每一行，进行一次json的解析
+                data = json.loads(line)
+                predictions.append(data["pred"])
+                answers.append(data["answers"])
+                all_classes = data["all_classes"] # 这是属于被一次次重复赋值了
+                if "length" in data:
+                    lengths.append(data["length"])
+        if args.e:
+            score = scorer_e(dataset, predictions, answers, lengths, all_classes)
+        else:
+            score = scorer(dataset, predictions, answers, all_classes) # NOTE 重要的计算得分的入口 TODO 1. dataset=具体的数据集的名字；predictions=list of str，预测结果; answers = list of list，参考答案; all_classes这是原本就带的，test in
+        scores[dataset] = score
+    if args.e:
+        out_path = f"pred_e/{args.model}/result.json"
+    else:
+        out_path = f"pred/{args.model}/result.json"
+    print(scores)
+    with open(out_path, "w") as f:
+        json.dump(scores, f, ensure_ascii=False, indent=4)

evaluation/long_32k_eval/longbench/metrics.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import re
+import string
+import jieba
+from fuzzywuzzy import fuzz
+import difflib
+from typing import List
+from collections import Counter
+from rouge import Rouge
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+    def remove_articles(text):
+        return re.sub(r"\b(a|an|the)\b", " ", text)
+    def white_space_fix(text):
+        return " ".join(text.split())
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return "".join(ch for ch in text if ch not in exclude)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+def normalize_zh_answer(s):
+    """Lower text and remove punctuation, extra whitespace."""
+    def white_space_fix(text):
+        return "".join(text.split())
+    def remove_punc(text):
+        cn_punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏."
+        all_punctuation = set(string.punctuation + cn_punctuation)
+        return "".join(ch for ch in text if ch not in all_punctuation)
+    def lower(text):
+        return text.lower()
+    return white_space_fix(remove_punc(lower(s)))
+def count_score(prediction, ground_truth, **kwargs):
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def retrieval_score(prediction, ground_truth, **kwargs):
+    pattern = r'Paragraph (\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def retrieval_zh_score(prediction, ground_truth, **kwargs):
+    pattern = r'段落(\d+)'
+    matches = re.findall(pattern, ground_truth)
+    ground_truth_id = matches[0]
+    numbers = re.findall(r"\d+", prediction)
+    right_num = 0
+    for number in numbers:
+        if str(number) == str(ground_truth_id):
+            right_num += 1
+    final_score = 0.0 if len(numbers) == 0 else right_num / len(numbers)
+    return float(final_score)
+def code_sim_score(prediction, ground_truth, **kwargs):
+    all_lines = prediction.lstrip('\n').split('\n')
+    prediction = ""
+    for line in all_lines:
+        if ('`' not in line) and ('#' not in line) and ('//' not in line):
+            prediction = line
+            break
+    return (fuzz.ratio(prediction, ground_truth) / 100)
+def classification_score(prediction, ground_truth, **kwargs):
+    em_match_list = []
+    all_classes = kwargs["all_classes"]
+    for class_name in all_classes:
+        if class_name in prediction:
+            em_match_list.append(class_name)
+    for match_term in em_match_list:
+        if match_term in ground_truth and match_term != ground_truth:
+            em_match_list.remove(match_term)
+    if em_match_list != 0:
+        if ground_truth in em_match_list:
+            score = (1.0 / len(em_match_list))
+        else:
+            score = 0.0
+    else:
+        best_match = None
+        highest_similarity = 0
+        for string in all_classes:
+            similarity = difflib.SequenceMatcher(None, string, prediction).ratio()
+            if similarity > highest_similarity:
+                highest_similarity = similarity
+                best_match = string
+        score = float(best_match == ground_truth)
+    return score
+def rouge_score(prediction, ground_truth, **kwargs):
+    rouge = Rouge()
+    try:
+        scores = rouge.get_scores([prediction], [ground_truth], avg=True)
+    except:
+        return 0.0
+    return scores["rouge-l"]["f"]
+def rouge_zh_score(prediction, ground_truth, **kwargs):
+    prediction = " ".join(list(jieba.cut(prediction, cut_all=False)))
+    ground_truth = " ".join(list(jieba.cut(ground_truth, cut_all=False)))
+    score = rouge_score(prediction, ground_truth)
+    return score
+def f1_score(prediction, ground_truth, **kwargs):
+    common = Counter(prediction) & Counter(ground_truth)
+    num_same = sum(common.values())
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(prediction)
+    recall = 1.0 * num_same / len(ground_truth)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+def qa_f1_score(prediction, ground_truth, **kwargs):
+    normalized_prediction = normalize_answer(prediction)
+    normalized_ground_truth = normalize_answer(ground_truth)
+    prediction_tokens = normalized_prediction.split()
+    ground_truth_tokens = normalized_ground_truth.split()
+    return f1_score(prediction_tokens, ground_truth_tokens)
+def qa_f1_zh_score(prediction, ground_truth, **kwargs):
+    prediction_tokens = list(jieba.cut(prediction, cut_all=False))
+    ground_truth_tokens = list(jieba.cut(ground_truth, cut_all=False))
+    prediction_tokens = [normalize_zh_answer(token) for token in prediction_tokens]
+    ground_truth_tokens = [normalize_zh_answer(token) for token in ground_truth_tokens]
+    prediction_tokens = [token for token in prediction_tokens if len(token) > 0]
+    ground_truth_tokens = [token for token in ground_truth_tokens if len(token) > 0]
+    return f1_score(prediction_tokens, ground_truth_tokens)

evaluation/long_32k_eval/run_eval_vllm.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+adate=`date +%Y_%m_%d_%H_%M_%S`
+outlog="eval_retro_vllm.sh.log.6_4setsplus.$adate"
+echo "out log = $outlog"
+bash eval_retro_vllm.sh > $outlog 2>&1
+grep "final display" $outlog | python3 extract_log.py