Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
RESULT_FILE = 'evaluation_results.xlsx' | |
metric_ud = { | |
"Accuracy": 1, | |
"Average Exact Match": 1, | |
"Exact Match": 1, | |
"F1 Score": 1, | |
"AUC ROC": 1, | |
"AUC PR": 1, | |
"Precision": 1, | |
"Recall": 1, | |
"Equivalent": 1, | |
"Bias": -1, | |
"Demographic representation (race)": -1, | |
"Demographic representation (gender)": -1, | |
"Stereotypical associations (race, profession)": -1, | |
"Stereotypical associations (gender, profession)": -1, | |
"Toxicity": -1, | |
"ROUGE-1": 1, | |
"ROUGE-2": 1, | |
"ROUGE-L": 1, | |
"BLEU": 1, | |
"SummaC": 1, | |
"BERTScore": 1, | |
"Coverage": 1, | |
"Density": 1, | |
"Compression": 1, | |
"hLEPOR": 1, | |
"Character Error Rate": -1, | |
"Word Error Rate": -1, | |
"Character Edit Distance": -1, | |
"Word Edit Distance": -1, | |
"Perplexity": -1, | |
"Expected Calibration Error": -1, | |
"acc@10": 1, | |
"MRR@10 (Top 30)": 1, | |
"NDCG@10 (Top 30)": 1, | |
"MRR@10": 1, | |
"NDCG@10": 1, | |
} | |
tasks = { | |
"Information Retrieval": "informationretrieval", | |
"Knowledge": "knowledge", | |
"Language Modelling": "language-modelling", | |
"Question Answering": "question-answering", | |
"Reasoning": "reasoning", | |
"Summarization": "summarization", | |
"Text Classification": "text-classification", | |
"Toxicity Detection": "toxicity-detection", | |
"Translation": "translation", | |
"Sentiment Analysis": "sentiment-analysis", | |
} | |
settings = { | |
"Normal": "", | |
"Few-shot Leanring": "fs", | |
"Prompt Strategy 0": "pt0", | |
"Prompt Strategy 1": "pt1", | |
"Prompt Strategy 2": "pt2", | |
"Chain-of-Thought": "cot", | |
"Fairness": "fairness", | |
"Robustness": "robustness", | |
"Random Order Choices": "randchoice", | |
} | |
task_w_settings = { | |
"Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Knowledge": ["Normal", "Few-shot Leanring", "Robustness", "Random Order Choices"], | |
"Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"], | |
"Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"], | |
"Reasoning": ["Few-shot Leanring", "Chain-of-Thought"], | |
"Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"], | |
"Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Translation": ["Few-shot Leanring", "Robustness"], | |
"Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
} | |
datasets = { | |
"question-answering": { | |
"xquad_xtreme": "xQUAD EXTREME", | |
"mlqa": "MLQA", | |
}, | |
"summarization": { | |
"vietnews": "VietNews", | |
"wikilingua": "WikiLingua", | |
}, | |
"text-classification": { | |
"vsmec": "VSMEC", | |
"phoatis": "PhoATIS", | |
}, | |
"toxicity-detection": { | |
"victsd": "UIT-ViCTSD", | |
"vihsd": "UIT-ViHSD", | |
}, | |
"translation": { | |
"phomt-envi": "PhoMT English-Vietnamese", | |
"phomt-vien": "PhoMT Vietnamese-English", | |
"opus100-envi": "OPUS-100 English-Vietnamese", | |
"opus100-vien": "OPUS-100 Vietnamese-English", | |
}, | |
"sentiment-analysis": { | |
"vlsp": "VLSP 2016", | |
"vsfc": "UIT-VSFC", | |
}, | |
"informationretrieval": { | |
"mmarco": "mMARCO", | |
"mrobust": "mRobust", | |
}, | |
"knowledge": { | |
"zaloe2e": "ZaloE2E", | |
"vimmrc": "ViMMRC", | |
}, | |
"language-modelling": { | |
"mlqa-mlm": "MLQA", | |
"vsec": "VSEC", | |
}, | |
"reasoning": { | |
# "srnatural-azr": "Synthetic Reasoning (Natural) - Azure", | |
# "srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud", | |
# "srinduction-azr": "Synthetic Reasoning (Induction) - Azure", | |
# "srinduction-gcp": "Synthetic Reasoning (Induction) - Google Cloud", | |
# "srpattern-azr": "Synthetic Introduction (Pattern Match) - Azure", | |
# "srpattern-gcp": "Synthetic Introduction (Pattern Match) - Google Cloud", | |
# "srsubstitution-azr": "Synthetic Introduction (Variable Substitution) - Azure", | |
# "srsubstitution-gcp": "Synthetic Introduction (Variable Substitution) - Google Cloud", | |
# "math-azr-Algebra": "MATH Level 1 (Algebra) - Azure", | |
# "math-azr-Counting&Probability": "MATH Level 1 (Counting&Probability) - Azure", | |
# "math-azr-Geometry": "MATH Level 1 (Geometry) - Azure", | |
# "math-azr-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Azure", | |
# "math-azr-NumberTheory": "MATH Level 1 (NumberTheory) - Azure", | |
# "math-azr-Prealgebra": "MATH Level 1 (Prealgebra) - Azure", | |
# "math-azr-Precalculus": "MATH Level 1 (Precalculus) - Azure", | |
# "math-gcp-Algebra": "MATH Level 1 (Algebra) - Google Cloud", | |
# "math-gcp-Counting&Probability": "MATH Level 1 (Counting&Probability) - Google Cloud", | |
# "math-gcp-Geometry": "MATH Level 1 (Geometry) - Google Cloud", | |
# "math-gcp-IntermediateAlgebra": "MATH Level 1 (IntermediateAlgebra) - Google Cloud", | |
# "math-gcp-NumberTheory": "MATH Level 1 (NumberTheory) - Google Cloud", | |
# "math-gcp-Prealgebra": "MATH Level 1 (Prealgebra) - Google Cloud", | |
# "math-gcp-Precalculus": "MATH Level 1 (Precalculus) - Google Cloud", | |
"math-azr": "MATH Level 1 - Azure", | |
"math-gcp": "MATH Level 1 - Google Cloud", | |
"srnatural-azr": "Synthetic Reasoning (Natural) - Azure", | |
"srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud", | |
"srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure", | |
"srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud", | |
}, | |
} | |
def load_data(file_name): | |
""" | |
Load the data from the csv file | |
""" | |
data = pd.read_excel( | |
file_name, | |
sheet_name=None, | |
header=None | |
) | |
results = {} | |
for task_name, task_id in tasks.items(): | |
for setting_name in task_w_settings[task_name]: | |
setting_id = settings[setting_name] | |
sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id | |
sheet_data = data[sheet_name] | |
results_by_dataset = {} | |
# Find the rows that contain the dataset ids | |
# dataset_ids = datasets[task_id].keys() | |
row_ids = [] | |
for i, row in sheet_data.iterrows(): | |
if "Models/" in row[0]: | |
row_ids.append(i) | |
row_ids.append(len(sheet_data)) | |
# Get the data for each dataset | |
for i in range(len(row_ids) - 1): | |
dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1] | |
dataset_name = datasets[task_id][dataset_id] | |
dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]] | |
dataset_data = dataset_data.fillna(f'-') | |
header = sheet_data.iloc[0] | |
header[0] = "Models" | |
# Create new pandas dataframe | |
dataset_data = pd.DataFrame( | |
dataset_data.values, columns=header) | |
# column_dtypes = {'Models': 'string'} | |
# for column in header[1:]: | |
# column_dtypes[column] = 'float' | |
# dataset_data = dataset_data.astype(column_dtypes) | |
results_by_dataset[dataset_name] = dataset_data | |
results[f"{task_id}-{setting_id}"] = results_by_dataset | |
return results | |
resutls = load_data(RESULT_FILE) | |