Spaces:
Running
Running
import pandas as pd | |
import numpy as np | |
RESULT_FILE = 'evaluation_results.xlsx' | |
metric_ud = { | |
"Accuracy": 1, | |
"Average Exact Match": 1, | |
"Exact Match": 1, | |
"F1 Score": 1, | |
"AUC ROC": 1, | |
"AUC PR": 1, | |
"Precision": 1, | |
"Recall": 1, | |
"Equivalent": 1, | |
"Bias": -1, | |
"Toxicity": -1, | |
"ROUGE-1": 1, | |
"ROUGE-2": 1, | |
"ROUGE-L": 1, | |
"BLEU": 1, | |
"SummaC": 1, | |
"BERTScore": 1, | |
"Coverage": 1, | |
"Density": 1, | |
"Compression": 1, | |
"hLEPOR": 1, | |
"Character Error Rate": -1, | |
"Word Error Rate": -1, | |
"Character Edit Distance": -1, | |
"Word Edit Distance": -1, | |
"Perplexity": -1, | |
"Expected Calibration Error": -1, | |
"acc@10": 1, | |
"MRR@10 (Top 30)": 1, | |
"NDCG@10 (Top 30)": 1, | |
"MRR@10": 1, | |
"NDCG@10": 1, | |
} | |
tasks = { | |
"Information Retrieval": "informationretrieval", | |
"Knowledge": "knowledge", | |
"Language Modelling": "language-modelling", | |
"Question Answering": "question-answering", | |
"Reasoning": "reasoning", | |
"Summarization": "summarization", | |
"Text Classification": "text-classification", | |
"Toxicity Detection": "toxicity-detection", | |
"Translation": "translation", | |
"Sentiment Analysis": "sentiment-analysis", | |
} | |
settings = { | |
"Normal": "", | |
"Few-shot Leanring": "fs", | |
"Prompt Strategy 0": "pt0", | |
"Prompt Strategy 1": "pt1", | |
"Prompt Strategy 2": "pt2", | |
"Chain-of-Thought": "cot", | |
"Fairness": "fairness", | |
"Robustness": "robustness", | |
} | |
task_w_settings = { | |
"Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Knowledge": ["Normal", "Few-shot Leanring", "Robustness"], | |
"Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"], | |
"Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"], | |
"Reasoning": ["Few-shot Leanring", "Chain-of-Thought"], | |
"Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"], | |
"Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
"Translation": ["Few-shot Leanring", "Robustness"], | |
"Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"], | |
} | |
datasets = { | |
"question-answering": { | |
"xquad_xtreme": "xQUAD EXTREME", | |
"mlqa": "MLQA", | |
}, | |
"summarization": { | |
"vietnews": "VietNews", | |
"wikilingua": "WikiLingua", | |
}, | |
"text-classification": { | |
"vsmec": "VSMEC", | |
"phoatis": "PhoATIS", | |
}, | |
"toxicity-detection": { | |
"victsd": "UIT-ViCTSD", | |
"vihsd": "UIT-ViHSD", | |
}, | |
"translation": { | |
"phomt-envi": "PhoMT English-Vietnamese", | |
"phomt-vien": "PhoMT Vietnamese-English", | |
"opus100-envi": "OPUS-100 English-Vietnamese", | |
"opus100-vien": "OPUS-100 Vietnamese-English", | |
}, | |
"sentiment-analysis": { | |
"vlsp": "VLSP 2016", | |
"vsfc": "UIT-VSFC", | |
}, | |
"informationretrieval": { | |
"mmarco": "mMARCO", | |
"mrobust": "mRobust", | |
}, | |
"knowledge": { | |
"zaloe2e": "ZaloE2E", | |
"vimmrc": "ViMMRC", | |
}, | |
"language-modelling": { | |
"mlqa": "MLQA", | |
"vsec": "VSEC", | |
}, | |
"reasoning": { | |
"srnatural-azr": "Synthetic Reasoning (Natural) - Azure", | |
"srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud", | |
"srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure", | |
"srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud", | |
"math-azr": "MATH Level 1 - Azure", | |
"math-gcp": "MATH Level 1 - Google Cloud", | |
}, | |
} | |
def load_data(file_name): | |
""" | |
Load the data from the csv file | |
""" | |
data = pd.read_excel( | |
file_name, | |
sheet_name=None, | |
header=None | |
) | |
results = {} | |
for task_name, task_id in tasks.items(): | |
for setting_name in task_w_settings[task_name]: | |
setting_id = settings[setting_name] | |
sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id | |
sheet_data = data[sheet_name] | |
results_by_dataset = {} | |
# Find the rows that contain the dataset ids | |
# dataset_ids = datasets[task_id].keys() | |
row_ids = [] | |
for i, row in sheet_data.iterrows(): | |
if "Models/" in row[0]: | |
row_ids.append(i) | |
row_ids.append(len(sheet_data)) | |
# Get the data for each dataset | |
for i in range(len(row_ids) - 1): | |
dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1] | |
dataset_name = datasets[task_id][dataset_id] | |
dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]] | |
dataset_data = dataset_data.fillna('') | |
# dataset_data = dataset_data.dropna(axis=1, how='all') | |
# dataset_data = dataset_data.dropna(axis=0, how='all') | |
header = sheet_data.iloc[0] | |
header[0] = "Models" | |
# Create new pandas dataframe | |
dataset_data = pd.DataFrame( | |
dataset_data.values, columns=header) | |
results_by_dataset[dataset_name] = dataset_data | |
results[f"{task_id}-{setting_id}"] = results_by_dataset | |
return results | |
resutls = load_data(RESULT_FILE) | |