Spaces:
Running
Running
martinakaduc
commited on
Commit
•
4af8ee7
1
Parent(s):
ef06837
Update code
Browse files- Logo BK.png +0 -0
- Logo Stanford.png +0 -0
- Logo VNU-HCM.png +0 -0
- app.py +45 -0
- data_loader.py +175 -0
- evaluation_results.xlsx +0 -0
- requirements.txt +1 -0
Logo BK.png
ADDED
Logo Stanford.png
ADDED
Logo VNU-HCM.png
ADDED
app.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from data_loader import (
|
3 |
+
resutls,
|
4 |
+
metric_ud,
|
5 |
+
tasks,
|
6 |
+
settings,
|
7 |
+
task_w_settings,
|
8 |
+
datasets
|
9 |
+
)
|
10 |
+
|
11 |
+
if __name__ == "__main__":
|
12 |
+
st.set_page_config(
|
13 |
+
page_title="URA-LLaMa Evaluation Dashboard",
|
14 |
+
page_icon="🧊",
|
15 |
+
layout="wide",
|
16 |
+
initial_sidebar_state="expanded",
|
17 |
+
)
|
18 |
+
|
19 |
+
st.image(["Logo BK.png", "Logo VNU-HCM.png",
|
20 |
+
"Logo Stanford.png"], width=120)
|
21 |
+
st.title("URA-LLaMa Evaluation Dashboard")
|
22 |
+
st.write(
|
23 |
+
"This dashboard is used to visualize the results of the URA-LLaMa evaluation.")
|
24 |
+
|
25 |
+
task = st.sidebar.selectbox(
|
26 |
+
"Select Task",
|
27 |
+
list(tasks.keys())
|
28 |
+
)
|
29 |
+
|
30 |
+
setting = st.sidebar.selectbox(
|
31 |
+
"Select Setting",
|
32 |
+
task_w_settings[task]
|
33 |
+
)
|
34 |
+
|
35 |
+
task_id = tasks[task]
|
36 |
+
dataset = st.sidebar.selectbox(
|
37 |
+
"Select Dataset",
|
38 |
+
list(datasets[task_id].values())
|
39 |
+
)
|
40 |
+
|
41 |
+
result_id = f"{task_id}-{settings[setting]}"
|
42 |
+
result_sheet = resutls[result_id][dataset]
|
43 |
+
|
44 |
+
# Visualize the data stored as a pandas dataframe
|
45 |
+
st.dataframe(result_sheet)
|
data_loader.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
|
4 |
+
RESULT_FILE = 'evaluation_results.xlsx'
|
5 |
+
|
6 |
+
metric_ud = {
|
7 |
+
"Accuracy": 1,
|
8 |
+
"Average Exact Match": 1,
|
9 |
+
"Exact Match": 1,
|
10 |
+
"F1 Score": 1,
|
11 |
+
"AUC ROC": 1,
|
12 |
+
"AUC PR": 1,
|
13 |
+
"Precision": 1,
|
14 |
+
"Recall": 1,
|
15 |
+
"Equivalent": 1,
|
16 |
+
"Bias": -1,
|
17 |
+
"Toxicity": -1,
|
18 |
+
"ROUGE-1": 1,
|
19 |
+
"ROUGE-2": 1,
|
20 |
+
"ROUGE-L": 1,
|
21 |
+
"BLEU": 1,
|
22 |
+
"SummaC": 1,
|
23 |
+
"BERTScore": 1,
|
24 |
+
"Coverage": 1,
|
25 |
+
"Density": 1,
|
26 |
+
"Compression": 1,
|
27 |
+
"hLEPOR": 1,
|
28 |
+
"Character Error Rate": -1,
|
29 |
+
"Word Error Rate": -1,
|
30 |
+
"Character Edit Distance": -1,
|
31 |
+
"Word Edit Distance": -1,
|
32 |
+
"Perplexity": -1,
|
33 |
+
"Expected Calibration Error": -1,
|
34 |
+
"acc@10": 1,
|
35 |
+
"MRR@10 (Top 30)": 1,
|
36 |
+
"NDCG@10 (Top 30)": 1,
|
37 |
+
"MRR@10": 1,
|
38 |
+
"NDCG@10": 1,
|
39 |
+
}
|
40 |
+
|
41 |
+
tasks = {
|
42 |
+
"Information Retrieval": "informationretrieval",
|
43 |
+
"Knowledge": "knowledge",
|
44 |
+
"Language Modelling": "language-modelling",
|
45 |
+
"Question Answering": "question-answering",
|
46 |
+
"Reasoning": "reasoning",
|
47 |
+
"Summarization": "summarization",
|
48 |
+
"Text Classification": "text-classification",
|
49 |
+
"Toxicity Detection": "toxicity-detection",
|
50 |
+
"Translation": "translation",
|
51 |
+
"Sentiment Analysis": "sentiment-analysis",
|
52 |
+
}
|
53 |
+
|
54 |
+
settings = {
|
55 |
+
"Normal": "",
|
56 |
+
"Few-shot Leanring": "fs",
|
57 |
+
"Prompt Strategy 0": "pt0",
|
58 |
+
"Prompt Strategy 1": "pt1",
|
59 |
+
"Prompt Strategy 2": "pt2",
|
60 |
+
"Chain-of-Thought": "cot",
|
61 |
+
"Fairness": "fairness",
|
62 |
+
"Robustness": "robustness",
|
63 |
+
}
|
64 |
+
|
65 |
+
task_w_settings = {
|
66 |
+
"Information Retrieval": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
|
67 |
+
"Knowledge": ["Normal", "Few-shot Leanring", "Robustness"],
|
68 |
+
"Language Modelling": ["Normal", "Few-shot Leanring", "Fairness"],
|
69 |
+
"Question Answering": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness", "Fairness"],
|
70 |
+
"Reasoning": ["Few-shot Leanring", "Chain-of-Thought"],
|
71 |
+
"Summarization": ["Prompt Strategy 0", "Prompt Strategy 1", "Prompt Strategy 2", "Robustness"],
|
72 |
+
"Text Classification": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
|
73 |
+
"Toxicity Detection": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
|
74 |
+
"Translation": ["Few-shot Leanring", "Robustness"],
|
75 |
+
"Sentiment Analysis": ["Normal", "Few-shot Leanring", "Robustness", "Fairness"],
|
76 |
+
}
|
77 |
+
|
78 |
+
datasets = {
|
79 |
+
"question-answering": {
|
80 |
+
"xquad_xtreme": "xQUAD EXTREME",
|
81 |
+
"mlqa": "MLQA",
|
82 |
+
},
|
83 |
+
"summarization": {
|
84 |
+
"vietnews": "VietNews",
|
85 |
+
"wikilingua": "WikiLingua",
|
86 |
+
},
|
87 |
+
"text-classification": {
|
88 |
+
"vsmec": "VSMEC",
|
89 |
+
"phoatis": "PhoATIS",
|
90 |
+
},
|
91 |
+
"toxicity-detection": {
|
92 |
+
"victsd": "UIT-ViCTSD",
|
93 |
+
"vihsd": "UIT-ViHSD",
|
94 |
+
},
|
95 |
+
"translation": {
|
96 |
+
"phomt-envi": "PhoMT English-Vietnamese",
|
97 |
+
"phomt-vien": "PhoMT Vietnamese-English",
|
98 |
+
"opus100-envi": "OPUS-100 English-Vietnamese",
|
99 |
+
"opus100-vien": "OPUS-100 Vietnamese-English",
|
100 |
+
},
|
101 |
+
"sentiment-analysis": {
|
102 |
+
"vlsp": "VLSP 2016",
|
103 |
+
"vsfc": "UIT-VSFC",
|
104 |
+
},
|
105 |
+
"informationretrieval": {
|
106 |
+
"mmarco": "mMARCO",
|
107 |
+
"mrobust": "mRobust",
|
108 |
+
},
|
109 |
+
"knowledge": {
|
110 |
+
"zaloe2e": "ZaloE2E",
|
111 |
+
"vimmrc": "ViMMRC",
|
112 |
+
},
|
113 |
+
"language-modelling": {
|
114 |
+
"mlqa": "MLQA",
|
115 |
+
"vsec": "VSEC",
|
116 |
+
},
|
117 |
+
"reasoning": {
|
118 |
+
"srnatural-azr": "Synthetic Reasoning (Natural) - Azure",
|
119 |
+
"srnatural-gcp": "Synthetic Reasoning (Natural) - Google Cloud",
|
120 |
+
"srabstract-azr": "Synthetic Reasoning (Abstract Symbol)- Azure",
|
121 |
+
"srabstract-gcp": "Synthetic Reasoning (Abstract Symbol)- Google Cloud",
|
122 |
+
"math-azr": "MATH Level 1 - Azure",
|
123 |
+
"math-gcp": "MATH Level 1 - Google Cloud",
|
124 |
+
},
|
125 |
+
}
|
126 |
+
|
127 |
+
|
128 |
+
def load_data(file_name):
|
129 |
+
"""
|
130 |
+
Load the data from the csv file
|
131 |
+
"""
|
132 |
+
data = pd.read_excel(
|
133 |
+
file_name,
|
134 |
+
sheet_name=None,
|
135 |
+
header=None
|
136 |
+
)
|
137 |
+
results = {}
|
138 |
+
for task_name, task_id in tasks.items():
|
139 |
+
for setting_name in task_w_settings[task_name]:
|
140 |
+
setting_id = settings[setting_name]
|
141 |
+
sheet_name = f"{task_id}-{setting_id}" if setting_id else task_id
|
142 |
+
sheet_data = data[sheet_name]
|
143 |
+
results_by_dataset = {}
|
144 |
+
|
145 |
+
# Find the rows that contain the dataset ids
|
146 |
+
# dataset_ids = datasets[task_id].keys()
|
147 |
+
row_ids = []
|
148 |
+
for i, row in sheet_data.iterrows():
|
149 |
+
if "Models/" in row[0]:
|
150 |
+
row_ids.append(i)
|
151 |
+
row_ids.append(len(sheet_data))
|
152 |
+
|
153 |
+
# Get the data for each dataset
|
154 |
+
for i in range(len(row_ids) - 1):
|
155 |
+
dataset_id = sheet_data.iloc[row_ids[i]][0].split('/')[-1]
|
156 |
+
dataset_name = datasets[task_id][dataset_id]
|
157 |
+
|
158 |
+
dataset_data = sheet_data.iloc[row_ids[i] + 1: row_ids[i + 1]]
|
159 |
+
dataset_data = dataset_data.fillna('')
|
160 |
+
# dataset_data = dataset_data.dropna(axis=1, how='all')
|
161 |
+
# dataset_data = dataset_data.dropna(axis=0, how='all')
|
162 |
+
header = sheet_data.iloc[0]
|
163 |
+
header[0] = "Models"
|
164 |
+
|
165 |
+
# Create new pandas dataframe
|
166 |
+
dataset_data = pd.DataFrame(
|
167 |
+
dataset_data.values, columns=header)
|
168 |
+
results_by_dataset[dataset_name] = dataset_data
|
169 |
+
|
170 |
+
results[f"{task_id}-{setting_id}"] = results_by_dataset
|
171 |
+
|
172 |
+
return results
|
173 |
+
|
174 |
+
|
175 |
+
resutls = load_data(RESULT_FILE)
|
evaluation_results.xlsx
ADDED
Binary file (141 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
openpyxl
|