Adam Jirkovsky
commited on
Commit
·
b2e7d0b
1
Parent(s):
54b05ee
Rename headers during data loading
Browse files- src/display/utils.py +40 -18
- src/populate.py +2 -1
src/display/utils.py
CHANGED
@@ -47,30 +47,52 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
|
|
47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
48 |
"""
|
49 |
|
50 |
-
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("
|
51 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("
|
52 |
-
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("
|
53 |
-
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("
|
54 |
-
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("
|
55 |
-
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("
|
56 |
-
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("
|
57 |
-
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("
|
58 |
-
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("
|
59 |
-
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("
|
60 |
-
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("
|
61 |
-
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("
|
62 |
-
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("
|
63 |
-
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("
|
64 |
-
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("
|
65 |
-
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("
|
66 |
-
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("
|
67 |
-
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("
|
68 |
|
69 |
|
70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
## For the queue columns in the submission tab
|
75 |
@dataclass(frozen=True)
|
76 |
class EvalQueueColumn: # Queue column
|
|
|
47 |
auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
|
48 |
"""
|
49 |
|
50 |
+
auto_eval_column_dict.append(["eval_name", ColumnContent, ColumnContent("Model", "str", True, never_hidden=True)])
|
51 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
52 |
+
auto_eval_column_dict.append(["hf_model_id", ColumnContent, ColumnContent("Model link (temporary)", "str", True)])
|
53 |
+
auto_eval_column_dict.append(["agree_cs", ColumnContent, ColumnContent("AGREE", "number", True)])
|
54 |
+
auto_eval_column_dict.append(["anli_cs", ColumnContent, ColumnContent("ANLI", "number", True)])
|
55 |
+
auto_eval_column_dict.append(["arc_challenge_cs", ColumnContent, ColumnContent("ARC-Challenge", "number", True)])
|
56 |
+
auto_eval_column_dict.append(["arc_easy_cs", ColumnContent, ColumnContent("ARC-Easy", "number", True)])
|
57 |
+
auto_eval_column_dict.append(["belebele_cs", ColumnContent, ColumnContent("Belebele", "number", True)])
|
58 |
+
auto_eval_column_dict.append(["ctkfacts_cs", ColumnContent, ColumnContent("CTKFacts", "number", True)])
|
59 |
+
auto_eval_column_dict.append(["czechnews_cs", ColumnContent, ColumnContent("Czech News", "number", True)])
|
60 |
+
auto_eval_column_dict.append(["fb_comments_cs", ColumnContent, ColumnContent("Facebook Comments", "number", True)])
|
61 |
+
auto_eval_column_dict.append(["gsm8k_cs", ColumnContent, ColumnContent("GSM8K", "number", True)])
|
62 |
+
auto_eval_column_dict.append(["klokanek_cs", ColumnContent, ColumnContent("Klokanek", "number", True)])
|
63 |
+
auto_eval_column_dict.append(["mall_reviews_cs", ColumnContent, ColumnContent("Mall Reviews", "number", True)])
|
64 |
+
auto_eval_column_dict.append(["mmlu_cs", ColumnContent, ColumnContent("MMLU", "number", True)])
|
65 |
+
auto_eval_column_dict.append(["sqad_cs", ColumnContent, ColumnContent("SQAD", "number", True)])
|
66 |
+
auto_eval_column_dict.append(["subjectivity_cs", ColumnContent, ColumnContent("Subjectivity", "number", True)])
|
67 |
+
auto_eval_column_dict.append(["truthfulqa_cs", ColumnContent, ColumnContent("TruthfulQA", "number", True)])
|
68 |
|
69 |
|
70 |
# We use make dataclass to dynamically fill the scores from Tasks
|
71 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
72 |
|
73 |
|
74 |
+
HEADER_MAP = {
|
75 |
+
"eval_name": "Model",
|
76 |
+
"precision": "Precision",
|
77 |
+
"hf_model_id": "Model link (temporary)",
|
78 |
+
"agree_cs": "AGREE",
|
79 |
+
"anli_cs": "ANLI",
|
80 |
+
"arc_challenge_cs": "ARC-Challenge",
|
81 |
+
"arc_easy_cs": "ARC-Easy",
|
82 |
+
"belebele_cs": "Belebele",
|
83 |
+
"ctkfacts_cs": "CTKFacts",
|
84 |
+
"czechnews_cs": "Czech News",
|
85 |
+
"fb_comments_cs": "Facebook Comments",
|
86 |
+
"gsm8k_cs": "GSM8K",
|
87 |
+
"klokanek_cs": "Klokanek",
|
88 |
+
"mall_reviews_cs": "Mall Reviews",
|
89 |
+
"mmlu_cs": "MMLU",
|
90 |
+
"sqad_cs": "SQAD",
|
91 |
+
"subjectivity_cs": "Subjectivity",
|
92 |
+
"truthfulqa_cs": "TruthfulQA",
|
93 |
+
}
|
94 |
+
|
95 |
+
|
96 |
## For the queue columns in the submission tab
|
97 |
@dataclass(frozen=True)
|
98 |
class EvalQueueColumn: # Queue column
|
src/populate.py
CHANGED
@@ -4,7 +4,7 @@ import numpy as np
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
@@ -13,6 +13,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
13 |
#all_data_json = [v.to_dict() for v in raw_data]
|
14 |
df = pd.DataFrame.from_records(raw_data)
|
15 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
16 |
df = df[cols].round(decimals=2)
|
17 |
df.replace(r'\s+', np.nan, regex=True)
|
18 |
# filter out if any of the benchmarks have not been produced
|
|
|
4 |
import pandas as pd
|
5 |
|
6 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
+
from src.display.utils import AutoEvalColumn, EvalQueueColumn, HEADER_MAP
|
8 |
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
|
|
13 |
#all_data_json = [v.to_dict() for v in raw_data]
|
14 |
df = pd.DataFrame.from_records(raw_data)
|
15 |
#df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
16 |
+
df = df.rename(columns=HEADER_MAP)
|
17 |
df = df[cols].round(decimals=2)
|
18 |
df.replace(r'\s+', np.nan, regex=True)
|
19 |
# filter out if any of the benchmarks have not been produced
|