Calculate results directly from the source
Browse files- functions.py +121 -27
functions.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
|
|
|
|
3 |
from datasets import load_dataset
|
4 |
from huggingface_hub import (
|
5 |
CommitOperationAdd,
|
@@ -11,16 +13,114 @@ from huggingface_hub import (
|
|
11 |
from huggingface_hub.repocard_data import eval_results_to_model_index
|
12 |
from pytablewriter import MarkdownTableWriter
|
13 |
|
14 |
-
COMMIT_DESCRIPTION = """This is an automated PR created with https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard
|
15 |
|
16 |
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
|
17 |
|
18 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
19 |
|
20 |
|
21 |
-
def
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
def get_details_url(repo):
|
@@ -42,10 +142,9 @@ def get_task_summary(results):
|
|
42 |
"dataset_type": "wis-k/instruction-following-eval",
|
43 |
"dataset_name": "IFEval (0-Shot)",
|
44 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
45 |
-
"metric_value":
|
46 |
"dataset_config": None,
|
47 |
"dataset_split": "train",
|
48 |
-
#"dataset_revision": None,
|
49 |
"dataset_args": {"num_few_shot": 0},
|
50 |
"metric_name": "averaged accuracy",
|
51 |
},
|
@@ -53,10 +152,9 @@ def get_task_summary(results):
|
|
53 |
"dataset_type": "SaylorTwift/bbh",
|
54 |
"dataset_name": "BBH (3-Shot)",
|
55 |
"metric_type": "acc_norm",
|
56 |
-
"metric_value":
|
57 |
"dataset_config": None,
|
58 |
"dataset_split": "test",
|
59 |
-
#"dataset_revision": None,
|
60 |
"dataset_args": {"num_few_shot": 3},
|
61 |
"metric_name": "normalized accuracy",
|
62 |
},
|
@@ -64,10 +162,9 @@ def get_task_summary(results):
|
|
64 |
"dataset_type": "lighteval/MATH-Hard",
|
65 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
66 |
"metric_type": "exact_match",
|
67 |
-
"metric_value":
|
68 |
"dataset_config": None,
|
69 |
"dataset_split": "test",
|
70 |
-
#"dataset_revision": None,
|
71 |
"dataset_args": {"num_few_shot": 4},
|
72 |
"metric_name": "exact match",
|
73 |
},
|
@@ -75,10 +172,9 @@ def get_task_summary(results):
|
|
75 |
"dataset_type": "Idavidrein/gpqa",
|
76 |
"dataset_name": "GPQA (0-shot)",
|
77 |
"metric_type": "acc_norm",
|
78 |
-
"metric_value":
|
79 |
"dataset_config": None,
|
80 |
"dataset_split": "train",
|
81 |
-
#"dataset_revision": None,
|
82 |
"dataset_args": {"num_few_shot": 0},
|
83 |
"metric_name": "acc_norm",
|
84 |
},
|
@@ -86,7 +182,7 @@ def get_task_summary(results):
|
|
86 |
"dataset_type": "TAUR-Lab/MuSR",
|
87 |
"dataset_name": "MuSR (0-shot)",
|
88 |
"metric_type": "acc_norm",
|
89 |
-
"metric_value":
|
90 |
"dataset_config": None,
|
91 |
"dataset_split": None, # three test splits
|
92 |
"dataset_args": {"num_few_shot": 0},
|
@@ -96,7 +192,7 @@ def get_task_summary(results):
|
|
96 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
97 |
"dataset_name": "MMLU-PRO (5-shot)",
|
98 |
"metric_type": "acc",
|
99 |
-
"metric_value":
|
100 |
"dataset_config": "main",
|
101 |
"dataset_split": "test",
|
102 |
"dataset_args": {"num_few_shot": 5},
|
@@ -105,12 +201,11 @@ def get_task_summary(results):
|
|
105 |
}
|
106 |
|
107 |
|
108 |
-
def get_eval_results(
|
109 |
-
results = search(df, repo)
|
110 |
task_summary = get_task_summary(results)
|
111 |
table = MarkdownTableWriter()
|
112 |
-
table.headers = ["Metric", "%
|
113 |
-
table.value_matrix = [["
|
114 |
[v["dataset_name"], v["metric_value"]] for v in task_summary.values()
|
115 |
]
|
116 |
|
@@ -123,15 +218,14 @@ Summarized results can be found [here]({get_contents_url(repo)})!
|
|
123 |
return text
|
124 |
|
125 |
|
126 |
-
def get_edited_yaml_readme(
|
127 |
card = ModelCard.load(repo, token=token)
|
128 |
-
results = search(df, repo)
|
129 |
|
130 |
common = {
|
131 |
"task_type": "text-generation",
|
132 |
"task_name": "Text Generation",
|
133 |
"source_name": "Open LLM Leaderboard",
|
134 |
-
"source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
|
135 |
}
|
136 |
|
137 |
tasks_results = get_task_summary(results)
|
@@ -167,8 +261,8 @@ def commit(
|
|
167 |
else:
|
168 |
token = oauth_token
|
169 |
|
170 |
-
|
171 |
-
|
172 |
|
173 |
if repo.startswith("https://huggingface.co/"):
|
174 |
try:
|
@@ -181,11 +275,11 @@ def commit(
|
|
181 |
try:
|
182 |
try: # check if there is a readme already
|
183 |
readme_text = get_edited_yaml_readme(
|
184 |
-
|
185 |
-
) + get_eval_results(
|
186 |
except Exception as e:
|
187 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
188 |
-
readme_text = get_edited_yaml_readme(
|
189 |
else:
|
190 |
print(f"Something went wrong: {e}")
|
191 |
|
|
|
1 |
import gradio as gr
|
2 |
+
import numpy as np
|
3 |
+
import urllib3
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
from datasets import load_dataset
|
6 |
from huggingface_hub import (
|
7 |
CommitOperationAdd,
|
|
|
13 |
from huggingface_hub.repocard_data import eval_results_to_model_index
|
14 |
from pytablewriter import MarkdownTableWriter
|
15 |
|
16 |
+
COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)!
|
17 |
|
18 |
The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
|
19 |
|
20 |
Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
|
21 |
|
22 |
|
23 |
+
def normalize_within_range(value, lower_bound=0, higher_bound=1):
|
24 |
+
return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
|
25 |
+
|
26 |
+
|
27 |
+
def calculate_results(repo: str, pool: urllib3.PoolManager):
|
28 |
+
try:
|
29 |
+
base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}"
|
30 |
+
html = pool.request("GET", base_url).data
|
31 |
+
soup = BeautifulSoup(html, "html.parser")
|
32 |
+
dl_link = soup.find_all(title="Download file")[-1]["href"]
|
33 |
+
data = pool.request("GET", f"https://huggingface.co{dl_link}").json()
|
34 |
+
|
35 |
+
del base_url
|
36 |
+
del html
|
37 |
+
del soup
|
38 |
+
del dl_link
|
39 |
+
|
40 |
+
model_name = data["model_name"]
|
41 |
+
precision = data["config"]["model_dtype"]
|
42 |
+
revision = data["config"]["model_revision"]
|
43 |
+
|
44 |
+
# Normalize BBH subtasks scores
|
45 |
+
bbh_scores = []
|
46 |
+
for subtask_key in data["group_subtasks"]["leaderboard_bbh"]:
|
47 |
+
num_choices = len(data["configs"][subtask_key]["doc_to_choice"])
|
48 |
+
if subtask_key in data["results"]:
|
49 |
+
bbh_raw_score = data["results"][subtask_key]["acc_norm,none"]
|
50 |
+
lower_bound = 1 / num_choices
|
51 |
+
normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0)
|
52 |
+
bbh_scores.append(normalized_score)
|
53 |
+
|
54 |
+
# Average BBH score
|
55 |
+
bbh_score = sum(bbh_scores) / len(bbh_scores)
|
56 |
+
|
57 |
+
# Calculate the MATH score
|
58 |
+
math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
|
59 |
+
math_score = normalize_within_range(math_raw_score, 0, 1.0)
|
60 |
+
|
61 |
+
# Normalize GPQA scores
|
62 |
+
gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
|
63 |
+
gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
|
64 |
+
|
65 |
+
# Normalize MMLU PRO scores
|
66 |
+
mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
|
67 |
+
mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
|
68 |
+
|
69 |
+
# Compute IFEval
|
70 |
+
ifeval_inst_score = (
|
71 |
+
data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100
|
72 |
+
)
|
73 |
+
ifeval_prompt_score = (
|
74 |
+
data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100
|
75 |
+
)
|
76 |
+
|
77 |
+
# Average IFEval scores
|
78 |
+
ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
|
79 |
+
|
80 |
+
# Normalize MUSR scores
|
81 |
+
musr_scores = []
|
82 |
+
for subtask_key in data["group_subtasks"]["leaderboard_musr"]:
|
83 |
+
subtask_config = data["configs"][subtask_key]
|
84 |
+
dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"])
|
85 |
+
num_choices = max(len(eval(question["choices"])) for question in dataset)
|
86 |
+
musr_raw_score = data["results"][subtask_key]["acc_norm,none"]
|
87 |
+
lower_bound = 1 / num_choices
|
88 |
+
normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0)
|
89 |
+
|
90 |
+
musr_scores.append(normalized_score)
|
91 |
+
del dataset
|
92 |
+
|
93 |
+
musr_score = sum(musr_scores) / len(musr_scores)
|
94 |
+
|
95 |
+
# Calculate overall score
|
96 |
+
overall_score = (
|
97 |
+
bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
|
98 |
+
) / 6
|
99 |
+
|
100 |
+
# Round all scores to 2 decimal places
|
101 |
+
bbh_score = float(round(bbh_score, 2))
|
102 |
+
math_score = float(round(math_score, 2))
|
103 |
+
gpqa_score = float(round(gpqa_score, 2))
|
104 |
+
mmlu_pro_score = float(round(mmlu_pro_score, 2))
|
105 |
+
musr_score = float(round(musr_score, 2))
|
106 |
+
ifeval_score = float(round(ifeval_score, 2))
|
107 |
+
overall_score = float(round(overall_score, 2))
|
108 |
+
results = {
|
109 |
+
"Model": model_name,
|
110 |
+
"Precision": precision,
|
111 |
+
"Revision": revision,
|
112 |
+
"Average": overall_score,
|
113 |
+
"IFEval": ifeval_score,
|
114 |
+
"BBH": bbh_score,
|
115 |
+
"MATH Lvl 5": math_score,
|
116 |
+
"GPQA": gpqa_score,
|
117 |
+
"MUSR": musr_score,
|
118 |
+
"MMLU-PRO": mmlu_pro_score,
|
119 |
+
}
|
120 |
+
# pprint(results, sort_dicts=False)
|
121 |
+
return results
|
122 |
+
except Exception: # likely will be from no results being available
|
123 |
+
return None
|
124 |
|
125 |
|
126 |
def get_details_url(repo):
|
|
|
142 |
"dataset_type": "wis-k/instruction-following-eval",
|
143 |
"dataset_name": "IFEval (0-Shot)",
|
144 |
"metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
|
145 |
+
"metric_value": results["IFEval"],
|
146 |
"dataset_config": None,
|
147 |
"dataset_split": "train",
|
|
|
148 |
"dataset_args": {"num_few_shot": 0},
|
149 |
"metric_name": "averaged accuracy",
|
150 |
},
|
|
|
152 |
"dataset_type": "SaylorTwift/bbh",
|
153 |
"dataset_name": "BBH (3-Shot)",
|
154 |
"metric_type": "acc_norm",
|
155 |
+
"metric_value": results["BBH"],
|
156 |
"dataset_config": None,
|
157 |
"dataset_split": "test",
|
|
|
158 |
"dataset_args": {"num_few_shot": 3},
|
159 |
"metric_name": "normalized accuracy",
|
160 |
},
|
|
|
162 |
"dataset_type": "lighteval/MATH-Hard",
|
163 |
"dataset_name": "MATH Lvl 5 (4-Shot)",
|
164 |
"metric_type": "exact_match",
|
165 |
+
"metric_value": results["MATH Lvl 5"],
|
166 |
"dataset_config": None,
|
167 |
"dataset_split": "test",
|
|
|
168 |
"dataset_args": {"num_few_shot": 4},
|
169 |
"metric_name": "exact match",
|
170 |
},
|
|
|
172 |
"dataset_type": "Idavidrein/gpqa",
|
173 |
"dataset_name": "GPQA (0-shot)",
|
174 |
"metric_type": "acc_norm",
|
175 |
+
"metric_value": results["GPQA"],
|
176 |
"dataset_config": None,
|
177 |
"dataset_split": "train",
|
|
|
178 |
"dataset_args": {"num_few_shot": 0},
|
179 |
"metric_name": "acc_norm",
|
180 |
},
|
|
|
182 |
"dataset_type": "TAUR-Lab/MuSR",
|
183 |
"dataset_name": "MuSR (0-shot)",
|
184 |
"metric_type": "acc_norm",
|
185 |
+
"metric_value": results["MUSR"],
|
186 |
"dataset_config": None,
|
187 |
"dataset_split": None, # three test splits
|
188 |
"dataset_args": {"num_few_shot": 0},
|
|
|
192 |
"dataset_type": "TIGER-Lab/MMLU-Pro",
|
193 |
"dataset_name": "MMLU-PRO (5-shot)",
|
194 |
"metric_type": "acc",
|
195 |
+
"metric_value": results["MMLU-PRO"],
|
196 |
"dataset_config": "main",
|
197 |
"dataset_split": "test",
|
198 |
"dataset_args": {"num_few_shot": 5},
|
|
|
201 |
}
|
202 |
|
203 |
|
204 |
+
def get_eval_results(repo: str, results: dict):
|
|
|
205 |
task_summary = get_task_summary(results)
|
206 |
table = MarkdownTableWriter()
|
207 |
+
table.headers = ["Metric", "Value (%)"]
|
208 |
+
table.value_matrix = [["**Average**", f"**{results["Average"]}**"]] + [
|
209 |
[v["dataset_name"], v["metric_value"]] for v in task_summary.values()
|
210 |
]
|
211 |
|
|
|
218 |
return text
|
219 |
|
220 |
|
221 |
+
def get_edited_yaml_readme(repo: str, results: dict, token: str | None):
|
222 |
card = ModelCard.load(repo, token=token)
|
|
|
223 |
|
224 |
common = {
|
225 |
"task_type": "text-generation",
|
226 |
"task_name": "Text Generation",
|
227 |
"source_name": "Open LLM Leaderboard",
|
228 |
+
"source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={repo.replace("/", "%2F")}",
|
229 |
}
|
230 |
|
231 |
tasks_results = get_task_summary(results)
|
|
|
261 |
else:
|
262 |
token = oauth_token
|
263 |
|
264 |
+
with urllib3.PoolManager() as pool:
|
265 |
+
results = calculate_results(repo, pool)
|
266 |
|
267 |
if repo.startswith("https://huggingface.co/"):
|
268 |
try:
|
|
|
275 |
try:
|
276 |
try: # check if there is a readme already
|
277 |
readme_text = get_edited_yaml_readme(
|
278 |
+
repo, results, token=token
|
279 |
+
) + get_eval_results(repo, results)
|
280 |
except Exception as e:
|
281 |
if "Repo card metadata block was not found." in str(e): # There is no readme
|
282 |
+
readme_text = get_edited_yaml_readme(repo, results, token=token)
|
283 |
else:
|
284 |
print(f"Something went wrong: {e}")
|
285 |
|