T145 commited on
Commit
15e129d
Β·
1 Parent(s): 4f90373

Calculate results directly from the source

Browse files
Files changed (1) hide show
  1. functions.py +121 -27
functions.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
- import pandas as pd
 
 
3
  from datasets import load_dataset
4
  from huggingface_hub import (
5
  CommitOperationAdd,
@@ -11,16 +13,114 @@ from huggingface_hub import (
11
  from huggingface_hub.repocard_data import eval_results_to_model_index
12
  from pytablewriter import MarkdownTableWriter
13
 
14
- COMMIT_DESCRIPTION = """This is an automated PR created with https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard
15
 
16
  The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
17
 
18
  Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
19
 
20
 
21
- def search(df, value):
22
- result_df = df[df["fullname"] == value]
23
- return result_df.iloc[0].to_dict() if not result_df.empty else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
 
26
  def get_details_url(repo):
@@ -42,10 +142,9 @@ def get_task_summary(results):
42
  "dataset_type": "wis-k/instruction-following-eval",
43
  "dataset_name": "IFEval (0-Shot)",
44
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
45
- "metric_value": round(results["IFEval"], 2),
46
  "dataset_config": None,
47
  "dataset_split": "train",
48
- #"dataset_revision": None,
49
  "dataset_args": {"num_few_shot": 0},
50
  "metric_name": "averaged accuracy",
51
  },
@@ -53,10 +152,9 @@ def get_task_summary(results):
53
  "dataset_type": "SaylorTwift/bbh",
54
  "dataset_name": "BBH (3-Shot)",
55
  "metric_type": "acc_norm",
56
- "metric_value": round(results["BBH"], 2),
57
  "dataset_config": None,
58
  "dataset_split": "test",
59
- #"dataset_revision": None,
60
  "dataset_args": {"num_few_shot": 3},
61
  "metric_name": "normalized accuracy",
62
  },
@@ -64,10 +162,9 @@ def get_task_summary(results):
64
  "dataset_type": "lighteval/MATH-Hard",
65
  "dataset_name": "MATH Lvl 5 (4-Shot)",
66
  "metric_type": "exact_match",
67
- "metric_value": round(results["MATH Lvl 5"], 2),
68
  "dataset_config": None,
69
  "dataset_split": "test",
70
- #"dataset_revision": None,
71
  "dataset_args": {"num_few_shot": 4},
72
  "metric_name": "exact match",
73
  },
@@ -75,10 +172,9 @@ def get_task_summary(results):
75
  "dataset_type": "Idavidrein/gpqa",
76
  "dataset_name": "GPQA (0-shot)",
77
  "metric_type": "acc_norm",
78
- "metric_value": round(results["GPQA"], 2),
79
  "dataset_config": None,
80
  "dataset_split": "train",
81
- #"dataset_revision": None,
82
  "dataset_args": {"num_few_shot": 0},
83
  "metric_name": "acc_norm",
84
  },
@@ -86,7 +182,7 @@ def get_task_summary(results):
86
  "dataset_type": "TAUR-Lab/MuSR",
87
  "dataset_name": "MuSR (0-shot)",
88
  "metric_type": "acc_norm",
89
- "metric_value": round(results["MUSR"], 2),
90
  "dataset_config": None,
91
  "dataset_split": None, # three test splits
92
  "dataset_args": {"num_few_shot": 0},
@@ -96,7 +192,7 @@ def get_task_summary(results):
96
  "dataset_type": "TIGER-Lab/MMLU-Pro",
97
  "dataset_name": "MMLU-PRO (5-shot)",
98
  "metric_type": "acc",
99
- "metric_value": round(results["MMLU-PRO"], 2),
100
  "dataset_config": "main",
101
  "dataset_split": "test",
102
  "dataset_args": {"num_few_shot": 5},
@@ -105,12 +201,11 @@ def get_task_summary(results):
105
  }
106
 
107
 
108
- def get_eval_results(df, repo):
109
- results = search(df, repo)
110
  task_summary = get_task_summary(results)
111
  table = MarkdownTableWriter()
112
- table.headers = ["Metric", "% Value"]
113
- table.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
114
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
115
  ]
116
 
@@ -123,15 +218,14 @@ Summarized results can be found [here]({get_contents_url(repo)})!
123
  return text
124
 
125
 
126
- def get_edited_yaml_readme(df, repo, token: str | None):
127
  card = ModelCard.load(repo, token=token)
128
- results = search(df, repo)
129
 
130
  common = {
131
  "task_type": "text-generation",
132
  "task_name": "Text Generation",
133
  "source_name": "Open LLM Leaderboard",
134
- "source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query={repo}",
135
  }
136
 
137
  tasks_results = get_task_summary(results)
@@ -167,8 +261,8 @@ def commit(
167
  else:
168
  token = oauth_token
169
 
170
- data = load_dataset("open-llm-leaderboard/contents", split="train")
171
- df = pd.DataFrame(data)
172
 
173
  if repo.startswith("https://huggingface.co/"):
174
  try:
@@ -181,11 +275,11 @@ def commit(
181
  try:
182
  try: # check if there is a readme already
183
  readme_text = get_edited_yaml_readme(
184
- df, repo, token=token
185
- ) + get_eval_results(df, repo)
186
  except Exception as e:
187
  if "Repo card metadata block was not found." in str(e): # There is no readme
188
- readme_text = get_edited_yaml_readme(df, repo, token=token)
189
  else:
190
  print(f"Something went wrong: {e}")
191
 
 
1
  import gradio as gr
2
+ import numpy as np
3
+ import urllib3
4
+ from bs4 import BeautifulSoup
5
  from datasets import load_dataset
6
  from huggingface_hub import (
7
  CommitOperationAdd,
 
13
  from huggingface_hub.repocard_data import eval_results_to_model_index
14
  from pytablewriter import MarkdownTableWriter
15
 
16
+ COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)!
17
 
18
  The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
19
 
20
  Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
21
 
22
 
23
+ def normalize_within_range(value, lower_bound=0, higher_bound=1):
24
+ return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
25
+
26
+
27
+ def calculate_results(repo: str, pool: urllib3.PoolManager):
28
+ try:
29
+ base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}"
30
+ html = pool.request("GET", base_url).data
31
+ soup = BeautifulSoup(html, "html.parser")
32
+ dl_link = soup.find_all(title="Download file")[-1]["href"]
33
+ data = pool.request("GET", f"https://huggingface.co{dl_link}").json()
34
+
35
+ del base_url
36
+ del html
37
+ del soup
38
+ del dl_link
39
+
40
+ model_name = data["model_name"]
41
+ precision = data["config"]["model_dtype"]
42
+ revision = data["config"]["model_revision"]
43
+
44
+ # Normalize BBH subtasks scores
45
+ bbh_scores = []
46
+ for subtask_key in data["group_subtasks"]["leaderboard_bbh"]:
47
+ num_choices = len(data["configs"][subtask_key]["doc_to_choice"])
48
+ if subtask_key in data["results"]:
49
+ bbh_raw_score = data["results"][subtask_key]["acc_norm,none"]
50
+ lower_bound = 1 / num_choices
51
+ normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0)
52
+ bbh_scores.append(normalized_score)
53
+
54
+ # Average BBH score
55
+ bbh_score = sum(bbh_scores) / len(bbh_scores)
56
+
57
+ # Calculate the MATH score
58
+ math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
59
+ math_score = normalize_within_range(math_raw_score, 0, 1.0)
60
+
61
+ # Normalize GPQA scores
62
+ gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
63
+ gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
64
+
65
+ # Normalize MMLU PRO scores
66
+ mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
67
+ mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
68
+
69
+ # Compute IFEval
70
+ ifeval_inst_score = (
71
+ data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100
72
+ )
73
+ ifeval_prompt_score = (
74
+ data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100
75
+ )
76
+
77
+ # Average IFEval scores
78
+ ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
79
+
80
+ # Normalize MUSR scores
81
+ musr_scores = []
82
+ for subtask_key in data["group_subtasks"]["leaderboard_musr"]:
83
+ subtask_config = data["configs"][subtask_key]
84
+ dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"])
85
+ num_choices = max(len(eval(question["choices"])) for question in dataset)
86
+ musr_raw_score = data["results"][subtask_key]["acc_norm,none"]
87
+ lower_bound = 1 / num_choices
88
+ normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0)
89
+
90
+ musr_scores.append(normalized_score)
91
+ del dataset
92
+
93
+ musr_score = sum(musr_scores) / len(musr_scores)
94
+
95
+ # Calculate overall score
96
+ overall_score = (
97
+ bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
98
+ ) / 6
99
+
100
+ # Round all scores to 2 decimal places
101
+ bbh_score = float(round(bbh_score, 2))
102
+ math_score = float(round(math_score, 2))
103
+ gpqa_score = float(round(gpqa_score, 2))
104
+ mmlu_pro_score = float(round(mmlu_pro_score, 2))
105
+ musr_score = float(round(musr_score, 2))
106
+ ifeval_score = float(round(ifeval_score, 2))
107
+ overall_score = float(round(overall_score, 2))
108
+ results = {
109
+ "Model": model_name,
110
+ "Precision": precision,
111
+ "Revision": revision,
112
+ "Average": overall_score,
113
+ "IFEval": ifeval_score,
114
+ "BBH": bbh_score,
115
+ "MATH Lvl 5": math_score,
116
+ "GPQA": gpqa_score,
117
+ "MUSR": musr_score,
118
+ "MMLU-PRO": mmlu_pro_score,
119
+ }
120
+ # pprint(results, sort_dicts=False)
121
+ return results
122
+ except Exception: # likely will be from no results being available
123
+ return None
124
 
125
 
126
  def get_details_url(repo):
 
142
  "dataset_type": "wis-k/instruction-following-eval",
143
  "dataset_name": "IFEval (0-Shot)",
144
  "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
145
+ "metric_value": results["IFEval"],
146
  "dataset_config": None,
147
  "dataset_split": "train",
 
148
  "dataset_args": {"num_few_shot": 0},
149
  "metric_name": "averaged accuracy",
150
  },
 
152
  "dataset_type": "SaylorTwift/bbh",
153
  "dataset_name": "BBH (3-Shot)",
154
  "metric_type": "acc_norm",
155
+ "metric_value": results["BBH"],
156
  "dataset_config": None,
157
  "dataset_split": "test",
 
158
  "dataset_args": {"num_few_shot": 3},
159
  "metric_name": "normalized accuracy",
160
  },
 
162
  "dataset_type": "lighteval/MATH-Hard",
163
  "dataset_name": "MATH Lvl 5 (4-Shot)",
164
  "metric_type": "exact_match",
165
+ "metric_value": results["MATH Lvl 5"],
166
  "dataset_config": None,
167
  "dataset_split": "test",
 
168
  "dataset_args": {"num_few_shot": 4},
169
  "metric_name": "exact match",
170
  },
 
172
  "dataset_type": "Idavidrein/gpqa",
173
  "dataset_name": "GPQA (0-shot)",
174
  "metric_type": "acc_norm",
175
+ "metric_value": results["GPQA"],
176
  "dataset_config": None,
177
  "dataset_split": "train",
 
178
  "dataset_args": {"num_few_shot": 0},
179
  "metric_name": "acc_norm",
180
  },
 
182
  "dataset_type": "TAUR-Lab/MuSR",
183
  "dataset_name": "MuSR (0-shot)",
184
  "metric_type": "acc_norm",
185
+ "metric_value": results["MUSR"],
186
  "dataset_config": None,
187
  "dataset_split": None, # three test splits
188
  "dataset_args": {"num_few_shot": 0},
 
192
  "dataset_type": "TIGER-Lab/MMLU-Pro",
193
  "dataset_name": "MMLU-PRO (5-shot)",
194
  "metric_type": "acc",
195
+ "metric_value": results["MMLU-PRO"],
196
  "dataset_config": "main",
197
  "dataset_split": "test",
198
  "dataset_args": {"num_few_shot": 5},
 
201
  }
202
 
203
 
204
+ def get_eval_results(repo: str, results: dict):
 
205
  task_summary = get_task_summary(results)
206
  table = MarkdownTableWriter()
207
+ table.headers = ["Metric", "Value (%)"]
208
+ table.value_matrix = [["**Average**", f"**{results["Average"]}**"]] + [
209
  [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
210
  ]
211
 
 
218
  return text
219
 
220
 
221
+ def get_edited_yaml_readme(repo: str, results: dict, token: str | None):
222
  card = ModelCard.load(repo, token=token)
 
223
 
224
  common = {
225
  "task_type": "text-generation",
226
  "task_name": "Text Generation",
227
  "source_name": "Open LLM Leaderboard",
228
+ "source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={repo.replace("/", "%2F")}",
229
  }
230
 
231
  tasks_results = get_task_summary(results)
 
261
  else:
262
  token = oauth_token
263
 
264
+ with urllib3.PoolManager() as pool:
265
+ results = calculate_results(repo, pool)
266
 
267
  if repo.startswith("https://huggingface.co/"):
268
  try:
 
275
  try:
276
  try: # check if there is a readme already
277
  readme_text = get_edited_yaml_readme(
278
+ repo, results, token=token
279
+ ) + get_eval_results(repo, results)
280
  except Exception as e:
281
  if "Repo card metadata block was not found." in str(e): # There is no readme
282
+ readme_text = get_edited_yaml_readme(repo, results, token=token)
283
  else:
284
  print(f"Something went wrong: {e}")
285