Spaces:

T145
/

open-llm-leaderboard-results-to-modelcard

Running

App Files Files Community

T145 commited on about 1 month ago

Commit

15e129d

1 Parent(s): 4f90373

Calculate results directly from the source

Browse files

Files changed (1) hide show

functions.py +121 -27

functions.py CHANGED Viewed

@@ -1,5 +1,7 @@
 import gradio as gr
-import pandas as pd
 from datasets import load_dataset
 from huggingface_hub import (
     CommitOperationAdd,
@@ -11,16 +13,114 @@ from huggingface_hub import (
 from huggingface_hub.repocard_data import eval_results_to_model_index
 from pytablewriter import MarkdownTableWriter
-COMMIT_DESCRIPTION = """This is an automated PR created with https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard
 The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
 Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
-def search(df, value):
-    result_df = df[df["fullname"] == value]
-    return result_df.iloc[0].to_dict() if not result_df.empty else None
 def get_details_url(repo):
@@ -42,10 +142,9 @@ def get_task_summary(results):
             "dataset_type": "wis-k/instruction-following-eval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
-            "metric_value": round(results["IFEval"], 2),
             "dataset_config": None,
             "dataset_split": "train",
-            #"dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "averaged accuracy",
         },
@@ -53,10 +152,9 @@ def get_task_summary(results):
             "dataset_type": "SaylorTwift/bbh",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
-            "metric_value": round(results["BBH"], 2),
             "dataset_config": None,
             "dataset_split": "test",
-            #"dataset_revision": None,
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
@@ -64,10 +162,9 @@ def get_task_summary(results):
             "dataset_type": "lighteval/MATH-Hard",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
-            "metric_value": round(results["MATH Lvl 5"], 2),
             "dataset_config": None,
             "dataset_split": "test",
-            #"dataset_revision": None,
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
         },
@@ -75,10 +172,9 @@ def get_task_summary(results):
             "dataset_type": "Idavidrein/gpqa",
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
-            "metric_value": round(results["GPQA"], 2),
             "dataset_config": None,
             "dataset_split": "train",
-            #"dataset_revision": None,
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
@@ -86,7 +182,7 @@ def get_task_summary(results):
             "dataset_type": "TAUR-Lab/MuSR",
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
-            "metric_value": round(results["MUSR"], 2),
             "dataset_config": None,
             "dataset_split": None,  # three test splits
             "dataset_args": {"num_few_shot": 0},
@@ -96,7 +192,7 @@ def get_task_summary(results):
             "dataset_type": "TIGER-Lab/MMLU-Pro",
             "dataset_name": "MMLU-PRO (5-shot)",
             "metric_type": "acc",
-            "metric_value": round(results["MMLU-PRO"], 2),
             "dataset_config": "main",
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 5},
@@ -105,12 +201,11 @@ def get_task_summary(results):
     }
-def get_eval_results(df, repo):
-    results = search(df, repo)
     task_summary = get_task_summary(results)
     table = MarkdownTableWriter()
-    table.headers = ["Metric", "% Value"]
-    table.value_matrix = [["Avg.", round(results["Average ⬆️"], 2)]] + [
         [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
     ]
@@ -123,15 +218,14 @@ Summarized results can be found [here]({get_contents_url(repo)})!
     return text
-def get_edited_yaml_readme(df, repo, token: str | None):
     card = ModelCard.load(repo, token=token)
-    results = search(df, repo)
     common = {
         "task_type": "text-generation",
         "task_name": "Text Generation",
         "source_name": "Open LLM Leaderboard",
-        "source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard?query={repo}",
     }
     tasks_results = get_task_summary(results)
@@ -167,8 +261,8 @@ def commit(
     else:
         token = oauth_token
-    data = load_dataset("open-llm-leaderboard/contents", split="train")
-    df = pd.DataFrame(data)
     if repo.startswith("https://huggingface.co/"):
         try:
@@ -181,11 +275,11 @@ def commit(
     try:
         try:  # check if there is a readme already
             readme_text = get_edited_yaml_readme(
-                df, repo, token=token
-            ) + get_eval_results(df, repo)
         except Exception as e:
             if "Repo card metadata block was not found." in str(e):  # There is no readme
-                readme_text = get_edited_yaml_readme(df, repo, token=token)
             else:
                 print(f"Something went wrong: {e}")

 import gradio as gr
+import numpy as np
+import urllib3
+from bs4 import BeautifulSoup
 from datasets import load_dataset
 from huggingface_hub import (
     CommitOperationAdd,
 from huggingface_hub.repocard_data import eval_results_to_model_index
 from pytablewriter import MarkdownTableWriter
+COMMIT_DESCRIPTION = """This is an automated PR created with [this space](https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard)!
 The purpose of this PR is to add evaluation results from the Open LLM Leaderboard to your model card.
 Please report any issues here: https://huggingface.co/spaces/T145/open-llm-leaderboard-results-to-modelcard/discussions"""
+def normalize_within_range(value, lower_bound=0, higher_bound=1):
+    return (np.clip(value - lower_bound, 0, None)) / (higher_bound - lower_bound) * 100
+def calculate_results(repo: str, pool: urllib3.PoolManager):
+    try:
+        base_url = f"https://huggingface.co/datasets/open-llm-leaderboard/results/tree/main/{repo}"
+        html = pool.request("GET", base_url).data
+        soup = BeautifulSoup(html, "html.parser")
+        dl_link = soup.find_all(title="Download file")[-1]["href"]
+        data = pool.request("GET", f"https://huggingface.co{dl_link}").json()
+        del base_url
+        del html
+        del soup
+        del dl_link
+        model_name = data["model_name"]
+        precision = data["config"]["model_dtype"]
+        revision = data["config"]["model_revision"]
+        # Normalize BBH subtasks scores
+        bbh_scores = []
+        for subtask_key in data["group_subtasks"]["leaderboard_bbh"]:
+            num_choices = len(data["configs"][subtask_key]["doc_to_choice"])
+            if subtask_key in data["results"]:
+                bbh_raw_score = data["results"][subtask_key]["acc_norm,none"]
+                lower_bound = 1 / num_choices
+                normalized_score = normalize_within_range(bbh_raw_score, lower_bound, 1.0)
+                bbh_scores.append(normalized_score)
+        # Average BBH score
+        bbh_score = sum(bbh_scores) / len(bbh_scores)
+        # Calculate the MATH score
+        math_raw_score = data["results"]["leaderboard_math_hard"]["exact_match,none"]
+        math_score = normalize_within_range(math_raw_score, 0, 1.0)
+        # Normalize GPQA scores
+        gpqa_raw_score = data["results"]["leaderboard_gpqa"]["acc_norm,none"]
+        gpqa_score = normalize_within_range(gpqa_raw_score, 0.25, 1.0)
+        # Normalize MMLU PRO scores
+        mmlu_pro_raw_score = data["results"]["leaderboard_mmlu_pro"]["acc,none"]
+        mmlu_pro_score = normalize_within_range(mmlu_pro_raw_score, 0.1, 1.0)
+        # Compute IFEval
+        ifeval_inst_score = (
+            data["results"]["leaderboard_ifeval"]["inst_level_strict_acc,none"] * 100
+        )
+        ifeval_prompt_score = (
+            data["results"]["leaderboard_ifeval"]["prompt_level_strict_acc,none"] * 100
+        )
+        # Average IFEval scores
+        ifeval_score = (ifeval_inst_score + ifeval_prompt_score) / 2
+        # Normalize MUSR scores
+        musr_scores = []
+        for subtask_key in data["group_subtasks"]["leaderboard_musr"]:
+            subtask_config = data["configs"][subtask_key]
+            dataset = load_dataset(subtask_config["dataset_path"], split=subtask_config["test_split"])
+            num_choices = max(len(eval(question["choices"])) for question in dataset)
+            musr_raw_score = data["results"][subtask_key]["acc_norm,none"]
+            lower_bound = 1 / num_choices
+            normalized_score = normalize_within_range(musr_raw_score, lower_bound, 1.0)
+            musr_scores.append(normalized_score)
+            del dataset
+        musr_score = sum(musr_scores) / len(musr_scores)
+        # Calculate overall score
+        overall_score = (
+            bbh_score + math_score + gpqa_score + mmlu_pro_score + musr_score + ifeval_score
+        ) / 6
+        # Round all scores to 2 decimal places
+        bbh_score = float(round(bbh_score, 2))
+        math_score = float(round(math_score, 2))
+        gpqa_score = float(round(gpqa_score, 2))
+        mmlu_pro_score = float(round(mmlu_pro_score, 2))
+        musr_score = float(round(musr_score, 2))
+        ifeval_score = float(round(ifeval_score, 2))
+        overall_score = float(round(overall_score, 2))
+        results = {
+            "Model": model_name,
+            "Precision": precision,
+            "Revision": revision,
+            "Average": overall_score,
+            "IFEval": ifeval_score,
+            "BBH": bbh_score,
+            "MATH Lvl 5": math_score,
+            "GPQA": gpqa_score,
+            "MUSR": musr_score,
+            "MMLU-PRO": mmlu_pro_score,
+        }
+        # pprint(results, sort_dicts=False)
+        return results
+    except Exception: # likely will be from no results being available
+        return None
 def get_details_url(repo):
             "dataset_type": "wis-k/instruction-following-eval",
             "dataset_name": "IFEval (0-Shot)",
             "metric_type": "inst_level_strict_acc and prompt_level_strict_acc",
+            "metric_value": results["IFEval"],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "averaged accuracy",
         },
             "dataset_type": "SaylorTwift/bbh",
             "dataset_name": "BBH (3-Shot)",
             "metric_type": "acc_norm",
+            "metric_value": results["BBH"],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 3},
             "metric_name": "normalized accuracy",
         },
             "dataset_type": "lighteval/MATH-Hard",
             "dataset_name": "MATH Lvl 5 (4-Shot)",
             "metric_type": "exact_match",
+            "metric_value": results["MATH Lvl 5"],
             "dataset_config": None,
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 4},
             "metric_name": "exact match",
         },
             "dataset_type": "Idavidrein/gpqa",
             "dataset_name": "GPQA (0-shot)",
             "metric_type": "acc_norm",
+            "metric_value": results["GPQA"],
             "dataset_config": None,
             "dataset_split": "train",
             "dataset_args": {"num_few_shot": 0},
             "metric_name": "acc_norm",
         },
             "dataset_type": "TAUR-Lab/MuSR",
             "dataset_name": "MuSR (0-shot)",
             "metric_type": "acc_norm",
+            "metric_value": results["MUSR"],
             "dataset_config": None,
             "dataset_split": None,  # three test splits
             "dataset_args": {"num_few_shot": 0},
             "dataset_type": "TIGER-Lab/MMLU-Pro",
             "dataset_name": "MMLU-PRO (5-shot)",
             "metric_type": "acc",
+            "metric_value": results["MMLU-PRO"],
             "dataset_config": "main",
             "dataset_split": "test",
             "dataset_args": {"num_few_shot": 5},
     }
+def get_eval_results(repo: str, results: dict):
     task_summary = get_task_summary(results)
     table = MarkdownTableWriter()
+    table.headers = ["Metric", "Value (%)"]
+    table.value_matrix = [["**Average**", f"**{results["Average"]}**"]] + [
         [v["dataset_name"], v["metric_value"]] for v in task_summary.values()
     ]
     return text
+def get_edited_yaml_readme(repo: str, results: dict, token: str | None):
     card = ModelCard.load(repo, token=token)
     common = {
         "task_type": "text-generation",
         "task_name": "Text Generation",
         "source_name": "Open LLM Leaderboard",
+        "source_url": f"https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard#/?search={repo.replace("/", "%2F")}",
     }
     tasks_results = get_task_summary(results)
     else:
         token = oauth_token
+    with urllib3.PoolManager() as pool:
+        results = calculate_results(repo, pool)
     if repo.startswith("https://huggingface.co/"):
         try:
     try:
         try:  # check if there is a readme already
             readme_text = get_edited_yaml_readme(
+                repo, results, token=token
+            ) + get_eval_results(repo, results)
         except Exception as e:
             if "Repo card metadata block was not found." in str(e):  # There is no readme
+                readme_text = get_edited_yaml_readme(repo, results, token=token)
             else:
                 print(f"Something went wrong: {e}")