Spaces:

babylm
/

leaderboard-2024

Runtime error

App Files Files Community

Aaron Mueller commited on Nov 22, 2024

Commit

de60bd6

•

1 Parent(s): b166dfb

update leaderboard

Browse files

Files changed (7) hide show

app.py +23 -33
src/about.py +25 -28
src/display/utils.py +3 -44
src/envs.py +4 -4
src/leaderboard/read_evals.py +8 -2
src/populate.py +1 -2
src/submission/submit.py +15 -5

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ from src.about import (
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
@@ -50,6 +51,7 @@ except Exception:
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -57,9 +59,11 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -95,13 +99,17 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -142,36 +150,20 @@ with demo:
                                 row_count=5,
                             )
             with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
                         multiselect=False,
                         value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
                     )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
@@ -179,11 +171,9 @@ with demo:
                 add_new_eval,
                 [
                     model_name_textbox,
-                    base_model_name_textbox,
                     revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
                 ],
                 submission_result,
             )
@@ -201,4 +191,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 from src.display.css_html_js import custom_css
 from src.display.utils import (
     BENCHMARK_COLS,
+    BENCHMARK_COLS_MULTIMODAL,
     COLS,
     EVAL_COLS,
     EVAL_TYPES,
 LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS_MULTIMODAL)
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe, track):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
+    # filter for correct track
+    dataframe = dataframe.loc[dataframe["track"] == track]
     return Leaderboard(
         value=dataframe,
         datatype=[c.type for c in fields(AutoEvalColumn)],
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("Strict Leaderboard", elem_id="strict-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
+        with gr.TabItem("Strict-small Leaderboard", elem_id="strict-small-benchmark-tab-table", id=1):
+            leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
+        with gr.TabItem("Multimodal Leaderboard", elem_id="multimodal-benchmark-tab-table", id=2):
+            leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
+        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                                 row_count=5,
                             )
             with gr.Row():
+                gr.Markdown("# ✉️✨ Submit your predictions here!", elem_classes="markdown-text")
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
+                    predictions_path_textbox = gr.Textbox(label="URL to predictions file")
                     revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
+                    track_name = gr.Dropdown(
+                        choices = ["Strict", "Strict-small", "Multimodal"],
+                        label = "Track",
                         multiselect=False,
                         value=None,
+                        interactive=True
                     )
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
                 add_new_eval,
                 [
                     model_name_textbox,
+                    predictions_path_textbox,
                     revision_name_textbox,
+                    track_name
                 ],
                 submission_result,
             )
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -12,8 +12,19 @@ class Task:
 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
-    task0 = Task("anli_r1", "acc", "ANLI")
-    task1 = Task("logiqa", "acc_norm", "LogiQA")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
@@ -21,52 +32,38 @@ NUM_FEWSHOT = 0 # Change with your few shot
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-Intro text
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
 ### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
 ### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 # ---------------------------------------------------
 class Tasks(Enum):
     # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("blimp", "acc", "BLiMP")
+    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
+    task2 = Task("glue", "acc", "(Super)GLUE")
+    task3 = Task("ewok", "acc", "EWoK")
+class TasksMultimodal(Enum):
+    task0 = Task("blimp", "acc", "BLiMP")
+    task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
+    task2 = Task("glue", "acc", "(Super)GLUE")
+    task3 = Task("ewok", "acc", "EWoK")
+    task4 = Task("vqa", "acc", "VQA")
+    task5 = Task("winoground", "acc", "Winoground")
+    task6 = Task("devbench", "acc", "DevBench")
 NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">BabyLM 2024 Leaderboards</h1>"""
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+The leaderboards for each track of the 2024 BabyLM Challenge.
 """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## How it works
+This leaderboard accepts predictions files as input, and uploads the results to the leaderboard. The logic is the same as in the `score_predictions.py` script from the BabyLM 2024 evaluation pipeline repository.
 """
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
+### 1) Make sure you can get scores from your prediction using the `score_predictions.py` script.
+```bash
+git clone https://github.com/babylm/evaluation-pipeline-2024/
+cd evaluation-pipeline-2024
+python score_predictions.py path/to/your/predictions.json.gz
 ```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
 ### 3) Make sure your model has an open license!
+This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
 ### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
 """
+CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
 CITATION_BUTTON_TEXT = r"""
 """

src/display/utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from enum import Enum
 import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -47,10 +47,9 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
     revision = ColumnContent("revision", "str", True)
     private = ColumnContent("private", "bool", True)
-    precision = ColumnContent("precision", "str", True)
-    weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
@@ -60,46 +59,6 @@ class ModelDetails:
     display_name: str = ""
     symbol: str = "" # emoji
-class ModelType(Enum):
-    PT = ModelDetails(name="pretrained", symbol="🟢")
-    FT = ModelDetails(name="fine-tuned", symbol="🔶")
-    IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
-    RL = ModelDetails(name="RL-tuned", symbol="🟦")
-    Unknown = ModelDetails(name="", symbol="?")
-    def to_str(self, separator=" "):
-        return f"{self.value.symbol}{separator}{self.value.name}"
-    @staticmethod
-    def from_str(type):
-        if "fine-tuned" in type or "🔶" in type:
-            return ModelType.FT
-        if "pretrained" in type or "🟢" in type:
-            return ModelType.PT
-        if "RL-tuned" in type or "🟦" in type:
-            return ModelType.RL
-        if "instruction-tuned" in type or "⭕" in type:
-            return ModelType.IFT
-        return ModelType.Unknown
-class WeightType(Enum):
-    Adapter = ModelDetails("Adapter")
-    Original = ModelDetails("Original")
-    Delta = ModelDetails("Delta")
-class Precision(Enum):
-    float16 = ModelDetails("float16")
-    bfloat16 = ModelDetails("bfloat16")
-    Unknown = ModelDetails("?")
-    def from_str(precision):
-        if precision in ["torch.float16", "float16"]:
-            return Precision.float16
-        if precision in ["torch.bfloat16", "bfloat16"]:
-            return Precision.bfloat16
-        return Precision.Unknown
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
@@ -107,4 +66,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 import pandas as pd
+from src.about import Tasks, TasksMultimodal
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 @dataclass(frozen=True)
 class EvalQueueColumn:  # Queue column
     model = ColumnContent("model", "markdown", True)
+    track = ColumnContent("track", "str", True)
     revision = ColumnContent("revision", "str", True)
     private = ColumnContent("private", "bool", True)
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
     display_name: str = ""
     symbol: str = "" # emoji
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
 BENCHMARK_COLS = [t.value.col_name for t in Tasks]
+BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]

src/envs.py CHANGED Viewed

@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
-RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "babylm" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard-2024"
+QUEUE_REPO = f"{OWNER}/requests-2024"
+RESULTS_REPO = f"{OWNER}/results-2024"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -8,7 +8,7 @@ import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
@@ -39,6 +39,7 @@ class EvalResult:
             data = json.load(fp)
         config = data.get("config")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
@@ -154,7 +155,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
     return request_file
-def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
@@ -174,6 +175,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)

 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumn, ModelType, Tasks, TasksMultimodal, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
             data = json.load(fp)
         config = data.get("config")
+        track = data.get("track")
         # Precision
         precision = Precision.from_str(config.get("model_dtype"))
     return request_file
+def get_raw_eval_results(results_path: str, requests_path: str, track: str) -> list[EvalResult]:
     """From the path of the results folder root, extract all needed info for results"""
     model_result_filepaths = []
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
+        with open(model_result_filepath, 'r') as f:
+            this_track = f["track"]
+            if this_track != track:
+                continue
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)

src/populate.py CHANGED Viewed

@@ -10,7 +10,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
-    raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
@@ -21,7 +21,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    raw_data = get_raw_eval_results(results_path, requests_path, track)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     df = df[has_no_nan_values(df, benchmark_cols)]
     return df
 def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]

src/submission/submit.py CHANGED Viewed

@@ -15,7 +15,9 @@ REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
-    model: str,
     base_model: str,
     revision: str,
     precision: str,
@@ -28,10 +30,10 @@ def add_new_eval(
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
-    model_path = model
     if "/" in model:
-        user_name = model.split("/")[0]
-        model_path = model.split("/")[1]
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -39,6 +41,12 @@ def add_new_eval(
     if model_type is None or model_type == "":
         return styled_error("Please select a model type.")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
@@ -76,7 +84,9 @@ def add_new_eval(
     print("Adding new eval")
     eval_entry = {
-        "model": model,
         "base_model": base_model,
         "revision": revision,
         "precision": precision,

 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
+    model_name: str,
+    preds_path: str,
+    track: str,
     base_model: str,
     revision: str,
     precision: str,
         REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
     user_name = ""
+    model_path = model_name
     if "/" in model:
+        user_name = model_name.split("/")[0]
+        model_path = model_name.split("/")[1]
     precision = precision.split(" ")[0]
     current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
     if model_type is None or model_type == "":
         return styled_error("Please select a model type.")
+    if preds_path is None or preds_path == "":
+        return styled_error("Please enter a URL where your predictions file can be downloaded.")
+    if track is None:
+        return styled_error("Please select a track.")
     # Does the model actually exist?
     if revision == "":
         revision = "main"
     print("Adding new eval")
     eval_entry = {
+        "model_name": model_name,
+        "preds_path": preds_path,
+        "track": track,
         "base_model": base_model,
         "revision": revision,
         "precision": precision,