Spaces:

babylm
/

leaderboard-2024

Runtime error

App Files Files Community

Aaron Mueller commited on Nov 23, 2024

Commit

6a3b9c1

•

1 Parent(s): 80e4e0d

submission page

Browse files

Files changed (3) hide show

app.py +26 -8
src/about.py +3 -7
src/display/utils.py +3 -2

app.py CHANGED Viewed

@@ -1,8 +1,11 @@
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
@@ -78,6 +81,18 @@ def init_leaderboard(dataframe, track):
         interactive=False,
     )
 demo = gr.Blocks(css=custom_css)
 with demo:
@@ -85,11 +100,11 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("Strict Leaderboard", elem_id="strict-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
-        with gr.TabItem("Strict-small Leaderboard", elem_id="strict-small-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
-        with gr.TabItem("Multimodal Leaderboard", elem_id="multimodal-benchmark-tab-table", id=2):
             leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
@@ -141,25 +156,28 @@ with demo:
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
-                    predictions_path_textbox = gr.Textbox(label="URL to predictions file")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
                     track_name = gr.Dropdown(
-                        choices = ["Strict", "Strict-small", "Multimodal"],
                         label = "Track",
                         multiselect=False,
                         value=None,
                         interactive=True
                     )
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
                 add_new_eval,
                 [
                     model_name_textbox,
-                    predictions_path_textbox,
                     revision_name_textbox,
-                    track_name
                 ],
                 submission_result,
             )

+import json
+import gzip
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
+from io import StringIO
 from src.about import (
     CITATION_BUTTON_LABEL,
         interactive=False,
     )
+def process_json(temp_file):
+    if isinstance(temp_file, str):
+        obj = json.loads(temp_file)
+    else:
+        try:
+            with gzip.open(temp_file, 'rt') as header:
+                obj = json.loads(header)
+        except:
+            with open(temp_file, 'r') as header:
+                obj = json.loads(header)
+    return obj
 demo = gr.Blocks(css=custom_css)
 with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
             leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
+        with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
             leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
+        with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
             leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
         with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
             with gr.Row():
                 with gr.Column():
                     model_name_textbox = gr.Textbox(label="Model name")
+                    revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
                     track_name = gr.Dropdown(
+                        choices = ["strict", "strict-small", "multimodal"],
                         label = "Track",
                         multiselect=False,
                         value=None,
                         interactive=True
                     )
+                    upload_button = gr.UploadButton(label="Upload predictions", file_types = ['.json', '.json.gz'], live=True, file_count = "single")
+                    predictions = {}
+                    upload_button.upload(fn=process_json, inputs=upload_button, outputs=predictions, api_name="upload_json")
             submit_button = gr.Button("Submit Eval")
             submission_result = gr.Markdown()
             submit_button.click(
                 add_new_eval,
                 [
                     model_name_textbox,
                     revision_name_textbox,
+                    track_name,
+                    upload_button,
                 ],
                 submission_result,
             )

src/about.py CHANGED Viewed

@@ -46,9 +46,9 @@ This leaderboard accepts predictions files as input, and uploads the results to
 """
 EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can get scores from your prediction using the `score_predictions.py` script.
 ```bash
 git clone https://github.com/babylm/evaluation-pipeline-2024/
 cd evaluation-pipeline-2024
@@ -56,11 +56,7 @@ python score_predictions.py path/to/your/predictions.json.gz
 ```
 If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
-### 3) Make sure your model has an open license!
-This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"

 """
 EVALUATION_QUEUE_TEXT = """
+## Some good practices before submitting a model:
+Make sure you can get scores from your prediction using the `score_predictions.py` script.
 ```bash
 git clone https://github.com/babylm/evaluation-pipeline-2024/
 cd evaluation-pipeline-2024
 ```
 If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
+Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
 """
 CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"

src/display/utils.py CHANGED Viewed

@@ -27,18 +27,19 @@ auto_eval_column_dict_multimodal = []
 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 #Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
-auto_eval_column_dict_multimodal.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
 for task in TasksMultimodal:
     auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])

 auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 #Scores
 for task in Tasks:
     auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 # Model information
+auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
 auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 for task in TasksMultimodal:
     auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
+auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
 auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
 auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])