Spaces:
Runtime error
Runtime error
Aaron Mueller
commited on
Commit
•
6a3b9c1
1
Parent(s):
80e4e0d
submission page
Browse files- app.py +26 -8
- src/about.py +3 -7
- src/display/utils.py +3 -2
app.py
CHANGED
@@ -1,8 +1,11 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
5 |
from huggingface_hub import snapshot_download
|
|
|
6 |
|
7 |
from src.about import (
|
8 |
CITATION_BUTTON_LABEL,
|
@@ -78,6 +81,18 @@ def init_leaderboard(dataframe, track):
|
|
78 |
interactive=False,
|
79 |
)
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
demo = gr.Blocks(css=custom_css)
|
83 |
with demo:
|
@@ -85,11 +100,11 @@ with demo:
|
|
85 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
86 |
|
87 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
88 |
-
with gr.TabItem("Strict
|
89 |
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
|
90 |
-
with gr.TabItem("Strict-small
|
91 |
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
|
92 |
-
with gr.TabItem("Multimodal
|
93 |
leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
|
94 |
|
95 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
@@ -141,25 +156,28 @@ with demo:
|
|
141 |
with gr.Row():
|
142 |
with gr.Column():
|
143 |
model_name_textbox = gr.Textbox(label="Model name")
|
144 |
-
|
145 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
146 |
track_name = gr.Dropdown(
|
147 |
-
choices = ["
|
148 |
label = "Track",
|
149 |
multiselect=False,
|
150 |
value=None,
|
151 |
interactive=True
|
152 |
)
|
153 |
|
|
|
|
|
|
|
|
|
154 |
submit_button = gr.Button("Submit Eval")
|
155 |
submission_result = gr.Markdown()
|
156 |
submit_button.click(
|
157 |
add_new_eval,
|
158 |
[
|
159 |
model_name_textbox,
|
160 |
-
predictions_path_textbox,
|
161 |
revision_name_textbox,
|
162 |
-
track_name
|
|
|
163 |
],
|
164 |
submission_result,
|
165 |
)
|
|
|
1 |
+
import json
|
2 |
+
import gzip
|
3 |
import gradio as gr
|
4 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
6 |
from apscheduler.schedulers.background import BackgroundScheduler
|
7 |
from huggingface_hub import snapshot_download
|
8 |
+
from io import StringIO
|
9 |
|
10 |
from src.about import (
|
11 |
CITATION_BUTTON_LABEL,
|
|
|
81 |
interactive=False,
|
82 |
)
|
83 |
|
84 |
+
def process_json(temp_file):
|
85 |
+
if isinstance(temp_file, str):
|
86 |
+
obj = json.loads(temp_file)
|
87 |
+
else:
|
88 |
+
try:
|
89 |
+
with gzip.open(temp_file, 'rt') as header:
|
90 |
+
obj = json.loads(header)
|
91 |
+
except:
|
92 |
+
with open(temp_file, 'r') as header:
|
93 |
+
obj = json.loads(header)
|
94 |
+
return obj
|
95 |
+
|
96 |
|
97 |
demo = gr.Blocks(css=custom_css)
|
98 |
with demo:
|
|
|
100 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
101 |
|
102 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
103 |
+
with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
|
104 |
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
|
105 |
+
with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
|
106 |
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
|
107 |
+
with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
|
108 |
leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
|
109 |
|
110 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
|
|
|
156 |
with gr.Row():
|
157 |
with gr.Column():
|
158 |
model_name_textbox = gr.Textbox(label="Model name")
|
159 |
+
revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
|
|
|
160 |
track_name = gr.Dropdown(
|
161 |
+
choices = ["strict", "strict-small", "multimodal"],
|
162 |
label = "Track",
|
163 |
multiselect=False,
|
164 |
value=None,
|
165 |
interactive=True
|
166 |
)
|
167 |
|
168 |
+
upload_button = gr.UploadButton(label="Upload predictions", file_types = ['.json', '.json.gz'], live=True, file_count = "single")
|
169 |
+
predictions = {}
|
170 |
+
upload_button.upload(fn=process_json, inputs=upload_button, outputs=predictions, api_name="upload_json")
|
171 |
+
|
172 |
submit_button = gr.Button("Submit Eval")
|
173 |
submission_result = gr.Markdown()
|
174 |
submit_button.click(
|
175 |
add_new_eval,
|
176 |
[
|
177 |
model_name_textbox,
|
|
|
178 |
revision_name_textbox,
|
179 |
+
track_name,
|
180 |
+
upload_button,
|
181 |
],
|
182 |
submission_result,
|
183 |
)
|
src/about.py
CHANGED
@@ -46,9 +46,9 @@ This leaderboard accepts predictions files as input, and uploads the results to
|
|
46 |
"""
|
47 |
|
48 |
EVALUATION_QUEUE_TEXT = """
|
49 |
-
## Some good practices before submitting a model
|
50 |
|
51 |
-
|
52 |
```bash
|
53 |
git clone https://github.com/babylm/evaluation-pipeline-2024/
|
54 |
cd evaluation-pipeline-2024
|
@@ -56,11 +56,7 @@ python score_predictions.py path/to/your/predictions.json.gz
|
|
56 |
```
|
57 |
If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
|
58 |
|
59 |
-
|
60 |
-
This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
|
61 |
-
|
62 |
-
### 4) Fill up your model card
|
63 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
|
64 |
"""
|
65 |
|
66 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
|
|
46 |
"""
|
47 |
|
48 |
EVALUATION_QUEUE_TEXT = """
|
49 |
+
## Some good practices before submitting a model:
|
50 |
|
51 |
+
Make sure you can get scores from your prediction using the `score_predictions.py` script.
|
52 |
```bash
|
53 |
git clone https://github.com/babylm/evaluation-pipeline-2024/
|
54 |
cd evaluation-pipeline-2024
|
|
|
56 |
```
|
57 |
If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
|
58 |
|
59 |
+
Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
|
|
|
|
|
|
|
|
|
60 |
"""
|
61 |
|
62 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
src/display/utils.py
CHANGED
@@ -27,18 +27,19 @@ auto_eval_column_dict_multimodal = []
|
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
29 |
#Scores
|
30 |
-
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
31 |
for task in Tasks:
|
32 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
33 |
# Model information
|
|
|
34 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
35 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
36 |
|
37 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
38 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
39 |
-
auto_eval_column_dict_multimodal.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
40 |
for task in TasksMultimodal:
|
41 |
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
|
|
42 |
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
43 |
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
44 |
|
|
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
29 |
#Scores
|
|
|
30 |
for task in Tasks:
|
31 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
32 |
# Model information
|
33 |
+
auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
34 |
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
35 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
36 |
|
37 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
38 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
|
|
39 |
for task in TasksMultimodal:
|
40 |
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
41 |
+
auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
|
42 |
+
auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
|
43 |
auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
44 |
auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
45 |
|