Aaron Mueller commited on
Commit
6a3b9c1
1 Parent(s): 80e4e0d

submission page

Browse files
Files changed (3) hide show
  1. app.py +26 -8
  2. src/about.py +3 -7
  3. src/display/utils.py +3 -2
app.py CHANGED
@@ -1,8 +1,11 @@
 
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
 
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
@@ -78,6 +81,18 @@ def init_leaderboard(dataframe, track):
78
  interactive=False,
79
  )
80
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  demo = gr.Blocks(css=custom_css)
83
  with demo:
@@ -85,11 +100,11 @@ with demo:
85
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
86
 
87
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
88
- with gr.TabItem("Strict Leaderboard", elem_id="strict-benchmark-tab-table", id=0):
89
  leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
90
- with gr.TabItem("Strict-small Leaderboard", elem_id="strict-small-benchmark-tab-table", id=1):
91
  leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
92
- with gr.TabItem("Multimodal Leaderboard", elem_id="multimodal-benchmark-tab-table", id=2):
93
  leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
94
 
95
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
@@ -141,25 +156,28 @@ with demo:
141
  with gr.Row():
142
  with gr.Column():
143
  model_name_textbox = gr.Textbox(label="Model name")
144
- predictions_path_textbox = gr.Textbox(label="URL to predictions file")
145
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
146
  track_name = gr.Dropdown(
147
- choices = ["Strict", "Strict-small", "Multimodal"],
148
  label = "Track",
149
  multiselect=False,
150
  value=None,
151
  interactive=True
152
  )
153
 
 
 
 
 
154
  submit_button = gr.Button("Submit Eval")
155
  submission_result = gr.Markdown()
156
  submit_button.click(
157
  add_new_eval,
158
  [
159
  model_name_textbox,
160
- predictions_path_textbox,
161
  revision_name_textbox,
162
- track_name
 
163
  ],
164
  submission_result,
165
  )
 
1
+ import json
2
+ import gzip
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
6
  from apscheduler.schedulers.background import BackgroundScheduler
7
  from huggingface_hub import snapshot_download
8
+ from io import StringIO
9
 
10
  from src.about import (
11
  CITATION_BUTTON_LABEL,
 
81
  interactive=False,
82
  )
83
 
84
+ def process_json(temp_file):
85
+ if isinstance(temp_file, str):
86
+ obj = json.loads(temp_file)
87
+ else:
88
+ try:
89
+ with gzip.open(temp_file, 'rt') as header:
90
+ obj = json.loads(header)
91
+ except:
92
+ with open(temp_file, 'r') as header:
93
+ obj = json.loads(header)
94
+ return obj
95
+
96
 
97
  demo = gr.Blocks(css=custom_css)
98
  with demo:
 
100
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
101
 
102
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
103
+ with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
104
  leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
105
+ with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
106
  leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
107
+ with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
108
  leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
109
 
110
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
 
156
  with gr.Row():
157
  with gr.Column():
158
  model_name_textbox = gr.Textbox(label="Model name")
159
+ revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
 
160
  track_name = gr.Dropdown(
161
+ choices = ["strict", "strict-small", "multimodal"],
162
  label = "Track",
163
  multiselect=False,
164
  value=None,
165
  interactive=True
166
  )
167
 
168
+ upload_button = gr.UploadButton(label="Upload predictions", file_types = ['.json', '.json.gz'], live=True, file_count = "single")
169
+ predictions = {}
170
+ upload_button.upload(fn=process_json, inputs=upload_button, outputs=predictions, api_name="upload_json")
171
+
172
  submit_button = gr.Button("Submit Eval")
173
  submission_result = gr.Markdown()
174
  submit_button.click(
175
  add_new_eval,
176
  [
177
  model_name_textbox,
 
178
  revision_name_textbox,
179
+ track_name,
180
+ upload_button,
181
  ],
182
  submission_result,
183
  )
src/about.py CHANGED
@@ -46,9 +46,9 @@ This leaderboard accepts predictions files as input, and uploads the results to
46
  """
47
 
48
  EVALUATION_QUEUE_TEXT = """
49
- ## Some good practices before submitting a model
50
 
51
- ### 1) Make sure you can get scores from your prediction using the `score_predictions.py` script.
52
  ```bash
53
  git clone https://github.com/babylm/evaluation-pipeline-2024/
54
  cd evaluation-pipeline-2024
@@ -56,11 +56,7 @@ python score_predictions.py path/to/your/predictions.json.gz
56
  ```
57
  If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
58
 
59
- ### 3) Make sure your model has an open license!
60
- This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
61
-
62
- ### 4) Fill up your model card
63
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
64
  """
65
 
66
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
 
46
  """
47
 
48
  EVALUATION_QUEUE_TEXT = """
49
+ ## Some good practices before submitting a model:
50
 
51
+ Make sure you can get scores from your prediction using the `score_predictions.py` script.
52
  ```bash
53
  git clone https://github.com/babylm/evaluation-pipeline-2024/
54
  cd evaluation-pipeline-2024
 
56
  ```
57
  If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
58
 
59
+ Make sure your model has an open license! This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
 
 
 
 
60
  """
61
 
62
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
src/display/utils.py CHANGED
@@ -27,18 +27,19 @@ auto_eval_column_dict_multimodal = []
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
29
  #Scores
30
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
31
  for task in Tasks:
32
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
33
  # Model information
 
34
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
35
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
36
 
37
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
38
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
39
- auto_eval_column_dict_multimodal.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
40
  for task in TasksMultimodal:
41
  auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
 
42
  auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
43
  auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
44
 
 
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
29
  #Scores
 
30
  for task in Tasks:
31
  auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
  # Model information
33
+ auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
34
  auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
35
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
36
 
37
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
38
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
 
39
  for task in TasksMultimodal:
40
  auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
41
+ auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
42
+ auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
43
  auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
44
  auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
45