Spaces:
Running
Running
Aaron Mueller
commited on
Commit
·
e1db744
1
Parent(s):
3d10b83
update leaderboard logic
Browse files- app.py +7 -7
- src/about.py +1 -0
- src/display/formatting.py +2 -2
- src/display/utils.py +2 -0
- src/leaderboard/read_evals.py +15 -5
- src/populate.py +6 -2
- src/submission/submit.py +7 -3
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import json
|
2 |
import gzip
|
|
|
3 |
import gradio as gr
|
4 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
5 |
import pandas as pd
|
@@ -81,7 +82,6 @@ def init_leaderboard(dataframe, track):
|
|
81 |
interactive=False,
|
82 |
)
|
83 |
|
84 |
-
submitted_predictions = {}
|
85 |
def process_json(temp_file):
|
86 |
if temp_file is None:
|
87 |
return {}
|
@@ -92,11 +92,9 @@ def process_json(temp_file):
|
|
92 |
if file_path.endswith('.gz'):
|
93 |
with gzip.open(file_path, 'rt') as f:
|
94 |
data = json.load(f)
|
95 |
-
submitted_predictions.update(data)
|
96 |
else:
|
97 |
with open(file_path, 'r') as f:
|
98 |
data = json.load(f)
|
99 |
-
submitted_predictions.update(data)
|
100 |
except Exception as e:
|
101 |
raise gr.Error(f"Error processing file: {str(e)}")
|
102 |
|
@@ -164,7 +162,8 @@ with demo:
|
|
164 |
|
165 |
with gr.Row():
|
166 |
with gr.Column():
|
167 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
168 |
revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
|
169 |
track_name = gr.Dropdown(
|
170 |
choices = ["strict", "strict-small", "multimodal"],
|
@@ -174,12 +173,12 @@ with demo:
|
|
174 |
interactive=True
|
175 |
)
|
176 |
|
|
|
177 |
upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
|
178 |
-
output_json = gr.JSON(label="Processed JSON")
|
179 |
upload_button.upload(
|
180 |
fn=process_json,
|
181 |
inputs=upload_button,
|
182 |
-
outputs=
|
183 |
api_name="upload_json"
|
184 |
)
|
185 |
|
@@ -189,9 +188,10 @@ with demo:
|
|
189 |
add_new_eval,
|
190 |
[
|
191 |
model_name_textbox,
|
|
|
192 |
revision_name_textbox,
|
193 |
track_name,
|
194 |
-
|
195 |
],
|
196 |
submission_result,
|
197 |
)
|
|
|
1 |
import json
|
2 |
import gzip
|
3 |
+
import shutils
|
4 |
import gradio as gr
|
5 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
6 |
import pandas as pd
|
|
|
82 |
interactive=False,
|
83 |
)
|
84 |
|
|
|
85 |
def process_json(temp_file):
|
86 |
if temp_file is None:
|
87 |
return {}
|
|
|
92 |
if file_path.endswith('.gz'):
|
93 |
with gzip.open(file_path, 'rt') as f:
|
94 |
data = json.load(f)
|
|
|
95 |
else:
|
96 |
with open(file_path, 'r') as f:
|
97 |
data = json.load(f)
|
|
|
98 |
except Exception as e:
|
99 |
raise gr.Error(f"Error processing file: {str(e)}")
|
100 |
|
|
|
162 |
|
163 |
with gr.Row():
|
164 |
with gr.Column():
|
165 |
+
model_name_textbox = gr.Textbox(label="Model name. This will be displayed on the leaderboard.")
|
166 |
+
model_id_textbox = gr.Textbox(label="Huggingface model ID (if applicable). This looks like `owner/repo_id`, not like a URL.", placeholder="")
|
167 |
revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
|
168 |
track_name = gr.Dropdown(
|
169 |
choices = ["strict", "strict-small", "multimodal"],
|
|
|
173 |
interactive=True
|
174 |
)
|
175 |
|
176 |
+
predictions_data = gr.State()
|
177 |
upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
|
|
|
178 |
upload_button.upload(
|
179 |
fn=process_json,
|
180 |
inputs=upload_button,
|
181 |
+
outputs=predictions_data,
|
182 |
api_name="upload_json"
|
183 |
)
|
184 |
|
|
|
188 |
add_new_eval,
|
189 |
[
|
190 |
model_name_textbox,
|
191 |
+
model_id_textbox,
|
192 |
revision_name_textbox,
|
193 |
track_name,
|
194 |
+
predictions_data,
|
195 |
],
|
196 |
submission_result,
|
197 |
)
|
src/about.py
CHANGED
@@ -61,4 +61,5 @@ Make sure your model has an open license! This is a leaderboard that is meant to
|
|
61 |
|
62 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
63 |
CITATION_BUTTON_TEXT = r"""
|
|
|
64 |
"""
|
|
|
61 |
|
62 |
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
63 |
CITATION_BUTTON_TEXT = r"""
|
64 |
+
Stay tuned!
|
65 |
"""
|
src/display/formatting.py
CHANGED
@@ -2,8 +2,8 @@ def model_hyperlink(link, model_name):
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
-
def make_clickable_model(model_name):
|
6 |
-
link = f"https://huggingface.co/{
|
7 |
return model_hyperlink(link, model_name)
|
8 |
|
9 |
|
|
|
2 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
3 |
|
4 |
|
5 |
+
def make_clickable_model(model_repo, model_name):
|
6 |
+
link = f"https://huggingface.co/{model_repo}"
|
7 |
return model_hyperlink(link, model_name)
|
8 |
|
9 |
|
src/display/utils.py
CHANGED
@@ -25,6 +25,7 @@ auto_eval_column_dict = []
|
|
25 |
auto_eval_column_dict_multimodal = []
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
28 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
29 |
#Scores
|
30 |
for task in Tasks:
|
@@ -35,6 +36,7 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
|
|
35 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
36 |
|
37 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
|
|
38 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
39 |
for task in TasksMultimodal:
|
40 |
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
|
|
25 |
auto_eval_column_dict_multimodal = []
|
26 |
# Init
|
27 |
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
28 |
+
auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
29 |
auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
30 |
#Scores
|
31 |
for task in Tasks:
|
|
|
36 |
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
37 |
|
38 |
auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
39 |
+
auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
|
40 |
auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
|
41 |
for task in TasksMultimodal:
|
42 |
auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
src/leaderboard/read_evals.py
CHANGED
@@ -17,7 +17,8 @@ class EvalResult:
|
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
eval_name: str # org_model_track (uid)
|
20 |
-
full_model: str # org/model (
|
|
|
21 |
track: str
|
22 |
org: str
|
23 |
model: str
|
@@ -37,6 +38,7 @@ class EvalResult:
|
|
37 |
|
38 |
# Get model and org
|
39 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
|
|
40 |
org_and_model = org_and_model.split("/", 1)
|
41 |
|
42 |
if len(org_and_model) == 1:
|
@@ -49,7 +51,7 @@ class EvalResult:
|
|
49 |
eval_name = "_".join(org_and_model) + f"_{track}"
|
50 |
|
51 |
still_on_hub, _, model_config = is_model_on_hub(
|
52 |
-
|
53 |
)
|
54 |
|
55 |
def _get_task_results(task):
|
@@ -80,6 +82,7 @@ class EvalResult:
|
|
80 |
return self(
|
81 |
eval_name=eval_name,
|
82 |
full_model=full_model,
|
|
|
83 |
track=track,
|
84 |
org=org,
|
85 |
model=model,
|
@@ -103,10 +106,16 @@ class EvalResult:
|
|
103 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
104 |
eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
|
105 |
vision_tasks = ("VQA", "Winoground", "DevBench", "vqa", "winoground", "devbench")
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
107 |
data_dict = {
|
108 |
"eval_name": self.eval_name, # not a column, just a save name,
|
109 |
-
eval_column.model.name:
|
|
|
110 |
eval_column.revision.name: self.revision,
|
111 |
eval_column.text_average.name: text_average,
|
112 |
eval_column.still_on_hub.name: self.still_on_hub,
|
@@ -114,7 +123,8 @@ class EvalResult:
|
|
114 |
|
115 |
if self.track.lower() == "multimodal":
|
116 |
taskset = TasksMultimodal
|
117 |
-
|
|
|
118 |
data_dict[eval_column.vision_average.name] = vision_average
|
119 |
else:
|
120 |
taskset = Tasks
|
|
|
17 |
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
"""
|
19 |
eval_name: str # org_model_track (uid)
|
20 |
+
full_model: str # org/model (name of model)
|
21 |
+
repo_id: str # org/model (path to model on HF)
|
22 |
track: str
|
23 |
org: str
|
24 |
model: str
|
|
|
38 |
|
39 |
# Get model and org
|
40 |
org_and_model = config.get("model_name", config.get("model_args", None))
|
41 |
+
repo_id = config.get("hf_repo", config.get("hf_repo", None))
|
42 |
org_and_model = org_and_model.split("/", 1)
|
43 |
|
44 |
if len(org_and_model) == 1:
|
|
|
51 |
eval_name = "_".join(org_and_model) + f"_{track}"
|
52 |
|
53 |
still_on_hub, _, model_config = is_model_on_hub(
|
54 |
+
repo_id, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
55 |
)
|
56 |
|
57 |
def _get_task_results(task):
|
|
|
82 |
return self(
|
83 |
eval_name=eval_name,
|
84 |
full_model=full_model,
|
85 |
+
repo_id=repo_id,
|
86 |
track=track,
|
87 |
org=org,
|
88 |
model=model,
|
|
|
106 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
107 |
eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
|
108 |
vision_tasks = ("VQA", "Winoground", "DevBench", "vqa", "winoground", "devbench")
|
109 |
+
num_text_tasks = len(Tasks)
|
110 |
+
text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / num_text_tasks
|
111 |
+
if self.still_on_hub:
|
112 |
+
model_display_name = make_clickable_model(self.full_model)
|
113 |
+
else:
|
114 |
+
model_display_name = self.full_model
|
115 |
data_dict = {
|
116 |
"eval_name": self.eval_name, # not a column, just a save name,
|
117 |
+
eval_column.model.name: model_display_name,
|
118 |
+
eval_column.hf_repo.name: self.repo_id,
|
119 |
eval_column.revision.name: self.revision,
|
120 |
eval_column.text_average.name: text_average,
|
121 |
eval_column.still_on_hub.name: self.still_on_hub,
|
|
|
123 |
|
124 |
if self.track.lower() == "multimodal":
|
125 |
taskset = TasksMultimodal
|
126 |
+
num_vision_tasks = len(TasksMultimodal) - len(Tasks)
|
127 |
+
vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / num_vision_tasks
|
128 |
data_dict[eval_column.vision_average.name] = vision_average
|
129 |
else:
|
130 |
taskset = Tasks
|
src/populate.py
CHANGED
@@ -41,8 +41,12 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
41 |
with open(file_path) as fp:
|
42 |
data = json.load(fp)
|
43 |
|
44 |
-
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
|
47 |
all_evals.append(data)
|
48 |
elif ".md" not in entry:
|
|
|
41 |
with open(file_path) as fp:
|
42 |
data = json.load(fp)
|
43 |
|
44 |
+
if data["still_on_hub"]:
|
45 |
+
data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
|
46 |
+
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
47 |
+
else:
|
48 |
+
data[EvalQueueColumn.model.name] = data["model"]
|
49 |
+
data[EvalQueueColumn.revision.name] = "N/A"
|
50 |
|
51 |
all_evals.append(data)
|
52 |
elif ".md" not in entry:
|
src/submission/submit.py
CHANGED
@@ -16,6 +16,7 @@ USERS_TO_SUBMISSION_DATES = None
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model_name: str,
|
|
|
19 |
revision: str,
|
20 |
track: str,
|
21 |
predictions: dict,
|
@@ -25,6 +26,8 @@ def add_new_eval(
|
|
25 |
if not REQUESTED_MODELS:
|
26 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
27 |
|
|
|
|
|
28 |
user_name = ""
|
29 |
model_path = model_name
|
30 |
if "/" in model_name:
|
@@ -42,19 +45,20 @@ def add_new_eval(
|
|
42 |
|
43 |
# Is the model info correctly filled?
|
44 |
try:
|
45 |
-
model_info = API.model_info(repo_id=
|
46 |
except Exception:
|
47 |
-
return
|
48 |
|
49 |
modelcard_OK, error_msg = check_model_card(model_name)
|
50 |
if not modelcard_OK:
|
51 |
-
return
|
52 |
|
53 |
# Seems good, creating the eval
|
54 |
print("Adding new eval")
|
55 |
|
56 |
eval_entry = {
|
57 |
"model_name": model_name,
|
|
|
58 |
"revision": revision,
|
59 |
"track": track,
|
60 |
"predictions": predictions,
|
|
|
16 |
|
17 |
def add_new_eval(
|
18 |
model_name: str,
|
19 |
+
model_id: str,
|
20 |
revision: str,
|
21 |
track: str,
|
22 |
predictions: dict,
|
|
|
26 |
if not REQUESTED_MODELS:
|
27 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
28 |
|
29 |
+
out_message = ""
|
30 |
+
|
31 |
user_name = ""
|
32 |
model_path = model_name
|
33 |
if "/" in model_name:
|
|
|
45 |
|
46 |
# Is the model info correctly filled?
|
47 |
try:
|
48 |
+
model_info = API.model_info(repo_id=model_id, revision=revision)
|
49 |
except Exception:
|
50 |
+
return styled_warning("Could not get your model information. Please fill it up properly.")
|
51 |
|
52 |
modelcard_OK, error_msg = check_model_card(model_name)
|
53 |
if not modelcard_OK:
|
54 |
+
return styled_warning(error_msg)
|
55 |
|
56 |
# Seems good, creating the eval
|
57 |
print("Adding new eval")
|
58 |
|
59 |
eval_entry = {
|
60 |
"model_name": model_name,
|
61 |
+
"hf_repo": model_id,
|
62 |
"revision": revision,
|
63 |
"track": track,
|
64 |
"predictions": predictions,
|