Aaron Mueller commited on
Commit
e1db744
·
1 Parent(s): 3d10b83

update leaderboard logic

Browse files
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import gzip
 
3
  import gradio as gr
4
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
5
  import pandas as pd
@@ -81,7 +82,6 @@ def init_leaderboard(dataframe, track):
81
  interactive=False,
82
  )
83
 
84
- submitted_predictions = {}
85
  def process_json(temp_file):
86
  if temp_file is None:
87
  return {}
@@ -92,11 +92,9 @@ def process_json(temp_file):
92
  if file_path.endswith('.gz'):
93
  with gzip.open(file_path, 'rt') as f:
94
  data = json.load(f)
95
- submitted_predictions.update(data)
96
  else:
97
  with open(file_path, 'r') as f:
98
  data = json.load(f)
99
- submitted_predictions.update(data)
100
  except Exception as e:
101
  raise gr.Error(f"Error processing file: {str(e)}")
102
 
@@ -164,7 +162,8 @@ with demo:
164
 
165
  with gr.Row():
166
  with gr.Column():
167
- model_name_textbox = gr.Textbox(label="Model name")
 
168
  revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
169
  track_name = gr.Dropdown(
170
  choices = ["strict", "strict-small", "multimodal"],
@@ -174,12 +173,12 @@ with demo:
174
  interactive=True
175
  )
176
 
 
177
  upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
178
- output_json = gr.JSON(label="Processed JSON")
179
  upload_button.upload(
180
  fn=process_json,
181
  inputs=upload_button,
182
- outputs=output_json,
183
  api_name="upload_json"
184
  )
185
 
@@ -189,9 +188,10 @@ with demo:
189
  add_new_eval,
190
  [
191
  model_name_textbox,
 
192
  revision_name_textbox,
193
  track_name,
194
- upload_button,
195
  ],
196
  submission_result,
197
  )
 
1
  import json
2
  import gzip
3
+ import shutils
4
  import gradio as gr
5
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
6
  import pandas as pd
 
82
  interactive=False,
83
  )
84
 
 
85
  def process_json(temp_file):
86
  if temp_file is None:
87
  return {}
 
92
  if file_path.endswith('.gz'):
93
  with gzip.open(file_path, 'rt') as f:
94
  data = json.load(f)
 
95
  else:
96
  with open(file_path, 'r') as f:
97
  data = json.load(f)
 
98
  except Exception as e:
99
  raise gr.Error(f"Error processing file: {str(e)}")
100
 
 
162
 
163
  with gr.Row():
164
  with gr.Column():
165
+ model_name_textbox = gr.Textbox(label="Model name. This will be displayed on the leaderboard.")
166
+ model_id_textbox = gr.Textbox(label="Huggingface model ID (if applicable). This looks like `owner/repo_id`, not like a URL.", placeholder="")
167
  revision_name_textbox = gr.Textbox(label="Model revision commit", placeholder="main")
168
  track_name = gr.Dropdown(
169
  choices = ["strict", "strict-small", "multimodal"],
 
173
  interactive=True
174
  )
175
 
176
+ predictions_data = gr.State()
177
  upload_button = gr.UploadButton(label="Upload predictions", file_types=[".json", ".gz"], file_count="single")
 
178
  upload_button.upload(
179
  fn=process_json,
180
  inputs=upload_button,
181
+ outputs=predictions_data,
182
  api_name="upload_json"
183
  )
184
 
 
188
  add_new_eval,
189
  [
190
  model_name_textbox,
191
+ model_id_textbox,
192
  revision_name_textbox,
193
  track_name,
194
+ predictions_data,
195
  ],
196
  submission_result,
197
  )
src/about.py CHANGED
@@ -61,4 +61,5 @@ Make sure your model has an open license! This is a leaderboard that is meant to
61
 
62
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
63
  CITATION_BUTTON_TEXT = r"""
 
64
  """
 
61
 
62
  CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
63
  CITATION_BUTTON_TEXT = r"""
64
+ Stay tuned!
65
  """
src/display/formatting.py CHANGED
@@ -2,8 +2,8 @@ def model_hyperlink(link, model_name):
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
  return model_hyperlink(link, model_name)
8
 
9
 
 
2
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
 
4
 
5
+ def make_clickable_model(model_repo, model_name):
6
+ link = f"https://huggingface.co/{model_repo}"
7
  return model_hyperlink(link, model_name)
8
 
9
 
src/display/utils.py CHANGED
@@ -25,6 +25,7 @@ auto_eval_column_dict = []
25
  auto_eval_column_dict_multimodal = []
26
  # Init
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
28
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
29
  #Scores
30
  for task in Tasks:
@@ -35,6 +36,7 @@ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Avai
35
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
36
 
37
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
 
38
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
39
  for task in TasksMultimodal:
40
  auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
 
25
  auto_eval_column_dict_multimodal = []
26
  # Init
27
  auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
+ auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
29
  auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
30
  #Scores
31
  for task in Tasks:
 
36
  auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
37
 
38
  auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
39
+ auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
40
  auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
41
  for task in TasksMultimodal:
42
  auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
src/leaderboard/read_evals.py CHANGED
@@ -17,7 +17,8 @@ class EvalResult:
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
  eval_name: str # org_model_track (uid)
20
- full_model: str # org/model (path on hub)
 
21
  track: str
22
  org: str
23
  model: str
@@ -37,6 +38,7 @@ class EvalResult:
37
 
38
  # Get model and org
39
  org_and_model = config.get("model_name", config.get("model_args", None))
 
40
  org_and_model = org_and_model.split("/", 1)
41
 
42
  if len(org_and_model) == 1:
@@ -49,7 +51,7 @@ class EvalResult:
49
  eval_name = "_".join(org_and_model) + f"_{track}"
50
 
51
  still_on_hub, _, model_config = is_model_on_hub(
52
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
53
  )
54
 
55
  def _get_task_results(task):
@@ -80,6 +82,7 @@ class EvalResult:
80
  return self(
81
  eval_name=eval_name,
82
  full_model=full_model,
 
83
  track=track,
84
  org=org,
85
  model=model,
@@ -103,10 +106,16 @@ class EvalResult:
103
  """Converts the Eval Result to a dict compatible with our dataframe display"""
104
  eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
105
  vision_tasks = ("VQA", "Winoground", "DevBench", "vqa", "winoground", "devbench")
106
- text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / len(Tasks)
 
 
 
 
 
107
  data_dict = {
108
  "eval_name": self.eval_name, # not a column, just a save name,
109
- eval_column.model.name: make_clickable_model(self.full_model),
 
110
  eval_column.revision.name: self.revision,
111
  eval_column.text_average.name: text_average,
112
  eval_column.still_on_hub.name: self.still_on_hub,
@@ -114,7 +123,8 @@ class EvalResult:
114
 
115
  if self.track.lower() == "multimodal":
116
  taskset = TasksMultimodal
117
- vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / len(Tasks)
 
118
  data_dict[eval_column.vision_average.name] = vision_average
119
  else:
120
  taskset = Tasks
 
17
  """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
  """
19
  eval_name: str # org_model_track (uid)
20
+ full_model: str # org/model (name of model)
21
+ repo_id: str # org/model (path to model on HF)
22
  track: str
23
  org: str
24
  model: str
 
38
 
39
  # Get model and org
40
  org_and_model = config.get("model_name", config.get("model_args", None))
41
+ repo_id = config.get("hf_repo", config.get("hf_repo", None))
42
  org_and_model = org_and_model.split("/", 1)
43
 
44
  if len(org_and_model) == 1:
 
51
  eval_name = "_".join(org_and_model) + f"_{track}"
52
 
53
  still_on_hub, _, model_config = is_model_on_hub(
54
+ repo_id, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
55
  )
56
 
57
  def _get_task_results(task):
 
82
  return self(
83
  eval_name=eval_name,
84
  full_model=full_model,
85
+ repo_id=repo_id,
86
  track=track,
87
  org=org,
88
  model=model,
 
106
  """Converts the Eval Result to a dict compatible with our dataframe display"""
107
  eval_column = AutoEvalColumnMultimodal if self.track.lower() == "multimodal" else AutoEvalColumn
108
  vision_tasks = ("VQA", "Winoground", "DevBench", "vqa", "winoground", "devbench")
109
+ num_text_tasks = len(Tasks)
110
+ text_average = sum([v for k, v in self.results.items() if v is not None and k not in vision_tasks]) / num_text_tasks
111
+ if self.still_on_hub:
112
+ model_display_name = make_clickable_model(self.full_model)
113
+ else:
114
+ model_display_name = self.full_model
115
  data_dict = {
116
  "eval_name": self.eval_name, # not a column, just a save name,
117
+ eval_column.model.name: model_display_name,
118
+ eval_column.hf_repo.name: self.repo_id,
119
  eval_column.revision.name: self.revision,
120
  eval_column.text_average.name: text_average,
121
  eval_column.still_on_hub.name: self.still_on_hub,
 
123
 
124
  if self.track.lower() == "multimodal":
125
  taskset = TasksMultimodal
126
+ num_vision_tasks = len(TasksMultimodal) - len(Tasks)
127
+ vision_average = sum([v for k, v in self.results.items() if v is not None and k in vision_tasks]) / num_vision_tasks
128
  data_dict[eval_column.vision_average.name] = vision_average
129
  else:
130
  taskset = Tasks
src/populate.py CHANGED
@@ -41,8 +41,12 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
41
  with open(file_path) as fp:
42
  data = json.load(fp)
43
 
44
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
45
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
 
 
 
 
46
 
47
  all_evals.append(data)
48
  elif ".md" not in entry:
 
41
  with open(file_path) as fp:
42
  data = json.load(fp)
43
 
44
+ if data["still_on_hub"]:
45
+ data[EvalQueueColumn.model.name] = make_clickable_model(data["hf_repo"], data["model"])
46
+ data[EvalQueueColumn.revision.name] = data.get("revision", "main")
47
+ else:
48
+ data[EvalQueueColumn.model.name] = data["model"]
49
+ data[EvalQueueColumn.revision.name] = "N/A"
50
 
51
  all_evals.append(data)
52
  elif ".md" not in entry:
src/submission/submit.py CHANGED
@@ -16,6 +16,7 @@ USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
  model_name: str,
 
19
  revision: str,
20
  track: str,
21
  predictions: dict,
@@ -25,6 +26,8 @@ def add_new_eval(
25
  if not REQUESTED_MODELS:
26
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
27
 
 
 
28
  user_name = ""
29
  model_path = model_name
30
  if "/" in model_name:
@@ -42,19 +45,20 @@ def add_new_eval(
42
 
43
  # Is the model info correctly filled?
44
  try:
45
- model_info = API.model_info(repo_id=model_name, revision=revision)
46
  except Exception:
47
- return styled_error("Could not get your model information. Please fill it up properly.")
48
 
49
  modelcard_OK, error_msg = check_model_card(model_name)
50
  if not modelcard_OK:
51
- return styled_error(error_msg)
52
 
53
  # Seems good, creating the eval
54
  print("Adding new eval")
55
 
56
  eval_entry = {
57
  "model_name": model_name,
 
58
  "revision": revision,
59
  "track": track,
60
  "predictions": predictions,
 
16
 
17
  def add_new_eval(
18
  model_name: str,
19
+ model_id: str,
20
  revision: str,
21
  track: str,
22
  predictions: dict,
 
26
  if not REQUESTED_MODELS:
27
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
28
 
29
+ out_message = ""
30
+
31
  user_name = ""
32
  model_path = model_name
33
  if "/" in model_name:
 
45
 
46
  # Is the model info correctly filled?
47
  try:
48
+ model_info = API.model_info(repo_id=model_id, revision=revision)
49
  except Exception:
50
+ return styled_warning("Could not get your model information. Please fill it up properly.")
51
 
52
  modelcard_OK, error_msg = check_model_card(model_name)
53
  if not modelcard_OK:
54
+ return styled_warning(error_msg)
55
 
56
  # Seems good, creating the eval
57
  print("Adding new eval")
58
 
59
  eval_entry = {
60
  "model_name": model_name,
61
+ "hf_repo": model_id,
62
  "revision": revision,
63
  "track": track,
64
  "predictions": predictions,