Aaron Mueller commited on
Commit
de60bd6
β€’
1 Parent(s): b166dfb

update leaderboard

Browse files
app.py CHANGED
@@ -15,6 +15,7 @@ from src.about import (
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
 
18
  COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
@@ -50,6 +51,7 @@ except Exception:
50
 
51
 
52
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,9 +59,11 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
 
63
  return Leaderboard(
64
  value=dataframe,
65
  datatype=[c.type for c in fields(AutoEvalColumn)],
@@ -95,13 +99,17 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
 
 
 
 
102
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -142,36 +150,20 @@ with demo:
142
  row_count=5,
143
  )
144
  with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
 
147
  with gr.Row():
148
  with gr.Column():
149
  model_name_textbox = gr.Textbox(label="Model name")
 
150
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
  multiselect=False,
155
  value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
  )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
 
176
  submit_button = gr.Button("Submit Eval")
177
  submission_result = gr.Markdown()
@@ -179,11 +171,9 @@ with demo:
179
  add_new_eval,
180
  [
181
  model_name_textbox,
182
- base_model_name_textbox,
183
  revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
  ],
188
  submission_result,
189
  )
@@ -201,4 +191,4 @@ with demo:
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
18
+ BENCHMARK_COLS_MULTIMODAL,
19
  COLS,
20
  EVAL_COLS,
21
  EVAL_TYPES,
 
51
 
52
 
53
  LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
+ LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS_MULTIMODAL)
55
 
56
  (
57
  finished_eval_queue_df,
 
59
  pending_eval_queue_df,
60
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
 
62
+ def init_leaderboard(dataframe, track):
63
  if dataframe is None or dataframe.empty:
64
  raise ValueError("Leaderboard DataFrame is empty or None.")
65
+ # filter for correct track
66
+ dataframe = dataframe.loc[dataframe["track"] == track]
67
  return Leaderboard(
68
  value=dataframe,
69
  datatype=[c.type for c in fields(AutoEvalColumn)],
 
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
 
101
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
102
+ with gr.TabItem("Strict Leaderboard", elem_id="strict-benchmark-tab-table", id=0):
103
+ leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
104
+ with gr.TabItem("Strict-small Leaderboard", elem_id="strict-small-benchmark-tab-table", id=1):
105
+ leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
106
+ with gr.TabItem("Multimodal Leaderboard", elem_id="multimodal-benchmark-tab-table", id=2):
107
+ leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
108
+
109
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=4):
110
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
111
 
112
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
113
  with gr.Column():
114
  with gr.Row():
115
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
150
  row_count=5,
151
  )
152
  with gr.Row():
153
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your predictions here!", elem_classes="markdown-text")
154
 
155
  with gr.Row():
156
  with gr.Column():
157
  model_name_textbox = gr.Textbox(label="Model name")
158
+ predictions_path_textbox = gr.Textbox(label="URL to predictions file")
159
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
160
+ track_name = gr.Dropdown(
161
+ choices = ["Strict", "Strict-small", "Multimodal"],
162
+ label = "Track",
163
  multiselect=False,
164
  value=None,
165
+ interactive=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  )
 
167
 
168
  submit_button = gr.Button("Submit Eval")
169
  submission_result = gr.Markdown()
 
171
  add_new_eval,
172
  [
173
  model_name_textbox,
174
+ predictions_path_textbox,
175
  revision_name_textbox,
176
+ track_name
 
 
177
  ],
178
  submission_result,
179
  )
 
191
  scheduler = BackgroundScheduler()
192
  scheduler.add_job(restart_space, "interval", seconds=1800)
193
  scheduler.start()
194
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -12,8 +12,19 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
@@ -21,52 +32,38 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
  ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
 
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
  ## Some good practices before submitting a model
42
 
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
  ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
 
58
  ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
60
 
61
  ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
72
  """
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("blimp", "acc", "BLiMP")
16
+ task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
17
+ task2 = Task("glue", "acc", "(Super)GLUE")
18
+ task3 = Task("ewok", "acc", "EWoK")
19
+
20
+ class TasksMultimodal(Enum):
21
+ task0 = Task("blimp", "acc", "BLiMP")
22
+ task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
23
+ task2 = Task("glue", "acc", "(Super)GLUE")
24
+ task3 = Task("ewok", "acc", "EWoK")
25
+ task4 = Task("vqa", "acc", "VQA")
26
+ task5 = Task("winoground", "acc", "Winoground")
27
+ task6 = Task("devbench", "acc", "DevBench")
28
 
29
  NUM_FEWSHOT = 0 # Change with your few shot
30
  # ---------------------------------------------------
 
32
 
33
 
34
  # Your leaderboard name
35
+ TITLE = """<h1 align="center" id="space-title">BabyLM 2024 Leaderboards</h1>"""
36
 
37
  # What does your leaderboard evaluate?
38
  INTRODUCTION_TEXT = """
39
+ The leaderboards for each track of the 2024 BabyLM Challenge.
40
  """
41
 
42
  # Which evaluations are you running? how can people reproduce what you have?
43
  LLM_BENCHMARKS_TEXT = f"""
44
  ## How it works
45
+ This leaderboard accepts predictions files as input, and uploads the results to the leaderboard. The logic is the same as in the `score_predictions.py` script from the BabyLM 2024 evaluation pipeline repository.
 
 
46
 
47
  """
48
 
49
  EVALUATION_QUEUE_TEXT = """
50
  ## Some good practices before submitting a model
51
 
52
+ ### 1) Make sure you can get scores from your prediction using the `score_predictions.py` script.
53
+ ```bash
54
+ git clone https://github.com/babylm/evaluation-pipeline-2024/
55
+ cd evaluation-pipeline-2024
56
+ python score_predictions.py path/to/your/predictions.json.gz
 
57
  ```
58
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
 
 
 
 
 
 
59
 
60
  ### 3) Make sure your model has an open license!
61
+ This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
62
 
63
  ### 4) Fill up your model card
64
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
 
 
 
 
 
65
  """
66
 
67
+ CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
68
  CITATION_BUTTON_TEXT = r"""
69
  """
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -47,10 +47,9 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
 
50
  revision = ColumnContent("revision", "str", True)
51
  private = ColumnContent("private", "bool", True)
52
- precision = ColumnContent("precision", "str", True)
53
- weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
56
  ## All the model information that we might need
@@ -60,46 +59,6 @@ class ModelDetails:
60
  display_name: str = ""
61
  symbol: str = "" # emoji
62
 
63
-
64
- class ModelType(Enum):
65
- PT = ModelDetails(name="pretrained", symbol="🟒")
66
- FT = ModelDetails(name="fine-tuned", symbol="πŸ”Ά")
67
- IFT = ModelDetails(name="instruction-tuned", symbol="β­•")
68
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
69
- Unknown = ModelDetails(name="", symbol="?")
70
-
71
- def to_str(self, separator=" "):
72
- return f"{self.value.symbol}{separator}{self.value.name}"
73
-
74
- @staticmethod
75
- def from_str(type):
76
- if "fine-tuned" in type or "πŸ”Ά" in type:
77
- return ModelType.FT
78
- if "pretrained" in type or "🟒" in type:
79
- return ModelType.PT
80
- if "RL-tuned" in type or "🟦" in type:
81
- return ModelType.RL
82
- if "instruction-tuned" in type or "β­•" in type:
83
- return ModelType.IFT
84
- return ModelType.Unknown
85
-
86
- class WeightType(Enum):
87
- Adapter = ModelDetails("Adapter")
88
- Original = ModelDetails("Original")
89
- Delta = ModelDetails("Delta")
90
-
91
- class Precision(Enum):
92
- float16 = ModelDetails("float16")
93
- bfloat16 = ModelDetails("bfloat16")
94
- Unknown = ModelDetails("?")
95
-
96
- def from_str(precision):
97
- if precision in ["torch.float16", "float16"]:
98
- return Precision.float16
99
- if precision in ["torch.bfloat16", "bfloat16"]:
100
- return Precision.bfloat16
101
- return Precision.Unknown
102
-
103
  # Column selection
104
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
105
 
@@ -107,4 +66,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, TasksMultimodal
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
49
  model = ColumnContent("model", "markdown", True)
50
+ track = ColumnContent("track", "str", True)
51
  revision = ColumnContent("revision", "str", True)
52
  private = ColumnContent("private", "bool", True)
 
 
53
  status = ColumnContent("status", "str", True)
54
 
55
  ## All the model information that we might need
 
59
  display_name: str = ""
60
  symbol: str = "" # emoji
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  # Column selection
63
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
64
 
 
66
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
67
 
68
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
69
+ BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
src/envs.py CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "babylm" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
+ REPO_ID = f"{OWNER}/leaderboard-2024"
13
+ QUEUE_REPO = f"{OWNER}/requests-2024"
14
+ RESULTS_REPO = f"{OWNER}/results-2024"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
src/leaderboard/read_evals.py CHANGED
@@ -8,7 +8,7 @@ import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
@@ -39,6 +39,7 @@ class EvalResult:
39
  data = json.load(fp)
40
 
41
  config = data.get("config")
 
42
 
43
  # Precision
44
  precision = Precision.from_str(config.get("model_dtype"))
@@ -154,7 +155,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
154
  return request_file
155
 
156
 
157
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
158
  """From the path of the results folder root, extract all needed info for results"""
159
  model_result_filepaths = []
160
 
@@ -174,6 +175,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
174
 
175
  eval_results = {}
176
  for model_result_filepath in model_result_filepaths:
 
 
 
 
 
177
  # Creation of result
178
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
179
  eval_result.update_with_request_file(requests_path)
 
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, TasksMultimodal, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
 
39
  data = json.load(fp)
40
 
41
  config = data.get("config")
42
+ track = data.get("track")
43
 
44
  # Precision
45
  precision = Precision.from_str(config.get("model_dtype"))
 
155
  return request_file
156
 
157
 
158
+ def get_raw_eval_results(results_path: str, requests_path: str, track: str) -> list[EvalResult]:
159
  """From the path of the results folder root, extract all needed info for results"""
160
  model_result_filepaths = []
161
 
 
175
 
176
  eval_results = {}
177
  for model_result_filepath in model_result_filepaths:
178
+ with open(model_result_filepath, 'r') as f:
179
+ this_track = f["track"]
180
+ if this_track != track:
181
+ continue
182
+
183
  # Creation of result
184
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
185
  eval_result.update_with_request_file(requests_path)
src/populate.py CHANGED
@@ -10,7 +10,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
@@ -21,7 +21,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
24
-
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
  """Creates the different dataframes for the evaluation queues requestes"""
27
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
+ raw_data = get_raw_eval_results(results_path, requests_path, track)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
 
21
  df = df[has_no_nan_values(df, benchmark_cols)]
22
  return df
23
 
 
24
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
25
  """Creates the different dataframes for the evaluation queues requestes"""
26
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
src/submission/submit.py CHANGED
@@ -15,7 +15,9 @@ REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
- model: str,
 
 
19
  base_model: str,
20
  revision: str,
21
  precision: str,
@@ -28,10 +30,10 @@ def add_new_eval(
28
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
  user_name = ""
31
- model_path = model
32
  if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
 
36
  precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -39,6 +41,12 @@ def add_new_eval(
39
  if model_type is None or model_type == "":
40
  return styled_error("Please select a model type.")
41
 
 
 
 
 
 
 
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
@@ -76,7 +84,9 @@ def add_new_eval(
76
  print("Adding new eval")
77
 
78
  eval_entry = {
79
- "model": model,
 
 
80
  "base_model": base_model,
81
  "revision": revision,
82
  "precision": precision,
 
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
  def add_new_eval(
18
+ model_name: str,
19
+ preds_path: str,
20
+ track: str,
21
  base_model: str,
22
  revision: str,
23
  precision: str,
 
30
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
31
 
32
  user_name = ""
33
+ model_path = model_name
34
  if "/" in model:
35
+ user_name = model_name.split("/")[0]
36
+ model_path = model_name.split("/")[1]
37
 
38
  precision = precision.split(" ")[0]
39
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
41
  if model_type is None or model_type == "":
42
  return styled_error("Please select a model type.")
43
 
44
+ if preds_path is None or preds_path == "":
45
+ return styled_error("Please enter a URL where your predictions file can be downloaded.")
46
+
47
+ if track is None:
48
+ return styled_error("Please select a track.")
49
+
50
  # Does the model actually exist?
51
  if revision == "":
52
  revision = "main"
 
84
  print("Adding new eval")
85
 
86
  eval_entry = {
87
+ "model_name": model_name,
88
+ "preds_path": preds_path,
89
+ "track": track,
90
  "base_model": base_model,
91
  "revision": revision,
92
  "precision": precision,