cdminix commited on
Commit
9c2d40e
β€’
1 Parent(s): 60c520b
app.py CHANGED
@@ -22,7 +22,7 @@ from src.display.utils import (
22
  ModelType,
23
  fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,18 +32,29 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
@@ -57,6 +68,7 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
@@ -80,125 +92,137 @@ def init_leaderboard(dataframe):
80
  max=150,
81
  label="Select the number of parameters (B)",
82
  ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
91
 
92
- demo = gr.Blocks(css=custom_css)
93
- with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
-
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
  with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  )
132
 
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
  )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
  with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
  )
158
 
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
 
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
203
  scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
 
22
  ModelType,
23
  fields,
24
  WeightType,
25
+ Precision,
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
+
36
  ### Space initialisation
37
  try:
38
  print(EVAL_REQUESTS_PATH)
39
  snapshot_download(
40
+ repo_id=QUEUE_REPO,
41
+ local_dir=EVAL_REQUESTS_PATH,
42
+ repo_type="dataset",
43
+ tqdm_class=None,
44
+ etag_timeout=30,
45
+ token=TOKEN,
46
  )
47
  except Exception:
48
  restart_space()
49
  try:
50
  print(EVAL_RESULTS_PATH)
51
  snapshot_download(
52
+ repo_id=RESULTS_REPO,
53
+ local_dir=EVAL_RESULTS_PATH,
54
+ repo_type="dataset",
55
+ tqdm_class=None,
56
+ etag_timeout=30,
57
+ token=TOKEN,
58
  )
59
  except Exception:
60
  restart_space()
 
68
  pending_eval_queue_df,
69
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
70
 
71
+
72
  def init_leaderboard(dataframe):
73
  if dataframe is None or dataframe.empty:
74
  raise ValueError("Leaderboard DataFrame is empty or None.")
 
92
  max=150,
93
  label="Select the number of parameters (B)",
94
  ),
95
+ ColumnFilter(AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True),
 
 
96
  ],
97
  bool_checkboxgroup_label="Hide models",
98
  interactive=False,
99
  )
100
 
101
 
102
+ def show_leaderboard(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None):
103
+ global demo
104
+
105
+ if profile or True:
106
+ print(f"Logged in as {profile.name}")
107
+ with demo:
108
+ gr.HTML(TITLE)
109
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
110
+
111
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
112
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
113
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
114
+
115
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
116
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
117
+
118
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
119
+ with gr.Column():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  with gr.Row():
121
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
122
+
123
+ with gr.Column():
124
+ with gr.Accordion(
125
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
126
+ open=False,
127
+ ):
128
+ with gr.Row():
129
+ finished_eval_table = gr.components.Dataframe(
130
+ value=finished_eval_queue_df,
131
+ headers=EVAL_COLS,
132
+ datatype=EVAL_TYPES,
133
+ row_count=5,
134
+ )
135
+ with gr.Accordion(
136
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
137
+ open=False,
138
+ ):
139
+ with gr.Row():
140
+ running_eval_table = gr.components.Dataframe(
141
+ value=running_eval_queue_df,
142
+ headers=EVAL_COLS,
143
+ datatype=EVAL_TYPES,
144
+ row_count=5,
145
+ )
146
+
147
+ with gr.Accordion(
148
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
149
+ open=False,
150
+ ):
151
+ with gr.Row():
152
+ pending_eval_table = gr.components.Dataframe(
153
+ value=pending_eval_queue_df,
154
+ headers=EVAL_COLS,
155
+ datatype=EVAL_TYPES,
156
+ row_count=5,
157
+ )
158
+ with gr.Row():
159
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
160
+
161
+ with gr.Row():
162
+ with gr.Column():
163
+ model_name_textbox = gr.Textbox(label="Model name")
164
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
165
+ model_type = gr.Dropdown(
166
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
167
+ label="Model type",
168
+ multiselect=False,
169
+ value=None,
170
+ interactive=True,
171
  )
172
 
173
+ with gr.Column():
174
+ precision = gr.Dropdown(
175
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
176
+ label="Precision",
177
+ multiselect=False,
178
+ value="float16",
179
+ interactive=True,
 
 
 
180
  )
181
+ weight_type = gr.Dropdown(
182
+ choices=[i.value.name for i in WeightType],
183
+ label="Weights type",
184
+ multiselect=False,
185
+ value="Original",
186
+ interactive=True,
187
+ )
188
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
189
+
190
+ submit_button = gr.Button("Submit Eval")
191
+ submission_result = gr.Markdown()
192
+ # submit_button.click(
193
+ # add_new_eval,
194
+ # [
195
+ # model_name_textbox,
196
+ # base_model_name_textbox,
197
+ # revision_name_textbox,
198
+ # precision,
199
+ # weight_type,
200
+ # model_type,
201
+ # ],
202
+ # submission_result,
203
+ # )
204
 
205
  with gr.Row():
206
+ with gr.Accordion("πŸ“™ Citation", open=False):
207
+ citation_button = gr.Textbox(
208
+ value=CITATION_BUTTON_TEXT,
209
+ label=CITATION_BUTTON_LABEL,
210
+ lines=20,
211
+ elem_id="citation-button",
212
+ show_copy_button=True,
 
 
213
  )
214
 
215
+
216
+ demo = gr.Blocks(css=custom_css)
217
+
218
+ with demo:
219
+ gr.LoginButton()
220
+ m1 = gr.Markdown("Please login to see the leaderboard.")
221
+ demo.load(show_leaderboard, inputs=None, outputs=m1)
222
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
224
  scheduler = BackgroundScheduler()
225
  scheduler.add_job(restart_space, "interval", seconds=1800)
226
  scheduler.start()
227
+ # demo.queue(default_concurrency_limit=40).launch()
228
+ demo.launch()
src/display/utils.py CHANGED
@@ -5,6 +5,7 @@ import pandas as pd
5
 
6
  from src.about import Tasks
7
 
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
@@ -20,28 +21,24 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
23
- ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❀️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
42
-
43
- # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -53,12 +50,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,18 +81,20 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
  float32 = ModelDetails("float32")
95
- #qt_8bit = ModelDetails("8bit")
96
- #qt_4bit = ModelDetails("4bit")
97
- #qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
@@ -104,14 +104,15 @@ class Precision(Enum):
104
  return Precision.bfloat16
105
  if precision in ["float32"]:
106
  return Precision.float32
107
- #if precision in ["8bit"]:
108
  # return Precision.qt_8bit
109
- #if precision in ["4bit"]:
110
  # return Precision.qt_4bit
111
- #if precision in ["GPTQ", "None"]:
112
  # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
117
 
@@ -119,4 +120,3 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
119
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
120
 
121
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
122
-
 
5
 
6
  from src.about import Tasks
7
 
8
+
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
11
 
 
21
  hidden: bool = False
22
  never_hidden: bool = False
23
 
24
+
25
+ @dataclass
26
+ class AutoEvalColumn:
27
+ model_type_symbol = ColumnContent("model_type_symbol", "str", True, never_hidden=True)
28
+ model = ColumnContent("model", "markdown", True, never_hidden=True)
29
+ average = ColumnContent("average", "number", True)
30
+ anli = ColumnContent("ANLI", "number", True)
31
+ logiqa = ColumnContent("LogiQA", "number", True)
32
+ model_type = ColumnContent("model_type", "str", False)
33
+ architecture = ColumnContent("architecture", "str", False)
34
+ weight_type = ColumnContent("weight_type", "str", False, True)
35
+ precision = ColumnContent("precision", "str", False)
36
+ license = ColumnContent("license", "str", False)
37
+ params = ColumnContent("#Params (B)", "number", False)
38
+ likes = ColumnContent("Hub ❀️", "number", False)
39
+ still_on_hub = ColumnContent("Available on the hub", "bool", False)
40
+ revision = ColumnContent("Model sha", "str", False, False)
41
+
 
 
 
 
42
 
43
  ## For the queue columns in the submission tab
44
  @dataclass(frozen=True)
 
50
  weight_type = ColumnContent("weight_type", "str", "Original")
51
  status = ColumnContent("status", "str", True)
52
 
53
+
54
  ## All the model information that we might need
55
  @dataclass
56
  class ModelDetails:
57
  name: str
58
  display_name: str = ""
59
+ symbol: str = "" # emoji
60
 
61
 
62
  class ModelType(Enum):
 
81
  return ModelType.IFT
82
  return ModelType.Unknown
83
 
84
+
85
  class WeightType(Enum):
86
  Adapter = ModelDetails("Adapter")
87
  Original = ModelDetails("Original")
88
  Delta = ModelDetails("Delta")
89
 
90
+
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
94
  float32 = ModelDetails("float32")
95
+ # qt_8bit = ModelDetails("8bit")
96
+ # qt_4bit = ModelDetails("4bit")
97
+ # qt_GPTQ = ModelDetails("GPTQ")
98
  Unknown = ModelDetails("?")
99
 
100
  def from_str(precision):
 
104
  return Precision.bfloat16
105
  if precision in ["float32"]:
106
  return Precision.float32
107
+ # if precision in ["8bit"]:
108
  # return Precision.qt_8bit
109
+ # if precision in ["4bit"]:
110
  # return Precision.qt_4bit
111
+ # if precision in ["GPTQ", "None"]:
112
  # return Precision.qt_GPTQ
113
  return Precision.Unknown
114
 
115
+
116
  # Column selection
117
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
118
 
 
120
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
121
 
122
  BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
src/envs.py CHANGED
@@ -4,9 +4,9 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
@@ -14,7 +14,7 @@ QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "ttsds" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
+ CACHE_PATH = os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -9,27 +9,26 @@ import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
- from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
@@ -57,15 +56,6 @@ class EvalResult:
57
  result_key = f"{org}_{model}_{precision.value.name}"
58
  full_model = "/".join(org_and_model)
59
 
60
- still_on_hub, _, model_config = is_model_on_hub(
61
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
62
- )
63
- architecture = "?"
64
- if model_config is not None:
65
- architectures = getattr(model_config, "architectures", None)
66
- if architectures:
67
- architecture = ";".join(architectures)
68
-
69
  # Extract results available in this file (some results are split in several files)
70
  results = {}
71
  for task in Tasks:
@@ -85,10 +75,8 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
- revision= config.get("model_sha", ""),
90
- still_on_hub=still_on_hub,
91
- architecture=architecture
92
  )
93
 
94
  def update_with_request_file(self, requests_path):
@@ -105,7 +93,9 @@ class EvalResult:
105
  self.num_params = request.get("params", 0)
106
  self.date = request.get("submitted_time", "")
107
  except Exception:
108
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
@@ -146,10 +136,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
146
  for tmp_request_file in request_files:
147
  with open(tmp_request_file, "r") as f:
148
  req_content = json.load(f)
149
- if (
150
- req_content["status"] in ["FINISHED"]
151
- and req_content["precision"] == precision.split(".")[-1]
152
- ):
153
  request_file = tmp_request_file
154
  return request_file
155
 
@@ -188,7 +175,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  results = []
189
  for v in eval_results.values():
190
  try:
191
- v.to_dict() # we test if the dict version is complete
192
  results.append(v)
193
  except KeyError: # not all eval values present
194
  continue
 
9
 
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
12
 
13
 
14
  @dataclass
15
  class EvalResult:
16
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
17
+
18
+ eval_name: str # org_model_precision (uid)
19
+ full_model: str # org/model (path on hub)
20
+ org: str
21
  model: str
22
+ revision: str # commit hash, "" if main
23
  results: dict
24
  precision: Precision = Precision.Unknown
25
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
26
+ weight_type: WeightType = WeightType.Original # Original or Adapter
27
+ architecture: str = "Unknown"
28
  license: str = "?"
29
  likes: int = 0
30
  num_params: int = 0
31
+ date: str = "" # submission date of request file
32
  still_on_hub: bool = False
33
 
34
  @classmethod
 
56
  result_key = f"{org}_{model}_{precision.value.name}"
57
  full_model = "/".join(org_and_model)
58
 
 
 
 
 
 
 
 
 
 
59
  # Extract results available in this file (some results are split in several files)
60
  results = {}
61
  for task in Tasks:
 
75
  org=org,
76
  model=model,
77
  results=results,
78
+ precision=precision,
79
+ revision=config.get("model_sha", ""),
 
 
80
  )
81
 
82
  def update_with_request_file(self, requests_path):
 
93
  self.num_params = request.get("params", 0)
94
  self.date = request.get("submitted_time", "")
95
  except Exception:
96
+ print(
97
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
98
+ )
99
 
100
  def to_dict(self):
101
  """Converts the Eval Result to a dict compatible with our dataframe display"""
 
136
  for tmp_request_file in request_files:
137
  with open(tmp_request_file, "r") as f:
138
  req_content = json.load(f)
139
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
140
  request_file = tmp_request_file
141
  return request_file
142
 
 
175
  results = []
176
  for v in eval_results.values():
177
  try:
178
+ v.to_dict() # we test if the dict version is complete
179
  results.append(v)
180
  except KeyError: # not all eval values present
181
  continue
src/submission/check_validity.py CHANGED
@@ -10,69 +10,6 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
  """Gather a list of already submitted models to avoid duplicates"""
@@ -88,7 +25,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def already_submitted_models(requested_models_dir: str) -> set[str]:
15
  """Gather a list of already submitted models to avoid duplicates"""
 
25
  continue
26
  with open(os.path.join(root, file), "r") as f:
27
  info = json.load(f)
28
+ file_names.append(f"{info['model']}_{info['revision']}")
29
 
30
  # Select organisation
31
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/submission/submit.py CHANGED
@@ -4,23 +4,16 @@ from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
  model: str,
19
- base_model: str,
20
  revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
@@ -28,76 +21,33 @@ def add_new_eval(
28
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
  user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
  # Does the model actually exist?
43
  if revision == "":
44
  revision = "main"
45
 
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
  # Seems good, creating the eval
76
  print("Adding new eval")
77
 
78
  eval_entry = {
79
  "model": model,
80
- "base_model": base_model,
81
  "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
  "private": False,
91
  }
92
 
93
  # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
  return styled_warning("This model has been already submitted.")
96
 
97
  print("Creating eval file")
98
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
  with open(out_path, "w") as f:
103
  f.write(json.dumps(eval_entry))
 
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
  from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
+ from src.submission.check_validity import already_submitted_models
8
+
 
 
 
 
9
 
10
  REQUESTED_MODELS = None
11
  USERS_TO_SUBMISSION_DATES = None
12
 
13
+
14
  def add_new_eval(
15
  model: str,
 
16
  revision: str,
 
 
 
17
  ):
18
  global REQUESTED_MODELS
19
  global USERS_TO_SUBMISSION_DATES
 
21
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
22
 
23
  user_name = ""
24
+ model_name = model
 
 
 
25
 
 
26
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
27
 
 
 
 
28
  # Does the model actually exist?
29
  if revision == "":
30
  revision = "main"
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # Seems good, creating the eval
33
  print("Adding new eval")
34
 
35
  eval_entry = {
36
  "model": model,
 
37
  "revision": revision,
 
 
38
  "status": "PENDING",
39
  "submitted_time": current_time,
 
 
 
 
40
  "private": False,
41
  }
42
 
43
  # Check for duplicate submission
44
+ if f"{model}_{revision}" in REQUESTED_MODELS:
45
  return styled_warning("This model has been already submitted.")
46
 
47
  print("Creating eval file")
48
  OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
49
  os.makedirs(OUT_DIR, exist_ok=True)
50
+ out_path = f"{OUT_DIR}/{model_name}_eval_request_False_{precision}_{weight_type}.json"
51
 
52
  with open(out_path, "w") as f:
53
  f.write(json.dumps(eval_entry))