Clémentine commited on
Commit
b323764
1 Parent(s): 217b585

Added icons for types + fixed pending queue

Browse files
app.py CHANGED
@@ -99,7 +99,6 @@ def get_leaderboard_df():
99
 
100
 
101
  def get_evaluation_queue_df():
102
- # todo @saylortwift: replace the repo by the one you created for the eval queue
103
  if eval_queue:
104
  print("Pulling changes for the evaluation queue.")
105
  eval_queue.git_pull()
@@ -141,7 +140,7 @@ def get_evaluation_queue_df():
141
  data["model"] = make_clickable_model(data["model"])
142
  all_evals.append(data)
143
 
144
- pending_list = [e for e in all_evals if e["status"] == "PENDING"]
145
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
146
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
147
  df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
@@ -388,6 +387,14 @@ with demo:
388
  private = gr.Checkbox(
389
  False, label="Private", visible=not IS_PUBLIC
390
  )
 
 
 
 
 
 
 
 
391
 
392
  with gr.Column():
393
  precision = gr.Dropdown(
@@ -398,14 +405,6 @@ with demo:
398
  max_choices=1,
399
  interactive=True,
400
  )
401
- model_type = gr.Dropdown(
402
- choices=["pretrained", "fine-tuned", "with RL"],
403
- label="Model type",
404
- multiselect=False,
405
- value="pretrained",
406
- max_choices=1,
407
- interactive=True,
408
- )
409
  weight_type = gr.Dropdown(
410
  choices=["Original", "Delta", "Adapter"],
411
  label="Weights type",
 
99
 
100
 
101
  def get_evaluation_queue_df():
 
102
  if eval_queue:
103
  print("Pulling changes for the evaluation queue.")
104
  eval_queue.git_pull()
 
140
  data["model"] = make_clickable_model(data["model"])
141
  all_evals.append(data)
142
 
143
+ pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
144
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
145
  finished_list = [e for e in all_evals if e["status"].startswith("FINISHED")]
146
  df_pending = pd.DataFrame.from_records(pending_list, columns=EVAL_COLS)
 
387
  private = gr.Checkbox(
388
  False, label="Private", visible=not IS_PUBLIC
389
  )
390
+ model_type = gr.Dropdown(
391
+ choices=["pretrained", "fine-tuned", "with RL"],
392
+ label="Model type",
393
+ multiselect=False,
394
+ value="pretrained",
395
+ max_choices=1,
396
+ interactive=True,
397
+ )
398
 
399
  with gr.Column():
400
  precision = gr.Dropdown(
 
405
  max_choices=1,
406
  interactive=True,
407
  )
 
 
 
 
 
 
 
 
408
  weight_type = gr.Dropdown(
409
  choices=["Original", "Delta", "Adapter"],
410
  label="Weights type",
src/assets/hardcoded_evals.py CHANGED
@@ -10,6 +10,7 @@ gpt4_values = {
10
  AutoEvalColumn.mmlu.name: 86.4,
11
  AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
 
13
  }
14
 
15
  gpt35_values = {
@@ -22,6 +23,7 @@ gpt35_values = {
22
  AutoEvalColumn.mmlu.name: 70.0,
23
  AutoEvalColumn.truthfulqa.name: 47.0,
24
  AutoEvalColumn.dummy.name: "GPT-3.5",
 
25
  }
26
 
27
  baseline = {
@@ -34,5 +36,6 @@ baseline = {
34
  AutoEvalColumn.mmlu.name: 25.0,
35
  AutoEvalColumn.truthfulqa.name: 25.0,
36
  AutoEvalColumn.dummy.name: "baseline",
 
37
  }
38
 
 
10
  AutoEvalColumn.mmlu.name: 86.4,
11
  AutoEvalColumn.truthfulqa.name: 59.0,
12
  AutoEvalColumn.dummy.name: "GPT-4",
13
+ AutoEvalColumn.model_type.name: "",
14
  }
15
 
16
  gpt35_values = {
 
23
  AutoEvalColumn.mmlu.name: 70.0,
24
  AutoEvalColumn.truthfulqa.name: 47.0,
25
  AutoEvalColumn.dummy.name: "GPT-3.5",
26
+ AutoEvalColumn.model_type.name: "",
27
  }
28
 
29
  baseline = {
 
36
  AutoEvalColumn.mmlu.name: 25.0,
37
  AutoEvalColumn.truthfulqa.name: 25.0,
38
  AutoEvalColumn.dummy.name: "baseline",
39
+ AutoEvalColumn.model_type.name: "",
40
  }
41
 
src/assets/text_content.py CHANGED
@@ -128,6 +128,13 @@ To get more information about quantization, see:
128
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
129
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
130
 
 
 
 
 
 
 
 
131
  # In case of model failure
132
  If your model is displayed in the `FAILED` category, its execution stopped.
133
  Make sure you have followed the above steps first.
 
128
  - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
129
  - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
130
 
131
+ ### Icons
132
+ 🟢 means that the model is pretrained
133
+ 🔶 that it is finetuned
134
+ 🟦 that is was trained with RL.
135
+ If there is no icon, we have not uploaded the information on the model yet, feel free to open an issue with the model information!
136
+
137
+
138
  # In case of model failure
139
  If your model is displayed in the `FAILED` category, its execution stopped.
140
  Make sure you have followed the above steps first.
src/auto_leaderboard/load_results.py CHANGED
@@ -26,6 +26,8 @@ class EvalResult:
26
  revision: str
27
  results: dict
28
  precision: str = "16bit"
 
 
29
 
30
  def to_dict(self):
31
  if self.org is not None:
@@ -35,7 +37,9 @@ class EvalResult:
35
  data_dict = {}
36
 
37
  data_dict["eval_name"] = self.eval_name # not a column, just a save name
 
38
  data_dict[AutoEvalColumn.precision.name] = self.precision
 
39
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
40
  data_dict[AutoEvalColumn.dummy.name] = base_model
41
  data_dict[AutoEvalColumn.revision.name] = self.revision
@@ -92,7 +96,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
92
  continue
93
  mean_acc = round(np.mean(accs) * 100.0, 1)
94
  eval_results.append(EvalResult(
95
- result_key, org, model, model_sha, {benchmark: mean_acc}
96
  ))
97
 
98
  return result_key, eval_results
 
26
  revision: str
27
  results: dict
28
  precision: str = "16bit"
29
+ model_type: str = ""
30
+ weight_type: str = ""
31
 
32
  def to_dict(self):
33
  if self.org is not None:
 
37
  data_dict = {}
38
 
39
  data_dict["eval_name"] = self.eval_name # not a column, just a save name
40
+ data_dict["weight_type"] = self.weight_type # not a column, just a save name
41
  data_dict[AutoEvalColumn.precision.name] = self.precision
42
+ data_dict[AutoEvalColumn.model_type.name] = self.model_type
43
  data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
44
  data_dict[AutoEvalColumn.dummy.name] = base_model
45
  data_dict[AutoEvalColumn.revision.name] = self.revision
 
96
  continue
97
  mean_acc = round(np.mean(accs) * 100.0, 1)
98
  eval_results.append(EvalResult(
99
+ eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
100
  ))
101
 
102
  return result_key, eval_results
src/auto_leaderboard/model_metadata_type.py CHANGED
@@ -2,6 +2,8 @@ from dataclasses import dataclass
2
  from enum import Enum
3
  from typing import Dict, List
4
 
 
 
5
  @dataclass
6
  class ModelInfo:
7
  name: str
@@ -167,23 +169,24 @@ TYPE_METADATA: Dict[str, ModelType] = {
167
 
168
  def get_model_type(leaderboard_data: List[dict]):
169
  for model_data in leaderboard_data:
170
- # Init
171
- model_data["Type name"] = "N/A"
172
- model_data["Type"] = ""
173
-
174
  # Stored information
175
  if model_data["model_name_for_query"] in TYPE_METADATA:
176
- model_data["Type name"] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
177
- model_data["Type"] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol
178
- else: # Supposed from the name
179
- if any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
180
- model_data["Type name"] = ModelType.SFT.value.name
181
- model_data["Type"] = ModelType.SFT.value.symbol
182
- elif any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
183
- model_data["Type name"] = ModelType.PT.value.name
184
- model_data["Type"] = ModelType.PT.value.symbol
185
- elif any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
186
- model_data["Type name"] = ModelType.RL.value.name
187
- model_data["Type"] = ModelType.RL.value.symbol
 
 
 
188
 
189
 
 
2
  from enum import Enum
3
  from typing import Dict, List
4
 
5
+ from ..utils_display import AutoEvalColumn
6
+
7
  @dataclass
8
  class ModelInfo:
9
  name: str
 
169
 
170
  def get_model_type(leaderboard_data: List[dict]):
171
  for model_data in leaderboard_data:
172
+ # Todo @clefourrier once requests are connected with results
173
+ is_delta = False # (model_data["weight_type"] != "Original")
 
 
174
  # Stored information
175
  if model_data["model_name_for_query"] in TYPE_METADATA:
176
+ model_data[AutoEvalColumn.model_type.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.name
177
+ model_data[AutoEvalColumn.model_type_symbol.name] = TYPE_METADATA[model_data["model_name_for_query"]].value.symbol + ("🔺" if is_delta else "")
178
+ # Inferred from the name or the selected type
179
+ elif model_data[AutoEvalColumn.model_type.name] == "pretrained" or any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
180
+ model_data[AutoEvalColumn.model_type.name] = ModelType.PT.value.name
181
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.PT.value.symbol + ("🔺" if is_delta else "")
182
+ elif model_data[AutoEvalColumn.model_type.name] == "finetuned" or any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
183
+ model_data[AutoEvalColumn.model_type.name] = ModelType.SFT.value.name
184
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.SFT.value.symbol + ("🔺" if is_delta else "")
185
+ elif model_data[AutoEvalColumn.model_type.name] == "with RL" or any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
186
+ model_data[AutoEvalColumn.model_type.name] = ModelType.RL.value.name
187
+ model_data[AutoEvalColumn.model_type_symbol.name] = ModelType.RL.value.symbol + ("🔺" if is_delta else "")
188
+ else:
189
+ model_data[AutoEvalColumn.model_type.name] = "N/A"
190
+ model_data[AutoEvalColumn.model_type_symbol.name] = ("🔺" if is_delta else "")
191
 
192
 
src/utils_display.py CHANGED
@@ -14,14 +14,14 @@ def fields(raw_class):
14
 
15
  @dataclass(frozen=True)
16
  class AutoEvalColumn: # Auto evals column
17
- model_type_symbol = ColumnContent("Type", "str", True)
18
  model = ColumnContent("Model", "markdown", True)
19
  average = ColumnContent("Average ⬆️", "number", True)
20
  arc = ColumnContent("ARC", "number", True)
21
  hellaswag = ColumnContent("HellaSwag", "number", True)
22
  mmlu = ColumnContent("MMLU", "number", True)
23
  truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
24
- model_type = ColumnContent("Type name", "str", False)
25
  precision = ColumnContent("Precision", "str", False, True)
26
  license = ColumnContent("Hub License", "str", False)
27
  params = ColumnContent("#Params (B)", "number", False)
 
14
 
15
  @dataclass(frozen=True)
16
  class AutoEvalColumn: # Auto evals column
17
+ model_type_symbol = ColumnContent("T", "str", True)
18
  model = ColumnContent("Model", "markdown", True)
19
  average = ColumnContent("Average ⬆️", "number", True)
20
  arc = ColumnContent("ARC", "number", True)
21
  hellaswag = ColumnContent("HellaSwag", "number", True)
22
  mmlu = ColumnContent("MMLU", "number", True)
23
  truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
24
+ model_type = ColumnContent("Type", "str", False)
25
  precision = ColumnContent("Precision", "str", False, True)
26
  license = ColumnContent("Hub License", "str", False)
27
  params = ColumnContent("#Params (B)", "number", False)