Clémentine commited on
Commit
314f91a
1 Parent(s): 1257fc3
src/display/about.py CHANGED
@@ -1,6 +1,5 @@
1
- from src.display.utils import ModelType
2
- from enum import Enum
3
  from dataclasses import dataclass
 
4
 
5
  @dataclass
6
  class Task:
@@ -8,6 +7,7 @@ class Task:
8
  metric: str
9
  col_name: str
10
 
 
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
 
 
 
1
  from dataclasses import dataclass
2
+ from enum import Enum
3
 
4
  @dataclass
5
  class Task:
 
7
  metric: str
8
  col_name: str
9
 
10
+
11
  # Init: to update with your specific keys
12
  class Tasks(Enum):
13
  task0 = Task("Key in the harness", "metric in the harness", "Display name 1")
src/display/utils.py CHANGED
@@ -8,6 +8,7 @@ from src.display.about import Tasks
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
 
11
  # These classes are for user facing column names,
12
  # to avoid having to change them all around the code
13
  # when a modif is needed
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
11
+
12
  # These classes are for user facing column names,
13
  # to avoid having to change them all around the code
14
  # when a modif is needed
src/leaderboard/filter_models.py DELETED
@@ -1,50 +0,0 @@
1
- from src.display.formatting import model_hyperlink
2
- from src.display.utils import AutoEvalColumn
3
-
4
- # Models which have been flagged by users as being problematic for a reason or another
5
- # (Model name to forum discussion link)
6
- FLAGGED_MODELS = {
7
- "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
8
- "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
9
- "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
10
- "Fredithefish/ReasonixPajama-3B-HF": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/236",
11
- "TigerResearch/tigerbot-7b-sft-v1": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/237",
12
- "gaodrew/gaodrew-gorgonzola-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/215",
13
- "AIDC-ai-business/Marcoroni-70B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
14
- "AIDC-ai-business/Marcoroni-13B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
15
- "AIDC-ai-business/Marcoroni-7B": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/287",
16
- }
17
-
18
- # Models which have been requested by orgs to not be submitted on the leaderboard
19
- DO_NOT_SUBMIT_MODELS = [
20
- "Voicelab/trurl-2-13b", # trained on MMLU
21
- ]
22
-
23
-
24
- def flag_models(leaderboard_data: list[dict]):
25
- for model_data in leaderboard_data:
26
- if model_data["model_name_for_query"] in FLAGGED_MODELS:
27
- issue_num = FLAGGED_MODELS[model_data["model_name_for_query"]].split("/")[-1]
28
- issue_link = model_hyperlink(
29
- FLAGGED_MODELS[model_data["model_name_for_query"]],
30
- f"See discussion #{issue_num}",
31
- )
32
- model_data[
33
- AutoEvalColumn.model.name
34
- ] = f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
35
-
36
-
37
- def remove_forbidden_models(leaderboard_data: list[dict]):
38
- indices_to_remove = []
39
- for ix, model in enumerate(leaderboard_data):
40
- if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
41
- indices_to_remove.append(ix)
42
-
43
- for ix in reversed(indices_to_remove):
44
- leaderboard_data.pop(ix)
45
- return leaderboard_data
46
-
47
-
48
- def filter_models(leaderboard_data: list[dict]):
49
- leaderboard_data = remove_forbidden_models(leaderboard_data)
50
- flag_models(leaderboard_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py CHANGED
@@ -4,16 +4,13 @@ import os
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
8
- from src.leaderboard.filter_models import filter_models
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
- all_data_json.append(baseline_row)
16
- filter_models(all_data_json)
17
 
18
  df = pd.DataFrame.from_records(all_data_json)
19
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
7
+ from src.display.utils import AutoEvalColumn, EvalQueueColumn
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  raw_data = get_raw_eval_results(results_path, requests_path)
13
  all_data_json = [v.to_dict() for v in raw_data]
 
 
14
 
15
  df = pd.DataFrame.from_records(all_data_json)
16
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
src/submission/submit.py CHANGED
@@ -3,7 +3,7 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, H4_TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
@@ -45,7 +45,7 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
  from src.submission.check_validity import (
8
  already_submitted_models,
9
  check_model_card,
 
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51