yangheng commited on
Commit
e23643e
·
1 Parent(s): a89f84e
.gitignore CHANGED
@@ -5,7 +5,6 @@ __pycache__/
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
8
- .idea/
9
 
10
  eval-queue/
11
  eval-results/
 
5
  .ipynb_checkpoints
6
  *ipynb
7
  .vscode/
 
8
 
9
  eval-queue/
10
  eval-results/
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
@@ -14,15 +13,17 @@ from src.about import (
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
- AutoEvalColumn,
 
22
  ModelType,
23
- fields,
24
  WeightType,
25
- Precision
26
  )
27
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,24 +33,39 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
 
35
  ### Space initialisation
 
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
40
  )
41
  except Exception:
42
  restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
 
 
 
 
 
47
  )
48
  except Exception:
49
  restart_space()
 
50
 
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 
 
53
 
54
  (
55
  finished_eval_queue_df,
@@ -57,7 +73,8 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
57
  pending_eval_queue_df,
58
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
- def init_leaderboard(dataframe):
 
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
@@ -95,18 +112,22 @@ with demo:
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("RGB Benchmark", elem_id="rgb-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
- # with gr.TabItem("PGB Benchmark", elem_id="pgb-benchmark-tab-table", id=0):
101
- # leaderboard1 = init_leaderboard(LEADERBOARD_DF)
102
- # with gr.TabItem("GUE Benchmark", elem_id="gue-benchmark-tab-table", id=0):
103
- # leaderboard2 = init_leaderboard(LEADERBOARD_DF)
104
- # with gr.TabItem("GB Benchmark", elem_id="gb-benchmark-tab-table", id=0):
105
- # leaderboard3 = init_leaderboard(LEADERBOARD_DF)
106
- # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
107
- # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=3):
 
 
 
 
110
  with gr.Column():
111
  with gr.Row():
112
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -160,6 +181,7 @@ with demo:
160
  value=None,
161
  interactive=True,
162
  )
 
163
  with gr.Column():
164
  precision = gr.Dropdown(
165
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
@@ -205,4 +227,4 @@ with demo:
205
  scheduler = BackgroundScheduler()
206
  scheduler.add_job(restart_space, "interval", seconds=1800)
207
  scheduler.start()
208
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
 
2
  from apscheduler.schedulers.background import BackgroundScheduler
3
+ from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.about import (
 
13
  )
14
  from src.display.css_html_js import custom_css
15
  from src.display.utils import (
16
+ RGB_BENCHMARK_COLS, PGB_BENCHMARK_COLS,
17
+ GUE_BENCHMARK_COLS, GB_BENCHMARK_COLS,
18
+ RGB_COLS, PGB_COLS, GUE_COLS, GB_COLS,
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
+ AutoEvalColumnRGB, AutoEvalColumnPGB,
22
+ AutoEvalColumnGUE, AutoEvalColumnGB,
23
  ModelType,
24
+ Precision,
25
  WeightType,
26
+ fields,
27
  )
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
 
33
  def restart_space():
34
  API.restart_space(repo_id=REPO_ID)
35
 
36
+
37
  ### Space initialisation
38
+ """
39
  try:
40
  print(EVAL_REQUESTS_PATH)
41
  snapshot_download(
42
+ repo_id=QUEUE_REPO,
43
+ local_dir=EVAL_REQUESTS_PATH,
44
+ repo_type="dataset",
45
+ tqdm_class=None,
46
+ etag_timeout=30,
47
+ token=TOKEN,
48
  )
49
  except Exception:
50
  restart_space()
51
  try:
52
  print(EVAL_RESULTS_PATH)
53
  snapshot_download(
54
+ repo_id=RESULTS_REPO,
55
+ local_dir=EVAL_RESULTS_PATH,
56
+ repo_type="dataset",
57
+ tqdm_class=None,
58
+ etag_timeout=30,
59
+ token=TOKEN,
60
  )
61
  except Exception:
62
  restart_space()
63
+ """
64
 
65
+ RGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/RGB/", EVAL_REQUESTS_PATH+"/RGB/", RGB_COLS, RGB_BENCHMARK_COLS)
66
+ PGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/PGB/", EVAL_REQUESTS_PATH+"/PGB/", PGB_COLS, PGB_BENCHMARK_COLS)
67
+ GUE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GUE/", EVAL_REQUESTS_PATH+"/GUE/", GUE_COLS, GUE_BENCHMARK_COLS)
68
+ GB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GB/", EVAL_REQUESTS_PATH+"/GB/", GB_COLS, GB_BENCHMARK_COLS)
69
 
70
  (
71
  finished_eval_queue_df,
 
73
  pending_eval_queue_df,
74
  ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
75
 
76
+
77
+ def init_leaderboard(dataframe, AutoEvalColumn):
78
  if dataframe is None or dataframe.empty:
79
  raise ValueError("Leaderboard DataFrame is empty or None.")
80
  return Leaderboard(
 
112
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
113
 
114
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
115
+ with gr.TabItem("RGB", elem_id="rgb-benchmark-tab-table", id=0):
116
+ leaderboard = init_leaderboard(RGB_LEADERBOARD_DF, AutoEvalColumnRGB)
117
+
118
+ with gr.TabItem("PGB", elem_id="pgb-benchmark-tab-table", id=1):
119
+ leaderboard2 = init_leaderboard(PGB_LEADERBOARD_DF, AutoEvalColumnPGB)
120
+
121
+ with gr.TabItem("GUE", elem_id="gue-benchmark-tab-table", id=2):
122
+ leaderboard3 = init_leaderboard(GUE_LEADERBOARD_DF, AutoEvalColumnGUE)
123
+
124
+ with gr.TabItem("GB", elem_id="gb-benchmark-tab-table", id=3):
125
+ leaderboard4 = init_leaderboard(GB_LEADERBOARD_DF, AutoEvalColumnGB)
126
+
127
+ with gr.TabItem("📝 About", elem_id="rgb-benchmark-tab-table", id=4):
128
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
129
+
130
+ with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=5):
131
  with gr.Column():
132
  with gr.Row():
133
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
181
  value=None,
182
  interactive=True,
183
  )
184
+
185
  with gr.Column():
186
  precision = gr.Dropdown(
187
  choices=[i.value.name for i in Precision if i != Precision.Unknown],
 
227
  scheduler = BackgroundScheduler()
228
  scheduler.add_job(restart_space, "interval", seconds=1800)
229
  scheduler.start()
230
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -1,6 +1,7 @@
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
 
4
  @dataclass
5
  class Task:
6
  benchmark: str
@@ -10,8 +11,8 @@ class Task:
10
 
11
  # Select your tasks here
12
  # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
  task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
16
  task1 = Task("SNMD", "AUC", "SNMD (AUC)")
17
  task2 = Task("SNMR", "F1", "SNMR (F1)")
@@ -19,72 +20,101 @@ class Tasks(Enum):
19
  task4 = Task("bpRNA", "F1", "bpRNA (F1)")
20
  task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
21
 
22
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # ---------------------------------------------------
24
 
25
-
26
-
27
  # Your leaderboard name
28
- TITLE = """<h1 align="center" id="space-title">OmniGenomeBench Leaderboard</h1>"""
29
 
 
 
 
30
 
 
31
  LLM_BENCHMARKS_TEXT = f"""
32
  ## Why do we need this benchmark?
33
- Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
34
- As key parts of biology, such as DNA, RNA sequences, and secondary structures, have a large effect on each other,
35
- the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
36
  However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
37
  Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
38
  This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
39
- Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets,
40
- to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
41
-
42
  ## Evaluation Datasets
43
  TODO HERE
44
-
45
  ## Reported Scores and Ranking
46
  TODO HERE
47
-
48
  ## How it works
49
  Do we need this?
50
-
51
  ## Reproducibility
52
  To reproduce our results, here are the commands you can run:
53
  """
54
 
55
  EVALUATION_QUEUE_TEXT = """
56
  ## Some good practices before submitting a model
57
-
58
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
59
  ```python
60
  from transformers import AutoConfig, AutoModel, AutoTokenizer
61
  config = AutoConfig.from_pretrained("your model name", revision=revision)
62
  model = AutoModel.from_pretrained("your model name", revision=revision)
63
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
64
-
65
- If this step fails, follow the error messages to debug your model before submitting it.
66
- It's likely your model has been improperly uploaded.
67
- Note: make sure your model is public! Note: if your model needs `use_remote_code=True',
68
- we do not support this option yet but we are working on adding it, stay posted!
69
-
70
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  """
72
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
73
 
74
- CITATION_BUTTON_TEXT = """
 
75
  @article{Yang2024,
76
- author = {Yang, Heng and Li, Ke},
77
- title = {OmniGenome: Aligning {RNA} Sequences with Secondary Structures in Genomic Foundation Models},
78
- journal= {CoRR},
79
- volume = {abs/2407.11242},
80
- year = {2024}
 
81
  }
82
  """
83
-
84
- INTRODUCTION_TEXT = """
85
- ## What does your leaderboard evaluate?
86
- The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes.
87
- Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes.
88
- However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics.
89
- This hinders progress in various genomic tasks, such as RNA design and structure prediction.
90
- """
 
1
  from dataclasses import dataclass
2
  from enum import Enum
3
 
4
+
5
  @dataclass
6
  class Task:
7
  benchmark: str
 
11
 
12
  # Select your tasks here
13
  # ---------------------------------------------------
14
+ class TasksRGB(Enum):
15
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
  task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
17
  task1 = Task("SNMD", "AUC", "SNMD (AUC)")
18
  task2 = Task("SNMR", "F1", "SNMR (F1)")
 
20
  task4 = Task("bpRNA", "F1", "bpRNA (F1)")
21
  task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
22
 
23
+ class TasksPGB(Enum):
24
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
25
+ task0 = Task("PolyA", "F1", "PolyA (F1)")
26
+ task1 = Task("LncRNA", "F1", "LncRNA (F1)")
27
+ task2 = Task("Chrom Acc", "F1", "Chrom Acc (F1)")
28
+ task3 = Task("Prom Str", "RMSE", "Prom Str (RMSE)")
29
+ task4 = Task("Term Str", "RMSE", "Term Str (RMSE)")
30
+ task5 = Task("Splice", "F1", "Splice (F1)")
31
+ task6 = Task("Gene Exp", "RMSE", "Gene Exp (RMSE)")
32
+ task7 = Task("Enhancer", "F1", "Enhancer (F1)")
33
+
34
+ class TasksGUE(Enum):
35
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
36
+ task0 = Task("Yeast EMP", "F1", "Yeast EMP (F1)")
37
+ task1 = Task("Mouse TF-M", "F1", "Mouse TF-M (F1)")
38
+ task2 = Task("Virus CVC", "F1", "Virus CVC (F1)")
39
+ task3 = Task("Human TF-H", "F1", "Human TF-H (F1)")
40
+ task4 = Task("Human PD", "F1", "Human PD (F1)")
41
+ task5 = Task("Human CPD", "F1", "Human CPD (F1)")
42
+ task6 = Task("Human SSP", "F1", "Human SSP (F1)")
43
+
44
+ class TasksGB(Enum):
45
+ # task_key in the json file, metric_key in the json file, name to display in the leaderboard
46
+ task0 = Task("DEM", "F1", "DEM (F1)")
47
+ task1 = Task("DOW", "F1", "DOW (F1)")
48
+ task2 = Task("DRE", "F1", "DRE (F1)")
49
+ task3 = Task("DME", "F1", "DME (F1)")
50
+ task4 = Task("HCE", "F1", "HCE (F1)")
51
+ task5 = Task("HEE", "F1", "HEE (F1)")
52
+ task6 = Task("HRE", "F1", "HRE (F1)")
53
+ task7 = Task("HNP", "F1", "HNP (F1)")
54
+ task8 = Task("HOR", "F1", "HOR (F1)")
55
+
56
+
57
+ NUM_FEWSHOT = 0 # Change with your few shot
58
  # ---------------------------------------------------
59
 
 
 
60
  # Your leaderboard name
61
+ TITLE = """<h1 align="center" id="space-title">Genomic Modelling Leaderboard</h1>"""
62
 
63
+ # What does your leaderboard evaluate?
64
+ INTRODUCTION_TEXT = """
65
+ """
66
 
67
+ # Which evaluations are you running? how can people reproduce what you have?
68
  LLM_BENCHMARKS_TEXT = f"""
69
  ## Why do we need this benchmark?
70
+ Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
71
+ As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
 
72
  However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
73
  Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
74
  This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
75
+ Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
 
 
76
  ## Evaluation Datasets
77
  TODO HERE
 
78
  ## Reported Scores and Ranking
79
  TODO HERE
 
80
  ## How it works
81
  Do we need this?
 
82
  ## Reproducibility
83
  To reproduce our results, here are the commands you can run:
84
  """
85
 
86
  EVALUATION_QUEUE_TEXT = """
87
  ## Some good practices before submitting a model
 
88
  ### 1) Make sure you can load your model and tokenizer using AutoClasses:
89
  ```python
90
  from transformers import AutoConfig, AutoModel, AutoTokenizer
91
  config = AutoConfig.from_pretrained("your model name", revision=revision)
92
  model = AutoModel.from_pretrained("your model name", revision=revision)
93
  tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 
 
 
 
 
 
94
  ```
95
+ If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
96
+ Note: make sure your model is public!
97
+ Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
98
+ ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
99
+ It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
100
+ ### 3) Make sure your model has an open license!
101
+ This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
102
+ ### 4) Fill up your model card
103
+ When we add extra information about models to the leaderboard, it will be automatically taken from the model card
104
+ ## In case of model failure
105
+ If your model is displayed in the `FAILED` category, its execution stopped.
106
+ Make sure you have followed the above steps first.
107
+ If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
108
  """
 
109
 
110
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
111
+ CITATION_BUTTON_TEXT = r"""
112
  @article{Yang2024,
113
+ author = {Yang, Heng and Li, Ke},
114
+ title = {Foundation Models Work},
115
+ journal = {arXiv},
116
+ year = {2024},
117
+ note = {arXiv preprint arXiv:XXXX.XXXXX}
118
+ url = {https://arxiv.org/abs/XXXX.XXXXX}
119
  }
120
  """
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -1,9 +1,9 @@
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
- import pandas as pd
5
 
6
- from src.about import Tasks
 
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -20,28 +20,37 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
 
23
  ## Leaderboard columns
24
- auto_eval_column_dict = []
25
- # Init
26
- auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
27
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
28
- #Scores
29
- auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
30
- for task in Tasks:
31
- auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
32
- # Model information
33
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
34
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
35
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
36
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
37
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
38
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
39
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
40
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
41
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 
 
 
 
42
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
45
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
@@ -53,12 +62,13 @@ class EvalQueueColumn: # Queue column
53
  weight_type = ColumnContent("weight_type", "str", "Original")
54
  status = ColumnContent("status", "str", True)
55
 
 
56
  ## All the model information that we might need
57
  @dataclass
58
  class ModelDetails:
59
  name: str
60
  display_name: str = ""
61
- symbol: str = "" # emoji
62
 
63
 
64
  class ModelType(Enum):
@@ -83,11 +93,13 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
89
  Delta = ModelDetails("Delta")
90
 
 
91
  class Precision(Enum):
92
  float16 = ModelDetails("float16")
93
  bfloat16 = ModelDetails("bfloat16")
@@ -100,11 +112,17 @@ class Precision(Enum):
100
  return Precision.bfloat16
101
  return Precision.Unknown
102
 
 
103
  # Column selection
104
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
 
 
105
 
106
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
107
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
108
 
109
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
110
-
 
 
 
1
  from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
 
4
 
5
+ from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
6
+
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
20
  hidden: bool = False
21
  never_hidden: bool = False
22
 
23
+
24
  ## Leaderboard columns
25
+ auto_eval_columns = []
26
+ for eval_col in [TasksRGB, TasksPGB, TasksGUE, TasksGB]:
27
+
28
+ auto_eval_column_dict = []
29
+ # Init
30
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
31
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
32
+ # Scores
33
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
34
+ for task in eval_col:
35
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
36
+ # Model information
37
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
38
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
39
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
40
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
41
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
42
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
43
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
44
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
45
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
46
+ auto_eval_columns.append(auto_eval_column_dict)
47
 
48
  # We use make dataclass to dynamically fill the scores from Tasks
49
+ AutoEvalColumnRGB = make_dataclass("AutoEvalColumn", auto_eval_columns[0], frozen=True)
50
+ AutoEvalColumnPGB = make_dataclass("AutoEvalColumn", auto_eval_columns[1], frozen=True)
51
+ AutoEvalColumnGUE = make_dataclass("AutoEvalColumn", auto_eval_columns[2], frozen=True)
52
+ AutoEvalColumnGB = make_dataclass("AutoEvalColumn", auto_eval_columns[3], frozen=True)
53
+
54
 
55
  ## For the queue columns in the submission tab
56
  @dataclass(frozen=True)
 
62
  weight_type = ColumnContent("weight_type", "str", "Original")
63
  status = ColumnContent("status", "str", True)
64
 
65
+
66
  ## All the model information that we might need
67
  @dataclass
68
  class ModelDetails:
69
  name: str
70
  display_name: str = ""
71
+ symbol: str = "" # emoji
72
 
73
 
74
  class ModelType(Enum):
 
93
  return ModelType.IFT
94
  return ModelType.Unknown
95
 
96
+
97
  class WeightType(Enum):
98
  Adapter = ModelDetails("Adapter")
99
  Original = ModelDetails("Original")
100
  Delta = ModelDetails("Delta")
101
 
102
+
103
  class Precision(Enum):
104
  float16 = ModelDetails("float16")
105
  bfloat16 = ModelDetails("bfloat16")
 
112
  return Precision.bfloat16
113
  return Precision.Unknown
114
 
115
+
116
  # Column selection
117
+ RGB_COLS = [c.name for c in fields(AutoEvalColumnRGB) if not c.hidden]
118
+ PGB_COLS = [c.name for c in fields(AutoEvalColumnPGB) if not c.hidden]
119
+ GUE_COLS = [c.name for c in fields(AutoEvalColumnGUE) if not c.hidden]
120
+ GB_COLS = [c.name for c in fields(AutoEvalColumnGB) if not c.hidden]
121
 
122
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
123
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
124
 
125
+ RGB_BENCHMARK_COLS = [t.value.col_name for t in TasksRGB]
126
+ PGB_BENCHMARK_COLS = [t.value.col_name for t in TasksPGB]
127
+ GUE_BENCHMARK_COLS = [t.value.col_name for t in TasksGUE]
128
+ GB_BENCHMARK_COLS = [t.value.col_name for t in TasksGB]
src/envs.py CHANGED
@@ -4,17 +4,19 @@ from huggingface_hub import HfApi
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
- TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
 
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/OmniGenomeLeaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
4
 
5
  # Info to change for your repository
6
  # ----------------------------------
7
+ TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = (
10
+ "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
11
+ )
12
  # ----------------------------------
13
 
14
+ REPO_ID = f"{OWNER}/leaderboard"
15
  QUEUE_REPO = f"{OWNER}/requests"
16
  RESULTS_REPO = f"{OWNER}/results"
17
 
18
  # If you setup a cache later, just change HF_HOME
19
+ CACHE_PATH = os.getenv("HF_HOME", ".")
20
 
21
  # Local caches
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -1,39 +1,41 @@
1
  import glob
2
  import json
3
- import math
4
  import os
5
  from dataclasses import dataclass
6
 
 
7
  import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 
 
12
  from src.submission.check_validity import is_model_on_hub
13
 
14
 
15
  @dataclass
16
  class EvalResult:
17
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
18
- """
19
- eval_name: str # org_model_precision (uid)
20
- full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
- revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
- model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
- weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
32
- date: str = "" # submission date of request file
33
  still_on_hub: bool = False
34
 
35
  @classmethod
36
- def init_from_json_file(self, json_filepath):
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
@@ -75,7 +77,7 @@ class EvalResult:
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
77
  continue
78
- if task.benchmark == "mRNA":
79
  # Keep RMSE at original value
80
  mean_acc = np.mean(accs)
81
  else:
@@ -88,10 +90,10 @@ class EvalResult:
88
  org=org,
89
  model=model,
90
  results=results,
91
- precision=precision,
92
- revision= config.get("model_sha", ""),
93
  still_on_hub=still_on_hub,
94
- architecture=architecture
95
  )
96
 
97
  def update_with_request_file(self, requests_path):
@@ -108,9 +110,11 @@ class EvalResult:
108
  self.num_params = request.get("params", 0)
109
  self.date = request.get("submitted_time", "")
110
  except Exception:
111
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
 
 
112
 
113
- def to_dict(self, rank):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
  average = rank
116
  # average = sorted(average, reverse=True)
@@ -154,10 +158,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
154
  req_content = json.load(f)
155
  # print("Request File: ", tmp_request_file)
156
  # print("Req Content: ", req_content)
157
- if (
158
- req_content["status"] in ["FINISHED"]
159
- and req_content["precision"] == precision.split(".")[-1]
160
- ):
161
  request_file = tmp_request_file
162
  return request_file
163
 
@@ -168,6 +169,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
168
 
169
  for root, _, files in os.walk(results_path):
170
  # We should only have json files in model results
 
171
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
172
  continue
173
 
@@ -176,14 +178,21 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
176
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
177
  except dateutil.parser._parser.ParserError:
178
  files = [files[-1]]
179
-
180
  for file in files:
181
  model_result_filepaths.append(os.path.join(root, file))
182
 
183
  eval_results = {}
 
184
  for model_result_filepath in model_result_filepaths:
185
  # Creation of result
186
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
 
 
 
 
 
 
187
  eval_result.update_with_request_file(requests_path)
188
 
189
  # Store results of same eval together
@@ -197,10 +206,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
197
  for result in eval_results.values():
198
  result.average = np.mean(list(result.results.values()))
199
  sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
200
-
201
- for i,v in enumerate(sorted_results):
202
  try:
203
- v.to_dict(i) # we test if the dict version is complete
 
 
 
 
 
 
 
 
204
  results.append(v)
205
  except KeyError: # not all eval values present
206
  continue
 
1
  import glob
2
  import json
 
3
  import os
4
  from dataclasses import dataclass
5
 
6
+ import re
7
  import dateutil
8
  import numpy as np
9
 
10
  from src.display.formatting import make_clickable_model
11
+ from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
12
+ AutoEvalColumnGUE, AutoEvalColumnGB, ModelType, Precision, WeightType
13
+ from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
14
  from src.submission.check_validity import is_model_on_hub
15
 
16
 
17
  @dataclass
18
  class EvalResult:
19
+ """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
20
+
21
+ eval_name: str # org_model_precision (uid)
22
+ full_model: str # org/model (path on hub)
23
+ org: str
24
  model: str
25
+ revision: str # commit hash, "" if main
26
  results: dict
27
  precision: Precision = Precision.Unknown
28
+ model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
29
+ weight_type: WeightType = WeightType.Original # Original or Adapter
30
+ architecture: str = "Unknown"
31
  license: str = "?"
32
  likes: int = 0
33
  num_params: int = 0
34
+ date: str = "" # submission date of request file
35
  still_on_hub: bool = False
36
 
37
  @classmethod
38
+ def init_from_json_file(self, json_filepath, Tasks):
39
  """Inits the result from the specific model result file"""
40
  with open(json_filepath) as fp:
41
  data = json.load(fp)
 
77
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
79
  continue
80
+ if task.metric == "RMSE":
81
  # Keep RMSE at original value
82
  mean_acc = np.mean(accs)
83
  else:
 
90
  org=org,
91
  model=model,
92
  results=results,
93
+ precision=precision,
94
+ revision=config.get("model_sha", ""),
95
  still_on_hub=still_on_hub,
96
+ architecture=architecture,
97
  )
98
 
99
  def update_with_request_file(self, requests_path):
 
110
  self.num_params = request.get("params", 0)
111
  self.date = request.get("submitted_time", "")
112
  except Exception:
113
+ print(
114
+ f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
115
+ )
116
 
117
+ def to_dict(self, rank, AutoEvalColumn, Tasks):
118
  """Converts the Eval Result to a dict compatible with our dataframe display"""
119
  average = rank
120
  # average = sorted(average, reverse=True)
 
158
  req_content = json.load(f)
159
  # print("Request File: ", tmp_request_file)
160
  # print("Req Content: ", req_content)
161
+ if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
 
 
 
162
  request_file = tmp_request_file
163
  return request_file
164
 
 
169
 
170
  for root, _, files in os.walk(results_path):
171
  # We should only have json files in model results
172
+ print(f"Files {files}")
173
  if len(files) == 0 or any([not f.endswith(".json") for f in files]):
174
  continue
175
 
 
178
  files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
179
  except dateutil.parser._parser.ParserError:
180
  files = [files[-1]]
 
181
  for file in files:
182
  model_result_filepaths.append(os.path.join(root, file))
183
 
184
  eval_results = {}
185
+ print(f"Filepaths: {model_result_filepaths}")
186
  for model_result_filepath in model_result_filepaths:
187
  # Creation of result
188
+ if "RGB" in results_path:
189
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksRGB)
190
+ elif "PGB" in results_path:
191
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksPGB)
192
+ elif "GUE" in results_path:
193
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGUE)
194
+ else:
195
+ eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGB)
196
  eval_result.update_with_request_file(requests_path)
197
 
198
  # Store results of same eval together
 
206
  for result in eval_results.values():
207
  result.average = np.mean(list(result.results.values()))
208
  sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
209
+ print(f"SORTED RESULTS HERE: \n{sorted_results}")
210
+ for i, v in enumerate(sorted_results):
211
  try:
212
+ # we test if the dict version is complete
213
+ if "RGB" in results_path:
214
+ v.to_dict(i, AutoEvalColumnRGB, TasksRGB)
215
+ elif "PGB" in results_path:
216
+ v.to_dict(i, AutoEvalColumnPGB, TasksPGB)
217
+ elif "GUE" in results_path:
218
+ v.to_dict(i, AutoEvalColumnGUE, TasksGUE)
219
+ else:
220
+ v.to_dict(i, AutoEvalColumnGB, TasksGB)
221
  results.append(v)
222
  except KeyError: # not all eval values present
223
  continue
src/populate.py CHANGED
@@ -1,16 +1,20 @@
1
  import json
2
  import os
 
3
  import numpy as np
4
  import pandas as pd
5
 
6
-
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
 
 
 
10
 
11
 
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
 
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  for result in raw_data:
16
  result.average = np.mean(list(result.results.values()))
@@ -18,10 +22,20 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
18
  print(sorted_results)
19
  # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
20
  # rank = [rank+1 for rank, value in enumerate(average)]
21
- all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
 
 
 
 
 
 
 
 
22
 
23
  df = pd.DataFrame.from_records(all_data_json)
24
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
 
 
25
  df = df[cols].round(decimals=2)
26
 
27
  # filter out if any of the benchmarks have not been produced
@@ -34,8 +48,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
34
  """Creates the different dataframes for the evaluation queues requestes"""
35
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
36
  all_evals = []
37
-
 
 
38
  for entry in entries:
 
39
  if ".json" in entry:
40
  file_path = os.path.join(save_path, entry)
41
  with open(file_path) as fp:
@@ -47,15 +64,15 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
47
  all_evals.append(data)
48
  elif ".md" not in entry:
49
  # this is a folder
50
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
51
- for sub_entry in sub_entries:
52
- file_path = os.path.join(save_path, entry, sub_entry)
53
- with open(file_path) as fp:
54
- data = json.load(fp)
55
 
56
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
57
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
58
- all_evals.append(data)
59
 
60
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
61
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
@@ -63,4 +80,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
63
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
64
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
65
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
66
- return df_finished[cols], df_running[cols], df_pending[cols]
 
1
  import json
2
  import os
3
+
4
  import numpy as np
5
  import pandas as pd
6
 
 
7
  from src.display.formatting import has_no_nan_values, make_clickable_model
8
+ from src.display.utils import EvalQueueColumn
9
  from src.leaderboard.read_evals import get_raw_eval_results
10
+ from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
11
+ AutoEvalColumnGUE, AutoEvalColumnGB
12
+ from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
13
 
14
 
15
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
16
  """Creates a dataframe from all the individual experiment results"""
17
+ print(f"RESULTS PATH: {results_path}")
18
  raw_data = get_raw_eval_results(results_path, requests_path)
19
  for result in raw_data:
20
  result.average = np.mean(list(result.results.values()))
 
22
  print(sorted_results)
23
  # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
24
  # rank = [rank+1 for rank, value in enumerate(average)]
25
+ if "RGB" in results_path:
26
+ all_data_json = [v.to_dict(i+1, AutoEvalColumnRGB, TasksRGB) for i, v in enumerate(raw_data)]
27
+ elif "PGB" in results_path:
28
+ all_data_json = [v.to_dict(i+1, AutoEvalColumnPGB, TasksPGB) for i, v in enumerate(raw_data)]
29
+ elif "GUE" in results_path:
30
+ all_data_json = [v.to_dict(i+1, AutoEvalColumnGUE, TasksGUE) for i, v in enumerate(raw_data)]
31
+ else:
32
+ all_data_json = [v.to_dict(i+1, AutoEvalColumnGB, TasksGB) for i, v in enumerate(raw_data)]
33
+ # all_data_json = [v.to_dict(i + 1) for i, v in enumerate(raw_data)]
34
 
35
  df = pd.DataFrame.from_records(all_data_json)
36
  # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
37
+ print(f"Cols: {cols}")
38
+ print(f"DF: {df}")
39
  df = df[cols].round(decimals=2)
40
 
41
  # filter out if any of the benchmarks have not been produced
 
48
  """Creates the different dataframes for the evaluation queues requestes"""
49
  entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
50
  all_evals = []
51
+ print(entries)
52
+ entries = [entry for entry in entries if not entry.startswith(".")]
53
+ print(entries)
54
  for entry in entries:
55
+ print(entries)
56
  if ".json" in entry:
57
  file_path = os.path.join(save_path, entry)
58
  with open(file_path) as fp:
 
64
  all_evals.append(data)
65
  elif ".md" not in entry:
66
  # this is a folder
67
+ entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
68
+ # for sub_entry in sub_entries:
69
+ # file_path = os.path.join(save_path, entry, sub_entry)
70
+ # with open(file_path) as fp:
71
+ # data = json.load(fp)
72
 
73
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
74
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
75
+ # all_evals.append(data)
76
 
77
  pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
78
  running_list = [e for e in all_evals if e["status"] == "RUNNING"]
 
80
  df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
81
  df_running = pd.DataFrame.from_records(running_list, columns=cols)
82
  df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
83
+ return df_finished[cols], df_running[cols], df_pending[cols]
src/submission/check_validity.py CHANGED
@@ -1,8 +1,6 @@
1
  import json
2
  import os
3
- import re
4
  from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
 
7
  import huggingface_hub
8
  from huggingface_hub import ModelCard
@@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
  from transformers.models.auto.tokenization_auto import AutoTokenizer
12
 
 
13
  def check_model_card(repo_id: str) -> tuple[bool, str]:
14
  """Checks if the model card and license exist and have been filled"""
15
  try:
@@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
31
 
32
  return True, ""
33
 
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
 
 
 
35
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
  try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
38
  if test_tokenizer:
39
  try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
41
  except ValueError as e:
 
 
42
  return (
43
  False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
  )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
  return True, None, config
50
 
51
  except ValueError:
52
  return (
53
  False,
54
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
  )
57
 
58
- except Exception as e:
59
  return False, "was not found on hub!", None
60
 
61
 
@@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
70
  model_size = size_factor * model_size
71
  return model_size
72
 
 
73
  def get_model_arch(model_info: ModelInfo):
74
  """Gets the model architecture from the configuration"""
75
  return model_info.config.get("architectures", "Unknown")
76
 
 
77
  def already_submitted_models(requested_models_dir: str) -> set[str]:
78
  """Gather a list of already submitted models to avoid duplicates"""
79
  depth = 1
 
1
  import json
2
  import os
 
3
  from collections import defaultdict
 
4
 
5
  import huggingface_hub
6
  from huggingface_hub import ModelCard
 
8
  from transformers import AutoConfig
9
  from transformers.models.auto.tokenization_auto import AutoTokenizer
10
 
11
+
12
  def check_model_card(repo_id: str) -> tuple[bool, str]:
13
  """Checks if the model card and license exist and have been filled"""
14
  try:
 
30
 
31
  return True, ""
32
 
33
+
34
+ def is_model_on_hub(
35
+ model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
36
+ ) -> tuple[bool, str]:
37
  """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
38
  try:
39
+ config = AutoConfig.from_pretrained(
40
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
41
+ )
42
  if test_tokenizer:
43
  try:
44
+ tk = AutoTokenizer.from_pretrained(
45
+ model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
46
+ )
47
  except ValueError as e:
48
+ return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
49
+ except Exception:
50
  return (
51
  False,
52
+ "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
53
+ None,
54
  )
 
 
55
  return True, None, config
56
 
57
  except ValueError:
58
  return (
59
  False,
60
  "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
61
+ None,
62
  )
63
 
64
+ except Exception:
65
  return False, "was not found on hub!", None
66
 
67
 
 
76
  model_size = size_factor * model_size
77
  return model_size
78
 
79
+
80
  def get_model_arch(model_info: ModelInfo):
81
  """Gets the model architecture from the configuration"""
82
  return model_info.config.get("architectures", "Unknown")
83
 
84
+
85
  def already_submitted_models(requested_models_dir: str) -> set[str]:
86
  """Gather a list of already submitted models to avoid duplicates"""
87
  depth = 1
src/submission/submit.py CHANGED
@@ -3,17 +3,13 @@ import os
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
17
  def add_new_eval(
18
  model: str,
19
  base_model: str,
@@ -45,7 +41,9 @@ def add_new_eval(
45
 
46
  # Is the model on the hub?
47
  if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
 
 
49
  if not base_model_on_hub:
50
  return styled_error(f'Base model "{base_model}" {error}')
51
 
 
3
  from datetime import datetime, timezone
4
 
5
  from src.display.formatting import styled_error, styled_message, styled_warning
6
+ from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
7
+ from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
 
 
 
 
 
8
 
9
  REQUESTED_MODELS = None
10
  USERS_TO_SUBMISSION_DATES = None
11
 
12
+
13
  def add_new_eval(
14
  model: str,
15
  base_model: str,
 
41
 
42
  # Is the model on the hub?
43
  if weight_type in ["Delta", "Adapter"]:
44
+ base_model_on_hub, error, _ = is_model_on_hub(
45
+ model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True
46
+ )
47
  if not base_model_on_hub:
48
  return styled_error(f'Base model "{base_model}" {error}')
49