Spaces:

yangheng
/

OmniGenomeLeaderboard

Running

App Files Files Community

yangheng commited on Sep 15, 2024

Commit

e23643e

1 Parent(s): a89f84e

update

Browse files

Files changed (9) hide show

.gitignore +0 -1
app.py +47 -25
src/about.py +67 -37
src/display/utils.py +43 -25
src/envs.py +6 -4
src/leaderboard/read_evals.py +45 -28
src/populate.py +30 -13
src/submission/check_validity.py +19 -11
src/submission/submit.py +6 -8

.gitignore CHANGED Viewed

@@ -5,7 +5,6 @@ __pycache__/
 .ipynb_checkpoints
 *ipynb
 .vscode/
-.idea/
 eval-queue/
 eval-results/

 .ipynb_checkpoints
 *ipynb
 .vscode/
 eval-queue/
 eval-results/

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import gradio as gr
-from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
@@ -14,15 +13,17 @@ from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    BENCHMARK_COLS,
-    COLS,
     EVAL_COLS,
     EVAL_TYPES,
-    AutoEvalColumn,
     ModelType,
-    fields,
     WeightType,
-    Precision
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
@@ -32,24 +33,39 @@ from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
-        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
 except Exception:
     restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
 (
     finished_eval_queue_df,
@@ -57,7 +73,8 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
-def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
@@ -95,18 +112,22 @@ with demo:
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("RGB Benchmark", elem_id="rgb-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        # with gr.TabItem("PGB Benchmark", elem_id="pgb-benchmark-tab-table", id=0):
-        #     leaderboard1 = init_leaderboard(LEADERBOARD_DF)
-        # with gr.TabItem("GUE Benchmark", elem_id="gue-benchmark-tab-table", id=0):
-        #     leaderboard2 = init_leaderboard(LEADERBOARD_DF)
-        # with gr.TabItem("GB Benchmark", elem_id="gb-benchmark-tab-table", id=0):
-        #     leaderboard3 = init_leaderboard(LEADERBOARD_DF)
-        # with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-        #     gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=3):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
@@ -160,6 +181,7 @@ with demo:
                         value=None,
                         interactive=True,
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
@@ -205,4 +227,4 @@ with demo:
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
-demo.queue(default_concurrency_limit=40).launch()

 import gradio as gr
 from apscheduler.schedulers.background import BackgroundScheduler
+from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
 from huggingface_hub import snapshot_download
 from src.about import (
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
+    RGB_BENCHMARK_COLS, PGB_BENCHMARK_COLS,
+    GUE_BENCHMARK_COLS, GB_BENCHMARK_COLS,
+    RGB_COLS, PGB_COLS, GUE_COLS, GB_COLS,
     EVAL_COLS,
     EVAL_TYPES,
+    AutoEvalColumnRGB, AutoEvalColumnPGB,
+    AutoEvalColumnGUE, AutoEvalColumnGB,
     ModelType,
+    Precision,
     WeightType,
+    fields,
 )
 from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
 from src.populate import get_evaluation_queue_df, get_leaderboard_df
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
+"""
 try:
     print(EVAL_REQUESTS_PATH)
     snapshot_download(
+        repo_id=QUEUE_REPO,
+        local_dir=EVAL_REQUESTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
     )
 except Exception:
     restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
+        repo_id=RESULTS_REPO,
+        local_dir=EVAL_RESULTS_PATH,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        token=TOKEN,
     )
 except Exception:
     restart_space()
+"""
+RGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/RGB/", EVAL_REQUESTS_PATH+"/RGB/", RGB_COLS, RGB_BENCHMARK_COLS)
+PGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/PGB/", EVAL_REQUESTS_PATH+"/PGB/", PGB_COLS, PGB_BENCHMARK_COLS)
+GUE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GUE/", EVAL_REQUESTS_PATH+"/GUE/", GUE_COLS, GUE_BENCHMARK_COLS)
+GB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GB/", EVAL_REQUESTS_PATH+"/GB/", GB_COLS, GB_BENCHMARK_COLS)
 (
     finished_eval_queue_df,
     pending_eval_queue_df,
 ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+def init_leaderboard(dataframe, AutoEvalColumn):
     if dataframe is None or dataframe.empty:
         raise ValueError("Leaderboard DataFrame is empty or None.")
     return Leaderboard(
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("RGB", elem_id="rgb-benchmark-tab-table", id=0):
+            leaderboard = init_leaderboard(RGB_LEADERBOARD_DF, AutoEvalColumnRGB)
+        with gr.TabItem("PGB", elem_id="pgb-benchmark-tab-table", id=1):
+            leaderboard2 = init_leaderboard(PGB_LEADERBOARD_DF, AutoEvalColumnPGB)
+        with gr.TabItem("GUE", elem_id="gue-benchmark-tab-table", id=2):
+            leaderboard3 = init_leaderboard(GUE_LEADERBOARD_DF, AutoEvalColumnGUE)
+        with gr.TabItem("GB", elem_id="gb-benchmark-tab-table", id=3):
+            leaderboard4 = init_leaderboard(GB_LEADERBOARD_DF, AutoEvalColumnGB)
+        with gr.TabItem("📝 About", elem_id="rgb-benchmark-tab-table", id=4):
+            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
+        with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=5):
             with gr.Column():
                 with gr.Row():
                     gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
                         value=None,
                         interactive=True,
                     )
                 with gr.Column():
                     precision = gr.Dropdown(
                         choices=[i.value.name for i in Precision if i != Precision.Unknown],
 scheduler = BackgroundScheduler()
 scheduler.add_job(restart_space, "interval", seconds=1800)
 scheduler.start()
+demo.queue(default_concurrency_limit=40).launch()

src/about.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
@@ -10,8 +11,8 @@ class Task:
 # Select your tasks here
 # ---------------------------------------------------
-class Tasks(Enum):
-    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
     task1 = Task("SNMD", "AUC", "SNMD (AUC)")
     task2 = Task("SNMR", "F1", "SNMR (F1)")
@@ -19,72 +20,101 @@ class Tasks(Enum):
     task4 = Task("bpRNA", "F1", "bpRNA (F1)")
     task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
-NUM_FEWSHOT = 0 # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
-TITLE = """<h1 align="center" id="space-title">OmniGenomeBench Leaderboard</h1>"""
 LLM_BENCHMARKS_TEXT = f"""
 ## Why do we need this benchmark?
-Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
-As key parts of biology, such as DNA, RNA sequences, and secondary structures, have a large effect on each other,
-the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
 However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
 Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
 This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
-Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets,
-to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
 ## Evaluation Datasets
 TODO HERE
 ## Reported Scores and Ranking
 TODO HERE
 ## How it works
 Do we need this?
 ## Reproducibility
 To reproduce our results, here are the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("your model name", revision=revision)
 model = AutoModel.from_pretrained("your model name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-If this step fails, follow the error messages to debug your model before submitting it.
-It's likely your model has been improperly uploaded.
-Note: make sure your model is public! Note: if your model needs `use_remote_code=True',
-we do not support this option yet but we are working on adding it, stay posted!
 ```
 """
-CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = """
 @article{Yang2024,
-    author = {Yang, Heng and Li, Ke},
-    title  = {OmniGenome: Aligning {RNA} Sequences with Secondary Structures in Genomic Foundation Models},
-    journal= {CoRR},
-    volume = {abs/2407.11242},
-    year   = {2024}
 }
 """
-INTRODUCTION_TEXT = """
-## What does your leaderboard evaluate?
-The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes.
-Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes.
-However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics.
-This hinders progress in various genomic tasks, such as RNA design and structure prediction.
-"""

 from dataclasses import dataclass
 from enum import Enum
 @dataclass
 class Task:
     benchmark: str
 # Select your tasks here
 # ---------------------------------------------------
+class TasksRGB(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
     task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
     task1 = Task("SNMD", "AUC", "SNMD (AUC)")
     task2 = Task("SNMR", "F1", "SNMR (F1)")
     task4 = Task("bpRNA", "F1", "bpRNA (F1)")
     task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
+class TasksPGB(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("PolyA", "F1", "PolyA (F1)")
+    task1 = Task("LncRNA", "F1", "LncRNA (F1)")
+    task2 = Task("Chrom Acc", "F1", "Chrom Acc (F1)")
+    task3 = Task("Prom Str", "RMSE", "Prom Str (RMSE)")
+    task4 = Task("Term Str", "RMSE", "Term Str (RMSE)")
+    task5 = Task("Splice", "F1", "Splice (F1)")
+    task6 = Task("Gene Exp", "RMSE", "Gene Exp (RMSE)")
+    task7 = Task("Enhancer", "F1", "Enhancer (F1)")
+class TasksGUE(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("Yeast EMP", "F1", "Yeast EMP (F1)")
+    task1 = Task("Mouse TF-M", "F1", "Mouse TF-M (F1)")
+    task2 = Task("Virus CVC", "F1", "Virus CVC (F1)")
+    task3 = Task("Human TF-H", "F1", "Human TF-H (F1)")
+    task4 = Task("Human PD", "F1", "Human PD (F1)")
+    task5 = Task("Human CPD", "F1", "Human CPD (F1)")
+    task6 = Task("Human SSP", "F1", "Human SSP (F1)")
+class TasksGB(Enum):
+    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
+    task0 = Task("DEM", "F1", "DEM (F1)")
+    task1 = Task("DOW", "F1", "DOW (F1)")
+    task2 = Task("DRE", "F1", "DRE (F1)")
+    task3 = Task("DME", "F1", "DME (F1)")
+    task4 = Task("HCE", "F1", "HCE (F1)")
+    task5 = Task("HEE", "F1", "HEE (F1)")
+    task6 = Task("HRE", "F1", "HRE (F1)")
+    task7 = Task("HNP", "F1", "HNP (F1)")
+    task8 = Task("HOR", "F1", "HOR (F1)")
+NUM_FEWSHOT = 0  # Change with your few shot
 # ---------------------------------------------------
 # Your leaderboard name
+TITLE = """<h1 align="center" id="space-title">Genomic Modelling Leaderboard</h1>"""
+# What does your leaderboard evaluate?
+INTRODUCTION_TEXT = """
+"""
+# Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 ## Why do we need this benchmark?
+Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
+As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
 However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
 Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
 This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
+Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
 ## Evaluation Datasets
 TODO HERE
 ## Reported Scores and Ranking
 TODO HERE
 ## How it works
 Do we need this?
 ## Reproducibility
 To reproduce our results, here are the commands you can run:
 """
 EVALUATION_QUEUE_TEXT = """
 ## Some good practices before submitting a model
 ### 1) Make sure you can load your model and tokenizer using AutoClasses:
 ```python
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 config = AutoConfig.from_pretrained("your model name", revision=revision)
 model = AutoModel.from_pretrained("your model name", revision=revision)
 tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
 ```
+If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
+Note: make sure your model is public!
+Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
+### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
+It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
+### 3) Make sure your model has an open license!
+This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
+### 4) Fill up your model card
+When we add extra information about models to the leaderboard, it will be automatically taken from the model card
+## In case of model failure
+If your model is displayed in the `FAILED` category, its execution stopped.
+Make sure you have followed the above steps first.
+If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
 """
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""
 @article{Yang2024,
+  author = {Yang, Heng and Li, Ke},
+  title = {Foundation Models Work},
+  journal = {arXiv},
+  year = {2024},
+  note = {arXiv preprint arXiv:XXXX.XXXXX}
+  url = {https://arxiv.org/abs/XXXX.XXXXX}
 }
 """

src/display/utils.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from dataclasses import dataclass, make_dataclass
 from enum import Enum
-import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -20,28 +20,37 @@ class ColumnContent:
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
-auto_eval_column_dict = []
-# Init
-auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
-auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
-#Scores
-auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
-for task in Tasks:
-    auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
-# Model information
-auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
-auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
-auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
-auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
-auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
-auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
-auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
-auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
-auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
 # We use make dataclass to dynamically fill the scores from Tasks
-AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
@@ -53,12 +62,13 @@ class EvalQueueColumn:  # Queue column
     weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
     display_name: str = ""
-    symbol: str = "" # emoji
 class ModelType(Enum):
@@ -83,11 +93,13 @@ class ModelType(Enum):
             return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
@@ -100,11 +112,17 @@ class Precision(Enum):
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
-COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, make_dataclass
 from enum import Enum
+from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
     hidden: bool = False
     never_hidden: bool = False
 ## Leaderboard columns
+auto_eval_columns = []
+for eval_col in [TasksRGB, TasksPGB, TasksGUE, TasksGB]:
+    auto_eval_column_dict = []
+    # Init
+    auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
+    auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
+    # Scores
+    auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
+    for task in eval_col:
+        auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
+    # Model information
+    auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
+    auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
+    auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
+    auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
+    auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
+    auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
+    auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
+    auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
+    auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
+    auto_eval_columns.append(auto_eval_column_dict)
 # We use make dataclass to dynamically fill the scores from Tasks
+AutoEvalColumnRGB = make_dataclass("AutoEvalColumn", auto_eval_columns[0], frozen=True)
+AutoEvalColumnPGB = make_dataclass("AutoEvalColumn", auto_eval_columns[1], frozen=True)
+AutoEvalColumnGUE = make_dataclass("AutoEvalColumn", auto_eval_columns[2], frozen=True)
+AutoEvalColumnGB = make_dataclass("AutoEvalColumn", auto_eval_columns[3], frozen=True)
 ## For the queue columns in the submission tab
 @dataclass(frozen=True)
     weight_type = ColumnContent("weight_type", "str", "Original")
     status = ColumnContent("status", "str", True)
 ## All the model information that we might need
 @dataclass
 class ModelDetails:
     name: str
     display_name: str = ""
+    symbol: str = ""  # emoji
 class ModelType(Enum):
             return ModelType.IFT
         return ModelType.Unknown
 class WeightType(Enum):
     Adapter = ModelDetails("Adapter")
     Original = ModelDetails("Original")
     Delta = ModelDetails("Delta")
 class Precision(Enum):
     float16 = ModelDetails("float16")
     bfloat16 = ModelDetails("bfloat16")
             return Precision.bfloat16
         return Precision.Unknown
 # Column selection
+RGB_COLS = [c.name for c in fields(AutoEvalColumnRGB) if not c.hidden]
+PGB_COLS = [c.name for c in fields(AutoEvalColumnPGB) if not c.hidden]
+GUE_COLS = [c.name for c in fields(AutoEvalColumnGUE) if not c.hidden]
+GB_COLS = [c.name for c in fields(AutoEvalColumnGB) if not c.hidden]
 EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
 EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
+RGB_BENCHMARK_COLS = [t.value.col_name for t in TasksRGB]
+PGB_BENCHMARK_COLS = [t.value.col_name for t in TasksPGB]
+GUE_BENCHMARK_COLS = [t.value.col_name for t in TasksGUE]
+GB_BENCHMARK_COLS = [t.value.col_name for t in TasksGB]

src/envs.py CHANGED Viewed

@@ -4,17 +4,19 @@ from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
-TOKEN = os.environ.get("TOKEN") # A read/write token for your org
-OWNER = "yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
-REPO_ID = f"{OWNER}/OmniGenomeLeaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
-CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

 # Info to change for your repository
 # ----------------------------------
+TOKEN = os.environ.get("TOKEN")  # A read/write token for your org
+OWNER = (
+    "yangheng"  # Change to your org - don't forget to create a results and request dataset, with the correct format!
+)
 # ----------------------------------
+REPO_ID = f"{OWNER}/leaderboard"
 QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
+CACHE_PATH = os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")

src/leaderboard/read_evals.py CHANGED Viewed

@@ -1,39 +1,41 @@
 import glob
 import json
-import math
 import os
 from dataclasses import dataclass
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
-from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
-    """Represents one full evaluation. Built from a combination of the result and request file for a given run.
-    """
-    eval_name: str # org_model_precision (uid)
-    full_model: str # org/model (path on hub)
-    org: str
     model: str
-    revision: str # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
-    model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
-    weight_type: WeightType = WeightType.Original # Original or Adapter
-    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
-    date: str = "" # submission date of request file
     still_on_hub: bool = False
     @classmethod
-    def init_from_json_file(self, json_filepath):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
@@ -75,7 +77,7 @@ class EvalResult:
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
-            if task.benchmark == "mRNA":
                 # Keep RMSE at original value
                 mean_acc = np.mean(accs)
             else:
@@ -88,10 +90,10 @@ class EvalResult:
             org=org,
             model=model,
             results=results,
-            precision=precision,
-            revision= config.get("model_sha", ""),
             still_on_hub=still_on_hub,
-            architecture=architecture
         )
     def update_with_request_file(self, requests_path):
@@ -108,9 +110,11 @@ class EvalResult:
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
-            print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}")
-    def to_dict(self, rank):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = rank
         # average = sorted(average, reverse=True)
@@ -154,10 +158,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
             req_content = json.load(f)
             # print("Request File: ", tmp_request_file)
             # print("Req Content: ", req_content)
-            if (
-                req_content["status"] in ["FINISHED"]
-                and req_content["precision"] == precision.split(".")[-1]
-            ):
                 request_file = tmp_request_file
     return request_file
@@ -168,6 +169,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
@@ -176,14 +178,21 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
-        eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
@@ -197,10 +206,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for result in eval_results.values():
         result.average = np.mean(list(result.results.values()))
     sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
-    for i,v in enumerate(sorted_results):
         try:
-            v.to_dict(i) # we test if the dict version is complete
             results.append(v)
         except KeyError:  # not all eval values present
             continue

 import glob
 import json
 import os
 from dataclasses import dataclass
+import re
 import dateutil
 import numpy as np
 from src.display.formatting import make_clickable_model
+from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
+    AutoEvalColumnGUE, AutoEvalColumnGB, ModelType, Precision, WeightType
+from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
 from src.submission.check_validity import is_model_on_hub
 @dataclass
 class EvalResult:
+    """Represents one full evaluation. Built from a combination of the result and request file for a given run."""
+    eval_name: str  # org_model_precision (uid)
+    full_model: str  # org/model (path on hub)
+    org: str
     model: str
+    revision: str  # commit hash, "" if main
     results: dict
     precision: Precision = Precision.Unknown
+    model_type: ModelType = ModelType.Unknown  # Pretrained, fine tuned, ...
+    weight_type: WeightType = WeightType.Original  # Original or Adapter
+    architecture: str = "Unknown"
     license: str = "?"
     likes: int = 0
     num_params: int = 0
+    date: str = ""  # submission date of request file
     still_on_hub: bool = False
     @classmethod
+    def init_from_json_file(self, json_filepath, Tasks):
         """Inits the result from the specific model result file"""
         with open(json_filepath) as fp:
             data = json.load(fp)
             accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
             if accs.size == 0 or any([acc is None for acc in accs]):
                 continue
+            if task.metric == "RMSE":
                 # Keep RMSE at original value
                 mean_acc = np.mean(accs)
             else:
             org=org,
             model=model,
             results=results,
+            precision=precision,
+            revision=config.get("model_sha", ""),
             still_on_hub=still_on_hub,
+            architecture=architecture,
         )
     def update_with_request_file(self, requests_path):
             self.num_params = request.get("params", 0)
             self.date = request.get("submitted_time", "")
         except Exception:
+            print(
+                f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
+            )
+    def to_dict(self, rank, AutoEvalColumn, Tasks):
         """Converts the Eval Result to a dict compatible with our dataframe display"""
         average = rank
         # average = sorted(average, reverse=True)
             req_content = json.load(f)
             # print("Request File: ", tmp_request_file)
             # print("Req Content: ", req_content)
+            if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
                 request_file = tmp_request_file
     return request_file
     for root, _, files in os.walk(results_path):
         # We should only have json files in model results
+        print(f"Files {files}")
         if len(files) == 0 or any([not f.endswith(".json") for f in files]):
             continue
             files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
         except dateutil.parser._parser.ParserError:
             files = [files[-1]]
         for file in files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
+    print(f"Filepaths: {model_result_filepaths}")
     for model_result_filepath in model_result_filepaths:
         # Creation of result
+        if "RGB" in results_path:
+            eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksRGB)
+        elif "PGB" in results_path:
+            eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksPGB)
+        elif "GUE" in results_path:
+            eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGUE)
+        else:
+            eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGB)
         eval_result.update_with_request_file(requests_path)
         # Store results of same eval together
     for result in eval_results.values():
         result.average = np.mean(list(result.results.values()))
     sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
+    print(f"SORTED RESULTS HERE: \n{sorted_results}")
+    for i, v in enumerate(sorted_results):
         try:
+            # we test if the dict version is complete
+            if "RGB" in results_path:
+                v.to_dict(i, AutoEvalColumnRGB, TasksRGB)
+            elif "PGB" in results_path:
+                v.to_dict(i, AutoEvalColumnPGB, TasksPGB)
+            elif "GUE" in results_path:
+                v.to_dict(i, AutoEvalColumnGUE, TasksGUE)
+            else:
+                v.to_dict(i, AutoEvalColumnGB, TasksGB)
             results.append(v)
         except KeyError:  # not all eval values present
             continue

src/populate.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import json
 import os
 import numpy as np
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     for result in raw_data:
         result.average = np.mean(list(result.results.values()))
@@ -18,10 +22,20 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
     print(sorted_results)
     # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
     # rank = [rank+1 for rank, value in enumerate(average)]
-    all_data_json = [v.to_dict(i+1) for i, v in enumerate(raw_data)]
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
@@ -34,8 +48,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
     for entry in entries:
         if ".json" in entry:
             file_path = os.path.join(save_path, entry)
             with open(file_path) as fp:
@@ -47,15 +64,15 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
-            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
-            for sub_entry in sub_entries:
-                file_path = os.path.join(save_path, entry, sub_entry)
-                with open(file_path) as fp:
-                    data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
-                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
-                all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
@@ -63,4 +80,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
-    return df_finished[cols], df_running[cols], df_pending[cols]

 import json
 import os
 import numpy as np
 import pandas as pd
 from src.display.formatting import has_no_nan_values, make_clickable_model
+from src.display.utils import EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
+from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
+    AutoEvalColumnGUE, AutoEvalColumnGB
+from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
+    print(f"RESULTS PATH: {results_path}")
     raw_data = get_raw_eval_results(results_path, requests_path)
     for result in raw_data:
         result.average = np.mean(list(result.results.values()))
     print(sorted_results)
     # ranks = [rank+1 for rank, value in enumerate(sorted_results)]
     # rank = [rank+1 for rank, value in enumerate(average)]
+    if "RGB" in results_path:
+        all_data_json = [v.to_dict(i+1, AutoEvalColumnRGB, TasksRGB) for i, v in enumerate(raw_data)]
+    elif "PGB" in results_path:
+        all_data_json = [v.to_dict(i+1, AutoEvalColumnPGB, TasksPGB) for i, v in enumerate(raw_data)]
+    elif "GUE" in results_path:
+        all_data_json = [v.to_dict(i+1, AutoEvalColumnGUE, TasksGUE) for i, v in enumerate(raw_data)]
+    else:
+        all_data_json = [v.to_dict(i+1, AutoEvalColumnGB, TasksGB) for i, v in enumerate(raw_data)]
+    # all_data_json = [v.to_dict(i + 1) for i, v in enumerate(raw_data)]
     df = pd.DataFrame.from_records(all_data_json)
     # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
+    print(f"Cols: {cols}")
+    print(f"DF: {df}")
     df = df[cols].round(decimals=2)
     # filter out if any of the benchmarks have not been produced
     """Creates the different dataframes for the evaluation queues requestes"""
     entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
     all_evals = []
+    print(entries)
+    entries = [entry for entry in entries if not entry.startswith(".")]
+    print(entries)
     for entry in entries:
+        print(entries)
         if ".json" in entry:
             file_path = os.path.join(save_path, entry)
             with open(file_path) as fp:
             all_evals.append(data)
         elif ".md" not in entry:
             # this is a folder
+            entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
+            # for sub_entry in sub_entries:
+            #     file_path = os.path.join(save_path, entry, sub_entry)
+            #     with open(file_path) as fp:
+            #         data = json.load(fp)
+                # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
+                # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
+                # all_evals.append(data)
     pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
     running_list = [e for e in all_evals if e["status"] == "RUNNING"]
     df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
     df_running = pd.DataFrame.from_records(running_list, columns=cols)
     df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
+    return df_finished[cols], df_running[cols], df_pending[cols]

src/submission/check_validity.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import json
 import os
-import re
 from collections import defaultdict
-from datetime import datetime, timedelta, timezone
 import huggingface_hub
 from huggingface_hub import ModelCard
@@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
@@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
     return True, ""
-def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
-        config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
         if test_tokenizer:
             try:
-                tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
             except ValueError as e:
                 return (
                     False,
-                    f"uses a tokenizer which is not in a transformers release: {e}",
-                    None
                 )
-            except Exception as e:
-                return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
         return True, None, config
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
-            None
         )
-    except Exception as e:
         return False, "was not found on hub!", None
@@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

 import json
 import os
 from collections import defaultdict
 import huggingface_hub
 from huggingface_hub import ModelCard
 from transformers import AutoConfig
 from transformers.models.auto.tokenization_auto import AutoTokenizer
 def check_model_card(repo_id: str) -> tuple[bool, str]:
     """Checks if the model card and license exist and have been filled"""
     try:
     return True, ""
+def is_model_on_hub(
+    model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
+) -> tuple[bool, str]:
     """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
     try:
+        config = AutoConfig.from_pretrained(
+            model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+        )
         if test_tokenizer:
             try:
+                tk = AutoTokenizer.from_pretrained(
+                    model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
+                )
             except ValueError as e:
+                return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
+            except Exception:
                 return (
                     False,
+                    "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
+                    None,
                 )
         return True, None, config
     except ValueError:
         return (
             False,
             "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
+            None,
         )
+    except Exception:
         return False, "was not found on hub!", None
     model_size = size_factor * model_size
     return model_size
 def get_model_arch(model_info: ModelInfo):
     """Gets the model architecture from the configuration"""
     return model_info.config.get("architectures", "Unknown")
 def already_submitted_models(requested_models_dir: str) -> set[str]:
     """Gather a list of already submitted models to avoid duplicates"""
     depth = 1

src/submission/submit.py CHANGED Viewed

@@ -3,17 +3,13 @@ import os
 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
-from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
-from src.submission.check_validity import (
-    already_submitted_models,
-    check_model_card,
-    get_model_size,
-    is_model_on_hub,
-)
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
@@ -45,7 +41,9 @@ def add_new_eval(
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
-        base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')

 from datetime import datetime, timezone
 from src.display.formatting import styled_error, styled_message, styled_warning
+from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
+from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
 REQUESTED_MODELS = None
 USERS_TO_SUBMISSION_DATES = None
 def add_new_eval(
     model: str,
     base_model: str,
     # Is the model on the hub?
     if weight_type in ["Delta", "Adapter"]:
+        base_model_on_hub, error, _ = is_model_on_hub(
+            model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True
+        )
         if not base_model_on_hub:
             return styled_error(f'Base model "{base_model}" {error}')