update
Browse files- .gitignore +0 -1
- app.py +47 -25
- src/about.py +67 -37
- src/display/utils.py +43 -25
- src/envs.py +6 -4
- src/leaderboard/read_evals.py +45 -28
- src/populate.py +30 -13
- src/submission/check_validity.py +19 -11
- src/submission/submit.py +6 -8
.gitignore
CHANGED
@@ -5,7 +5,6 @@ __pycache__/
|
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
8 |
-
.idea/
|
9 |
|
10 |
eval-queue/
|
11 |
eval-results/
|
|
|
5 |
.ipynb_checkpoints
|
6 |
*ipynb
|
7 |
.vscode/
|
|
|
8 |
|
9 |
eval-queue/
|
10 |
eval-results/
|
app.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
-
import pandas as pd
|
4 |
from apscheduler.schedulers.background import BackgroundScheduler
|
|
|
5 |
from huggingface_hub import snapshot_download
|
6 |
|
7 |
from src.about import (
|
@@ -14,15 +13,17 @@ from src.about import (
|
|
14 |
)
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
-
|
18 |
-
|
|
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
-
|
|
|
22 |
ModelType,
|
23 |
-
|
24 |
WeightType,
|
25 |
-
|
26 |
)
|
27 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
@@ -32,24 +33,39 @@ from src.submission.submit import add_new_eval
|
|
32 |
def restart_space():
|
33 |
API.restart_space(repo_id=REPO_ID)
|
34 |
|
|
|
35 |
### Space initialisation
|
|
|
36 |
try:
|
37 |
print(EVAL_REQUESTS_PATH)
|
38 |
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO,
|
|
|
|
|
|
|
|
|
|
|
40 |
)
|
41 |
except Exception:
|
42 |
restart_space()
|
43 |
try:
|
44 |
print(EVAL_RESULTS_PATH)
|
45 |
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO,
|
|
|
|
|
|
|
|
|
|
|
47 |
)
|
48 |
except Exception:
|
49 |
restart_space()
|
|
|
50 |
|
51 |
-
|
52 |
-
|
|
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -57,7 +73,8 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
-
|
|
|
61 |
if dataframe is None or dataframe.empty:
|
62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
63 |
return Leaderboard(
|
@@ -95,18 +112,22 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("RGB
|
99 |
-
leaderboard = init_leaderboard(
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
110 |
with gr.Column():
|
111 |
with gr.Row():
|
112 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
@@ -160,6 +181,7 @@ with demo:
|
|
160 |
value=None,
|
161 |
interactive=True,
|
162 |
)
|
|
|
163 |
with gr.Column():
|
164 |
precision = gr.Dropdown(
|
165 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
@@ -205,4 +227,4 @@ with demo:
|
|
205 |
scheduler = BackgroundScheduler()
|
206 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
207 |
scheduler.start()
|
208 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
from apscheduler.schedulers.background import BackgroundScheduler
|
3 |
+
from gradio_leaderboard import ColumnFilter, Leaderboard, SelectColumns
|
4 |
from huggingface_hub import snapshot_download
|
5 |
|
6 |
from src.about import (
|
|
|
13 |
)
|
14 |
from src.display.css_html_js import custom_css
|
15 |
from src.display.utils import (
|
16 |
+
RGB_BENCHMARK_COLS, PGB_BENCHMARK_COLS,
|
17 |
+
GUE_BENCHMARK_COLS, GB_BENCHMARK_COLS,
|
18 |
+
RGB_COLS, PGB_COLS, GUE_COLS, GB_COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
21 |
+
AutoEvalColumnRGB, AutoEvalColumnPGB,
|
22 |
+
AutoEvalColumnGUE, AutoEvalColumnGB,
|
23 |
ModelType,
|
24 |
+
Precision,
|
25 |
WeightType,
|
26 |
+
fields,
|
27 |
)
|
28 |
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
29 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id=REPO_ID)
|
35 |
|
36 |
+
|
37 |
### Space initialisation
|
38 |
+
"""
|
39 |
try:
|
40 |
print(EVAL_REQUESTS_PATH)
|
41 |
snapshot_download(
|
42 |
+
repo_id=QUEUE_REPO,
|
43 |
+
local_dir=EVAL_REQUESTS_PATH,
|
44 |
+
repo_type="dataset",
|
45 |
+
tqdm_class=None,
|
46 |
+
etag_timeout=30,
|
47 |
+
token=TOKEN,
|
48 |
)
|
49 |
except Exception:
|
50 |
restart_space()
|
51 |
try:
|
52 |
print(EVAL_RESULTS_PATH)
|
53 |
snapshot_download(
|
54 |
+
repo_id=RESULTS_REPO,
|
55 |
+
local_dir=EVAL_RESULTS_PATH,
|
56 |
+
repo_type="dataset",
|
57 |
+
tqdm_class=None,
|
58 |
+
etag_timeout=30,
|
59 |
+
token=TOKEN,
|
60 |
)
|
61 |
except Exception:
|
62 |
restart_space()
|
63 |
+
"""
|
64 |
|
65 |
+
RGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/RGB/", EVAL_REQUESTS_PATH+"/RGB/", RGB_COLS, RGB_BENCHMARK_COLS)
|
66 |
+
PGB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/PGB/", EVAL_REQUESTS_PATH+"/PGB/", PGB_COLS, PGB_BENCHMARK_COLS)
|
67 |
+
GUE_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GUE/", EVAL_REQUESTS_PATH+"/GUE/", GUE_COLS, GUE_BENCHMARK_COLS)
|
68 |
+
GB_LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH+"/GB/", EVAL_REQUESTS_PATH+"/GB/", GB_COLS, GB_BENCHMARK_COLS)
|
69 |
|
70 |
(
|
71 |
finished_eval_queue_df,
|
|
|
73 |
pending_eval_queue_df,
|
74 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
75 |
|
76 |
+
|
77 |
+
def init_leaderboard(dataframe, AutoEvalColumn):
|
78 |
if dataframe is None or dataframe.empty:
|
79 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
80 |
return Leaderboard(
|
|
|
112 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
113 |
|
114 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
115 |
+
with gr.TabItem("RGB", elem_id="rgb-benchmark-tab-table", id=0):
|
116 |
+
leaderboard = init_leaderboard(RGB_LEADERBOARD_DF, AutoEvalColumnRGB)
|
117 |
+
|
118 |
+
with gr.TabItem("PGB", elem_id="pgb-benchmark-tab-table", id=1):
|
119 |
+
leaderboard2 = init_leaderboard(PGB_LEADERBOARD_DF, AutoEvalColumnPGB)
|
120 |
+
|
121 |
+
with gr.TabItem("GUE", elem_id="gue-benchmark-tab-table", id=2):
|
122 |
+
leaderboard3 = init_leaderboard(GUE_LEADERBOARD_DF, AutoEvalColumnGUE)
|
123 |
+
|
124 |
+
with gr.TabItem("GB", elem_id="gb-benchmark-tab-table", id=3):
|
125 |
+
leaderboard4 = init_leaderboard(GB_LEADERBOARD_DF, AutoEvalColumnGB)
|
126 |
+
|
127 |
+
with gr.TabItem("📝 About", elem_id="rgb-benchmark-tab-table", id=4):
|
128 |
+
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
129 |
+
|
130 |
+
with gr.TabItem("🚀 Submit here! ", elem_id="rgb-benchmark-tab-table", id=5):
|
131 |
with gr.Column():
|
132 |
with gr.Row():
|
133 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
181 |
value=None,
|
182 |
interactive=True,
|
183 |
)
|
184 |
+
|
185 |
with gr.Column():
|
186 |
precision = gr.Dropdown(
|
187 |
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
|
|
227 |
scheduler = BackgroundScheduler()
|
228 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
229 |
scheduler.start()
|
230 |
+
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
@dataclass
|
5 |
class Task:
|
6 |
benchmark: str
|
@@ -10,8 +11,8 @@ class Task:
|
|
10 |
|
11 |
# Select your tasks here
|
12 |
# ---------------------------------------------------
|
13 |
-
class
|
14 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
|
16 |
task1 = Task("SNMD", "AUC", "SNMD (AUC)")
|
17 |
task2 = Task("SNMR", "F1", "SNMR (F1)")
|
@@ -19,72 +20,101 @@ class Tasks(Enum):
|
|
19 |
task4 = Task("bpRNA", "F1", "bpRNA (F1)")
|
20 |
task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
|
21 |
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# ---------------------------------------------------
|
24 |
|
25 |
-
|
26 |
-
|
27 |
# Your leaderboard name
|
28 |
-
TITLE = """<h1 align="center" id="space-title">
|
29 |
|
|
|
|
|
|
|
30 |
|
|
|
31 |
LLM_BENCHMARKS_TEXT = f"""
|
32 |
## Why do we need this benchmark?
|
33 |
-
Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
|
34 |
-
As key parts of biology, such as DNA, RNA sequences,
|
35 |
-
the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
|
36 |
However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
|
37 |
Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
|
38 |
This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
|
39 |
-
Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets,
|
40 |
-
to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
|
41 |
-
|
42 |
## Evaluation Datasets
|
43 |
TODO HERE
|
44 |
-
|
45 |
## Reported Scores and Ranking
|
46 |
TODO HERE
|
47 |
-
|
48 |
## How it works
|
49 |
Do we need this?
|
50 |
-
|
51 |
## Reproducibility
|
52 |
To reproduce our results, here are the commands you can run:
|
53 |
"""
|
54 |
|
55 |
EVALUATION_QUEUE_TEXT = """
|
56 |
## Some good practices before submitting a model
|
57 |
-
|
58 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
59 |
```python
|
60 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
61 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
62 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
63 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
64 |
-
|
65 |
-
If this step fails, follow the error messages to debug your model before submitting it.
|
66 |
-
It's likely your model has been improperly uploaded.
|
67 |
-
Note: make sure your model is public! Note: if your model needs `use_remote_code=True',
|
68 |
-
we do not support this option yet but we are working on adding it, stay posted!
|
69 |
-
|
70 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"""
|
72 |
-
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
73 |
|
74 |
-
|
|
|
75 |
@article{Yang2024,
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
81 |
}
|
82 |
"""
|
83 |
-
|
84 |
-
INTRODUCTION_TEXT = """
|
85 |
-
## What does your leaderboard evaluate?
|
86 |
-
The deciphering of RNA and DNA genomes has been ongoing for decades, with the aim of advancing genome analysis, including understanding and synthesizing genomes.
|
87 |
-
Recently, Genomic Foundation Models (GFMs) have emerged as powerful tools for genome analysis and manipulation, leveraging advancements in natural language processing to model the "genomic language" encoded in genomes.
|
88 |
-
However, GFMs face two significant challenges: the lack of benchmarking tools and open-source software for diverse genomics.
|
89 |
-
This hinders progress in various genomic tasks, such as RNA design and structure prediction.
|
90 |
-
"""
|
|
|
1 |
from dataclasses import dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
+
|
5 |
@dataclass
|
6 |
class Task:
|
7 |
benchmark: str
|
|
|
11 |
|
12 |
# Select your tasks here
|
13 |
# ---------------------------------------------------
|
14 |
+
class TasksRGB(Enum):
|
15 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
task0 = Task("mRNA", "RMSE", "mRNA (RMSE)")
|
17 |
task1 = Task("SNMD", "AUC", "SNMD (AUC)")
|
18 |
task2 = Task("SNMR", "F1", "SNMR (F1)")
|
|
|
20 |
task4 = Task("bpRNA", "F1", "bpRNA (F1)")
|
21 |
task5 = Task("RNAStralign", "F1", "RNAStralign (F1)")
|
22 |
|
23 |
+
class TasksPGB(Enum):
|
24 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
25 |
+
task0 = Task("PolyA", "F1", "PolyA (F1)")
|
26 |
+
task1 = Task("LncRNA", "F1", "LncRNA (F1)")
|
27 |
+
task2 = Task("Chrom Acc", "F1", "Chrom Acc (F1)")
|
28 |
+
task3 = Task("Prom Str", "RMSE", "Prom Str (RMSE)")
|
29 |
+
task4 = Task("Term Str", "RMSE", "Term Str (RMSE)")
|
30 |
+
task5 = Task("Splice", "F1", "Splice (F1)")
|
31 |
+
task6 = Task("Gene Exp", "RMSE", "Gene Exp (RMSE)")
|
32 |
+
task7 = Task("Enhancer", "F1", "Enhancer (F1)")
|
33 |
+
|
34 |
+
class TasksGUE(Enum):
|
35 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
36 |
+
task0 = Task("Yeast EMP", "F1", "Yeast EMP (F1)")
|
37 |
+
task1 = Task("Mouse TF-M", "F1", "Mouse TF-M (F1)")
|
38 |
+
task2 = Task("Virus CVC", "F1", "Virus CVC (F1)")
|
39 |
+
task3 = Task("Human TF-H", "F1", "Human TF-H (F1)")
|
40 |
+
task4 = Task("Human PD", "F1", "Human PD (F1)")
|
41 |
+
task5 = Task("Human CPD", "F1", "Human CPD (F1)")
|
42 |
+
task6 = Task("Human SSP", "F1", "Human SSP (F1)")
|
43 |
+
|
44 |
+
class TasksGB(Enum):
|
45 |
+
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
46 |
+
task0 = Task("DEM", "F1", "DEM (F1)")
|
47 |
+
task1 = Task("DOW", "F1", "DOW (F1)")
|
48 |
+
task2 = Task("DRE", "F1", "DRE (F1)")
|
49 |
+
task3 = Task("DME", "F1", "DME (F1)")
|
50 |
+
task4 = Task("HCE", "F1", "HCE (F1)")
|
51 |
+
task5 = Task("HEE", "F1", "HEE (F1)")
|
52 |
+
task6 = Task("HRE", "F1", "HRE (F1)")
|
53 |
+
task7 = Task("HNP", "F1", "HNP (F1)")
|
54 |
+
task8 = Task("HOR", "F1", "HOR (F1)")
|
55 |
+
|
56 |
+
|
57 |
+
NUM_FEWSHOT = 0 # Change with your few shot
|
58 |
# ---------------------------------------------------
|
59 |
|
|
|
|
|
60 |
# Your leaderboard name
|
61 |
+
TITLE = """<h1 align="center" id="space-title">Genomic Modelling Leaderboard</h1>"""
|
62 |
|
63 |
+
# What does your leaderboard evaluate?
|
64 |
+
INTRODUCTION_TEXT = """
|
65 |
+
"""
|
66 |
|
67 |
+
# Which evaluations are you running? how can people reproduce what you have?
|
68 |
LLM_BENCHMARKS_TEXT = f"""
|
69 |
## Why do we need this benchmark?
|
70 |
+
Large-scale foundation models for molecular biology constitute a vital and rapidly developing change in the computational biology and AI4Science landscape.
|
71 |
+
As key parts of biology, such as DNA, RNA sequences, secondary structures, have a large effect on each other, the usage of this information within large-scale models allows for foundation models to be adapted and suited to multiple key tasks.
|
|
|
72 |
However, with this trend comes significant issues, the primary one being the difficulty to comprehensively evaluate these models and compare them fairly.
|
73 |
Here, we refer to the specific lack of real-world data to reflect the true performance of the models, rather than in-silico experiments only.
|
74 |
This issue forces repeated benchmark testing and models being trained and adapted for a specific task that may not have any real-world benefit.
|
75 |
+
Given the importance of this, we propose this genomic leaderboard on meticulously curated real-world datasets, to allow for a fair and comprehensive benchmark on the most important genomic downstream tasks.
|
|
|
|
|
76 |
## Evaluation Datasets
|
77 |
TODO HERE
|
|
|
78 |
## Reported Scores and Ranking
|
79 |
TODO HERE
|
|
|
80 |
## How it works
|
81 |
Do we need this?
|
|
|
82 |
## Reproducibility
|
83 |
To reproduce our results, here are the commands you can run:
|
84 |
"""
|
85 |
|
86 |
EVALUATION_QUEUE_TEXT = """
|
87 |
## Some good practices before submitting a model
|
|
|
88 |
### 1) Make sure you can load your model and tokenizer using AutoClasses:
|
89 |
```python
|
90 |
from transformers import AutoConfig, AutoModel, AutoTokenizer
|
91 |
config = AutoConfig.from_pretrained("your model name", revision=revision)
|
92 |
model = AutoModel.from_pretrained("your model name", revision=revision)
|
93 |
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
```
|
95 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
|
96 |
+
Note: make sure your model is public!
|
97 |
+
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
98 |
+
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
99 |
+
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
100 |
+
### 3) Make sure your model has an open license!
|
101 |
+
This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
|
102 |
+
### 4) Fill up your model card
|
103 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
104 |
+
## In case of model failure
|
105 |
+
If your model is displayed in the `FAILED` category, its execution stopped.
|
106 |
+
Make sure you have followed the above steps first.
|
107 |
+
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
108 |
"""
|
|
|
109 |
|
110 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
111 |
+
CITATION_BUTTON_TEXT = r"""
|
112 |
@article{Yang2024,
|
113 |
+
author = {Yang, Heng and Li, Ke},
|
114 |
+
title = {Foundation Models Work},
|
115 |
+
journal = {arXiv},
|
116 |
+
year = {2024},
|
117 |
+
note = {arXiv preprint arXiv:XXXX.XXXXX}
|
118 |
+
url = {https://arxiv.org/abs/XXXX.XXXXX}
|
119 |
}
|
120 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/display/utils.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
4 |
-
import pandas as pd
|
5 |
|
6 |
-
from src.about import
|
|
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -20,28 +20,37 @@ class ColumnContent:
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
|
|
23 |
## Leaderboard columns
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
auto_eval_column_dict
|
28 |
-
#
|
29 |
-
auto_eval_column_dict.append(["
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
auto_eval_column_dict.append([
|
35 |
-
|
36 |
-
auto_eval_column_dict.append(["
|
37 |
-
auto_eval_column_dict.append(["
|
38 |
-
auto_eval_column_dict.append(["
|
39 |
-
auto_eval_column_dict.append(["
|
40 |
-
auto_eval_column_dict.append(["
|
41 |
-
auto_eval_column_dict.append(["
|
|
|
|
|
|
|
|
|
42 |
|
43 |
# We use make dataclass to dynamically fill the scores from Tasks
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
|
46 |
## For the queue columns in the submission tab
|
47 |
@dataclass(frozen=True)
|
@@ -53,12 +62,13 @@ class EvalQueueColumn: # Queue column
|
|
53 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
|
|
56 |
## All the model information that we might need
|
57 |
@dataclass
|
58 |
class ModelDetails:
|
59 |
name: str
|
60 |
display_name: str = ""
|
61 |
-
symbol: str = ""
|
62 |
|
63 |
|
64 |
class ModelType(Enum):
|
@@ -83,11 +93,13 @@ class ModelType(Enum):
|
|
83 |
return ModelType.IFT
|
84 |
return ModelType.Unknown
|
85 |
|
|
|
86 |
class WeightType(Enum):
|
87 |
Adapter = ModelDetails("Adapter")
|
88 |
Original = ModelDetails("Original")
|
89 |
Delta = ModelDetails("Delta")
|
90 |
|
|
|
91 |
class Precision(Enum):
|
92 |
float16 = ModelDetails("float16")
|
93 |
bfloat16 = ModelDetails("bfloat16")
|
@@ -100,11 +112,17 @@ class Precision(Enum):
|
|
100 |
return Precision.bfloat16
|
101 |
return Precision.Unknown
|
102 |
|
|
|
103 |
# Column selection
|
104 |
-
|
|
|
|
|
|
|
105 |
|
106 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
-
|
110 |
-
|
|
|
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
from enum import Enum
|
3 |
|
|
|
4 |
|
5 |
+
from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
|
6 |
+
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
20 |
hidden: bool = False
|
21 |
never_hidden: bool = False
|
22 |
|
23 |
+
|
24 |
## Leaderboard columns
|
25 |
+
auto_eval_columns = []
|
26 |
+
for eval_col in [TasksRGB, TasksPGB, TasksGUE, TasksGB]:
|
27 |
+
|
28 |
+
auto_eval_column_dict = []
|
29 |
+
# Init
|
30 |
+
auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
|
31 |
+
auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
|
32 |
+
# Scores
|
33 |
+
auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Rank", "number", True)])
|
34 |
+
for task in eval_col:
|
35 |
+
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
36 |
+
# Model information
|
37 |
+
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
38 |
+
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
39 |
+
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
40 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
41 |
+
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
42 |
+
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
43 |
+
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
44 |
+
auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
45 |
+
auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
46 |
+
auto_eval_columns.append(auto_eval_column_dict)
|
47 |
|
48 |
# We use make dataclass to dynamically fill the scores from Tasks
|
49 |
+
AutoEvalColumnRGB = make_dataclass("AutoEvalColumn", auto_eval_columns[0], frozen=True)
|
50 |
+
AutoEvalColumnPGB = make_dataclass("AutoEvalColumn", auto_eval_columns[1], frozen=True)
|
51 |
+
AutoEvalColumnGUE = make_dataclass("AutoEvalColumn", auto_eval_columns[2], frozen=True)
|
52 |
+
AutoEvalColumnGB = make_dataclass("AutoEvalColumn", auto_eval_columns[3], frozen=True)
|
53 |
+
|
54 |
|
55 |
## For the queue columns in the submission tab
|
56 |
@dataclass(frozen=True)
|
|
|
62 |
weight_type = ColumnContent("weight_type", "str", "Original")
|
63 |
status = ColumnContent("status", "str", True)
|
64 |
|
65 |
+
|
66 |
## All the model information that we might need
|
67 |
@dataclass
|
68 |
class ModelDetails:
|
69 |
name: str
|
70 |
display_name: str = ""
|
71 |
+
symbol: str = "" # emoji
|
72 |
|
73 |
|
74 |
class ModelType(Enum):
|
|
|
93 |
return ModelType.IFT
|
94 |
return ModelType.Unknown
|
95 |
|
96 |
+
|
97 |
class WeightType(Enum):
|
98 |
Adapter = ModelDetails("Adapter")
|
99 |
Original = ModelDetails("Original")
|
100 |
Delta = ModelDetails("Delta")
|
101 |
|
102 |
+
|
103 |
class Precision(Enum):
|
104 |
float16 = ModelDetails("float16")
|
105 |
bfloat16 = ModelDetails("bfloat16")
|
|
|
112 |
return Precision.bfloat16
|
113 |
return Precision.Unknown
|
114 |
|
115 |
+
|
116 |
# Column selection
|
117 |
+
RGB_COLS = [c.name for c in fields(AutoEvalColumnRGB) if not c.hidden]
|
118 |
+
PGB_COLS = [c.name for c in fields(AutoEvalColumnPGB) if not c.hidden]
|
119 |
+
GUE_COLS = [c.name for c in fields(AutoEvalColumnGUE) if not c.hidden]
|
120 |
+
GB_COLS = [c.name for c in fields(AutoEvalColumnGB) if not c.hidden]
|
121 |
|
122 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
123 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
124 |
|
125 |
+
RGB_BENCHMARK_COLS = [t.value.col_name for t in TasksRGB]
|
126 |
+
PGB_BENCHMARK_COLS = [t.value.col_name for t in TasksPGB]
|
127 |
+
GUE_BENCHMARK_COLS = [t.value.col_name for t in TasksGUE]
|
128 |
+
GB_BENCHMARK_COLS = [t.value.col_name for t in TasksGB]
|
src/envs.py
CHANGED
@@ -4,17 +4,19 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
-
TOKEN = os.environ.get("TOKEN")
|
8 |
|
9 |
-
OWNER =
|
|
|
|
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/
|
13 |
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
|
19 |
# Local caches
|
20 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
|
|
4 |
|
5 |
# Info to change for your repository
|
6 |
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = (
|
10 |
+
"yangheng" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
11 |
+
)
|
12 |
# ----------------------------------
|
13 |
|
14 |
+
REPO_ID = f"{OWNER}/leaderboard"
|
15 |
QUEUE_REPO = f"{OWNER}/requests"
|
16 |
RESULTS_REPO = f"{OWNER}/results"
|
17 |
|
18 |
# If you setup a cache later, just change HF_HOME
|
19 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
20 |
|
21 |
# Local caches
|
22 |
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
src/leaderboard/read_evals.py
CHANGED
@@ -1,39 +1,41 @@
|
|
1 |
import glob
|
2 |
import json
|
3 |
-
import math
|
4 |
import os
|
5 |
from dataclasses import dataclass
|
6 |
|
|
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import
|
|
|
|
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
15 |
@dataclass
|
16 |
class EvalResult:
|
17 |
-
"""Represents one full evaluation. Built from a combination of the result and request file for a given run.
|
18 |
-
|
19 |
-
eval_name: str
|
20 |
-
full_model: str
|
21 |
-
org: str
|
22 |
model: str
|
23 |
-
revision: str
|
24 |
results: dict
|
25 |
precision: Precision = Precision.Unknown
|
26 |
-
model_type: ModelType = ModelType.Unknown
|
27 |
-
weight_type: WeightType = WeightType.Original
|
28 |
-
architecture: str = "Unknown"
|
29 |
license: str = "?"
|
30 |
likes: int = 0
|
31 |
num_params: int = 0
|
32 |
-
date: str = ""
|
33 |
still_on_hub: bool = False
|
34 |
|
35 |
@classmethod
|
36 |
-
def init_from_json_file(self, json_filepath):
|
37 |
"""Inits the result from the specific model result file"""
|
38 |
with open(json_filepath) as fp:
|
39 |
data = json.load(fp)
|
@@ -75,7 +77,7 @@ class EvalResult:
|
|
75 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
76 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
77 |
continue
|
78 |
-
if task.
|
79 |
# Keep RMSE at original value
|
80 |
mean_acc = np.mean(accs)
|
81 |
else:
|
@@ -88,10 +90,10 @@ class EvalResult:
|
|
88 |
org=org,
|
89 |
model=model,
|
90 |
results=results,
|
91 |
-
precision=precision,
|
92 |
-
revision=
|
93 |
still_on_hub=still_on_hub,
|
94 |
-
architecture=architecture
|
95 |
)
|
96 |
|
97 |
def update_with_request_file(self, requests_path):
|
@@ -108,9 +110,11 @@ class EvalResult:
|
|
108 |
self.num_params = request.get("params", 0)
|
109 |
self.date = request.get("submitted_time", "")
|
110 |
except Exception:
|
111 |
-
print(
|
|
|
|
|
112 |
|
113 |
-
def to_dict(self, rank):
|
114 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
115 |
average = rank
|
116 |
# average = sorted(average, reverse=True)
|
@@ -154,10 +158,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
154 |
req_content = json.load(f)
|
155 |
# print("Request File: ", tmp_request_file)
|
156 |
# print("Req Content: ", req_content)
|
157 |
-
if (
|
158 |
-
req_content["status"] in ["FINISHED"]
|
159 |
-
and req_content["precision"] == precision.split(".")[-1]
|
160 |
-
):
|
161 |
request_file = tmp_request_file
|
162 |
return request_file
|
163 |
|
@@ -168,6 +169,7 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
168 |
|
169 |
for root, _, files in os.walk(results_path):
|
170 |
# We should only have json files in model results
|
|
|
171 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
172 |
continue
|
173 |
|
@@ -176,14 +178,21 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
176 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
177 |
except dateutil.parser._parser.ParserError:
|
178 |
files = [files[-1]]
|
179 |
-
|
180 |
for file in files:
|
181 |
model_result_filepaths.append(os.path.join(root, file))
|
182 |
|
183 |
eval_results = {}
|
|
|
184 |
for model_result_filepath in model_result_filepaths:
|
185 |
# Creation of result
|
186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
187 |
eval_result.update_with_request_file(requests_path)
|
188 |
|
189 |
# Store results of same eval together
|
@@ -197,10 +206,18 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
197 |
for result in eval_results.values():
|
198 |
result.average = np.mean(list(result.results.values()))
|
199 |
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
200 |
-
|
201 |
-
for i,v in enumerate(sorted_results):
|
202 |
try:
|
203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
results.append(v)
|
205 |
except KeyError: # not all eval values present
|
206 |
continue
|
|
|
1 |
import glob
|
2 |
import json
|
|
|
3 |
import os
|
4 |
from dataclasses import dataclass
|
5 |
|
6 |
+
import re
|
7 |
import dateutil
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
|
12 |
+
AutoEvalColumnGUE, AutoEvalColumnGB, ModelType, Precision, WeightType
|
13 |
+
from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
|
14 |
from src.submission.check_validity import is_model_on_hub
|
15 |
|
16 |
|
17 |
@dataclass
|
18 |
class EvalResult:
|
19 |
+
"""Represents one full evaluation. Built from a combination of the result and request file for a given run."""
|
20 |
+
|
21 |
+
eval_name: str # org_model_precision (uid)
|
22 |
+
full_model: str # org/model (path on hub)
|
23 |
+
org: str
|
24 |
model: str
|
25 |
+
revision: str # commit hash, "" if main
|
26 |
results: dict
|
27 |
precision: Precision = Precision.Unknown
|
28 |
+
model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
|
29 |
+
weight_type: WeightType = WeightType.Original # Original or Adapter
|
30 |
+
architecture: str = "Unknown"
|
31 |
license: str = "?"
|
32 |
likes: int = 0
|
33 |
num_params: int = 0
|
34 |
+
date: str = "" # submission date of request file
|
35 |
still_on_hub: bool = False
|
36 |
|
37 |
@classmethod
|
38 |
+
def init_from_json_file(self, json_filepath, Tasks):
|
39 |
"""Inits the result from the specific model result file"""
|
40 |
with open(json_filepath) as fp:
|
41 |
data = json.load(fp)
|
|
|
77 |
accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
|
78 |
if accs.size == 0 or any([acc is None for acc in accs]):
|
79 |
continue
|
80 |
+
if task.metric == "RMSE":
|
81 |
# Keep RMSE at original value
|
82 |
mean_acc = np.mean(accs)
|
83 |
else:
|
|
|
90 |
org=org,
|
91 |
model=model,
|
92 |
results=results,
|
93 |
+
precision=precision,
|
94 |
+
revision=config.get("model_sha", ""),
|
95 |
still_on_hub=still_on_hub,
|
96 |
+
architecture=architecture,
|
97 |
)
|
98 |
|
99 |
def update_with_request_file(self, requests_path):
|
|
|
110 |
self.num_params = request.get("params", 0)
|
111 |
self.date = request.get("submitted_time", "")
|
112 |
except Exception:
|
113 |
+
print(
|
114 |
+
f"Could not find request file for {self.org}/{self.model} with precision {self.precision.value.name}"
|
115 |
+
)
|
116 |
|
117 |
+
def to_dict(self, rank, AutoEvalColumn, Tasks):
|
118 |
"""Converts the Eval Result to a dict compatible with our dataframe display"""
|
119 |
average = rank
|
120 |
# average = sorted(average, reverse=True)
|
|
|
158 |
req_content = json.load(f)
|
159 |
# print("Request File: ", tmp_request_file)
|
160 |
# print("Req Content: ", req_content)
|
161 |
+
if req_content["status"] in ["FINISHED"] and req_content["precision"] == precision.split(".")[-1]:
|
|
|
|
|
|
|
162 |
request_file = tmp_request_file
|
163 |
return request_file
|
164 |
|
|
|
169 |
|
170 |
for root, _, files in os.walk(results_path):
|
171 |
# We should only have json files in model results
|
172 |
+
print(f"Files {files}")
|
173 |
if len(files) == 0 or any([not f.endswith(".json") for f in files]):
|
174 |
continue
|
175 |
|
|
|
178 |
files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
|
179 |
except dateutil.parser._parser.ParserError:
|
180 |
files = [files[-1]]
|
|
|
181 |
for file in files:
|
182 |
model_result_filepaths.append(os.path.join(root, file))
|
183 |
|
184 |
eval_results = {}
|
185 |
+
print(f"Filepaths: {model_result_filepaths}")
|
186 |
for model_result_filepath in model_result_filepaths:
|
187 |
# Creation of result
|
188 |
+
if "RGB" in results_path:
|
189 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksRGB)
|
190 |
+
elif "PGB" in results_path:
|
191 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksPGB)
|
192 |
+
elif "GUE" in results_path:
|
193 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGUE)
|
194 |
+
else:
|
195 |
+
eval_result = EvalResult.init_from_json_file(model_result_filepath, TasksGB)
|
196 |
eval_result.update_with_request_file(requests_path)
|
197 |
|
198 |
# Store results of same eval together
|
|
|
206 |
for result in eval_results.values():
|
207 |
result.average = np.mean(list(result.results.values()))
|
208 |
sorted_results = sorted(eval_results.values(), key=lambda r: r.average, reverse=True)
|
209 |
+
print(f"SORTED RESULTS HERE: \n{sorted_results}")
|
210 |
+
for i, v in enumerate(sorted_results):
|
211 |
try:
|
212 |
+
# we test if the dict version is complete
|
213 |
+
if "RGB" in results_path:
|
214 |
+
v.to_dict(i, AutoEvalColumnRGB, TasksRGB)
|
215 |
+
elif "PGB" in results_path:
|
216 |
+
v.to_dict(i, AutoEvalColumnPGB, TasksPGB)
|
217 |
+
elif "GUE" in results_path:
|
218 |
+
v.to_dict(i, AutoEvalColumnGUE, TasksGUE)
|
219 |
+
else:
|
220 |
+
v.to_dict(i, AutoEvalColumnGB, TasksGB)
|
221 |
results.append(v)
|
222 |
except KeyError: # not all eval values present
|
223 |
continue
|
src/populate.py
CHANGED
@@ -1,16 +1,20 @@
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
|
6 |
-
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
-
from src.display.utils import
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
14 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
15 |
for result in raw_data:
|
16 |
result.average = np.mean(list(result.results.values()))
|
@@ -18,10 +22,20 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
18 |
print(sorted_results)
|
19 |
# ranks = [rank+1 for rank, value in enumerate(sorted_results)]
|
20 |
# rank = [rank+1 for rank, value in enumerate(average)]
|
21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
|
23 |
df = pd.DataFrame.from_records(all_data_json)
|
24 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
|
|
|
|
25 |
df = df[cols].round(decimals=2)
|
26 |
|
27 |
# filter out if any of the benchmarks have not been produced
|
@@ -34,8 +48,11 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
34 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
35 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
36 |
all_evals = []
|
37 |
-
|
|
|
|
|
38 |
for entry in entries:
|
|
|
39 |
if ".json" in entry:
|
40 |
file_path = os.path.join(save_path, entry)
|
41 |
with open(file_path) as fp:
|
@@ -47,15 +64,15 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
47 |
all_evals.append(data)
|
48 |
elif ".md" not in entry:
|
49 |
# this is a folder
|
50 |
-
|
51 |
-
for sub_entry in sub_entries:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
|
56 |
-
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
57 |
-
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
58 |
-
all_evals.append(data)
|
59 |
|
60 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
61 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
@@ -63,4 +80,4 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
|
63 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
64 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
65 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
66 |
-
return df_finished[cols], df_running[cols], df_pending[cols]
|
|
|
1 |
import json
|
2 |
import os
|
3 |
+
|
4 |
import numpy as np
|
5 |
import pandas as pd
|
6 |
|
|
|
7 |
from src.display.formatting import has_no_nan_values, make_clickable_model
|
8 |
+
from src.display.utils import EvalQueueColumn
|
9 |
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
+
from src.display.utils import AutoEvalColumnRGB, AutoEvalColumnPGB,\
|
11 |
+
AutoEvalColumnGUE, AutoEvalColumnGB
|
12 |
+
from src.about import TasksRGB, TasksPGB, TasksGUE, TasksGB
|
13 |
|
14 |
|
15 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
16 |
"""Creates a dataframe from all the individual experiment results"""
|
17 |
+
print(f"RESULTS PATH: {results_path}")
|
18 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
19 |
for result in raw_data:
|
20 |
result.average = np.mean(list(result.results.values()))
|
|
|
22 |
print(sorted_results)
|
23 |
# ranks = [rank+1 for rank, value in enumerate(sorted_results)]
|
24 |
# rank = [rank+1 for rank, value in enumerate(average)]
|
25 |
+
if "RGB" in results_path:
|
26 |
+
all_data_json = [v.to_dict(i+1, AutoEvalColumnRGB, TasksRGB) for i, v in enumerate(raw_data)]
|
27 |
+
elif "PGB" in results_path:
|
28 |
+
all_data_json = [v.to_dict(i+1, AutoEvalColumnPGB, TasksPGB) for i, v in enumerate(raw_data)]
|
29 |
+
elif "GUE" in results_path:
|
30 |
+
all_data_json = [v.to_dict(i+1, AutoEvalColumnGUE, TasksGUE) for i, v in enumerate(raw_data)]
|
31 |
+
else:
|
32 |
+
all_data_json = [v.to_dict(i+1, AutoEvalColumnGB, TasksGB) for i, v in enumerate(raw_data)]
|
33 |
+
# all_data_json = [v.to_dict(i + 1) for i, v in enumerate(raw_data)]
|
34 |
|
35 |
df = pd.DataFrame.from_records(all_data_json)
|
36 |
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
37 |
+
print(f"Cols: {cols}")
|
38 |
+
print(f"DF: {df}")
|
39 |
df = df[cols].round(decimals=2)
|
40 |
|
41 |
# filter out if any of the benchmarks have not been produced
|
|
|
48 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
49 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
50 |
all_evals = []
|
51 |
+
print(entries)
|
52 |
+
entries = [entry for entry in entries if not entry.startswith(".")]
|
53 |
+
print(entries)
|
54 |
for entry in entries:
|
55 |
+
print(entries)
|
56 |
if ".json" in entry:
|
57 |
file_path = os.path.join(save_path, entry)
|
58 |
with open(file_path) as fp:
|
|
|
64 |
all_evals.append(data)
|
65 |
elif ".md" not in entry:
|
66 |
# this is a folder
|
67 |
+
entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
68 |
+
# for sub_entry in sub_entries:
|
69 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
70 |
+
# with open(file_path) as fp:
|
71 |
+
# data = json.load(fp)
|
72 |
|
73 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
74 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
75 |
+
# all_evals.append(data)
|
76 |
|
77 |
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
78 |
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
|
|
80 |
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
81 |
df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
82 |
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
83 |
+
return df_finished[cols], df_running[cols], df_pending[cols]
|
src/submission/check_validity.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
-
import re
|
4 |
from collections import defaultdict
|
5 |
-
from datetime import datetime, timedelta, timezone
|
6 |
|
7 |
import huggingface_hub
|
8 |
from huggingface_hub import ModelCard
|
@@ -10,6 +8,7 @@ from huggingface_hub.hf_api import ModelInfo
|
|
10 |
from transformers import AutoConfig
|
11 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
12 |
|
|
|
13 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
14 |
"""Checks if the model card and license exist and have been filled"""
|
15 |
try:
|
@@ -31,31 +30,38 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
|
|
31 |
|
32 |
return True, ""
|
33 |
|
34 |
-
|
|
|
|
|
|
|
35 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
36 |
try:
|
37 |
-
config = AutoConfig.from_pretrained(
|
|
|
|
|
38 |
if test_tokenizer:
|
39 |
try:
|
40 |
-
tk = AutoTokenizer.from_pretrained(
|
|
|
|
|
41 |
except ValueError as e:
|
|
|
|
|
42 |
return (
|
43 |
False,
|
44 |
-
|
45 |
-
None
|
46 |
)
|
47 |
-
except Exception as e:
|
48 |
-
return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
|
49 |
return True, None, config
|
50 |
|
51 |
except ValueError:
|
52 |
return (
|
53 |
False,
|
54 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
55 |
-
None
|
56 |
)
|
57 |
|
58 |
-
except Exception
|
59 |
return False, "was not found on hub!", None
|
60 |
|
61 |
|
@@ -70,10 +76,12 @@ def get_model_size(model_info: ModelInfo, precision: str):
|
|
70 |
model_size = size_factor * model_size
|
71 |
return model_size
|
72 |
|
|
|
73 |
def get_model_arch(model_info: ModelInfo):
|
74 |
"""Gets the model architecture from the configuration"""
|
75 |
return model_info.config.get("architectures", "Unknown")
|
76 |
|
|
|
77 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
78 |
"""Gather a list of already submitted models to avoid duplicates"""
|
79 |
depth = 1
|
|
|
1 |
import json
|
2 |
import os
|
|
|
3 |
from collections import defaultdict
|
|
|
4 |
|
5 |
import huggingface_hub
|
6 |
from huggingface_hub import ModelCard
|
|
|
8 |
from transformers import AutoConfig
|
9 |
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
10 |
|
11 |
+
|
12 |
def check_model_card(repo_id: str) -> tuple[bool, str]:
|
13 |
"""Checks if the model card and license exist and have been filled"""
|
14 |
try:
|
|
|
30 |
|
31 |
return True, ""
|
32 |
|
33 |
+
|
34 |
+
def is_model_on_hub(
|
35 |
+
model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False
|
36 |
+
) -> tuple[bool, str]:
|
37 |
"""Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
|
38 |
try:
|
39 |
+
config = AutoConfig.from_pretrained(
|
40 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
41 |
+
)
|
42 |
if test_tokenizer:
|
43 |
try:
|
44 |
+
tk = AutoTokenizer.from_pretrained(
|
45 |
+
model_name, revision=revision, trust_remote_code=trust_remote_code, token=token
|
46 |
+
)
|
47 |
except ValueError as e:
|
48 |
+
return (False, f"uses a tokenizer which is not in a transformers release: {e}", None)
|
49 |
+
except Exception:
|
50 |
return (
|
51 |
False,
|
52 |
+
"'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?",
|
53 |
+
None,
|
54 |
)
|
|
|
|
|
55 |
return True, None, config
|
56 |
|
57 |
except ValueError:
|
58 |
return (
|
59 |
False,
|
60 |
"needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
|
61 |
+
None,
|
62 |
)
|
63 |
|
64 |
+
except Exception:
|
65 |
return False, "was not found on hub!", None
|
66 |
|
67 |
|
|
|
76 |
model_size = size_factor * model_size
|
77 |
return model_size
|
78 |
|
79 |
+
|
80 |
def get_model_arch(model_info: ModelInfo):
|
81 |
"""Gets the model architecture from the configuration"""
|
82 |
return model_info.config.get("architectures", "Unknown")
|
83 |
|
84 |
+
|
85 |
def already_submitted_models(requested_models_dir: str) -> set[str]:
|
86 |
"""Gather a list of already submitted models to avoid duplicates"""
|
87 |
depth = 1
|
src/submission/submit.py
CHANGED
@@ -3,17 +3,13 @@ import os
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
-
from src.envs import API, EVAL_REQUESTS_PATH,
|
7 |
-
from src.submission.check_validity import
|
8 |
-
already_submitted_models,
|
9 |
-
check_model_card,
|
10 |
-
get_model_size,
|
11 |
-
is_model_on_hub,
|
12 |
-
)
|
13 |
|
14 |
REQUESTED_MODELS = None
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
|
|
17 |
def add_new_eval(
|
18 |
model: str,
|
19 |
base_model: str,
|
@@ -45,7 +41,9 @@ def add_new_eval(
|
|
45 |
|
46 |
# Is the model on the hub?
|
47 |
if weight_type in ["Delta", "Adapter"]:
|
48 |
-
base_model_on_hub, error, _ = is_model_on_hub(
|
|
|
|
|
49 |
if not base_model_on_hub:
|
50 |
return styled_error(f'Base model "{base_model}" {error}')
|
51 |
|
|
|
3 |
from datetime import datetime, timezone
|
4 |
|
5 |
from src.display.formatting import styled_error, styled_message, styled_warning
|
6 |
+
from src.envs import API, EVAL_REQUESTS_PATH, QUEUE_REPO, TOKEN
|
7 |
+
from src.submission.check_validity import already_submitted_models, check_model_card, get_model_size, is_model_on_hub
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
REQUESTED_MODELS = None
|
10 |
USERS_TO_SUBMISSION_DATES = None
|
11 |
|
12 |
+
|
13 |
def add_new_eval(
|
14 |
model: str,
|
15 |
base_model: str,
|
|
|
41 |
|
42 |
# Is the model on the hub?
|
43 |
if weight_type in ["Delta", "Adapter"]:
|
44 |
+
base_model_on_hub, error, _ = is_model_on_hub(
|
45 |
+
model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True
|
46 |
+
)
|
47 |
if not base_model_on_hub:
|
48 |
return styled_error(f'Base model "{base_model}" {error}')
|
49 |
|