Spaces:
Runtime error
Runtime error
Aaron Mueller
commited on
Commit
β’
de60bd6
1
Parent(s):
b166dfb
update leaderboard
Browse files- app.py +23 -33
- src/about.py +25 -28
- src/display/utils.py +3 -44
- src/envs.py +4 -4
- src/leaderboard/read_evals.py +8 -2
- src/populate.py +1 -2
- src/submission/submit.py +15 -5
app.py
CHANGED
@@ -15,6 +15,7 @@ from src.about import (
|
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
|
|
18 |
COLS,
|
19 |
EVAL_COLS,
|
20 |
EVAL_TYPES,
|
@@ -50,6 +51,7 @@ except Exception:
|
|
50 |
|
51 |
|
52 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
|
|
53 |
|
54 |
(
|
55 |
finished_eval_queue_df,
|
@@ -57,9 +59,11 @@ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS,
|
|
57 |
pending_eval_queue_df,
|
58 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
-
def init_leaderboard(dataframe):
|
61 |
if dataframe is None or dataframe.empty:
|
62 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
|
|
|
|
63 |
return Leaderboard(
|
64 |
value=dataframe,
|
65 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
@@ -95,13 +99,17 @@ with demo:
|
|
95 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
|
97 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("
|
99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
-
|
101 |
-
|
|
|
|
|
|
|
|
|
102 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
|
104 |
-
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=
|
105 |
with gr.Column():
|
106 |
with gr.Row():
|
107 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
@@ -142,36 +150,20 @@ with demo:
|
|
142 |
row_count=5,
|
143 |
)
|
144 |
with gr.Row():
|
145 |
-
gr.Markdown("# βοΈβ¨ Submit your
|
146 |
|
147 |
with gr.Row():
|
148 |
with gr.Column():
|
149 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
150 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
|
152 |
-
choices=[
|
153 |
-
label=
|
154 |
multiselect=False,
|
155 |
value=None,
|
156 |
-
interactive=True
|
157 |
-
)
|
158 |
-
|
159 |
-
with gr.Column():
|
160 |
-
precision = gr.Dropdown(
|
161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
label="Precision",
|
163 |
-
multiselect=False,
|
164 |
-
value="float16",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
weight_type = gr.Dropdown(
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
|
176 |
submit_button = gr.Button("Submit Eval")
|
177 |
submission_result = gr.Markdown()
|
@@ -179,11 +171,9 @@ with demo:
|
|
179 |
add_new_eval,
|
180 |
[
|
181 |
model_name_textbox,
|
182 |
-
|
183 |
revision_name_textbox,
|
184 |
-
|
185 |
-
weight_type,
|
186 |
-
model_type,
|
187 |
],
|
188 |
submission_result,
|
189 |
)
|
@@ -201,4 +191,4 @@ with demo:
|
|
201 |
scheduler = BackgroundScheduler()
|
202 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
15 |
from src.display.css_html_js import custom_css
|
16 |
from src.display.utils import (
|
17 |
BENCHMARK_COLS,
|
18 |
+
BENCHMARK_COLS_MULTIMODAL,
|
19 |
COLS,
|
20 |
EVAL_COLS,
|
21 |
EVAL_TYPES,
|
|
|
51 |
|
52 |
|
53 |
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
54 |
+
LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS_MULTIMODAL)
|
55 |
|
56 |
(
|
57 |
finished_eval_queue_df,
|
|
|
59 |
pending_eval_queue_df,
|
60 |
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
61 |
|
62 |
+
def init_leaderboard(dataframe, track):
|
63 |
if dataframe is None or dataframe.empty:
|
64 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
65 |
+
# filter for correct track
|
66 |
+
dataframe = dataframe.loc[dataframe["track"] == track]
|
67 |
return Leaderboard(
|
68 |
value=dataframe,
|
69 |
datatype=[c.type for c in fields(AutoEvalColumn)],
|
|
|
99 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
100 |
|
101 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
102 |
+
with gr.TabItem("Strict Leaderboard", elem_id="strict-benchmark-tab-table", id=0):
|
103 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
|
104 |
+
with gr.TabItem("Strict-small Leaderboard", elem_id="strict-small-benchmark-tab-table", id=1):
|
105 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
|
106 |
+
with gr.TabItem("Multimodal Leaderboard", elem_id="multimodal-benchmark-tab-table", id=2):
|
107 |
+
leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")
|
108 |
+
|
109 |
+
with gr.TabItem("π About", elem_id="llm-benchmark-tab-table", id=4):
|
110 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
111 |
|
112 |
+
with gr.TabItem("π Submit here! ", elem_id="llm-benchmark-tab-table", id=5):
|
113 |
with gr.Column():
|
114 |
with gr.Row():
|
115 |
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
|
|
150 |
row_count=5,
|
151 |
)
|
152 |
with gr.Row():
|
153 |
+
gr.Markdown("# βοΈβ¨ Submit your predictions here!", elem_classes="markdown-text")
|
154 |
|
155 |
with gr.Row():
|
156 |
with gr.Column():
|
157 |
model_name_textbox = gr.Textbox(label="Model name")
|
158 |
+
predictions_path_textbox = gr.Textbox(label="URL to predictions file")
|
159 |
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
160 |
+
track_name = gr.Dropdown(
|
161 |
+
choices = ["Strict", "Strict-small", "Multimodal"],
|
162 |
+
label = "Track",
|
163 |
multiselect=False,
|
164 |
value=None,
|
165 |
+
interactive=True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
)
|
|
|
167 |
|
168 |
submit_button = gr.Button("Submit Eval")
|
169 |
submission_result = gr.Markdown()
|
|
|
171 |
add_new_eval,
|
172 |
[
|
173 |
model_name_textbox,
|
174 |
+
predictions_path_textbox,
|
175 |
revision_name_textbox,
|
176 |
+
track_name
|
|
|
|
|
177 |
],
|
178 |
submission_result,
|
179 |
)
|
|
|
191 |
scheduler = BackgroundScheduler()
|
192 |
scheduler.add_job(restart_space, "interval", seconds=1800)
|
193 |
scheduler.start()
|
194 |
+
demo.queue(default_concurrency_limit=40).launch()
|
src/about.py
CHANGED
@@ -12,8 +12,19 @@ class Task:
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
-
task0 = Task("
|
16 |
-
task1 = Task("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
NUM_FEWSHOT = 0 # Change with your few shot
|
19 |
# ---------------------------------------------------
|
@@ -21,52 +32,38 @@ NUM_FEWSHOT = 0 # Change with your few shot
|
|
21 |
|
22 |
|
23 |
# Your leaderboard name
|
24 |
-
TITLE = """<h1 align="center" id="space-title">
|
25 |
|
26 |
# What does your leaderboard evaluate?
|
27 |
INTRODUCTION_TEXT = """
|
28 |
-
|
29 |
"""
|
30 |
|
31 |
# Which evaluations are you running? how can people reproduce what you have?
|
32 |
LLM_BENCHMARKS_TEXT = f"""
|
33 |
## How it works
|
34 |
-
|
35 |
-
## Reproducibility
|
36 |
-
To reproduce our results, here is the commands you can run:
|
37 |
|
38 |
"""
|
39 |
|
40 |
EVALUATION_QUEUE_TEXT = """
|
41 |
## Some good practices before submitting a model
|
42 |
|
43 |
-
### 1) Make sure you can
|
44 |
-
```
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
|
49 |
```
|
50 |
-
If this step fails, follow the error messages to debug your model before submitting it. It's likely
|
51 |
-
|
52 |
-
Note: make sure your model is public!
|
53 |
-
Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
|
54 |
-
|
55 |
-
### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
|
56 |
-
It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
|
57 |
|
58 |
### 3) Make sure your model has an open license!
|
59 |
-
This is a leaderboard
|
60 |
|
61 |
### 4) Fill up your model card
|
62 |
-
When we add extra information about models to the leaderboard, it will be automatically taken from the model card
|
63 |
-
|
64 |
-
## In case of model failure
|
65 |
-
If your model is displayed in the `FAILED` category, its execution stopped.
|
66 |
-
Make sure you have followed the above steps first.
|
67 |
-
If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
|
68 |
"""
|
69 |
|
70 |
-
CITATION_BUTTON_LABEL = "
|
71 |
CITATION_BUTTON_TEXT = r"""
|
72 |
"""
|
|
|
12 |
# ---------------------------------------------------
|
13 |
class Tasks(Enum):
|
14 |
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
15 |
+
task0 = Task("blimp", "acc", "BLiMP")
|
16 |
+
task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
|
17 |
+
task2 = Task("glue", "acc", "(Super)GLUE")
|
18 |
+
task3 = Task("ewok", "acc", "EWoK")
|
19 |
+
|
20 |
+
class TasksMultimodal(Enum):
|
21 |
+
task0 = Task("blimp", "acc", "BLiMP")
|
22 |
+
task1 = Task("blimp_supplement", "acc", "BLiMP Supplement")
|
23 |
+
task2 = Task("glue", "acc", "(Super)GLUE")
|
24 |
+
task3 = Task("ewok", "acc", "EWoK")
|
25 |
+
task4 = Task("vqa", "acc", "VQA")
|
26 |
+
task5 = Task("winoground", "acc", "Winoground")
|
27 |
+
task6 = Task("devbench", "acc", "DevBench")
|
28 |
|
29 |
NUM_FEWSHOT = 0 # Change with your few shot
|
30 |
# ---------------------------------------------------
|
|
|
32 |
|
33 |
|
34 |
# Your leaderboard name
|
35 |
+
TITLE = """<h1 align="center" id="space-title">BabyLM 2024 Leaderboards</h1>"""
|
36 |
|
37 |
# What does your leaderboard evaluate?
|
38 |
INTRODUCTION_TEXT = """
|
39 |
+
The leaderboards for each track of the 2024 BabyLM Challenge.
|
40 |
"""
|
41 |
|
42 |
# Which evaluations are you running? how can people reproduce what you have?
|
43 |
LLM_BENCHMARKS_TEXT = f"""
|
44 |
## How it works
|
45 |
+
This leaderboard accepts predictions files as input, and uploads the results to the leaderboard. The logic is the same as in the `score_predictions.py` script from the BabyLM 2024 evaluation pipeline repository.
|
|
|
|
|
46 |
|
47 |
"""
|
48 |
|
49 |
EVALUATION_QUEUE_TEXT = """
|
50 |
## Some good practices before submitting a model
|
51 |
|
52 |
+
### 1) Make sure you can get scores from your prediction using the `score_predictions.py` script.
|
53 |
+
```bash
|
54 |
+
git clone https://github.com/babylm/evaluation-pipeline-2024/
|
55 |
+
cd evaluation-pipeline-2024
|
56 |
+
python score_predictions.py path/to/your/predictions.json.gz
|
|
|
57 |
```
|
58 |
+
If this step fails, follow the error messages to debug your model before submitting it. It's likely that either (i) some results are missing, or (ii) the results are incorrectly formatted.
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
### 3) Make sure your model has an open license!
|
61 |
+
This is a leaderboard that is meant to advance research on language modeling, and we'd love for as many people as possible to know they can use your model!
|
62 |
|
63 |
### 4) Fill up your model card
|
64 |
+
When we add extra information about models to the leaderboard, it will be automatically taken from the model card.
|
|
|
|
|
|
|
|
|
|
|
65 |
"""
|
66 |
|
67 |
+
CITATION_BUTTON_LABEL = "If you would like to cite these results, please cite the 2024 BabyLM Findings paper, as well as the authors of the model(s) whose results you cite!"
|
68 |
CITATION_BUTTON_TEXT = r"""
|
69 |
"""
|
src/display/utils.py
CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -47,10 +47,9 @@ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=
|
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
model = ColumnContent("model", "markdown", True)
|
|
|
50 |
revision = ColumnContent("revision", "str", True)
|
51 |
private = ColumnContent("private", "bool", True)
|
52 |
-
precision = ColumnContent("precision", "str", True)
|
53 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
54 |
status = ColumnContent("status", "str", True)
|
55 |
|
56 |
## All the model information that we might need
|
@@ -60,46 +59,6 @@ class ModelDetails:
|
|
60 |
display_name: str = ""
|
61 |
symbol: str = "" # emoji
|
62 |
|
63 |
-
|
64 |
-
class ModelType(Enum):
|
65 |
-
PT = ModelDetails(name="pretrained", symbol="π’")
|
66 |
-
FT = ModelDetails(name="fine-tuned", symbol="πΆ")
|
67 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="β")
|
68 |
-
RL = ModelDetails(name="RL-tuned", symbol="π¦")
|
69 |
-
Unknown = ModelDetails(name="", symbol="?")
|
70 |
-
|
71 |
-
def to_str(self, separator=" "):
|
72 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
73 |
-
|
74 |
-
@staticmethod
|
75 |
-
def from_str(type):
|
76 |
-
if "fine-tuned" in type or "πΆ" in type:
|
77 |
-
return ModelType.FT
|
78 |
-
if "pretrained" in type or "π’" in type:
|
79 |
-
return ModelType.PT
|
80 |
-
if "RL-tuned" in type or "π¦" in type:
|
81 |
-
return ModelType.RL
|
82 |
-
if "instruction-tuned" in type or "β" in type:
|
83 |
-
return ModelType.IFT
|
84 |
-
return ModelType.Unknown
|
85 |
-
|
86 |
-
class WeightType(Enum):
|
87 |
-
Adapter = ModelDetails("Adapter")
|
88 |
-
Original = ModelDetails("Original")
|
89 |
-
Delta = ModelDetails("Delta")
|
90 |
-
|
91 |
-
class Precision(Enum):
|
92 |
-
float16 = ModelDetails("float16")
|
93 |
-
bfloat16 = ModelDetails("bfloat16")
|
94 |
-
Unknown = ModelDetails("?")
|
95 |
-
|
96 |
-
def from_str(precision):
|
97 |
-
if precision in ["torch.float16", "float16"]:
|
98 |
-
return Precision.float16
|
99 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
100 |
-
return Precision.bfloat16
|
101 |
-
return Precision.Unknown
|
102 |
-
|
103 |
# Column selection
|
104 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
105 |
|
@@ -107,4 +66,4 @@ EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
|
107 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
108 |
|
109 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
110 |
-
|
|
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
+
from src.about import Tasks, TasksMultimodal
|
7 |
|
8 |
def fields(raw_class):
|
9 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
47 |
@dataclass(frozen=True)
|
48 |
class EvalQueueColumn: # Queue column
|
49 |
model = ColumnContent("model", "markdown", True)
|
50 |
+
track = ColumnContent("track", "str", True)
|
51 |
revision = ColumnContent("revision", "str", True)
|
52 |
private = ColumnContent("private", "bool", True)
|
|
|
|
|
53 |
status = ColumnContent("status", "str", True)
|
54 |
|
55 |
## All the model information that we might need
|
|
|
59 |
display_name: str = ""
|
60 |
symbol: str = "" # emoji
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
# Column selection
|
63 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
64 |
|
|
|
66 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
67 |
|
68 |
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
69 |
+
BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]
|
src/envs.py
CHANGED
@@ -6,12 +6,12 @@ from huggingface_hub import HfApi
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
-
OWNER = "
|
10 |
# ----------------------------------
|
11 |
|
12 |
-
REPO_ID = f"{OWNER}/leaderboard"
|
13 |
-
QUEUE_REPO = f"{OWNER}/requests"
|
14 |
-
RESULTS_REPO = f"{OWNER}/results"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
|
|
6 |
# ----------------------------------
|
7 |
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
|
9 |
+
OWNER = "babylm" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
# ----------------------------------
|
11 |
|
12 |
+
REPO_ID = f"{OWNER}/leaderboard-2024"
|
13 |
+
QUEUE_REPO = f"{OWNER}/requests-2024"
|
14 |
+
RESULTS_REPO = f"{OWNER}/results-2024"
|
15 |
|
16 |
# If you setup a cache later, just change HF_HOME
|
17 |
CACHE_PATH=os.getenv("HF_HOME", ".")
|
src/leaderboard/read_evals.py
CHANGED
@@ -8,7 +8,7 @@ import dateutil
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
-
from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
@@ -39,6 +39,7 @@ class EvalResult:
|
|
39 |
data = json.load(fp)
|
40 |
|
41 |
config = data.get("config")
|
|
|
42 |
|
43 |
# Precision
|
44 |
precision = Precision.from_str(config.get("model_dtype"))
|
@@ -154,7 +155,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
|
|
154 |
return request_file
|
155 |
|
156 |
|
157 |
-
def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
|
158 |
"""From the path of the results folder root, extract all needed info for results"""
|
159 |
model_result_filepaths = []
|
160 |
|
@@ -174,6 +175,11 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
174 |
|
175 |
eval_results = {}
|
176 |
for model_result_filepath in model_result_filepaths:
|
|
|
|
|
|
|
|
|
|
|
177 |
# Creation of result
|
178 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
179 |
eval_result.update_with_request_file(requests_path)
|
|
|
8 |
import numpy as np
|
9 |
|
10 |
from src.display.formatting import make_clickable_model
|
11 |
+
from src.display.utils import AutoEvalColumn, ModelType, Tasks, TasksMultimodal, Precision, WeightType
|
12 |
from src.submission.check_validity import is_model_on_hub
|
13 |
|
14 |
|
|
|
39 |
data = json.load(fp)
|
40 |
|
41 |
config = data.get("config")
|
42 |
+
track = data.get("track")
|
43 |
|
44 |
# Precision
|
45 |
precision = Precision.from_str(config.get("model_dtype"))
|
|
|
155 |
return request_file
|
156 |
|
157 |
|
158 |
+
def get_raw_eval_results(results_path: str, requests_path: str, track: str) -> list[EvalResult]:
|
159 |
"""From the path of the results folder root, extract all needed info for results"""
|
160 |
model_result_filepaths = []
|
161 |
|
|
|
175 |
|
176 |
eval_results = {}
|
177 |
for model_result_filepath in model_result_filepaths:
|
178 |
+
with open(model_result_filepath, 'r') as f:
|
179 |
+
this_track = f["track"]
|
180 |
+
if this_track != track:
|
181 |
+
continue
|
182 |
+
|
183 |
# Creation of result
|
184 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
185 |
eval_result.update_with_request_file(requests_path)
|
src/populate.py
CHANGED
@@ -10,7 +10,7 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
@@ -21,7 +21,6 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
|
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
24 |
-
|
25 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
26 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
27 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
|
|
10 |
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
+
raw_data = get_raw_eval_results(results_path, requests_path, track)
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
15 |
|
16 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
21 |
df = df[has_no_nan_values(df, benchmark_cols)]
|
22 |
return df
|
23 |
|
|
|
24 |
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
25 |
"""Creates the different dataframes for the evaluation queues requestes"""
|
26 |
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
src/submission/submit.py
CHANGED
@@ -15,7 +15,9 @@ REQUESTED_MODELS = None
|
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
17 |
def add_new_eval(
|
18 |
-
|
|
|
|
|
19 |
base_model: str,
|
20 |
revision: str,
|
21 |
precision: str,
|
@@ -28,10 +30,10 @@ def add_new_eval(
|
|
28 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
29 |
|
30 |
user_name = ""
|
31 |
-
model_path =
|
32 |
if "/" in model:
|
33 |
-
user_name =
|
34 |
-
model_path =
|
35 |
|
36 |
precision = precision.split(" ")[0]
|
37 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
@@ -39,6 +41,12 @@ def add_new_eval(
|
|
39 |
if model_type is None or model_type == "":
|
40 |
return styled_error("Please select a model type.")
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Does the model actually exist?
|
43 |
if revision == "":
|
44 |
revision = "main"
|
@@ -76,7 +84,9 @@ def add_new_eval(
|
|
76 |
print("Adding new eval")
|
77 |
|
78 |
eval_entry = {
|
79 |
-
"
|
|
|
|
|
80 |
"base_model": base_model,
|
81 |
"revision": revision,
|
82 |
"precision": precision,
|
|
|
15 |
USERS_TO_SUBMISSION_DATES = None
|
16 |
|
17 |
def add_new_eval(
|
18 |
+
model_name: str,
|
19 |
+
preds_path: str,
|
20 |
+
track: str,
|
21 |
base_model: str,
|
22 |
revision: str,
|
23 |
precision: str,
|
|
|
30 |
REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
|
31 |
|
32 |
user_name = ""
|
33 |
+
model_path = model_name
|
34 |
if "/" in model:
|
35 |
+
user_name = model_name.split("/")[0]
|
36 |
+
model_path = model_name.split("/")[1]
|
37 |
|
38 |
precision = precision.split(" ")[0]
|
39 |
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
41 |
if model_type is None or model_type == "":
|
42 |
return styled_error("Please select a model type.")
|
43 |
|
44 |
+
if preds_path is None or preds_path == "":
|
45 |
+
return styled_error("Please enter a URL where your predictions file can be downloaded.")
|
46 |
+
|
47 |
+
if track is None:
|
48 |
+
return styled_error("Please select a track.")
|
49 |
+
|
50 |
# Does the model actually exist?
|
51 |
if revision == "":
|
52 |
revision = "main"
|
|
|
84 |
print("Adding new eval")
|
85 |
|
86 |
eval_entry = {
|
87 |
+
"model_name": model_name,
|
88 |
+
"preds_path": preds_path,
|
89 |
+
"track": track,
|
90 |
"base_model": base_model,
|
91 |
"revision": revision,
|
92 |
"precision": precision,
|