Spaces:
Runtime error
Runtime error
Support comparing model tree generations
Browse files- app.py +43 -6
- src/constants.py +10 -0
- src/details.py +9 -2
- src/hub.py +22 -3
- src/model_tree.py +42 -0
- src/results.py +9 -6
app.py
CHANGED
@@ -11,6 +11,7 @@ from src.details import (
|
|
11 |
update_subtasks_component,
|
12 |
update_task_description_component,
|
13 |
)
|
|
|
14 |
from src.results import (
|
15 |
clear_results,
|
16 |
clear_results_file,
|
@@ -20,7 +21,6 @@ from src.results import (
|
|
20 |
load_result_paths_per_model,
|
21 |
load_results,
|
22 |
plot_results,
|
23 |
-
update_load_results_component,
|
24 |
update_tasks_component,
|
25 |
)
|
26 |
|
@@ -41,6 +41,18 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
41 |
model_ids = gr.Dropdown(label="Models", multiselect=True)
|
42 |
result_paths_per_model = gr.State()
|
43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
with gr.Row():
|
45 |
with gr.Tab("Results"):
|
46 |
load_results_btn = gr.Button("Load", interactive=False)
|
@@ -119,19 +131,25 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
119 |
outputs=model_ids,
|
120 |
)
|
121 |
|
122 |
-
#
|
123 |
gr.on(
|
124 |
triggers=[model_ids.input],
|
125 |
-
fn=
|
126 |
-
outputs=[load_results_btn, load_configs_btn],
|
127 |
)
|
|
|
|
|
128 |
gr.on(
|
129 |
triggers=[load_results_btn.click, load_configs_btn.click],
|
130 |
fn=display_loading_message_for_results,
|
131 |
outputs=[results, configs],
|
132 |
).then(
|
133 |
fn=load_results,
|
134 |
-
inputs=[
|
|
|
|
|
|
|
|
|
135 |
outputs=results_dataframe,
|
136 |
).then(
|
137 |
fn=update_tasks_component,
|
@@ -185,6 +203,12 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
185 |
results_task,
|
186 |
configs_task,
|
187 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
).then(
|
189 |
fn=clear_results_file,
|
190 |
outputs=results_file,
|
@@ -211,7 +235,11 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
211 |
outputs=details,
|
212 |
).then(
|
213 |
fn=load_details,
|
214 |
-
inputs=[
|
|
|
|
|
|
|
|
|
215 |
outputs=details_dataframe,
|
216 |
).then(
|
217 |
fn=update_sample_idx_component,
|
@@ -240,4 +268,13 @@ with gr.Blocks(fill_height=True, fill_width=True) as demo:
|
|
240 |
],
|
241 |
)
|
242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
243 |
demo.launch()
|
|
|
11 |
update_subtasks_component,
|
12 |
update_task_description_component,
|
13 |
)
|
14 |
+
from src.model_tree import load_model_tree
|
15 |
from src.results import (
|
16 |
clear_results,
|
17 |
clear_results_file,
|
|
|
21 |
load_result_paths_per_model,
|
22 |
load_results,
|
23 |
plot_results,
|
|
|
24 |
update_tasks_component,
|
25 |
)
|
26 |
|
|
|
41 |
model_ids = gr.Dropdown(label="Models", multiselect=True)
|
42 |
result_paths_per_model = gr.State()
|
43 |
|
44 |
+
with gr.Accordion("Model tree: Compare base and derived models", open=False):
|
45 |
+
load_model_tree_btn = gr.Button("Load Model Tree", interactive=False)
|
46 |
+
model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
|
47 |
+
derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
|
48 |
+
]
|
49 |
+
base_and_derived_models = [
|
50 |
+
gr.Dropdown(label=model_tree_labels[0], multiselect=True),
|
51 |
+
]
|
52 |
+
with gr.Row():
|
53 |
+
for label in model_tree_labels[1:]:
|
54 |
+
base_and_derived_models.append(gr.Dropdown(label=label, multiselect=True, interactive=False))
|
55 |
+
|
56 |
with gr.Row():
|
57 |
with gr.Tab("Results"):
|
58 |
load_results_btn = gr.Button("Load", interactive=False)
|
|
|
131 |
outputs=model_ids,
|
132 |
)
|
133 |
|
134 |
+
# Buttons:
|
135 |
gr.on(
|
136 |
triggers=[model_ids.input],
|
137 |
+
fn=lambda: (gr.Button(interactive=True),) * 3,
|
138 |
+
outputs=[load_model_tree_btn, load_results_btn, load_configs_btn],
|
139 |
)
|
140 |
+
|
141 |
+
# RESULTS:
|
142 |
gr.on(
|
143 |
triggers=[load_results_btn.click, load_configs_btn.click],
|
144 |
fn=display_loading_message_for_results,
|
145 |
outputs=[results, configs],
|
146 |
).then(
|
147 |
fn=load_results,
|
148 |
+
inputs=[
|
149 |
+
result_paths_per_model,
|
150 |
+
model_ids,
|
151 |
+
*base_and_derived_models,
|
152 |
+
],
|
153 |
outputs=results_dataframe,
|
154 |
).then(
|
155 |
fn=update_tasks_component,
|
|
|
203 |
results_task,
|
204 |
configs_task,
|
205 |
],
|
206 |
+
).then(
|
207 |
+
fn=lambda: gr.Button(interactive=False),
|
208 |
+
outputs=load_model_tree_btn,
|
209 |
+
).then(
|
210 |
+
fn=lambda: [gr.Dropdown(label=label, multiselect=True, interactive=False) for label in model_tree_labels],
|
211 |
+
outputs=[*base_and_derived_models],
|
212 |
).then(
|
213 |
fn=clear_results_file,
|
214 |
outputs=results_file,
|
|
|
235 |
outputs=details,
|
236 |
).then(
|
237 |
fn=load_details,
|
238 |
+
inputs=[
|
239 |
+
subtask,
|
240 |
+
model_ids,
|
241 |
+
*base_and_derived_models,
|
242 |
+
],
|
243 |
outputs=details_dataframe,
|
244 |
).then(
|
245 |
fn=update_sample_idx_component,
|
|
|
268 |
],
|
269 |
)
|
270 |
|
271 |
+
# MODEL TREE:
|
272 |
+
load_model_tree_btn.click(
|
273 |
+
fn=load_model_tree,
|
274 |
+
inputs=[result_paths_per_model, model_ids],
|
275 |
+
outputs=[
|
276 |
+
*base_and_derived_models,
|
277 |
+
],
|
278 |
+
)
|
279 |
+
|
280 |
demo.launch()
|
src/constants.py
CHANGED
@@ -72,3 +72,13 @@ TASK_DESCRIPTIONS = {
|
|
72 |
"leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
|
73 |
"leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
|
74 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
"leaderboard_mmlu_pro": "MMLU-Pro is a refined version of the MMLU dataset, which has been a standard for multiple-choice knowledge assessment. Recent research identified issues with the original MMLU, such as noisy data (some unanswerable questions) and decreasing difficulty due to advances in model capabilities and increased data contamination. MMLU-Pro addresses these issues by presenting models with 10 choices instead of 4, requiring reasoning on more questions, and undergoing expert review to reduce noise. As a result, MMLU-Pro is of higher quality and currently more challenging than the original.",
|
73 |
"leaderboard_musr": "MuSR is a new dataset consisting of algorithmically generated complex problems, each around 1,000 words in length. The problems include murder mysteries, object placement questions, and team allocation optimizations. Solving these problems requires models to integrate reasoning with long-range context parsing. Few models achieve better than random performance on this dataset.",
|
74 |
}
|
75 |
+
|
76 |
+
|
77 |
+
HF_API_URL = "https://huggingface.co/api"
|
78 |
+
BASE_MODEL_TYPE = ("Base models", "base_model")
|
79 |
+
DERIVED_MODEL_TYPES = [
|
80 |
+
("Adapters", "adapter"),
|
81 |
+
("Finetunes", "finetune"),
|
82 |
+
("Merges", "merge"),
|
83 |
+
("Quantizations", "quantized"),
|
84 |
+
]
|
src/details.py
CHANGED
@@ -61,8 +61,15 @@ async def load_details_dataframe(model_id, subtask):
|
|
61 |
return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
|
62 |
|
63 |
|
64 |
-
async def load_details(
|
65 |
-
dfs = await asyncio.gather(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
if dfs:
|
67 |
return pd.concat(dfs)
|
68 |
|
|
|
61 |
return df.sort_values("doc_id").set_index("doc_id", drop=False).set_index("model_name", append=True)
|
62 |
|
63 |
|
64 |
+
async def load_details(subtask, *model_ids_lists):
|
65 |
+
dfs = await asyncio.gather(
|
66 |
+
*[
|
67 |
+
load_details_dataframe(model_id, subtask)
|
68 |
+
for model_ids in model_ids_lists
|
69 |
+
if model_ids
|
70 |
+
for model_id in model_ids
|
71 |
+
]
|
72 |
+
)
|
73 |
if dfs:
|
74 |
return pd.concat(dfs)
|
75 |
|
src/hub.py
CHANGED
@@ -2,9 +2,11 @@ import io
|
|
2 |
import json
|
3 |
|
4 |
import httpx
|
5 |
-
from huggingface_hub import HfFileSystem, hf_hub_url
|
6 |
from huggingface_hub.utils import build_hf_headers
|
7 |
|
|
|
|
|
8 |
|
9 |
client = httpx.AsyncClient(follow_redirects=True)
|
10 |
fs = HfFileSystem()
|
@@ -29,5 +31,22 @@ async def load_jsonlines_file(path):
|
|
29 |
|
30 |
|
31 |
def to_url(path):
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import json
|
3 |
|
4 |
import httpx
|
5 |
+
from huggingface_hub import HfFileSystem, ModelCard, hf_hub_url
|
6 |
from huggingface_hub.utils import build_hf_headers
|
7 |
|
8 |
+
import src.constants as constants
|
9 |
+
|
10 |
|
11 |
client = httpx.AsyncClient(follow_redirects=True)
|
12 |
fs = HfFileSystem()
|
|
|
31 |
|
32 |
|
33 |
def to_url(path):
|
34 |
+
*repo_type, org_name, ds_name, filename = path.split("/", 3)
|
35 |
+
repo_type = repo_type[0][:-1] if repo_type else None
|
36 |
+
print(path)
|
37 |
+
print(repo_type, org_name, ds_name, filename)
|
38 |
+
return hf_hub_url(repo_id=f"{org_name}/{ds_name}", filename=filename, repo_type=repo_type)
|
39 |
+
|
40 |
+
|
41 |
+
async def load_model_card(model_id):
|
42 |
+
url = to_url(f"{model_id}/README.md")
|
43 |
+
r = await client.get(url)
|
44 |
+
return ModelCard(r.text, ignore_metadata_errors=True)
|
45 |
+
|
46 |
+
|
47 |
+
async def list_models(filtering=None):
|
48 |
+
params = {}
|
49 |
+
if filtering:
|
50 |
+
params["filter"] = filtering
|
51 |
+
r = await client.get(f"{constants.HF_API_URL}/models", params=params)
|
52 |
+
return r.json()
|
src/model_tree.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
import src.constants as constants
|
6 |
+
from src.hub import list_models, load_model_card
|
7 |
+
|
8 |
+
|
9 |
+
async def load_model_tree(result_paths_per_model, model_ids):
|
10 |
+
# TODO: Multiple models?
|
11 |
+
model_id = model_ids[0]
|
12 |
+
model_tree = await asyncio.gather(
|
13 |
+
load_base_models(model_id),
|
14 |
+
*[
|
15 |
+
load_derived_models_by_type(model_id, derived_model_type[1])
|
16 |
+
for derived_model_type in constants.DERIVED_MODEL_TYPES
|
17 |
+
],
|
18 |
+
)
|
19 |
+
model_tree_choices = [
|
20 |
+
[model_id for model_id in model_ids if model_id in result_paths_per_model] for model_ids in model_tree
|
21 |
+
]
|
22 |
+
model_tree_labels = [constants.BASE_MODEL_TYPE[0]] + [
|
23 |
+
derived_model_type[0] for derived_model_type in constants.DERIVED_MODEL_TYPES
|
24 |
+
]
|
25 |
+
return [
|
26 |
+
gr.Dropdown(choices=choices, label=f"{label} ({len(choices)})", interactive=True if choices else False)
|
27 |
+
for choices, label in zip(model_tree_choices, model_tree_labels)
|
28 |
+
]
|
29 |
+
|
30 |
+
|
31 |
+
async def load_base_models(model_id) -> list[str]:
|
32 |
+
card = await load_model_card(model_id)
|
33 |
+
base_models = getattr(card.data, constants.BASE_MODEL_TYPE[1])
|
34 |
+
if not isinstance(base_models, list):
|
35 |
+
base_models = [base_models]
|
36 |
+
return base_models
|
37 |
+
|
38 |
+
|
39 |
+
async def load_derived_models_by_type(model_id, derived_model_type):
|
40 |
+
models = await list_models(filtering=f"base_model:{derived_model_type}:{model_id}")
|
41 |
+
models = [model["id"] for model in models]
|
42 |
+
return models
|
src/results.py
CHANGED
@@ -29,10 +29,6 @@ def sort_result_paths_per_model(paths):
|
|
29 |
return {model_id: sorted(paths) for model_id, paths in d.items()}
|
30 |
|
31 |
|
32 |
-
def update_load_results_component():
|
33 |
-
return (gr.Button("Load", interactive=True),) * 2
|
34 |
-
|
35 |
-
|
36 |
async def load_results_dataframe(model_id, result_paths_per_model=None):
|
37 |
if not model_id or not result_paths_per_model:
|
38 |
return
|
@@ -48,8 +44,15 @@ async def load_results_dataframe(model_id, result_paths_per_model=None):
|
|
48 |
return df.set_index(pd.Index([model_name]))
|
49 |
|
50 |
|
51 |
-
async def load_results(
|
52 |
-
dfs = await asyncio.gather(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
dfs = [df for df in dfs if df is not None]
|
54 |
if dfs:
|
55 |
return pd.concat(dfs)
|
|
|
29 |
return {model_id: sorted(paths) for model_id, paths in d.items()}
|
30 |
|
31 |
|
|
|
|
|
|
|
|
|
32 |
async def load_results_dataframe(model_id, result_paths_per_model=None):
|
33 |
if not model_id or not result_paths_per_model:
|
34 |
return
|
|
|
44 |
return df.set_index(pd.Index([model_name]))
|
45 |
|
46 |
|
47 |
+
async def load_results(result_paths_per_model, *model_ids_lists):
|
48 |
+
dfs = await asyncio.gather(
|
49 |
+
*[
|
50 |
+
load_results_dataframe(model_id, result_paths_per_model)
|
51 |
+
for model_ids in model_ids_lists
|
52 |
+
if model_ids
|
53 |
+
for model_id in model_ids
|
54 |
+
]
|
55 |
+
)
|
56 |
dfs = [df for df in dfs if df is not None]
|
57 |
if dfs:
|
58 |
return pd.concat(dfs)
|