Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 17,764 Bytes
ad7917a c223932 ad7917a d4e77f2 ad7917a 188a5cb ad7917a 188a5cb ad7917a a22ef66 ad7917a 89233f9 ad7917a bd97257 ad7917a 947b7c2 ad7917a 3701087 188a5cb ad7917a d4e77f2 f3bd1e0 e4d2970 ad7917a d1935ca ad7917a bd97257 ad7917a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 |
# TODO
# Remove duplication in code used to generate markdown
# periodically update models to check all still valid and public
import os
import re
import sys
from functools import lru_cache
from pathlib import Path
from typing import Dict, List, Set, Union
import gradio as gr
from apscheduler.schedulers.background import BackgroundScheduler
from apscheduler.triggers.cron import CronTrigger
from cachetools import TTLCache, cached
from dotenv import load_dotenv
from huggingface_hub import (
HfApi,
comment_discussion,
create_discussion,
dataset_info,
get_repo_discussions,
)
from huggingface_hub.utils import HFValidationError, RepositoryNotFoundError
from sqlitedict import SqliteDict
from toolz import concat, count, unique
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map
local = bool(sys.platform.startswith("darwin"))
cache_location = "cache/" if local else "/data/cache"
save_dir = "test_data" if local else "/data/"
Path(save_dir).mkdir(parents=True, exist_ok=True)
load_dotenv()
user_agent = os.getenv("USER_AGENT")
HF_TOKEN = os.getenv("HF_TOKEN")
REPO = "librarian-bots/dataset-to-model-monitor" # where issues land
AUTHOR = "librarian-bot" # who makes the issues
hf_api = HfApi(user_agent=user_agent)
ten_min_cache = TTLCache(maxsize=5_000, ttl=600)
@cached(cache=ten_min_cache)
def get_datasets_for_user(username: str) -> List[str]:
datasets = hf_api.list_datasets(author=username)
datasets = (dataset.id for dataset in datasets)
return datasets
@cached(cache=ten_min_cache)
def get_models_for_dataset(dataset_id):
results = list(iter(hf_api.list_models(filter=f"dataset:{dataset_id}")))
if results:
results = list({result.id for result in results})
return {dataset_id: results}
def generate_dataset_model_map(
dataset_ids: List[str],
) -> dict[str, dict[str, List[str]]]:
results = thread_map(get_models_for_dataset, dataset_ids)
results = {key: value for d in results for key, value in d.items()}
return results
def maybe_update_datasets_to_model_map(dataset_id):
with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as dataset_to_model_map_db:
if dataset_id not in dataset_to_model_map_db:
dataset_to_model_map_db[dataset_id] = list(
get_models_for_dataset(dataset_id)[dataset_id]
)
dataset_to_model_map_db.commit()
return len(dataset_to_model_map_db)
return False
def datasets_tracked_by_user(username):
with SqliteDict(
f"{save_dir}/tracked_dataset_to_users.sqlite"
) as tracked_dataset_to_users_db:
return [
dataset
for dataset, users in tracked_dataset_to_users_db.items()
if username in users
]
def update_tracked_dataset_to_users(dataset_id: str, username: str):
with SqliteDict(
f"{save_dir}/tracked_dataset_to_users.sqlite",
) as tracked_dataset_to_users_db:
if dataset_id in tracked_dataset_to_users_db:
# check if user already tracking dataset
if username not in tracked_dataset_to_users_db[dataset_id]:
users_for_dataset = tracked_dataset_to_users_db[dataset_id]
users_for_dataset.append(username)
tracked_dataset_to_users_db[dataset_id] = list(set(users_for_dataset))
tracked_dataset_to_users_db.commit()
else:
tracked_dataset_to_users_db[dataset_id] = [username]
tracked_dataset_to_users_db.commit()
return datasets_tracked_by_user(username)
HUB_ORG_OR_USERNAME_GLOB_PATTERN = re.compile(r"^([a-zA-Z0-9_-]+)\/\*$")
@lru_cache(maxsize=128)
def match_org_user_glob_pattern(hub_id):
if match := re.match(HUB_ORG_OR_USERNAME_GLOB_PATTERN, hub_id):
return match[1]
else:
return None
@cached(cache=TTLCache(maxsize=100, ttl=60))
def grab_dataset_ids_for_user_or_org(hub_id: str) -> List[str]:
datasets_for_org = hf_api.list_datasets(author=hub_id)
datasets_for_org = (
dataset for dataset in datasets_for_org if dataset.private is False
)
return [dataset.id for dataset in datasets_for_org]
@cached(cache=TTLCache(maxsize=100, ttl=60))
def parse_hub_id_entry(hub_id: str) -> Union[str, List[str]]:
if match := match_org_user_glob_pattern(hub_id):
return grab_dataset_ids_for_user_or_org(match), match
try:
dataset_info(hub_id)
return hub_id, match
except HFValidationError as e:
raise gr.Error(f"Invalid format for Hugging Face Hub dataset ID. {e}") from e
except RepositoryNotFoundError as e:
raise gr.Error(f"{hub_id}: Invalid Hugging Face Hub dataset ID") from e
def remove_user_from_tracking_datasets(dataset_id, profile: gr.OAuthProfile | None):
if not profile and not local:
return "You must be logged in to remove a dataset"
username = profile.username
dataset_id, match = parse_hub_id_entry(dataset_id)
if isinstance(dataset_id, str):
return _remove_user_from_tracking_datasets(dataset_id, username)
if isinstance(dataset_id, list):
[
_remove_user_from_tracking_datasets(dataset, username)
for dataset in dataset_id
]
return f"Stopped tracking datasets for username or org: {match}"
def _remove_user_from_tracking_datasets(dataset_id: str, username):
with SqliteDict(
f"{save_dir}/tracked_dataset_to_users.sqlite"
) as tracked_dataset_to_users_db:
users = tracked_dataset_to_users_db.get(dataset_id)
if users is None:
return "Dataset not being tracked"
try:
users.remove(username)
except ValueError:
return "No longer tracking dataset"
tracked_dataset_to_users_db[dataset_id] = users
if len(users) < 1:
del tracked_dataset_to_users_db[dataset_id]
with SqliteDict(
f"{save_dir}/models_to_dataset.sqlite"
) as dataset_to_models_db:
del dataset_to_models_db[dataset_id]
dataset_to_models_db.commit()
tracked_dataset_to_users_db.commit()
return "Dataset no longer being tracked"
def user_unsubscribe_all(username):
datasets_tracked = datasets_tracked_by_user(username)
for dataset_id in datasets_tracked:
remove_user_from_tracking_datasets(username, dataset_id)
assert len(datasets_tracked_by_user(username)) == 0
return f"Unsubscribed from {len(datasets_tracked)} datasets"
def user_update(hub_id, profile: gr.OAuthProfile | None):
if not profile and not local:
return "Please login to track a dataset"
username = profile.username
hub_id, match = parse_hub_id_entry(hub_id)
if isinstance(hub_id, str):
return _user_update(hub_id, username)
else:
return glob_update_tracked_datasets(hub_id, username, match)
def glob_update_tracked_datasets(hub_ids, username, match):
for id_ in tqdm(hub_ids):
_user_update(id_, username)
response = "## Dataset tracking summary \n\n"
response += (
f"All datasets under the user or organization: {match} are being tracked \n\n"
)
tracked_datasets = datasets_tracked_by_user(username)
response += (
"You are currently tracking whether new models have been trained on"
f" {len(tracked_datasets)} datasets.\n\n"
)
if tracked_datasets:
response += "### Datasets being tracked \n\n"
response += (
"You are currently monitoring whether new models have been trained on the"
" following datasets:\n"
)
for dataset in tracked_datasets:
response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
return response
def _user_update(hub_id: str, username: str) -> str:
"""Update the user's tracked datasets and return a response string."""
response = ""
if number_datasets_being_tracked := maybe_update_datasets_to_model_map(hub_id):
response += (
"New dataset being tracked! Now tracking"
f" {number_datasets_being_tracked} datasets \n\n"
)
if not number_datasets_being_tracked:
response += f"Dataset {hub_id} is already being tracked. \n\n"
datasets_tracked_by_user = update_tracked_dataset_to_users(hub_id, username)
response += (
"You are currently tracking whether new models have been trained on"
f" {len(datasets_tracked_by_user)} datasets."
)
if datasets_tracked_by_user:
response += (
"\nYou are currently monitoring whether new models have been trained on the"
" following datasets:\n"
)
for dataset in datasets_tracked_by_user:
response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
else:
response += "You are not currently tracking any datasets."
return response
def check_for_new_models_for_dataset_and_update() -> Dict[str, Set[str]]:
# if not Path(f"{save_dir}/models_to_dataset.json").is_file():
with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as old_results_db:
dataset_ids = list(old_results_db.keys())
new_results = generate_dataset_model_map(dataset_ids)
models_to_notify_about = {
dataset_id: set(models).difference(set(old_results_db[dataset_id]))
for dataset_id, models in new_results.items()
if len(models) > len(old_results_db[dataset_id])
}
for dataset_id, models in new_results.items():
old_results_db[dataset_id] = models
old_results_db.commit()
return models_to_notify_about
def get_repo_discussion_by_author_and_type(
repo, author, token, repo_type="space", include_prs=False
):
discussions = get_repo_discussions(repo, repo_type=repo_type, token=token)
for discussion in discussions:
if discussion.author == author:
if not include_prs and discussion.is_pull_request:
continue
yield discussion
def create_discussion_text_body(dataset_id, new_models, users_to_notify):
usernames = [f"@{username}" for username in users_to_notify]
usernames_string = ", ".join(usernames)
dataset_id_markdown_url = (
f"[{dataset_id}](https://huggingface.co/datasets/{dataset_id})"
)
description = (
f"Hey {usernames_string}! Librarian Bot found new models trained on the"
f" {dataset_id_markdown_url} dataset!\n\n"
)
description += f"New model trained on {dataset_id}:\n"
markdown_items = [
f"- {hub_id_to_huggingface_hub_url_markdown(model)}" for model in new_models
]
markdown_list = "\n".join(markdown_items)
description += markdown_list
description += """\n\n This discussion was created by the [Dataset to Model Monitor](https://huggingface.co/spaces/librarian-bots/dataset-to-model-monitor) Space. You can modify your alerts using this Space."""
return description
def maybe_create_discussion(
repo: str,
dataset_id: str,
new_models: Union[List, str],
users_to_notify: List[str],
author: str,
token: str,
):
title = f"Discussion tracking new models trained on {dataset_id}"
discussions = get_repo_discussion_by_author_and_type(repo, author, HF_TOKEN)
if discussions_for_dataset := next(
(discussion for discussion in discussions if title == discussion.title),
None,
):
discussion_id = discussions_for_dataset.num
description = create_discussion_text_body(
dataset_id, new_models, users_to_notify
)
comment_discussion(
repo, discussion_id, description, token=token, repo_type="space"
)
else:
description = create_discussion_text_body(
dataset_id, new_models, users_to_notify
)
create_discussion(
repo,
title,
token=token,
description=description,
repo_type="space",
)
def hub_id_to_huggingface_hub_url_markdown(hub_id: str) -> str:
return f"[{hub_id}](https://huggingface.co/{hub_id})"
def notify_about_new_models():
print("running notifications")
if models_to_notify_about := check_for_new_models_for_dataset_and_update():
for dataset_id, new_models in models_to_notify_about.items():
with SqliteDict(
f"{save_dir}/tracked_dataset_to_users.sqlite"
) as tracked_dataset_to_users_db:
users_to_notify = tracked_dataset_to_users_db.get(dataset_id)
maybe_create_discussion(
REPO, dataset_id, new_models, users_to_notify, AUTHOR, HF_TOKEN
)
print("notified about new models")
def number_of_users_tracking_datasets():
with SqliteDict(
f"{save_dir}/tracked_dataset_to_users.sqlite"
) as tracked_dataset_to_users_db:
return count(unique(concat(iter(tracked_dataset_to_users_db.values()))))
def number_of_datasets_tracked():
with SqliteDict(f"{save_dir}/models_to_dataset.sqlite") as datasets_to_models_db:
return len(datasets_to_models_db)
@cached(cache=TTLCache(maxsize=1, ttl=30))
def generate_summary_stats():
return (
f"Currently there are {number_of_users_tracking_datasets()} users tracking"
f" datasets with a total of {number_of_datasets_tracked()} datasets being"
" tracked"
)
def _user_stats(username: str):
if not (tracked_datasets := datasets_tracked_by_user(username)):
return "You are not currently tracking any datasets"
response = (
"You are currently tracking whether new models have been trained on"
f" {len(tracked_datasets)} datasets.\n\n"
)
response += "### Datasets being tracked \n\n"
response += (
"You are currently monitoring whether new models have been trained on the"
" following datasets:\n"
)
for dataset in tracked_datasets:
response += f"- [{dataset}](https://huggingface.co/datasets/{dataset})\n"
return response
def user_stats(profile: gr.OAuthProfile | None):
if not profile and not local:
return "You must be logged in to view datasets you are tracking"
username = profile.username
return _user_stats(username)
markdown_text = """
The Hugging Face Hub allows users to specify the dataset used to train a model in the model metadata.
This metadata allows you to find models trained on a particular dataset.
These links can be very powerful for finding models that might be suitable for a particular task.\n\n
This Gradio app allows you to track datasets hosted on the Hugging Face Hub and get a notification when new models are trained on the dataset you are tracking.
1. Submit the Hugging Face Hub ID for the dataset you are interested in tracking.
2. If a new model is listed as being trained on this dataset Librarian Bot will ping you in a discussion on the Hugging Face Hub to let you know.
3. Librarian Bot will check for new models for a particular dataset once a day.
**NOTE** This app is a proof of concept and is intended to validate how much interest there is for a feature like this.
If you have feedback please add it to this [discussion](https://huggingface.co/spaces/librarian-bots/dataset-to-model-monitor/discussions/2).
### Tips
- You might find the [Hugging Face Datasets Semantic Search](https://huggingface.co/spaces/librarian-bots/huggingface-datasets-semantic-search) Space useful for finding datasets to track.
- You can use a wildcard `*` to track all datasets for a user or organization on the hub. For example `biglam/*` will create alerts for all the datasets under the biglam Hugging Face Organization
- You need to be logged in to your Hugging Face account to use this app. If you don't have a Hugging Face Hub account you can get one <a href="https://huggingface.co/join">here</a>.
"""
with gr.Blocks() as demo:
gr.Markdown(
'<div style="text-align: center;"><h1> 🤖 Librarian Bot Dataset-to-Model'
' Monitor 🤖 </h1><i><p style="font-size: 20px;">✨ Get alerts when a new'
" model is created from a dataset you are interested in! ✨</p></i></div>"
)
with gr.Row():
gr.Markdown(markdown_text)
with gr.Row():
hub_id = gr.Textbox(
"i.e. biglam/brill_iconclass",
label="Hugging Face Hub ID for dataset to track",
max_lines=1,
)
with gr.Column():
track_button = gr.Button("Track new models for dataset")
with gr.Row():
remove_specific_datasets = gr.Button("Stop tracking dataset")
remove_all = gr.Button("⛔️ Unsubscribe from all datasets ⛔️")
with gr.Row(variant="compact"):
gr.LoginButton(size="sm")
gr.LogoutButton(size="sm")
summary_stats_btn = gr.Button(
"Summary stats for datasets being tracked by this app", size="sm"
)
user_stats_btn = gr.Button("List my tracked datasets", size="sm")
with gr.Row():
output = gr.Markdown()
track_button.click(user_update, [hub_id], output)
remove_specific_datasets.click(
remove_user_from_tracking_datasets, [hub_id], output
)
summary_stats_btn.click(generate_summary_stats, [], output)
user_stats_btn.click(user_stats, [], output)
scheduler = BackgroundScheduler()
if local:
scheduler.add_job(notify_about_new_models, "interval", minutes=5)
else:
scheduler.add_job(
notify_about_new_models,
CronTrigger.from_crontab("0 */12 * * *"),
)
scheduler.start()
demo.queue(max_size=5)
demo.launch()
|