Spaces:
Running
Running
run garbage collector to delete empty models periodically
Browse files- app.py +20 -1
- clean_community_org.py +36 -0
app.py
CHANGED
@@ -3,6 +3,8 @@ import pathlib
|
|
3 |
import random
|
4 |
import string
|
5 |
import tempfile
|
|
|
|
|
6 |
from typing import Iterable, List
|
7 |
|
8 |
import gradio as gr
|
@@ -12,6 +14,8 @@ import yaml
|
|
12 |
from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
|
13 |
from mergekit.config import MergeConfiguration
|
14 |
|
|
|
|
|
15 |
has_gpu = torch.cuda.is_available()
|
16 |
|
17 |
# Running directly from Python doesn't work well with Gradio+run_process because of:
|
@@ -164,7 +168,7 @@ def merge(yaml_config: str, hf_token: str, repo_name: str) -> Iterable[List[Log]
|
|
164 |
return
|
165 |
|
166 |
# Set tmp HF_HOME to avoid filling up disk Space
|
167 |
-
tmp_env = os.environ.copy()
|
168 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
169 |
yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
|
170 |
|
@@ -215,4 +219,19 @@ with gr.Blocks() as demo:
|
|
215 |
|
216 |
button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
|
217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
218 |
demo.queue(default_concurrency_limit=1).launch()
|
|
|
3 |
import random
|
4 |
import string
|
5 |
import tempfile
|
6 |
+
import time
|
7 |
+
from concurrent.futures import ThreadPoolExecutor
|
8 |
from typing import Iterable, List
|
9 |
|
10 |
import gradio as gr
|
|
|
14 |
from gradio_logsview.logsview import Log, LogsView, LogsViewRunner
|
15 |
from mergekit.config import MergeConfiguration
|
16 |
|
17 |
+
from clean_community_org import garbage_collect_empty_models
|
18 |
+
|
19 |
has_gpu = torch.cuda.is_available()
|
20 |
|
21 |
# Running directly from Python doesn't work well with Gradio+run_process because of:
|
|
|
168 |
return
|
169 |
|
170 |
# Set tmp HF_HOME to avoid filling up disk Space
|
171 |
+
tmp_env = os.environ.copy() # taken from https://stackoverflow.com/a/4453495
|
172 |
tmp_env["HF_HOME"] = f"{tmpdirname}/.cache"
|
173 |
yield from runner.run_command(cli.split(), cwd=merged_path, env=tmp_env)
|
174 |
|
|
|
219 |
|
220 |
button.click(fn=merge, inputs=[config, token, repo_name], outputs=[logs])
|
221 |
|
222 |
+
|
223 |
+
# Run garbage collection every hour to keep the community org clean.
|
224 |
+
# Empty models might exists if the merge fails abruptly (e.g. if user leaves the Space).
|
225 |
+
def _garbage_collect_every_hour():
|
226 |
+
while True:
|
227 |
+
try:
|
228 |
+
garbage_collect_empty_models(token=COMMUNITY_HF_TOKEN)
|
229 |
+
except Exception as e:
|
230 |
+
print("Error running garbage collection", e)
|
231 |
+
time.sleep(3600)
|
232 |
+
|
233 |
+
|
234 |
+
pool = ThreadPoolExecutor()
|
235 |
+
pool.submit(_garbage_collect_every_hour)
|
236 |
+
|
237 |
demo.queue(default_concurrency_limit=1).launch()
|
clean_community_org.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Script to delete empty models from the community org.
|
2 |
+
# Can be run manually or scheduled to run periodically in the Space.
|
3 |
+
# Usage: python clean_community_org.py
|
4 |
+
#
|
5 |
+
# 1. List models from https://huggingface.co/mergekit-community
|
6 |
+
# 2. Filter out models with no files.
|
7 |
+
# 3. Filter out models that are newer than 1 hour.
|
8 |
+
# 4. Delete the remaining models.
|
9 |
+
from datetime import datetime, timezone
|
10 |
+
|
11 |
+
from huggingface_hub import HfApi
|
12 |
+
|
13 |
+
|
14 |
+
def garbage_collect_empty_models(token: str | None = None):
|
15 |
+
api = HfApi(token=token)
|
16 |
+
now = datetime.now(timezone.utc)
|
17 |
+
print("Running garbage collection on mergekit-community.")
|
18 |
+
for model in api.list_models(author="mergekit-community", full=True):
|
19 |
+
if model.siblings and len(model.siblings) > 1:
|
20 |
+
# If model has files, then it's not empty
|
21 |
+
print("Skipping", model.modelId, "(not empty)")
|
22 |
+
continue
|
23 |
+
if (now - model.last_modified).total_seconds() < 3600:
|
24 |
+
# If model was updated in the last hour, then keep it
|
25 |
+
# to avoid deleting models that are being uploaded
|
26 |
+
print("Skipping", model.modelId, "(recently updated)")
|
27 |
+
continue
|
28 |
+
try:
|
29 |
+
print(f"Deleting {model.modelId}")
|
30 |
+
api.delete_repo(model.modelId, missing_ok=True)
|
31 |
+
except Exception as e:
|
32 |
+
print(f"Error deleting {model.modelId}: {e}")
|
33 |
+
|
34 |
+
|
35 |
+
if __name__ == "__main__":
|
36 |
+
garbage_collect_empty_models()
|