|
|
|
|
|
import gevent.monkey |
|
gevent.monkey.patch_all() |
|
|
|
import sqlite3 |
|
import huggingface_hub |
|
import sys |
|
import time |
|
|
|
from tqdm import tqdm |
|
|
|
SQLITE3_DB = "data/repos.sqlite3" |
|
|
|
def list_repos_from_hub(): |
|
for repo in huggingface_hub.list_datasets(): |
|
if not(repo.private): |
|
yield "datasets/" + repo.id |
|
for repo in huggingface_hub.list_models(): |
|
if not(repo.private): |
|
yield "models/" + repo.id |
|
for repo in huggingface_hub.list_spaces(): |
|
if not(repo.private): |
|
yield "spaces/" + repo.id |
|
|
|
def write_repos_to_db(): |
|
print("Opening database", SQLITE3_DB, file=sys.stderr) |
|
con = sqlite3.connect(SQLITE3_DB) |
|
cur = con.cursor() |
|
print("Creating repos table if not exists", file=sys.stderr) |
|
cur.execute("CREATE TABLE IF NOT EXISTS repos (id TEXT PRIMARY KEY, last_updated_datetime INTEGER, last_enumerated_datetime INTEGER NULLABLE)") |
|
con.commit() |
|
print("Inserting rows from huggingface_hub query", file=sys.stderr) |
|
for repo in tqdm(list_repos_from_hub()): |
|
cur.execute("INSERT OR IGNORE INTO repos VALUES ('{}', '{}', NULL)".format(repo, 0)) |
|
con.commit() |
|
|
|
def list_repos(limit=None): |
|
con = sqlite3.connect(SQLITE3_DB) |
|
cur = con.cursor() |
|
if limit is None: |
|
res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC") |
|
else: |
|
res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC LIMIT {}".format(limit)) |
|
return [row[0] for row in res.fetchall()] |
|
|
|
def set_updated_datetime(repo): |
|
con = sqlite3.connect(SQLITE3_DB) |
|
cur = con.cursor() |
|
cur.execute("UPDATE repos SET last_updated_datetime = {} WHERE id = '{}'".format(int(time.time()), repo)) |
|
con.commit() |
|
|
|
if __name__ == "__main__": |
|
write_repos_to_db() |
|
print("Done writing to DB. Sample of 5 rows:") |
|
for repo in list_repos(limit=5): |
|
print(repo) |