import sqlite3 import huggingface_hub import sys import time from tqdm import tqdm SQLITE3_DB = "data/repos.sqlite3" def list_repos_from_hub(): for repo in huggingface_hub.list_datasets(): if not(repo.private): yield "datasets/" + repo.id for repo in huggingface_hub.list_models(): if not(repo.private): yield "models/" + repo.id for repo in huggingface_hub.list_spaces(): if not(repo.private): yield "spaces/" + repo.id def write_repos_to_db(): print("Opening database", SQLITE3_DB, file=sys.stderr) con = sqlite3.connect(SQLITE3_DB) cur = con.cursor() print("Creating repos table if not exists", file=sys.stderr) cur.execute("CREATE TABLE IF NOT EXISTS repos (id TEXT PRIMARY KEY, last_updated_datetime INTEGER, last_enumerated_datetime INTEGER NULLABLE)") con.commit() print("Inserting rows from huggingface_hub query", file=sys.stderr) for repo in tqdm(list_repos_from_hub()): cur.execute("INSERT OR IGNORE INTO repos VALUES ('{}', '{}', NULL)".format(repo, 0)) con.commit() def list_repos(limit=None): con = sqlite3.connect(SQLITE3_DB) cur = con.cursor() if limit is None: res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC") else: res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC LIMIT {}".format(limit)) return [row[0] for row in res.fetchall()] def set_updated_datetime(repo): con = sqlite3.connect(SQLITE3_DB) cur = con.cursor() cur.execute("UPDATE repos SET last_updated_datetime = {} WHERE id = '{}'".format(int(time.time()), repo)) con.commit() if __name__ == "__main__": write_repos_to_db() print("Done writing to DB. Sample of 5 rows:") for repo in list_repos(limit=5): print(repo)