xet-repo-data-collection / list_repos.py
znation's picture
znation HF staff
ssl recursion fix in list_repos
03a8d5e
# found on https://stackoverflow.com/a/52130355 to fix infinite recursion with ssl
# at the beginning of the script
import gevent.monkey
gevent.monkey.patch_all()
import sqlite3
import huggingface_hub
import sys
import time
from tqdm import tqdm
SQLITE3_DB = "data/repos.sqlite3"
def list_repos_from_hub():
for repo in huggingface_hub.list_datasets():
if not(repo.private):
yield "datasets/" + repo.id
for repo in huggingface_hub.list_models():
if not(repo.private):
yield "models/" + repo.id
for repo in huggingface_hub.list_spaces():
if not(repo.private):
yield "spaces/" + repo.id
def write_repos_to_db():
print("Opening database", SQLITE3_DB, file=sys.stderr)
con = sqlite3.connect(SQLITE3_DB)
cur = con.cursor()
print("Creating repos table if not exists", file=sys.stderr)
cur.execute("CREATE TABLE IF NOT EXISTS repos (id TEXT PRIMARY KEY, last_updated_datetime INTEGER, last_enumerated_datetime INTEGER NULLABLE)")
con.commit()
print("Inserting rows from huggingface_hub query", file=sys.stderr)
for repo in tqdm(list_repos_from_hub()):
cur.execute("INSERT OR IGNORE INTO repos VALUES ('{}', '{}', NULL)".format(repo, 0))
con.commit()
def list_repos(limit=None):
con = sqlite3.connect(SQLITE3_DB)
cur = con.cursor()
if limit is None:
res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC")
else:
res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC LIMIT {}".format(limit))
return [row[0] for row in res.fetchall()]
def set_updated_datetime(repo):
con = sqlite3.connect(SQLITE3_DB)
cur = con.cursor()
cur.execute("UPDATE repos SET last_updated_datetime = {} WHERE id = '{}'".format(int(time.time()), repo))
con.commit()
if __name__ == "__main__":
write_repos_to_db()
print("Done writing to DB. Sample of 5 rows:")
for repo in list_repos(limit=5):
print(repo)