File size: 1,844 Bytes
f624d68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import sqlite3
import huggingface_hub
import sys
import time

from tqdm import tqdm

SQLITE3_DB = "data/repos.sqlite3"

def list_repos_from_hub():
    for repo in huggingface_hub.list_datasets():
        if not(repo.private):
            yield "datasets/" + repo.id
    for repo in huggingface_hub.list_models():
        if not(repo.private):
            yield "models/" + repo.id
    for repo in huggingface_hub.list_spaces():
        if not(repo.private):
            yield "spaces/" + repo.id

def write_repos_to_db():
    print("Opening database", SQLITE3_DB, file=sys.stderr)
    con = sqlite3.connect(SQLITE3_DB)
    cur = con.cursor()
    print("Creating repos table if not exists", file=sys.stderr)
    cur.execute("CREATE TABLE IF NOT EXISTS repos (id TEXT PRIMARY KEY, last_updated_datetime INTEGER, last_enumerated_datetime INTEGER NULLABLE)")
    con.commit()
    print("Inserting rows from huggingface_hub query", file=sys.stderr)
    for repo in tqdm(list_repos_from_hub()):
        cur.execute("INSERT OR IGNORE INTO repos VALUES ('{}', '{}', NULL)".format(repo, 0))
        con.commit()

def list_repos(limit=None):
    con = sqlite3.connect(SQLITE3_DB)
    cur = con.cursor()
    if limit is None:
        res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC")
    else:
        res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC LIMIT {}".format(limit))
    return [row[0] for row in res.fetchall()]

def set_updated_datetime(repo):
    con = sqlite3.connect(SQLITE3_DB)
    cur = con.cursor()
    cur.execute("UPDATE repos SET last_updated_datetime = {} WHERE id = '{}'".format(int(time.time()), repo))
    con.commit()

if __name__ == "__main__":
    write_repos_to_db()
    print("Done writing to DB. Sample of 5 rows:")
    for repo in list_repos(limit=5):
        print(repo)