Spaces:

znation
/

xet-repo-data-collection

Running

App Files Files Community

znation HF staff commited on 18 days ago

Commit

f624d68

1 Parent(s): c90151d

let's try this

Browse files

Files changed (6) hide show

app.py +12 -0
list_files.py +126 -0
list_reconstructions.py +161 -0
list_repos.py +52 -0
list_xorbs.py +59 -0
refresh_lists.py +27 -0

app.py CHANGED Viewed

@@ -1,7 +1,19 @@
 import gradio as gr
 def greet(name):
     return "Hello " + name + "!!"
 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import list_repos
+import refresh_lists
 def greet(name):
+    refresh_repos()
+    refresh_files()
     return "Hello " + name + "!!"
+def refresh_repos(progress=gr.Progress(track_tqdm=True)):
+    list_repos.write_repos_to_db()
+def refresh_files(progress=gr.Progress(track_tqdm=True)):
+    while True:
+        refresh_lists.refresh_oldest_repo()
 demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

list_files.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# found on https://stackoverflow.com/a/52130355 to fix infinite recursion with ssl
+# at the beginning of the script
+import gevent.monkey
+gevent.monkey.patch_all()
+import json
+from datetime import date, datetime
+import sys
+import time
+import huggingface_hub
+import sqlite3
+from tqdm import tqdm
+fs = huggingface_hub.HfFileSystem()
+import list_repos
+SQLITE3_DB = "data/files.sqlite3"
+def json_serial(obj):
+    if isinstance(obj, (datetime, date)):
+        return obj.isoformat()
+    raise TypeError("Type %s not serializable" % type(obj))
+def list_files_from_hub(repo, replace_model_in_url=True):
+    # remove models/ from the front of repo,
+    # since the "default" type of repo is a model.
+    # the underlying implementation of fs.ls appends repo as /api/models/<repo>.
+    if replace_model_in_url and repo.startswith("models/"):
+        repo = repo.replace("models/", "", 1)
+    # implement our own recursive list since it will make multiple requests,
+    # one for each ls, which are much more likely to succeed.
+    # passing recursive=True (which is undocumented anyway) does it in one request
+    # which really slams the server and might give a 500 error due to hitting some
+    # backend timeout.
+    items = fs.ls(repo)
+    for item in items:
+        if item["type"] == "directory":
+            yield from list_files_from_hub(item["name"], replace_model_in_url=False)
+        else:
+            yield item
+def write_files_to_db(repo):
+    print("Opening database", SQLITE3_DB, file=sys.stderr)
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    print("Creating files table if not exists", file=sys.stderr)
+    cur.execute(
+        "CREATE TABLE IF NOT EXISTS files (name TEXT PRIMARY KEY, last_updated_datetime INTEGER, repo TEXT, size INTEGER, type TEXT, blob_id TEXT, is_lfs INTEGER, lfs_size INTEGER, lfs_sha256 TEXT, lfs_pointer_size INTEGER, last_commit_oid TEXT, last_commit_title TEXT, last_commit_date TEXT)"
+    )
+    con.commit()
+    print("Deleting existing rows for repo {}".format(repo), file=sys.stderr)
+    cur.execute("DELETE FROM files WHERE repo = '{}'".format(repo))
+    con.commit()
+    print("Inserting new rows from HFFileSystem query for repo {}".format(repo), file=sys.stderr)
+    for file in tqdm(list_files_from_hub(repo)):
+        is_lfs = file["lfs"] is not None
+        query = "INSERT INTO files VALUES ('{}', {}, '{}', {}, '{}', '{}', {}, {}, '{}', {}, '{}', '{}', '{}')".format(
+            file["name"],
+            int(time.time()),
+            repo,
+            file["size"],
+            file["type"],
+            file["blob_id"],
+            1 if is_lfs else 0,
+            file["lfs"]["size"] if is_lfs else 'NULL',
+            file["lfs"]["sha256"] if is_lfs else 'NULL',
+            file["lfs"]["pointer_size"] if is_lfs else 'NULL',
+            file["last_commit"]["oid"],
+            file["last_commit"]["title"],
+            file["last_commit"]["date"],
+        )
+        cur.execute(query)
+        con.commit()
+def is_lfs(file):
+    return file["lfs"] is not None
+def list_lfs_files(repo):
+    list = list_files(repo)
+    for file in list:
+        if is_lfs(file):
+            yield file
+def list_files(repo, limit=None):
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    if limit is None:
+        res = cur.execute("SELECT * FROM files WHERE repo == '{}'".format(repo))
+    else:
+        res = cur.execute("SELECT * FROM files WHERE repo == '{}' LIMIT {}".format(repo, limit))
+    ret = [
+        {
+            "name": row[0],
+            "last_updated_datetime": row[1],
+            "size": row[2],
+            "type": row[3],
+            "blob_id": row[4],
+            "lfs": (
+                {"size": row[6], "sha256": row[7], "pointer_size": row[8]}
+                if row[5]
+                else None
+            ),
+            "last_commit": {"oid": row[9], "title": row[10], "date": row[11]},
+        }
+        for row in res.fetchall()
+    ]
+    return ret
+if __name__ == "__main__":
+    for repo in list_repos.list_repos():
+        write_files_to_db(repo)
+    print("Done writing to DB. Sample of 9 rows:")
+    for repo in list_repos.list_repos(limit=3):
+        for file in list_files(repo, limit=3):
+            print(json.dumps(file, default=json_serial))

list_reconstructions.py ADDED Viewed

	@@ -0,0 +1,161 @@

+import os
+import json
+import sys
+import time
+import grequests
+import sqlite3
+from tqdm import tqdm
+import list_files
+import list_repos
+SQLITE3_DB = "data/reconstructions.sqlite3"
+HF_ENDPOINT = os.getenv("HF_ENDPOINT", "https://huggingface.co")
+XET_CAS_ENDPOINT = os.getenv("XET_CAS_ENDPOINT", "https://cas-server.xethub.hf.co")
+RESOLVE_URL_TEMPLATE = HF_ENDPOINT + "/{}/resolve/main"
+def exception_handler(req, exc):
+    print(exc, file=sys.stderr)
+def list_reconstructions_from_hub(repo):
+    print(
+        "Listing reconstructions using:\nHF Hub Endpoint:  {}\nXet CAS Endpoint: {}".format(
+            HF_ENDPOINT, XET_CAS_ENDPOINT
+        ),
+        file=sys.stderr,
+    )
+    ret = []
+    files = []
+    resolve_reqs = []
+    reconstruct_reqs = []
+    err_count = 0
+    print("Listing files for repo {}".format(repo), file=sys.stderr)
+    total = 0
+    for i, file in tqdm(enumerate(list_files.list_lfs_files(repo))):
+        total += 1
+        files.append(file["name"])
+        if repo.startswith("models/"):
+            repo = repo.replace("models/", "", 1)
+        url = file["name"].replace(repo, RESOLVE_URL_TEMPLATE.format(repo), 1)
+        headers = {"Authorization": "Bearer {}".format(os.getenv("HF_TOKEN"))}
+        resolve_reqs.append(
+            grequests.head(url, headers=headers, allow_redirects=False)
+        )
+    print("", file=sys.stderr)
+    print("Calling /resolve/ for repo {}".format(repo), file=sys.stderr)
+    for i, resp in tqdm(
+        grequests.imap_enumerated(
+            resolve_reqs, size=4, exception_handler=exception_handler
+        ),
+        total=total,
+    ):
+        if resp is None:
+            err_count += 1
+            continue
+        # todo: use refresh_route when access_token is expired
+        refresh_route = resp.headers.get("x-xet-refresh-route")
+        xet_hash = resp.headers.get("x-xet-hash")
+        access_token = resp.headers.get("x-xet-access-token")
+        if xet_hash is not None and xet_hash != "":
+            url = "{}/reconstruction/{}".format(XET_CAS_ENDPOINT, xet_hash)
+            headers = {"Authorization": "Bearer {}".format(access_token)}
+            reconstruct_reqs.append(grequests.get(url, headers=headers))
+    print("", file=sys.stderr)
+    print(
+        "Calling /reconstruct/ with grequests for repo {}".format(repo),
+        file=sys.stderr,
+    )
+    for i, resp in tqdm(
+        grequests.imap_enumerated(
+            reconstruct_reqs, size=4, exception_handler=exception_handler
+        ),
+        total=total,
+    ):
+        if resp is None:
+            continue
+        if resp.status_code != 200:
+            continue
+        body = resp.json()
+        for term in body["terms"]:
+            entry = {
+                "start": term["range"]["start"],
+                "end": term["range"]["end"],
+                "file_path": files[i + err_count],
+                "xorb_id": term["hash"],
+                "unpacked_length": term["unpacked_length"]
+            }
+            ret.append(entry)
+    return ret
+def list_reconstructions(repos, limit=None):
+    ret = []
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    for repo in repos:
+        if limit is None:
+            res = cur.execute("SELECT * FROM reconstructions WHERE repo = '{}'".format(repo))
+        else:
+            res = cur.execute("SELECT * FROM reconstructions WHERE repo = '{}' LIMIT {}".format(repo, limit))
+        for row in res.fetchall():
+            entry = {
+                "xorb_id": row[1],
+                "last_updated_timestamp": row[2],
+                "repo": row[3],
+                "file_path": row[4],
+                "unpacked_length": row[5],
+                "start": row[6],
+                "end": row[7]
+            }
+            ret.append(entry)
+    return ret
+def write_files_to_db(repo):
+    print("Opening database", SQLITE3_DB, file=sys.stderr)
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    print("Creating reconstructions table if not exists", file=sys.stderr)
+    cur.execute(
+        "CREATE TABLE IF NOT EXISTS reconstructions (id INTEGER PRIMARY KEY AUTOINCREMENT, xorb_id TEXT, last_updated_datetime INTEGER, repo TEXT, file_path TEXT, unpacked_length INTEGER, start INTEGER, end INTEGER)"
+    )
+    con.commit()
+    print("Deleting existing rows for repo {}".format(repo), file=sys.stderr)
+    cur.execute("DELETE FROM reconstructions WHERE repo = '{}'".format(repo))
+    con.commit()
+    print("Inserting rows from HFFileSystem query", file=sys.stderr)
+    for reconstruction in list_reconstructions_from_hub(repo):
+        query = "INSERT INTO reconstructions VALUES (NULL, '{}', {}, '{}', '{}', {}, {}, {})".format(
+            reconstruction["xorb_id"],
+            int(time.time()),
+            repo,
+            reconstruction["file_path"],
+            reconstruction["unpacked_length"],
+            reconstruction["start"],
+            reconstruction["end"]
+        )
+        cur.execute(query)
+        con.commit()
+if __name__ == "__main__":
+    for repo in list_repos.list_repos():
+        write_files_to_db(repo)
+    print("Done writing to DB. Sample of 5 rows:")
+    json.dump(
+        list_reconstructions(list_repos.list_repos(), limit=5),
+        sys.stdout,
+        sort_keys=True,
+        indent=4,
+    )

list_repos.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sqlite3
+import huggingface_hub
+import sys
+import time
+from tqdm import tqdm
+SQLITE3_DB = "data/repos.sqlite3"
+def list_repos_from_hub():
+    for repo in huggingface_hub.list_datasets():
+        if not(repo.private):
+            yield "datasets/" + repo.id
+    for repo in huggingface_hub.list_models():
+        if not(repo.private):
+            yield "models/" + repo.id
+    for repo in huggingface_hub.list_spaces():
+        if not(repo.private):
+            yield "spaces/" + repo.id
+def write_repos_to_db():
+    print("Opening database", SQLITE3_DB, file=sys.stderr)
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    print("Creating repos table if not exists", file=sys.stderr)
+    cur.execute("CREATE TABLE IF NOT EXISTS repos (id TEXT PRIMARY KEY, last_updated_datetime INTEGER, last_enumerated_datetime INTEGER NULLABLE)")
+    con.commit()
+    print("Inserting rows from huggingface_hub query", file=sys.stderr)
+    for repo in tqdm(list_repos_from_hub()):
+        cur.execute("INSERT OR IGNORE INTO repos VALUES ('{}', '{}', NULL)".format(repo, 0))
+        con.commit()
+def list_repos(limit=None):
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    if limit is None:
+        res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC")
+    else:
+        res = cur.execute("SELECT id FROM repos ORDER BY last_updated_datetime ASC LIMIT {}".format(limit))
+    return [row[0] for row in res.fetchall()]
+def set_updated_datetime(repo):
+    con = sqlite3.connect(SQLITE3_DB)
+    cur = con.cursor()
+    cur.execute("UPDATE repos SET last_updated_datetime = {} WHERE id = '{}'".format(int(time.time()), repo))
+    con.commit()
+if __name__ == "__main__":
+    write_repos_to_db()
+    print("Done writing to DB. Sample of 5 rows:")
+    for repo in list_repos(limit=5):
+        print(repo)

list_xorbs.py ADDED Viewed

	@@ -0,0 +1,59 @@

+### A simpler rendition of reconstructions.
+### Lists only xorbs and a "dedupe factor" for the xorb
+### Where dedupe-factor 1 == no dedupe,
+###                     2 == 1 chunk shared,
+###                     3 == 2 chunks shared,
+###                     etc.
+import json
+import sys
+import list_reconstructions
+import list_repos
+def list_xorbs(repos):
+    # first build up a mapping of {xorb_id: [(start, end), (start, end), ...]}
+    xorbs = {}
+    reconstructions = list_reconstructions.list_reconstructions(repos)
+    for term in reconstructions:
+        if not(term["xorb_id"] in xorbs):
+            xorbs[term["xorb_id"]] = []
+        path_parts = term["file_path"].split("/")
+        if path_parts[0] != "datasets" and \
+           path_parts[0] != "spaces":
+            # models omit the "models" part from file path
+            path_parts.insert(0, "models")
+        repo = "/".join(path_parts[:3])
+        xorbs[term["xorb_id"]].append((term["start"], term["end"], repo))
+    # then walk the lists and compute dedupe factor
+    output = []
+    for xorb_id,chunks in xorbs.items():
+        min_chunk_idx = float("inf")
+        max_chunk_idx = float("-inf")
+        xorb_repos = set()
+        dedupe_factor = 0
+        for chunk in chunks:
+            min_chunk_idx = min(min_chunk_idx, chunk[0])
+            max_chunk_idx = max(max_chunk_idx, chunk[1])
+            xorb_repos.add(chunk[2])
+        xorb_repos = list(xorb_repos)
+        for i in range(min_chunk_idx, max_chunk_idx):
+            ref_count = 0
+            for chunk in chunks:
+                if i >= chunk[0] and i < chunk[1]:
+                    ref_count += 1
+            dedupe_factor += ref_count
+        if max_chunk_idx != 0:
+            dedupe_factor /= float(max_chunk_idx)
+        for repo in xorb_repos:
+            output.append({
+                "xorb_id": xorb_id,
+                "dedupe_factor": dedupe_factor,
+                "repo": repo
+            })
+    return output
+if __name__ == "__main__":
+    json.dump(list_xorbs(list_repos.list_repos()), sys.stdout, sort_keys=True, indent=4)

refresh_lists.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Refreshes the `files` and `reconstructions` tables for a single repo.
+"""
+import sys
+import list_files
+import list_reconstructions
+import list_repos
+def refresh_oldest_repo():
+    oldest_repo = list_repos.list_repos(limit=1)
+    assert(len(oldest_repo) == 1)
+    refresh_repo(oldest_repo[0])
+def refresh_repo(repo):
+    print("Refreshing repo", repo, file=sys.stderr)
+    print("Listing files", file=sys.stderr)
+    list_files.write_files_to_db(repo)
+    print("Listing reconstructions", file=sys.stderr)
+    list_reconstructions.write_files_to_db(repo)
+    print("Updating timestamp", file=sys.stderr)
+    list_repos.set_updated_datetime(repo)
+    print("Done", file=sys.stderr)
+if __name__ == "__main__":
+    refresh_oldest_repo()