import json import os import re from collections import defaultdict from datetime import datetime, timedelta, timezone import huggingface_hub from huggingface_hub import ModelCard from huggingface_hub.hf_api import ModelInfo from transformers import AutoConfig from transformers.models.auto.tokenization_auto import AutoTokenizer def already_submitted_models(requested_models_dir: str) -> set[str]: """Gather a list of already submitted models to avoid duplicates""" depth = 1 file_names = [] users_to_submission_dates = defaultdict(list) for root, _, files in os.walk(requested_models_dir): current_depth = root.count(os.sep) - requested_models_dir.count(os.sep) if current_depth == depth: for file in files: if not file.endswith(".json"): continue with open(os.path.join(root, file), "r") as f: info = json.load(f) file_names.append(f"{info['model']}_{info['revision']}") # Select organisation if info["model"].count("/") == 0 or "submitted_time" not in info: continue organisation, _ = info["model"].split("/") users_to_submission_dates[organisation].append(info["submitted_time"]) return set(file_names), users_to_submission_dates