load-balancer / hf_scrapper.py
ChandimaPrabath's picture
fix
a5ae33a
raw
history blame
2.49 kB
import os
import json
import aiohttp
import aiofiles
import asyncio
import urllib.request
from aiohttp import ClientSession, ClientTimeout
from aiohttp.client_exceptions import ClientError
from tqdm.asyncio import tqdm
CACHE_DIR = os.getenv("CACHE_DIR")
CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")
download_progress = {}
def get_system_proxies():
"""
Retrieves the system's HTTP and HTTPS proxies.
Returns:
dict: A dictionary containing the proxies.
"""
try:
proxies = urllib.request.getproxies()
print("System proxies:", proxies)
return {
"http": proxies.get("http"),
"https": proxies.get("http")
}
except Exception as e:
print(f"Error getting system proxies: {e}")
return {}
async def get_file_structure(repo, token, path="", proxies=None):
"""
Fetches the file structure of a specified Hugging Face repository.
Args:
repo (str): The name of the repository.
token (str): The authorization token for the request.
path (str, optional): The specific path in the repository. Defaults to "".
proxies (dict, optional): The proxies to use for the request. Defaults to None.
Returns:
list: A list of file structure information.
"""
api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
headers = {'Authorization': f'Bearer {token}'}
timeout = ClientTimeout(total=10)
async with ClientSession(timeout=timeout) as session:
print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
try:
async with session.get(api_url, headers=headers, proxy=proxies.get("http")) as response:
response.raise_for_status()
return await response.json()
except ClientError as e:
print(f"Error fetching file structure: {e}")
return []
async def write_file_structure_to_json(file_structure, file_path):
"""
Writes the file structure to a JSON file.
Args:
file_structure (list): The file structure data.
file_path (str): The path where the JSON file will be saved.
"""
try:
async with aiofiles.open(file_path, 'w') as json_file:
await json_file.write(json.dumps(file_structure, indent=2))
print(f'File structure written to {file_path}')
except IOError as e:
print(f"Error writing file structure to JSON: {e}")