Spaces:

Unicone-Studio
/

instance2

Paused

App Files Files Community

instance2 / hf_scrapper.py

ChandimaPrabath

v0.1

e264f26 11 months ago

raw

history blame

5.75 kB

	import os
	import requests
	import json
	import urllib.request
	from requests.exceptions import RequestException
	from tqdm import tqdm

	CACHE_DIR = os.getenv("CACHE_DIR")
	CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")

	download_progress = {}

	def get_system_proxies():
	"""
	Retrieves the system's HTTP and HTTPS proxies.

	Returns:
	dict: A dictionary containing the proxies.
	"""
	try:
	proxies = urllib.request.getproxies()
	print("System proxies:", proxies)
	return {
	"http": proxies.get("http"),
	"https": proxies.get("http")
	}
	except Exception as e:
	print(f"Error getting system proxies: {e}")
	return {}

	def download_file(file_url, token, cache_path, proxies, film_id, chunk_size=50 * 1024 * 1024): # 50MB chunk size
	"""
	Downloads a file from the specified URL and saves it to the cache path.
	Tracks the download progress.

	Args:
	file_url (str): The URL of the file to download.
	token (str): The authorization token for the request.
	cache_path (str): The path to save the downloaded file.
	proxies (dict): Proxies for the request.
	film_id (str): Unique identifier for the film download.
	chunk_size (int): Size of each chunk to download.
	"""
	print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
	headers = {'Authorization': f'Bearer {token}'}
	try:
	response = requests.get(file_url, headers=headers, proxies=proxies, stream=True)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	download_progress[film_id] = {"total": total_size, "downloaded": 0}

	os.makedirs(os.path.dirname(cache_path), exist_ok=True)
	with open(cache_path, 'wb') as file, tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_path) as pbar:
	for data in response.iter_content(chunk_size=chunk_size):
	file.write(data)
	pbar.update(len(data))
	download_progress[film_id]["downloaded"] += len(data)

	print(f'File cached to {cache_path} successfully.')
	update_cache_json(file_url, cache_path)
	except RequestException as e:
	print(f"Error downloading file: {e}")
	except IOError as e:
	print(f"Error writing file {cache_path}: {e}")
	finally:
	del download_progress[film_id]


	def get_download_progress(film_id):
	"""
	Gets the download progress for a specific film.

	Args:
	film_id (str): The unique identifier for the film download.

	Returns:
	dict: A dictionary containing the total size, downloaded size, and progress percentage.
	"""
	if film_id in download_progress:
	total = download_progress[film_id]["total"]
	downloaded = download_progress[film_id]["downloaded"]
	progress = (downloaded / total) * 100
	return {"total": total, "downloaded": downloaded, "progress": progress}
	return {"total": 0, "downloaded": 0, "progress": 0}

	def update_cache_json(file_url, cache_path):
	"""
	Updates the cached films JSON with the new file.

	Args:
	file_url (str): The URL of the downloaded file.
	cache_path (str): The local path where the file is saved.
	"""
	cache_data = {}
	if os.path.exists(CACHE_JSON_PATH):
	with open(CACHE_JSON_PATH, 'r') as json_file:
	cache_data = json.load(json_file)

	film_title = os.path.basename(cache_path)
	cache_data[film_title] = cache_path

	with open(CACHE_JSON_PATH, 'w') as json_file:
	json.dump(cache_data, json_file, indent=2)
	print(f'Cache updated with {film_title}.')

	def get_file_structure(repo, token, path="", proxies=None):
	"""
	Fetches the file structure of a specified Hugging Face repository.

	Args:
	repo (str): The name of the repository.
	token (str): The authorization token for the request.
	path (str, optional): The specific path in the repository. Defaults to "".
	proxies (dict, optional): The proxies to use for the request. Defaults to None.

	Returns:
	list: A list of file structure information.
	"""
	api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
	headers = {'Authorization': f'Bearer {token}'}
	print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
	try:
	response = requests.get(api_url, headers=headers, proxies=proxies)
	response.raise_for_status()
	return response.json()
	except RequestException as e:
	print(f"Error fetching file structure: {e}")
	return []

	def write_file_structure_to_json(file_structure, file_path):
	"""
	Writes the file structure to a JSON file.

	Args:
	file_structure (list): The file structure data.
	file_path (str): The path where the JSON file will be saved.
	"""
	try:
	with open(file_path, 'w') as json_file:
	json.dump(file_structure, json_file, indent=2)
	print(f'File structure written to {file_path}')
	except IOError as e:
	print(f"Error writing file structure to JSON: {e}")

	if __name__ == "__main__":
	file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
	token = os.getenv("TOKEN")
	cache_path = os.path.join(CACHE_DIR, "films/Funky Monkey 2004/Funky Monkey (2004) Web-dl 1080p.mp4")
	proxies = get_system_proxies()
	film_id = "funky_monkey_2004" # Unique identifier for the film download
	download_file(file_url, token, cache_path, proxies=proxies, film_id=film_id)