Spaces:

Unicone-Studio
/

instance1

Paused

App Files Files Community

instance1 / hf_scrapper.py

ChandimaPrabath

update

aeed8f1 9 months ago

raw

history blame

4.12 kB

	import os
	import requests
	import json
	import urllib.request
	from requests.exceptions import RequestException
	from tqdm import tqdm
	from concurrent.futures import ThreadPoolExecutor, as_completed

	CACHE_DIR = os.getenv("CACHE_DIR")
	CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")
	MAX_WORKERS = 4 # Adjust the number of threads for concurrent downloads

	def get_system_proxies():
	try:
	proxies = urllib.request.getproxies()
	print("System proxies:", proxies)
	return {
	"http": proxies.get("http"),
	"https": proxies.get("http")
	}
	except Exception as e:
	print(f"Error getting system proxies: {e}")
	return {}

	def download_file_chunk(url, headers, proxies, start, end):
	headers['Range'] = f"bytes={start}-{end}"
	response = requests.get(url, headers=headers, proxies=proxies, stream=True)
	response.raise_for_status()
	return response.content

	def download_and_cache_file(file_url, token, cache_path, proxies=None):
	print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
	headers = {'Authorization': f'Bearer {token}'}
	try:
	response = requests.head(file_url, headers=headers, proxies=proxies)
	response.raise_for_status()

	total_size = int(response.headers.get('content-length', 0))
	os.makedirs(os.path.dirname(cache_path), exist_ok=True)

	chunk_size = total_size // MAX_WORKERS
	ranges = [(i, min(i + chunk_size, total_size) - 1) for i in range(0, total_size, chunk_size)]

	with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
	futures = [executor.submit(download_file_chunk, file_url, headers, proxies, start, end) for start, end in ranges]
	with open(cache_path, 'wb') as f, tqdm(total=total_size, unit='B', unit_scale=True, desc=cache_path) as pbar:
	for future in as_completed(futures):
	chunk = future.result()
	f.write(chunk)
	pbar.update(len(chunk))

	print(f'File cached to {cache_path} successfully.')
	update_cache_json(file_url, cache_path)
	return True
	except RequestException as e:
	print(f"Error downloading file: {e}")
	except IOError as e:
	print(f"Error writing file {cache_path}: {e}")
	return False

	def update_cache_json(file_url, cache_path):
	cache_data = {}
	if os.path.exists(CACHE_JSON_PATH):
	with open(CACHE_JSON_PATH, 'r') as json_file:
	cache_data = json.load(json_file)

	film_title = os.path.basename(cache_path)
	cache_data[film_title] = cache_path

	with open(CACHE_JSON_PATH, 'w') as json_file:
	json.dump(cache_data, json_file, indent=2)

	print(f'Updated cache JSON: {CACHE_JSON_PATH} with {film_title}: {cache_path}')

	def get_file_structure(repo, token, path="", proxies=None):
	api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
	headers = {'Authorization': f'Bearer {token}'}
	print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
	try:
	response = requests.get(api_url, headers=headers, proxies=proxies)
	response.raise_for_status()
	return response.json()
	except RequestException as e:
	print(f"Error fetching file structure: {e}")
	return []

	def write_file_structure_to_json(file_structure, file_path):
	try:
	with open(file_path, 'w') as json_file:
	json.dump(file_structure, json_file, indent=2)
	print(f'File structure written to {file_path}')
	except IOError as e:
	print(f"Error writing file structure to JSON: {e}")

	if __name__ == "__main__":
	file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
	token = os.getenv("TOKEN")
	cache_path = os.path.join(CACHE_DIR, "films/Funky Monkey 2004/Funky Monkey (2004) Web-dl 1080p.mp4")
	proxy = get_system_proxies()
	download_and_cache_file(file_url, token, cache_path, proxies=proxy)