Spaces:
Paused
Paused
Commit
·
b8f7a78
1
Parent(s):
9b708eb
update
Browse files- hf_scrapper.py +13 -30
- requirements.txt +2 -1
hf_scrapper.py
CHANGED
@@ -3,7 +3,7 @@ import requests
|
|
3 |
import json
|
4 |
import urllib.request
|
5 |
from requests.exceptions import RequestException
|
6 |
-
from
|
7 |
|
8 |
def get_system_proxies():
|
9 |
try:
|
@@ -19,21 +19,20 @@ def get_system_proxies():
|
|
19 |
|
20 |
def download_and_cache_file(file_url, token, cache_path, proxies=None):
|
21 |
print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
|
22 |
-
|
23 |
-
# Create a requests session for better performance
|
24 |
-
session = requests.Session()
|
25 |
-
session.headers.update({'Authorization': f'Bearer {token}'})
|
26 |
-
session.proxies.update(proxies)
|
27 |
-
|
28 |
try:
|
29 |
-
response =
|
30 |
response.raise_for_status()
|
31 |
-
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
32 |
|
|
|
|
|
|
|
|
|
33 |
with open(cache_path, 'wb') as f:
|
34 |
-
|
|
|
35 |
if chunk:
|
36 |
f.write(chunk)
|
|
|
37 |
print(f'File cached to {cache_path} successfully.')
|
38 |
return True
|
39 |
except RequestException as e:
|
@@ -46,7 +45,6 @@ def get_file_structure(repo, token, path="", proxies=None):
|
|
46 |
api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
|
47 |
headers = {'Authorization': f'Bearer {token}'}
|
48 |
print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
|
49 |
-
|
50 |
try:
|
51 |
response = requests.get(api_url, headers=headers, proxies=proxies)
|
52 |
response.raise_for_status()
|
@@ -63,24 +61,9 @@ def write_file_structure_to_json(file_structure, file_path):
|
|
63 |
except IOError as e:
|
64 |
print(f"Error writing file structure to JSON: {e}")
|
65 |
|
66 |
-
# Function to download files in parallel
|
67 |
-
def parallel_downloads(file_urls, token, cache_dir, proxies=None):
|
68 |
-
with ThreadPoolExecutor() as executor:
|
69 |
-
futures = []
|
70 |
-
for file_url in file_urls:
|
71 |
-
filename = file_url.split("/")[-1]
|
72 |
-
cache_path = os.path.join(cache_dir, filename)
|
73 |
-
futures.append(executor.submit(download_and_cache_file, file_url, token, cache_path, proxies))
|
74 |
-
# Wait for all futures to complete
|
75 |
-
for future in futures:
|
76 |
-
future.result()
|
77 |
-
|
78 |
if __name__ == "__main__":
|
79 |
-
|
80 |
-
"https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
81 |
-
]
|
82 |
token = os.getenv("TOKEN")
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
parallel_downloads(file_urls, token, cache_dir, proxies)
|
|
|
3 |
import json
|
4 |
import urllib.request
|
5 |
from requests.exceptions import RequestException
|
6 |
+
from tqdm import tqdm
|
7 |
|
8 |
def get_system_proxies():
|
9 |
try:
|
|
|
19 |
|
20 |
def download_and_cache_file(file_url, token, cache_path, proxies=None):
|
21 |
print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
try:
|
23 |
+
response = requests.get(file_url, headers={'Authorization': f'Bearer {token}'}, proxies=proxies, stream=True)
|
24 |
response.raise_for_status()
|
|
|
25 |
|
26 |
+
# Get the total file size from the headers
|
27 |
+
total_size = int(response.headers.get('content-length', 0))
|
28 |
+
|
29 |
+
os.makedirs(os.path.dirname(cache_path), exist_ok=True)
|
30 |
with open(cache_path, 'wb') as f:
|
31 |
+
# Use tqdm to show download progress
|
32 |
+
for chunk in tqdm(response.iter_content(chunk_size=8192), total=total_size//8192, unit='KB', unit_scale=True, unit_divisor=1024):
|
33 |
if chunk:
|
34 |
f.write(chunk)
|
35 |
+
|
36 |
print(f'File cached to {cache_path} successfully.')
|
37 |
return True
|
38 |
except RequestException as e:
|
|
|
45 |
api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
|
46 |
headers = {'Authorization': f'Bearer {token}'}
|
47 |
print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
|
|
|
48 |
try:
|
49 |
response = requests.get(api_url, headers=headers, proxies=proxies)
|
50 |
response.raise_for_status()
|
|
|
61 |
except IOError as e:
|
62 |
print(f"Error writing file structure to JSON: {e}")
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if __name__ == "__main__":
|
65 |
+
file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
|
|
|
|
66 |
token = os.getenv("TOKEN")
|
67 |
+
cache_path = "tmp/cache/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
|
68 |
+
proxy = get_system_proxies()
|
69 |
+
download_and_cache_file(file_url, token, cache_path, proxies=proxy)
|
|
requirements.txt
CHANGED
@@ -3,4 +3,5 @@ Flask-Cors
|
|
3 |
requests
|
4 |
python-dotenv
|
5 |
ffmpy
|
6 |
-
ffmpeg-python
|
|
|
|
3 |
requests
|
4 |
python-dotenv
|
5 |
ffmpy
|
6 |
+
ffmpeg-python
|
7 |
+
tqdm
|