ChandimaPrabath commited on
Commit
b8f7a78
·
1 Parent(s): 9b708eb
Files changed (2) hide show
  1. hf_scrapper.py +13 -30
  2. requirements.txt +2 -1
hf_scrapper.py CHANGED
@@ -3,7 +3,7 @@ import requests
3
  import json
4
  import urllib.request
5
  from requests.exceptions import RequestException
6
- from concurrent.futures import ThreadPoolExecutor
7
 
8
  def get_system_proxies():
9
  try:
@@ -19,21 +19,20 @@ def get_system_proxies():
19
 
20
  def download_and_cache_file(file_url, token, cache_path, proxies=None):
21
  print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
22
-
23
- # Create a requests session for better performance
24
- session = requests.Session()
25
- session.headers.update({'Authorization': f'Bearer {token}'})
26
- session.proxies.update(proxies)
27
-
28
  try:
29
- response = session.get(file_url, stream=True)
30
  response.raise_for_status()
31
- os.makedirs(os.path.dirname(cache_path), exist_ok=True)
32
 
 
 
 
 
33
  with open(cache_path, 'wb') as f:
34
- for chunk in response.iter_content(chunk_size=16384): # Larger chunk size
 
35
  if chunk:
36
  f.write(chunk)
 
37
  print(f'File cached to {cache_path} successfully.')
38
  return True
39
  except RequestException as e:
@@ -46,7 +45,6 @@ def get_file_structure(repo, token, path="", proxies=None):
46
  api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
47
  headers = {'Authorization': f'Bearer {token}'}
48
  print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
49
-
50
  try:
51
  response = requests.get(api_url, headers=headers, proxies=proxies)
52
  response.raise_for_status()
@@ -63,24 +61,9 @@ def write_file_structure_to_json(file_structure, file_path):
63
  except IOError as e:
64
  print(f"Error writing file structure to JSON: {e}")
65
 
66
- # Function to download files in parallel
67
- def parallel_downloads(file_urls, token, cache_dir, proxies=None):
68
- with ThreadPoolExecutor() as executor:
69
- futures = []
70
- for file_url in file_urls:
71
- filename = file_url.split("/")[-1]
72
- cache_path = os.path.join(cache_dir, filename)
73
- futures.append(executor.submit(download_and_cache_file, file_url, token, cache_path, proxies))
74
- # Wait for all futures to complete
75
- for future in futures:
76
- future.result()
77
-
78
  if __name__ == "__main__":
79
- file_urls = [
80
- "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
81
- ]
82
  token = os.getenv("TOKEN")
83
- cache_dir = "tmp/cache/films"
84
- proxies = get_system_proxies()
85
-
86
- parallel_downloads(file_urls, token, cache_dir, proxies)
 
3
  import json
4
  import urllib.request
5
  from requests.exceptions import RequestException
6
+ from tqdm import tqdm
7
 
8
  def get_system_proxies():
9
  try:
 
19
 
20
  def download_and_cache_file(file_url, token, cache_path, proxies=None):
21
  print(f"Downloading file from URL: {file_url} to {cache_path} with proxies: {proxies}")
 
 
 
 
 
 
22
  try:
23
+ response = requests.get(file_url, headers={'Authorization': f'Bearer {token}'}, proxies=proxies, stream=True)
24
  response.raise_for_status()
 
25
 
26
+ # Get the total file size from the headers
27
+ total_size = int(response.headers.get('content-length', 0))
28
+
29
+ os.makedirs(os.path.dirname(cache_path), exist_ok=True)
30
  with open(cache_path, 'wb') as f:
31
+ # Use tqdm to show download progress
32
+ for chunk in tqdm(response.iter_content(chunk_size=8192), total=total_size//8192, unit='KB', unit_scale=True, unit_divisor=1024):
33
  if chunk:
34
  f.write(chunk)
35
+
36
  print(f'File cached to {cache_path} successfully.')
37
  return True
38
  except RequestException as e:
 
45
  api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
46
  headers = {'Authorization': f'Bearer {token}'}
47
  print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
 
48
  try:
49
  response = requests.get(api_url, headers=headers, proxies=proxies)
50
  response.raise_for_status()
 
61
  except IOError as e:
62
  print(f"Error writing file structure to JSON: {e}")
63
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  if __name__ == "__main__":
65
+ file_url = "https://huggingface.co/Unicone-Studio/jellyfin_media/resolve/main/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
 
 
66
  token = os.getenv("TOKEN")
67
+ cache_path = "tmp/cache/films/Funky%20Monkey%202004/Funky%20Monkey%20(2004)%20Web-dl%201080p.mp4"
68
+ proxy = get_system_proxies()
69
+ download_and_cache_file(file_url, token, cache_path, proxies=proxy)
 
requirements.txt CHANGED
@@ -3,4 +3,5 @@ Flask-Cors
3
  requests
4
  python-dotenv
5
  ffmpy
6
- ffmpeg-python
 
 
3
  requests
4
  python-dotenv
5
  ffmpy
6
+ ffmpeg-python
7
+ tqdm