ChandimaPrabath commited on
Commit
dbe8689
·
1 Parent(s): a5ae33a
Files changed (1) hide show
  1. indexer.py +67 -38
indexer.py CHANGED
@@ -1,44 +1,73 @@
1
- import json
2
- import logging
3
- from hf_scrapper import get_system_proxies, get_file_structure, write_file_structure_to_json
4
- from dotenv import load_dotenv
5
  import os
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- load_dotenv()
8
 
9
- def index_repository(token, repo, current_path="", proxies=None):
 
 
 
 
 
 
10
  try:
11
- file_structure = get_file_structure(repo, token, current_path, proxies)
12
- full_structure = []
13
- for item in file_structure:
14
- if item['type'] == 'directory':
15
- sub_directory_structure = index_repository(token, repo, item['path'], proxies)
16
- full_structure.append({
17
- "type": "directory",
18
- "path": item['path'],
19
- "contents": sub_directory_structure
20
- })
21
- else:
22
- full_structure.append(item)
23
- return full_structure
24
  except Exception as e:
25
- logging.error(f"Error indexing repository: {e}")
26
- raise
27
-
28
- def indexer():
29
- token = os.getenv("TOKEN")
30
- repo = os.getenv("REPO")
31
- output_path = os.getenv("INDEX_FILE")
32
-
33
- if not token or not repo or not output_path:
34
- logging.error("Environment variables TOKEN, REPO, or INDEX_FILE are not set.")
35
- return
36
-
37
- proxies = get_system_proxies()
38
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  try:
40
- full_structure = index_repository(token, repo, "", proxies)
41
- write_file_structure_to_json(full_structure, output_path)
42
- logging.info(f"Full file structure for repository '{repo}' has been indexed and saved to {output_path}")
43
- except Exception as e:
44
- logging.error(f"Error during indexing: {e}")
 
 
 
 
 
1
  import os
2
+ import json
3
+ import aiohttp
4
+ import asyncio
5
+ import aiofiles
6
+ import urllib.request
7
+ from aiohttp import ClientSession, ClientTimeout
8
+ from aiohttp.client_exceptions import ClientError
9
+ from tqdm.asyncio import tqdm
10
+
11
+ CACHE_DIR = os.getenv("CACHE_DIR")
12
+ CACHE_JSON_PATH = os.path.join(CACHE_DIR, "cached_films.json")
13
 
14
+ download_progress = {}
15
 
16
+ async def get_system_proxies():
17
+ """
18
+ Retrieves the system's HTTP and HTTPS proxies.
19
+
20
+ Returns:
21
+ dict: A dictionary containing the proxies.
22
+ """
23
  try:
24
+ proxies = urllib.request.getproxies()
25
+ print("System proxies:", proxies)
26
+ return {
27
+ "http": proxies.get("http"),
28
+ "https": proxies.get("http")
29
+ }
 
 
 
 
 
 
 
30
  except Exception as e:
31
+ print(f"Error getting system proxies: {e}")
32
+ return {}
33
+
34
+ async def get_file_structure(repo, token, path="", proxies=None):
35
+ """
36
+ Fetches the file structure of a specified Hugging Face repository.
37
+
38
+ Args:
39
+ repo (str): The name of the repository.
40
+ token (str): The authorization token for the request.
41
+ path (str, optional): The specific path in the repository. Defaults to "".
42
+ proxies (dict, optional): The proxies to use for the request. Defaults to None.
43
+
44
+ Returns:
45
+ list: A list of file structure information.
46
+ """
47
+ api_url = f"https://huggingface.co/api/models/{repo}/tree/main/{path}"
48
+ headers = {'Authorization': f'Bearer {token}'}
49
+ timeout = ClientTimeout(total=10)
50
+ async with ClientSession(timeout=timeout) as session:
51
+ print(f"Fetching file structure from URL: {api_url} with proxies: {proxies}")
52
+ try:
53
+ async with session.get(api_url, headers=headers, proxy=proxies.get("http")) as response:
54
+ response.raise_for_status()
55
+ return await response.json()
56
+ except ClientError as e:
57
+ print(f"Error fetching file structure: {e}")
58
+ return []
59
+
60
+ async def write_file_structure_to_json(file_structure, file_path):
61
+ """
62
+ Writes the file structure to a JSON file.
63
+
64
+ Args:
65
+ file_structure (list): The file structure data.
66
+ file_path (str): The path where the JSON file will be saved.
67
+ """
68
  try:
69
+ async with aiofiles.open(file_path, 'w') as json_file:
70
+ await json_file.write(json.dumps(file_structure, indent=2))
71
+ print(f'File structure written to {file_path}')
72
+ except IOError as e:
73
+ print(f"Error writing file structure to JSON: {e}")