Hansimov commited on
Commit
62ee9e4
1 Parent(s): 9fb4731

:zap: [Enhance] Rename HTMLFetcher to WebpageFetcher, and add output_parent param

Browse files
apis/search_api.py CHANGED
@@ -10,7 +10,7 @@ from typing import Union
10
  from sse_starlette.sse import EventSourceResponse, ServerSentEvent
11
  from utils.logger import logger
12
  from networks.google_searcher import GoogleSearcher
13
- from networks.html_fetcher import HTMLFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
15
  from documents.webpage_content_extractor import WebpageContentExtractor
16
  from utils.logger import logger
@@ -74,14 +74,16 @@ class SearchAPIApp:
74
  logger.note(queries_search_results)
75
 
76
  if item.extract_content:
77
- html_fetcher = HTMLFetcher()
78
  webpage_content_extractor = WebpageContentExtractor()
79
  for query_idx, query_search_result in enumerate(queries_search_results):
80
  for query_result_idx, query_result in enumerate(
81
  query_search_result["query_results"]
82
  ):
83
- webpage_html_path = html_fetcher.fetch(
84
- query_result["url"], overwrite=item.overwrite_webpage_html
 
 
85
  )
86
  extracted_content = webpage_content_extractor.extract(
87
  webpage_html_path
 
10
  from sse_starlette.sse import EventSourceResponse, ServerSentEvent
11
  from utils.logger import logger
12
  from networks.google_searcher import GoogleSearcher
13
+ from networks.webpage_fetcher import WebpageFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
15
  from documents.webpage_content_extractor import WebpageContentExtractor
16
  from utils.logger import logger
 
74
  logger.note(queries_search_results)
75
 
76
  if item.extract_content:
77
+ webpage_fetcher = WebpageFetcher()
78
  webpage_content_extractor = WebpageContentExtractor()
79
  for query_idx, query_search_result in enumerate(queries_search_results):
80
  for query_result_idx, query_result in enumerate(
81
  query_search_result["query_results"]
82
  ):
83
+ webpage_html_path = webpage_fetcher.fetch(
84
+ query_result["url"],
85
+ overwrite=item.overwrite_webpage_html,
86
+ output_parent=query_search_result["query"],
87
  )
88
  extracted_content = webpage_content_extractor.extract(
89
  webpage_html_path
networks/filepath_converter.py CHANGED
@@ -1,7 +1,7 @@
1
  import platform
2
  import re
3
  from pathlib import Path
4
- from urllib.parse import quote
5
 
6
 
7
  # What characters are forbidden in Windows and Linux directory names?
@@ -42,6 +42,8 @@ class FilepathConverter:
42
  return input_string
43
 
44
  def validate(self, input_string):
 
 
45
  filename = input_string
46
  for char in INVALID_FILE_PATH_CHARS:
47
  filename = filename.replace(char, "_")
@@ -65,6 +67,7 @@ class FilepathConverter:
65
  filename = self.append_extension(filename)
66
 
67
  parent = parent or self.parent
 
68
  if parent:
69
  filepath = self.output_root / parent / filename
70
  else:
@@ -82,7 +85,7 @@ class UrlToFilepathConverter(FilepathConverter):
82
  self.output_root = self.output_root / "urls"
83
 
84
  def preprocess(self, url):
85
- filename = url.split("//")[1]
86
  return filename
87
 
88
 
 
1
  import platform
2
  import re
3
  from pathlib import Path
4
+ from urllib.parse import quote, unquote
5
 
6
 
7
  # What characters are forbidden in Windows and Linux directory names?
 
42
  return input_string
43
 
44
  def validate(self, input_string):
45
+ if not input_string:
46
+ return input_string
47
  filename = input_string
48
  for char in INVALID_FILE_PATH_CHARS:
49
  filename = filename.replace(char, "_")
 
67
  filename = self.append_extension(filename)
68
 
69
  parent = parent or self.parent
70
+ parent = self.validate(parent)
71
  if parent:
72
  filepath = self.output_root / parent / filename
73
  else:
 
85
  self.output_root = self.output_root / "urls"
86
 
87
  def preprocess(self, url):
88
+ filename = unquote(url.split("//")[1])
89
  return filename
90
 
91
 
networks/{html_fetcher.py → webpage_fetcher.py} RENAMED
@@ -7,7 +7,7 @@ from networks.filepath_converter import UrlToFilepathConverter
7
  from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
8
 
9
 
10
- class HTMLFetcher:
11
  def __init__(self):
12
  self.enver = enver
13
  self.enver.set_envs(proxies=True)
@@ -34,10 +34,12 @@ class HTMLFetcher:
34
  with open(self.output_path, "wb") as wf:
35
  wf.write(self.request_response.content)
36
 
37
- def fetch(self, url, overwrite=False):
38
  self.url = url
39
  logger.note(f"Fetching: [{self.url}]")
40
- self.output_path = self.filepath_converter.convert(self.url)
 
 
41
 
42
  if self.is_ignored_host(self.url):
43
  logger.warn(f"Ignore host: [{self.host}]")
@@ -57,5 +59,5 @@ if __name__ == "__main__":
57
  # "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
58
  "https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
59
  )
60
- fetcher = HTMLFetcher()
61
  fetcher.fetch(url)
 
7
  from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
8
 
9
 
10
+ class WebpageFetcher:
11
  def __init__(self):
12
  self.enver = enver
13
  self.enver.set_envs(proxies=True)
 
34
  with open(self.output_path, "wb") as wf:
35
  wf.write(self.request_response.content)
36
 
37
+ def fetch(self, url, overwrite=False, output_parent=None):
38
  self.url = url
39
  logger.note(f"Fetching: [{self.url}]")
40
+ self.output_path = self.filepath_converter.convert(
41
+ self.url, parent=output_parent
42
+ )
43
 
44
  if self.is_ignored_host(self.url):
45
  logger.warn(f"Ignore host: [{self.host}]")
 
59
  # "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
60
  "https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
61
  )
62
+ fetcher = WebpageFetcher()
63
  fetcher.fetch(url)