Spaces:
Sleeping
Sleeping
:zap: [Enhance] Rename HTMLFetcher to WebpageFetcher, and add output_parent param
Browse files
apis/search_api.py
CHANGED
@@ -10,7 +10,7 @@ from typing import Union
|
|
10 |
from sse_starlette.sse import EventSourceResponse, ServerSentEvent
|
11 |
from utils.logger import logger
|
12 |
from networks.google_searcher import GoogleSearcher
|
13 |
-
from networks.
|
14 |
from documents.query_results_extractor import QueryResultsExtractor
|
15 |
from documents.webpage_content_extractor import WebpageContentExtractor
|
16 |
from utils.logger import logger
|
@@ -74,14 +74,16 @@ class SearchAPIApp:
|
|
74 |
logger.note(queries_search_results)
|
75 |
|
76 |
if item.extract_content:
|
77 |
-
|
78 |
webpage_content_extractor = WebpageContentExtractor()
|
79 |
for query_idx, query_search_result in enumerate(queries_search_results):
|
80 |
for query_result_idx, query_result in enumerate(
|
81 |
query_search_result["query_results"]
|
82 |
):
|
83 |
-
webpage_html_path =
|
84 |
-
query_result["url"],
|
|
|
|
|
85 |
)
|
86 |
extracted_content = webpage_content_extractor.extract(
|
87 |
webpage_html_path
|
|
|
10 |
from sse_starlette.sse import EventSourceResponse, ServerSentEvent
|
11 |
from utils.logger import logger
|
12 |
from networks.google_searcher import GoogleSearcher
|
13 |
+
from networks.webpage_fetcher import WebpageFetcher
|
14 |
from documents.query_results_extractor import QueryResultsExtractor
|
15 |
from documents.webpage_content_extractor import WebpageContentExtractor
|
16 |
from utils.logger import logger
|
|
|
74 |
logger.note(queries_search_results)
|
75 |
|
76 |
if item.extract_content:
|
77 |
+
webpage_fetcher = WebpageFetcher()
|
78 |
webpage_content_extractor = WebpageContentExtractor()
|
79 |
for query_idx, query_search_result in enumerate(queries_search_results):
|
80 |
for query_result_idx, query_result in enumerate(
|
81 |
query_search_result["query_results"]
|
82 |
):
|
83 |
+
webpage_html_path = webpage_fetcher.fetch(
|
84 |
+
query_result["url"],
|
85 |
+
overwrite=item.overwrite_webpage_html,
|
86 |
+
output_parent=query_search_result["query"],
|
87 |
)
|
88 |
extracted_content = webpage_content_extractor.extract(
|
89 |
webpage_html_path
|
networks/filepath_converter.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import platform
|
2 |
import re
|
3 |
from pathlib import Path
|
4 |
-
from urllib.parse import quote
|
5 |
|
6 |
|
7 |
# What characters are forbidden in Windows and Linux directory names?
|
@@ -42,6 +42,8 @@ class FilepathConverter:
|
|
42 |
return input_string
|
43 |
|
44 |
def validate(self, input_string):
|
|
|
|
|
45 |
filename = input_string
|
46 |
for char in INVALID_FILE_PATH_CHARS:
|
47 |
filename = filename.replace(char, "_")
|
@@ -65,6 +67,7 @@ class FilepathConverter:
|
|
65 |
filename = self.append_extension(filename)
|
66 |
|
67 |
parent = parent or self.parent
|
|
|
68 |
if parent:
|
69 |
filepath = self.output_root / parent / filename
|
70 |
else:
|
@@ -82,7 +85,7 @@ class UrlToFilepathConverter(FilepathConverter):
|
|
82 |
self.output_root = self.output_root / "urls"
|
83 |
|
84 |
def preprocess(self, url):
|
85 |
-
filename = url.split("//")[1]
|
86 |
return filename
|
87 |
|
88 |
|
|
|
1 |
import platform
|
2 |
import re
|
3 |
from pathlib import Path
|
4 |
+
from urllib.parse import quote, unquote
|
5 |
|
6 |
|
7 |
# What characters are forbidden in Windows and Linux directory names?
|
|
|
42 |
return input_string
|
43 |
|
44 |
def validate(self, input_string):
|
45 |
+
if not input_string:
|
46 |
+
return input_string
|
47 |
filename = input_string
|
48 |
for char in INVALID_FILE_PATH_CHARS:
|
49 |
filename = filename.replace(char, "_")
|
|
|
67 |
filename = self.append_extension(filename)
|
68 |
|
69 |
parent = parent or self.parent
|
70 |
+
parent = self.validate(parent)
|
71 |
if parent:
|
72 |
filepath = self.output_root / parent / filename
|
73 |
else:
|
|
|
85 |
self.output_root = self.output_root / "urls"
|
86 |
|
87 |
def preprocess(self, url):
|
88 |
+
filename = unquote(url.split("//")[1])
|
89 |
return filename
|
90 |
|
91 |
|
networks/{html_fetcher.py → webpage_fetcher.py}
RENAMED
@@ -7,7 +7,7 @@ from networks.filepath_converter import UrlToFilepathConverter
|
|
7 |
from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
8 |
|
9 |
|
10 |
-
class
|
11 |
def __init__(self):
|
12 |
self.enver = enver
|
13 |
self.enver.set_envs(proxies=True)
|
@@ -34,10 +34,12 @@ class HTMLFetcher:
|
|
34 |
with open(self.output_path, "wb") as wf:
|
35 |
wf.write(self.request_response.content)
|
36 |
|
37 |
-
def fetch(self, url, overwrite=False):
|
38 |
self.url = url
|
39 |
logger.note(f"Fetching: [{self.url}]")
|
40 |
-
self.output_path = self.filepath_converter.convert(
|
|
|
|
|
41 |
|
42 |
if self.is_ignored_host(self.url):
|
43 |
logger.warn(f"Ignore host: [{self.host}]")
|
@@ -57,5 +59,5 @@ if __name__ == "__main__":
|
|
57 |
# "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
|
58 |
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
|
59 |
)
|
60 |
-
fetcher =
|
61 |
fetcher.fetch(url)
|
|
|
7 |
from networks.network_configs import IGNORE_HOSTS, REQUESTS_HEADERS
|
8 |
|
9 |
|
10 |
+
class WebpageFetcher:
|
11 |
def __init__(self):
|
12 |
self.enver = enver
|
13 |
self.enver.set_envs(proxies=True)
|
|
|
34 |
with open(self.output_path, "wb") as wf:
|
35 |
wf.write(self.request_response.content)
|
36 |
|
37 |
+
def fetch(self, url, overwrite=False, output_parent=None):
|
38 |
self.url = url
|
39 |
logger.note(f"Fetching: [{self.url}]")
|
40 |
+
self.output_path = self.filepath_converter.convert(
|
41 |
+
self.url, parent=output_parent
|
42 |
+
)
|
43 |
|
44 |
if self.is_ignored_host(self.url):
|
45 |
logger.warn(f"Ignore host: [{self.host}]")
|
|
|
59 |
# "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
|
60 |
"https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
|
61 |
)
|
62 |
+
fetcher = WebpageFetcher()
|
63 |
fetcher.fetch(url)
|