Spaces:
Sleeping
Sleeping
:recycle: [Refactor] Replace output_path with html_path to avoid confuse
Browse files- apis/search_api.py +2 -2
- networks/google_searcher.py +8 -8
- networks/webpage_fetcher.py +13 -15
apis/search_api.py
CHANGED
@@ -94,7 +94,7 @@ class SearchAPIApp:
|
|
94 |
output_parent=query_search_results["query"],
|
95 |
)
|
96 |
html_paths = [
|
97 |
-
url_and_html_path["
|
98 |
for url_and_html_path in url_and_html_path_list
|
99 |
]
|
100 |
|
@@ -109,7 +109,7 @@ class SearchAPIApp:
|
|
109 |
|
110 |
for item in url_and_html_path_list:
|
111 |
url = item["url"]
|
112 |
-
html_path = item["
|
113 |
extracted_content = html_path_and_extracted_content_list[
|
114 |
html_paths.index(html_path)
|
115 |
]["extracted_content"]
|
|
|
94 |
output_parent=query_search_results["query"],
|
95 |
)
|
96 |
html_paths = [
|
97 |
+
str(url_and_html_path["html_path"])
|
98 |
for url_and_html_path in url_and_html_path_list
|
99 |
]
|
100 |
|
|
|
109 |
|
110 |
for item in url_and_html_path_list:
|
111 |
url = item["url"]
|
112 |
+
html_path = str(item["html_path"])
|
113 |
extracted_content = html_path_and_extracted_content_list[
|
114 |
html_paths.index(html_path)
|
115 |
]["extracted_content"]
|
networks/google_searcher.py
CHANGED
@@ -26,22 +26,22 @@ class GoogleSearcher:
|
|
26 |
)
|
27 |
|
28 |
def save_response(self):
|
29 |
-
if not self.
|
30 |
-
self.
|
31 |
-
logger.note(f"Saving to: [{self.
|
32 |
-
with open(self.
|
33 |
wf.write(self.request_response.content)
|
34 |
|
35 |
def search(self, query, result_num=10, safe=False, overwrite=False):
|
36 |
self.query = query
|
37 |
-
self.
|
38 |
logger.note(f"Searching: [{self.query}]")
|
39 |
-
if self.
|
40 |
-
logger.success(f"HTML existed: {self.
|
41 |
else:
|
42 |
self.send_request(result_num=result_num, safe=safe)
|
43 |
self.save_response()
|
44 |
-
return self.
|
45 |
|
46 |
|
47 |
if __name__ == "__main__":
|
|
|
26 |
)
|
27 |
|
28 |
def save_response(self):
|
29 |
+
if not self.html_path.exists():
|
30 |
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
31 |
+
logger.note(f"Saving to: [{self.html_path}]")
|
32 |
+
with open(self.html_path, "wb") as wf:
|
33 |
wf.write(self.request_response.content)
|
34 |
|
35 |
def search(self, query, result_num=10, safe=False, overwrite=False):
|
36 |
self.query = query
|
37 |
+
self.html_path = self.filepath_converter.convert(self.query)
|
38 |
logger.note(f"Searching: [{self.query}]")
|
39 |
+
if self.html_path.exists() and not overwrite:
|
40 |
+
logger.success(f"HTML existed: {self.html_path}")
|
41 |
else:
|
42 |
self.send_request(result_num=result_num, safe=safe)
|
43 |
self.save_response()
|
44 |
+
return self.html_path
|
45 |
|
46 |
|
47 |
if __name__ == "__main__":
|
networks/webpage_fetcher.py
CHANGED
@@ -34,47 +34,45 @@ class WebpageFetcher:
|
|
34 |
self.request_response = None
|
35 |
|
36 |
def save_response(self):
|
37 |
-
if not self.
|
38 |
-
self.
|
39 |
-
logger.success(f"Saving to: [{self.
|
40 |
|
41 |
if self.request_response is None:
|
42 |
return
|
43 |
else:
|
44 |
-
with open(self.
|
45 |
wf.write(self.request_response.content)
|
46 |
|
47 |
def fetch(self, url, overwrite=False, output_parent=None):
|
48 |
self.url = url
|
49 |
logger.note(f"Fetching: [{self.url}]")
|
50 |
-
self.
|
51 |
-
self.url, parent=output_parent
|
52 |
-
)
|
53 |
|
54 |
if self.is_ignored_host(self.url):
|
55 |
logger.warn(f"Ignore host: [{self.host}]")
|
56 |
-
return self.
|
57 |
|
58 |
-
if self.
|
59 |
-
logger.success(f"HTML existed: [{self.
|
60 |
else:
|
61 |
self.send_request()
|
62 |
self.save_response()
|
63 |
-
return self.
|
64 |
|
65 |
|
66 |
class BatchWebpageFetcher:
|
67 |
def __init__(self):
|
68 |
self.done_count = 0
|
69 |
self.total_count = 0
|
70 |
-
self.
|
71 |
|
72 |
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
|
73 |
webpage_fetcher = WebpageFetcher()
|
74 |
-
|
75 |
url=url, overwrite=overwrite, output_parent=output_parent
|
76 |
)
|
77 |
-
self.
|
78 |
self.done_count += 1
|
79 |
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
80 |
|
@@ -94,7 +92,7 @@ class BatchWebpageFetcher:
|
|
94 |
|
95 |
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
96 |
result = future.result()
|
97 |
-
return self.
|
98 |
|
99 |
|
100 |
if __name__ == "__main__":
|
|
|
34 |
self.request_response = None
|
35 |
|
36 |
def save_response(self):
|
37 |
+
if not self.html_path.exists():
|
38 |
+
self.html_path.parent.mkdir(parents=True, exist_ok=True)
|
39 |
+
logger.success(f"Saving to: [{self.html_path}]")
|
40 |
|
41 |
if self.request_response is None:
|
42 |
return
|
43 |
else:
|
44 |
+
with open(self.html_path, "wb") as wf:
|
45 |
wf.write(self.request_response.content)
|
46 |
|
47 |
def fetch(self, url, overwrite=False, output_parent=None):
|
48 |
self.url = url
|
49 |
logger.note(f"Fetching: [{self.url}]")
|
50 |
+
self.html_path = self.filepath_converter.convert(self.url, parent=output_parent)
|
|
|
|
|
51 |
|
52 |
if self.is_ignored_host(self.url):
|
53 |
logger.warn(f"Ignore host: [{self.host}]")
|
54 |
+
return self.html_path
|
55 |
|
56 |
+
if self.html_path.exists() and not overwrite:
|
57 |
+
logger.success(f"HTML existed: [{self.html_path}]")
|
58 |
else:
|
59 |
self.send_request()
|
60 |
self.save_response()
|
61 |
+
return self.html_path
|
62 |
|
63 |
|
64 |
class BatchWebpageFetcher:
|
65 |
def __init__(self):
|
66 |
self.done_count = 0
|
67 |
self.total_count = 0
|
68 |
+
self.url_and_html_path_list = []
|
69 |
|
70 |
def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
|
71 |
webpage_fetcher = WebpageFetcher()
|
72 |
+
html_path = webpage_fetcher.fetch(
|
73 |
url=url, overwrite=overwrite, output_parent=output_parent
|
74 |
)
|
75 |
+
self.url_and_html_path_list.append({"url": url, "html_path": html_path})
|
76 |
self.done_count += 1
|
77 |
logger.success(f"> [{self.done_count}/{self.total_count}] Fetched: {url}")
|
78 |
|
|
|
92 |
|
93 |
for idx, future in enumerate(concurrent.futures.as_completed(futures)):
|
94 |
result = future.result()
|
95 |
+
return self.url_and_html_path_list
|
96 |
|
97 |
|
98 |
if __name__ == "__main__":
|