Spaces:
Sleeping
Sleeping
:zap: [Enhance] SearchAPIApp: overwrite param for query and webpage HTML
Browse files- apis/search_api.py +18 -3
apis/search_api.py
CHANGED
@@ -47,6 +47,14 @@ class SearchAPIApp:
|
|
47 |
default=False,
|
48 |
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
49 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
|
51 |
def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
|
52 |
google_searcher = GoogleSearcher()
|
@@ -56,7 +64,10 @@ class SearchAPIApp:
|
|
56 |
if not query.strip():
|
57 |
continue
|
58 |
query_html_path = google_searcher.search(
|
59 |
-
query=query,
|
|
|
|
|
|
|
60 |
)
|
61 |
query_search_results = query_results_extractor.extract(query_html_path)
|
62 |
queries_search_results.append(query_search_results)
|
@@ -69,8 +80,12 @@ class SearchAPIApp:
|
|
69 |
for query_result_idx, query_result in enumerate(
|
70 |
query_search_result["query_results"]
|
71 |
):
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
74 |
queries_search_results[query_idx]["query_results"][
|
75 |
query_result_idx
|
76 |
]["text"] = extracted_content
|
|
|
47 |
default=False,
|
48 |
description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
|
49 |
)
|
50 |
+
overwrite_query_html: bool = Field(
|
51 |
+
default=False,
|
52 |
+
description="(bool) Overwrite HTML file of query results",
|
53 |
+
)
|
54 |
+
overwrite_webpage_html: bool = Field(
|
55 |
+
default=False,
|
56 |
+
description="(bool) Overwrite HTML files of webpages from query results",
|
57 |
+
)
|
58 |
|
59 |
def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
|
60 |
google_searcher = GoogleSearcher()
|
|
|
64 |
if not query.strip():
|
65 |
continue
|
66 |
query_html_path = google_searcher.search(
|
67 |
+
query=query,
|
68 |
+
result_num=item.result_num,
|
69 |
+
safe=item.safe,
|
70 |
+
overwrite=item.overwrite_query_html,
|
71 |
)
|
72 |
query_search_results = query_results_extractor.extract(query_html_path)
|
73 |
queries_search_results.append(query_search_results)
|
|
|
80 |
for query_result_idx, query_result in enumerate(
|
81 |
query_search_result["query_results"]
|
82 |
):
|
83 |
+
webpage_html_path = html_fetcher.fetch(
|
84 |
+
query_result["url"], overwrite=item.overwrite_webpage_html
|
85 |
+
)
|
86 |
+
extracted_content = webpage_content_extractor.extract(
|
87 |
+
webpage_html_path
|
88 |
+
)
|
89 |
queries_search_results[query_idx]["query_results"][
|
90 |
query_result_idx
|
91 |
]["text"] = extracted_content
|