Spaces:
Sleeping
Sleeping
:boom: [Fix] SearchAPIApp: incorrect order of extracted contents to urls
Browse files- apis/search_api.py +15 -13
apis/search_api.py
CHANGED
@@ -93,33 +93,35 @@ class SearchAPIApp:
|
|
93 |
overwrite=overwrite_webpage_html,
|
94 |
output_parent=query_search_results["query"],
|
95 |
)
|
|
|
|
|
96 |
html_paths = [
|
97 |
str(url_and_html_path["html_path"])
|
98 |
for url_and_html_path in url_and_html_path_list
|
99 |
]
|
100 |
-
|
101 |
-
# Extract webpage contents from htmls
|
102 |
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
103 |
html_path_and_extracted_content_list = (
|
104 |
batch_webpage_content_extractor.extract(html_paths)
|
105 |
)
|
106 |
|
107 |
-
#
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
]["extracted_content"]
|
116 |
-
|
|
|
117 |
|
|
|
118 |
for query_result_idx, query_result in enumerate(
|
119 |
query_search_results["query_results"]
|
120 |
):
|
121 |
url = query_result["url"]
|
122 |
-
extracted_content =
|
123 |
queries_search_results[query_idx]["query_results"][query_result_idx][
|
124 |
"text"
|
125 |
] = extracted_content
|
|
|
93 |
overwrite=overwrite_webpage_html,
|
94 |
output_parent=query_search_results["query"],
|
95 |
)
|
96 |
+
|
97 |
+
# Extract webpage contents from htmls
|
98 |
html_paths = [
|
99 |
str(url_and_html_path["html_path"])
|
100 |
for url_and_html_path in url_and_html_path_list
|
101 |
]
|
|
|
|
|
102 |
batch_webpage_content_extractor = BatchWebpageContentExtractor()
|
103 |
html_path_and_extracted_content_list = (
|
104 |
batch_webpage_content_extractor.extract(html_paths)
|
105 |
)
|
106 |
|
107 |
+
# Build the map of url to extracted_content
|
108 |
+
html_path_to_url_dict = {
|
109 |
+
str(url_and_html_path["html_path"]): url_and_html_path["url"]
|
110 |
+
for url_and_html_path in url_and_html_path_list
|
111 |
+
}
|
112 |
+
url_to_extracted_content_dict = {
|
113 |
+
html_path_to_url_dict[
|
114 |
+
html_path_and_extracted_content["html_path"]
|
115 |
+
]: html_path_and_extracted_content["extracted_content"]
|
116 |
+
for html_path_and_extracted_content in html_path_and_extracted_content_list
|
117 |
+
}
|
118 |
|
119 |
+
# Write extracted contents (as 'text' field) to query_search_results
|
120 |
for query_result_idx, query_result in enumerate(
|
121 |
query_search_results["query_results"]
|
122 |
):
|
123 |
url = query_result["url"]
|
124 |
+
extracted_content = url_to_extracted_content_dict[url]
|
125 |
queries_search_results[query_idx]["query_results"][query_result_idx][
|
126 |
"text"
|
127 |
] = extracted_content
|