Hansimov commited on
Commit
f234ce3
1 Parent(s): bce51d4

:gem: [Feature] SearchAPIApp: Concurrent fetch urls and extract contents

Browse files
Files changed (1) hide show
  1. apis/search_api.py +54 -21
apis/search_api.py CHANGED
@@ -10,9 +10,9 @@ from typing import Union
10
  from sse_starlette.sse import EventSourceResponse, ServerSentEvent
11
  from utils.logger import logger
12
  from networks.google_searcher import GoogleSearcher
13
- from networks.webpage_fetcher import WebpageFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
15
- from documents.webpage_content_extractor import WebpageContentExtractor
16
  from utils.logger import logger
17
 
18
 
@@ -43,7 +43,7 @@ class SearchAPIApp:
43
  default=["web"],
44
  description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
45
  )
46
- extract_content: bool = Field(
47
  default=False,
48
  description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
49
  )
@@ -73,24 +73,57 @@ class SearchAPIApp:
73
  queries_search_results.append(query_search_results)
74
  logger.note(queries_search_results)
75
 
76
- if item.extract_content:
77
- webpage_fetcher = WebpageFetcher()
78
- webpage_content_extractor = WebpageContentExtractor()
79
- for query_idx, query_search_result in enumerate(queries_search_results):
80
- for query_result_idx, query_result in enumerate(
81
- query_search_result["query_results"]
82
- ):
83
- webpage_html_path = webpage_fetcher.fetch(
84
- query_result["url"],
85
- overwrite=item.overwrite_webpage_html,
86
- output_parent=query_search_result["query"],
87
- )
88
- extracted_content = webpage_content_extractor.extract(
89
- webpage_html_path
90
- )
91
- queries_search_results[query_idx]["query_results"][
92
- query_result_idx
93
- ]["text"] = extracted_content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  return queries_search_results
95
 
96
  def setup_routes(self):
 
10
  from sse_starlette.sse import EventSourceResponse, ServerSentEvent
11
  from utils.logger import logger
12
  from networks.google_searcher import GoogleSearcher
13
+ from networks.webpage_fetcher import BatchWebpageFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
15
+ from documents.webpage_content_extractor import BatchWebpageContentExtractor
16
  from utils.logger import logger
17
 
18
 
 
43
  default=["web"],
44
  description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
45
  )
46
+ extract_webpage: bool = Field(
47
  default=False,
48
  description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
49
  )
 
73
  queries_search_results.append(query_search_results)
74
  logger.note(queries_search_results)
75
 
76
+ if item.extract_webpage:
77
+ queries_search_results = self.extract_webpages(
78
+ queries_search_results,
79
+ overwrite_webpage_html=item.overwrite_webpage_html,
80
+ )
81
+ return queries_search_results
82
+
83
+ def extract_webpages(self, queries_search_results, overwrite_webpage_html=False):
84
+ for query_idx, query_search_results in enumerate(queries_search_results):
85
+ # Fetch webpages with urls
86
+ batch_webpage_fetcher = BatchWebpageFetcher()
87
+ urls = [
88
+ query_result["url"]
89
+ for query_result in query_search_results["query_results"]
90
+ ]
91
+ url_and_html_path_list = batch_webpage_fetcher.fetch(
92
+ urls,
93
+ overwrite=overwrite_webpage_html,
94
+ output_parent=query_search_results["query"],
95
+ )
96
+ html_paths = [
97
+ url_and_html_path["output_path"]
98
+ for url_and_html_path in url_and_html_path_list
99
+ ]
100
+
101
+ # Extract webpage contents from htmls
102
+ batch_webpage_content_extractor = BatchWebpageContentExtractor()
103
+ html_path_and_extracted_content_list = (
104
+ batch_webpage_content_extractor.extract(html_paths)
105
+ )
106
+
107
+ # Write extracted contents (as 'text' field) to query_search_results
108
+ url_and_extracted_content_dict = {}
109
+
110
+ for item in url_and_html_path_list:
111
+ url = item["url"]
112
+ html_path = item["output_path"]
113
+ extracted_content = html_path_and_extracted_content_list[
114
+ html_paths.index(html_path)
115
+ ]["extracted_content"]
116
+ url_and_extracted_content_dict[url] = extracted_content
117
+
118
+ for query_result_idx, query_result in enumerate(
119
+ query_search_results["query_results"]
120
+ ):
121
+ url = query_result["url"]
122
+ extracted_content = url_and_extracted_content_dict[url]
123
+ queries_search_results[query_idx]["query_results"][query_result_idx][
124
+ "text"
125
+ ] = extracted_content
126
+
127
  return queries_search_results
128
 
129
  def setup_routes(self):