Hansimov commited on
Commit
4d3e890
1 Parent(s): e773696

:gem: [Feature] SearchAPIApp: New extract_content param

Browse files
apis/search_api.py CHANGED
@@ -12,6 +12,7 @@ from utils.logger import logger
12
  from networks.google_searcher import GoogleSearcher
13
  from networks.html_fetcher import HTMLFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
 
15
  from utils.logger import logger
16
 
17
 
@@ -42,12 +43,18 @@ class SearchAPIApp:
42
  default=["web"],
43
  description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
44
  )
 
 
 
 
45
 
46
  def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
47
  google_searcher = GoogleSearcher()
48
  query_results_extractor = QueryResultsExtractor()
49
  queries_search_results = []
50
  for query in item.queries:
 
 
51
  query_html_path = google_searcher.search(
52
  query=query, result_num=item.result_num, safe=item.safe
53
  )
@@ -55,11 +62,18 @@ class SearchAPIApp:
55
  queries_search_results.append(query_search_results)
56
  logger.note(queries_search_results)
57
 
58
- # html_fetcher = HTMLFetcher()
59
- # for query_search_result in queries_search_results:
60
- # for query_result in query_search_result["query_results"]:
61
- # html_path = html_fetcher.fetch(query_result["url"])
62
- # query_result["html_path"] = str(html_path)
 
 
 
 
 
 
 
63
  return queries_search_results
64
 
65
  def setup_routes(self):
 
12
  from networks.google_searcher import GoogleSearcher
13
  from networks.html_fetcher import HTMLFetcher
14
  from documents.query_results_extractor import QueryResultsExtractor
15
+ from documents.webpage_content_extractor import WebpageContentExtractor
16
  from utils.logger import logger
17
 
18
 
 
43
  default=["web"],
44
  description="(list[str]) Types of search results: `web`, `image`, `videos`, `news`",
45
  )
46
+ extract_content: bool = Field(
47
+ default=False,
48
+ description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
49
+ )
50
 
51
  def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
52
  google_searcher = GoogleSearcher()
53
  query_results_extractor = QueryResultsExtractor()
54
  queries_search_results = []
55
  for query in item.queries:
56
+ if not query.strip():
57
+ continue
58
  query_html_path = google_searcher.search(
59
  query=query, result_num=item.result_num, safe=item.safe
60
  )
 
62
  queries_search_results.append(query_search_results)
63
  logger.note(queries_search_results)
64
 
65
+ if item.extract_content:
66
+ html_fetcher = HTMLFetcher()
67
+ webpage_content_extractor = WebpageContentExtractor()
68
+ for query_idx, query_search_result in enumerate(queries_search_results):
69
+ for query_result_idx, query_result in enumerate(
70
+ query_search_result["query_results"]
71
+ ):
72
+ html_path = html_fetcher.fetch(query_result["url"])
73
+ extracted_content = webpage_content_extractor.extract(html_path)
74
+ queries_search_results[query_idx]["query_results"][
75
+ query_result_idx
76
+ ]["text"] = extracted_content
77
  return queries_search_results
78
 
79
  def setup_routes(self):
documents/webpage_content_extractor.py CHANGED
@@ -95,7 +95,7 @@ class WebpageContentExtractor:
95
 
96
  self.main_content = markdownify(html_str, strip="a")
97
  self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
98
- logger.line(self.main_content)
99
  # pprint(self.main_content)
100
  token_count = self.count_tokens(self.main_content)
101
  logger.note(f"Token Count: {token_count}")
 
95
 
96
  self.main_content = markdownify(html_str, strip="a")
97
  self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
98
+ # logger.line(self.main_content)
99
  # pprint(self.main_content)
100
  token_count = self.count_tokens(self.main_content)
101
  logger.note(f"Token Count: {token_count}")