Hansimov commited on
Commit
9fb4731
1 Parent(s): a636bcb

:zap: [Enhance] SearchAPIApp: overwrite param for query and webpage HTML

Browse files
Files changed (1) hide show
  1. apis/search_api.py +18 -3
apis/search_api.py CHANGED
@@ -47,6 +47,14 @@ class SearchAPIApp:
47
  default=False,
48
  description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
49
  )
 
 
 
 
 
 
 
 
50
 
51
  def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
52
  google_searcher = GoogleSearcher()
@@ -56,7 +64,10 @@ class SearchAPIApp:
56
  if not query.strip():
57
  continue
58
  query_html_path = google_searcher.search(
59
- query=query, result_num=item.result_num, safe=item.safe
 
 
 
60
  )
61
  query_search_results = query_results_extractor.extract(query_html_path)
62
  queries_search_results.append(query_search_results)
@@ -69,8 +80,12 @@ class SearchAPIApp:
69
  for query_result_idx, query_result in enumerate(
70
  query_search_result["query_results"]
71
  ):
72
- html_path = html_fetcher.fetch(query_result["url"])
73
- extracted_content = webpage_content_extractor.extract(html_path)
 
 
 
 
74
  queries_search_results[query_idx]["query_results"][
75
  query_result_idx
76
  ]["text"] = extracted_content
 
47
  default=False,
48
  description="(bool) Enable extracting main text contents from webpage, will add `text` filed in each `query_result` dict",
49
  )
50
+ overwrite_query_html: bool = Field(
51
+ default=False,
52
+ description="(bool) Overwrite HTML file of query results",
53
+ )
54
+ overwrite_webpage_html: bool = Field(
55
+ default=False,
56
+ description="(bool) Overwrite HTML files of webpages from query results",
57
+ )
58
 
59
  def queries_to_search_results(self, item: QueriesToSearchResultsPostItem):
60
  google_searcher = GoogleSearcher()
 
64
  if not query.strip():
65
  continue
66
  query_html_path = google_searcher.search(
67
+ query=query,
68
+ result_num=item.result_num,
69
+ safe=item.safe,
70
+ overwrite=item.overwrite_query_html,
71
  )
72
  query_search_results = query_results_extractor.extract(query_html_path)
73
  queries_search_results.append(query_search_results)
 
80
  for query_result_idx, query_result in enumerate(
81
  query_search_result["query_results"]
82
  ):
83
+ webpage_html_path = html_fetcher.fetch(
84
+ query_result["url"], overwrite=item.overwrite_webpage_html
85
+ )
86
+ extracted_content = webpage_content_extractor.extract(
87
+ webpage_html_path
88
+ )
89
  queries_search_results[query_idx]["query_results"][
90
  query_result_idx
91
  ]["text"] = extracted_content