Hansimov commited on
Commit
0f6452f
1 Parent(s): f9c42cf

:recycle: [Refactor] Rename SearchResultsExtractor to QueryResultsExtractor, and store results

Browse files
documents/{search_results_extractor.py → query_results_extractor.py} RENAMED
@@ -2,32 +2,40 @@ from bs4 import BeautifulSoup
2
  from pathlib import Path
3
 
4
 
5
- class SearchResultsExtractor:
6
  def __init__(self) -> None:
7
- pass
 
8
 
9
  def load_html(self, html_path):
10
  with open(html_path, "r", encoding="utf-8") as f:
11
  html = f.read()
12
  self.soup = BeautifulSoup(html, "html.parser")
13
 
14
- def extract_search_results(self):
15
- search_result_elements = self.soup.find_all("div", class_="g")
16
-
17
- for result in search_result_elements:
18
- site = result.find("cite").find_previous("span").text
19
- link = result.find("a")["href"]
20
- title = result.find("h3").text
21
 
22
  abstract_element = result.find("div", {"data-sncf": "1"})
23
  if abstract_element is None:
24
  abstract_element = result.find("div", class_="ITZIwc")
25
  abstract = abstract_element.text.strip()
26
-
27
- print(
28
- f"{title}\n" f" - {site}\n" f" - {link}\n" f" - {abstract}\n" f"\n"
 
 
 
 
 
 
 
29
  )
30
- print(len(search_result_elements))
31
 
32
  def extract_related_questions(self):
33
  related_question_elements = self.soup.find_all(
@@ -36,12 +44,19 @@ class SearchResultsExtractor:
36
  for question_element in related_question_elements:
37
  question = question_element.find("span").text.strip()
38
  print(question)
 
39
  print(len(related_question_elements))
40
 
41
  def extract(self, html_path):
42
  self.load_html(html_path)
43
- self.extract_search_results()
44
  self.extract_related_questions()
 
 
 
 
 
 
45
 
46
 
47
  if __name__ == "__main__":
@@ -49,5 +64,5 @@ if __name__ == "__main__":
49
  # html_filename = "python教程"
50
  html_filename = "python_tutorials"
51
  html_path = html_path_root / f"{html_filename}.html"
52
- extractor = SearchResultsExtractor()
53
  extractor.extract(html_path)
 
2
  from pathlib import Path
3
 
4
 
5
+ class QueryResultsExtractor:
6
  def __init__(self) -> None:
7
+ self.query_results = []
8
+ self.related_questions = []
9
 
10
  def load_html(self, html_path):
11
  with open(html_path, "r", encoding="utf-8") as f:
12
  html = f.read()
13
  self.soup = BeautifulSoup(html, "html.parser")
14
 
15
+ def extract_query_results(self):
16
+ self.query = self.soup.find("textarea").text.strip()
17
+ query_result_elements = self.soup.find_all("div", class_="g")
18
+ for idx, result in enumerate(query_result_elements):
19
+ site = result.find("cite").find_previous("span").text.strip()
20
+ url = result.find("a")["href"]
21
+ title = result.find("h3").text.strip()
22
 
23
  abstract_element = result.find("div", {"data-sncf": "1"})
24
  if abstract_element is None:
25
  abstract_element = result.find("div", class_="ITZIwc")
26
  abstract = abstract_element.text.strip()
27
+ print(f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n")
28
+ self.query_results.append(
29
+ {
30
+ "title": title,
31
+ "site": site,
32
+ "url": url,
33
+ "abstract": abstract,
34
+ "index": idx,
35
+ "type": "web",
36
+ }
37
  )
38
+ print(len(query_result_elements))
39
 
40
  def extract_related_questions(self):
41
  related_question_elements = self.soup.find_all(
 
44
  for question_element in related_question_elements:
45
  question = question_element.find("span").text.strip()
46
  print(question)
47
+ self.related_questions.append(question)
48
  print(len(related_question_elements))
49
 
50
  def extract(self, html_path):
51
  self.load_html(html_path)
52
+ self.extract_query_results()
53
  self.extract_related_questions()
54
+ self.search_results = {
55
+ "query": self.query,
56
+ "query_results": self.query_results,
57
+ "related_questions": self.related_questions,
58
+ }
59
+ return self.search_results
60
 
61
 
62
  if __name__ == "__main__":
 
64
  # html_filename = "python教程"
65
  html_filename = "python_tutorials"
66
  html_path = html_path_root / f"{html_filename}.html"
67
+ extractor = QueryResultsExtractor()
68
  extractor.extract(html_path)