Spaces:
Sleeping
Sleeping
:gem: [Feature] New SearchResultsExtractor: title, site, link, abstract
Browse files
documents/__init__.py
ADDED
File without changes
|
documents/search_results_extractor.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from bs4 import BeautifulSoup
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
|
5 |
+
class SearchResultsExtractor:
|
6 |
+
def __init__(self) -> None:
|
7 |
+
pass
|
8 |
+
|
9 |
+
def load_html(self, html_path):
|
10 |
+
with open(html_path, "r", encoding="utf-8") as f:
|
11 |
+
html = f.read()
|
12 |
+
self.soup = BeautifulSoup(html, "html.parser")
|
13 |
+
|
14 |
+
def extract_search_results(self):
|
15 |
+
search_result_elements = self.soup.find_all("div", class_="g")
|
16 |
+
|
17 |
+
for result in search_result_elements:
|
18 |
+
site = result.find("cite").find_previous("span").text
|
19 |
+
link = result.find("a")["href"]
|
20 |
+
title = result.find("h3").text
|
21 |
+
|
22 |
+
abstract_element = result.find("div", {"data-sncf": "1"})
|
23 |
+
if abstract_element is None:
|
24 |
+
abstract_element = result.find("div", class_="ITZIwc")
|
25 |
+
abstract = abstract_element.text.strip()
|
26 |
+
|
27 |
+
print(
|
28 |
+
f"{title}\n" f" - {site}\n" f" - {link}\n" f" - {abstract}\n" f"\n"
|
29 |
+
)
|
30 |
+
|
31 |
+
def extract_related_questions(self):
|
32 |
+
related_questions = self.soup.find_all("div", class_="related-question-pair")
|
33 |
+
for question in related_questions:
|
34 |
+
print(question)
|
35 |
+
# print(question.find("a")["href"])
|
36 |
+
# print(question.find("a").text)
|
37 |
+
|
38 |
+
def extract(self, html_path):
|
39 |
+
self.load_html(html_path)
|
40 |
+
self.extract_search_results()
|
41 |
+
|
42 |
+
|
43 |
+
if __name__ == "__main__":
|
44 |
+
html_path_root = Path(__file__).parents[1] / "files"
|
45 |
+
# html_filename = "python教程"
|
46 |
+
html_filename = "python_tutorials"
|
47 |
+
html_path = html_path_root / f"{html_filename}.html"
|
48 |
+
extractor = SearchResultsExtractor()
|
49 |
+
extractor.extract(html_path)
|