Spaces:
Sleeping
Sleeping
File size: 2,624 Bytes
ef3de03 0acc824 ef3de03 0f6452f ef3de03 0f6452f ef3de03 0f6452f ef3de03 0acc824 0f6452f ef3de03 0acc824 ef3de03 f150f6b ef3de03 0f6452f 0acc824 ef3de03 0f6452f f150f6b 0f6452f ef3de03 0f6452f ef3de03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
from bs4 import BeautifulSoup
from pathlib import Path
from utils.logger import logger
class QueryResultsExtractor:
def __init__(self) -> None:
self.query_results = []
self.related_questions = []
def load_html(self, html_path):
with open(html_path, "r", encoding="utf-8") as f:
html = f.read()
self.soup = BeautifulSoup(html, "html.parser")
def extract_query_results(self):
self.query = self.soup.find("textarea").text.strip()
query_result_elements = self.soup.find_all("div", class_="g")
for idx, result in enumerate(query_result_elements):
site = result.find("cite").find_previous("span").text.strip()
url = result.find("a")["href"]
title = result.find("h3").text.strip()
abstract_element = result.find("div", {"data-sncf": "1"})
if abstract_element is None:
abstract_element = result.find("div", class_="ITZIwc")
abstract = abstract_element.text.strip()
logger.mesg(
f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n"
)
self.query_results.append(
{
"title": title,
"site": site,
"url": url,
"abstract": abstract,
"index": idx,
"type": "web",
}
)
logger.success(f"- {len(query_result_elements)} query results")
def extract_related_questions(self):
related_question_elements = self.soup.find_all(
"div", class_="related-question-pair"
)
for question_element in related_question_elements:
question = question_element.find("span").text.strip()
print(question)
self.related_questions.append(question)
logger.success(f"- {len(self.related_questions)} related questions")
def extract(self, html_path):
self.load_html(html_path)
self.extract_query_results()
self.extract_related_questions()
self.search_results = {
"query": self.query,
"query_results": self.query_results,
"related_questions": self.related_questions,
}
return self.search_results
if __name__ == "__main__":
html_path_root = Path(__file__).parents[1] / "files"
# html_filename = "python教程"
html_filename = "python_tutorials"
html_path = html_path_root / f"{html_filename}.html"
extractor = QueryResultsExtractor()
extractor.extract(html_path)
|