Spaces:
Sleeping
Sleeping
from bs4 import BeautifulSoup | |
from pathlib import Path | |
from utils.logger import logger | |
class QueryResultsExtractor: | |
def __init__(self) -> None: | |
self.query_results = [] | |
self.related_questions = [] | |
def load_html(self, html_path): | |
with open(html_path, "r", encoding="utf-8") as f: | |
html = f.read() | |
self.soup = BeautifulSoup(html, "html.parser") | |
def extract_query_results(self): | |
self.query = self.soup.find("textarea").text.strip() | |
query_result_elements = self.soup.find_all("div", class_="g") | |
for idx, result in enumerate(query_result_elements): | |
cite_tag = result.find("cite") | |
if cite_tag: | |
site = cite_tag.find_previous("span").text.strip() | |
else: | |
site = "Unknown" | |
url = result.find("a")["href"] | |
title_element = result.find("h3") | |
if title_element: | |
title = title_element.text.strip() | |
else: | |
title = "Unknown Title" | |
abstract_element_conditions = [ | |
{"data-sncf": "1"}, | |
{"class_": "ITZIwc"}, | |
] | |
for condition in abstract_element_conditions: | |
abstract_element = result.find("div", condition) | |
if abstract_element is not None: | |
abstract = abstract_element.text.strip() | |
break | |
else: | |
abstract = "" | |
logger.mesg( | |
f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n" | |
) | |
self.query_results.append( | |
{ | |
"title": title, | |
"site": site, | |
"url": url, | |
"abstract": abstract, | |
"position": idx + 1, | |
"type": "web", | |
} | |
) | |
logger.success(f"- {len(query_result_elements)} query results") | |
def extract_related_questions(self): | |
related_question_elements = self.soup.find_all( | |
"div", class_="related-question-pair" | |
) | |
for question_element in related_question_elements: | |
question = question_element.find("span").text.strip() | |
print(question) | |
self.related_questions.append(question) | |
logger.success(f"- {len(self.related_questions)} related questions") | |
def extract(self, html_path): | |
self.load_html(html_path) | |
self.extract_query_results() | |
self.extract_related_questions() | |
self.search_results = { | |
"query": self.query, | |
"query_results": self.query_results, | |
"related_questions": self.related_questions, | |
} | |
return self.search_results | |
if __name__ == "__main__": | |
html_path_root = Path(__file__).parents[1] / "files" | |
# html_filename = "python教程" | |
html_filename = "python_tutorials" | |
html_path = html_path_root / f"{html_filename}.html" | |
extractor = QueryResultsExtractor() | |
extractor.extract(html_path) | |