from bs4 import BeautifulSoup from pathlib import Path from utils.logger import logger class QueryResultsExtractor: def __init__(self) -> None: self.query_results = [] self.related_questions = [] def load_html(self, html_path): with open(html_path, "r", encoding="utf-8") as f: html = f.read() self.soup = BeautifulSoup(html, "html.parser") def extract_query_results(self): self.query = self.soup.find("textarea").text.strip() query_result_elements = self.soup.find_all("div", class_="g") for idx, result in enumerate(query_result_elements): cite_tag = result.find("cite") if cite_tag: site = cite_tag.find_previous("span").text.strip() else: site = "Unknown" url = result.find("a")["href"] title_element = result.find("h3") if title_element: title = title_element.text.strip() else: title = "Unknown Title" abstract_element_conditions = [ {"data-sncf": "1"}, {"class_": "ITZIwc"}, ] for condition in abstract_element_conditions: abstract_element = result.find("div", condition) if abstract_element is not None: abstract = abstract_element.text.strip() break else: abstract = "" logger.mesg( f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n" ) self.query_results.append( { "title": title, "site": site, "url": url, "abstract": abstract, "index": idx, "type": "web", } ) logger.success(f"- {len(query_result_elements)} query results") def extract_related_questions(self): related_question_elements = self.soup.find_all( "div", class_="related-question-pair" ) for question_element in related_question_elements: question = question_element.find("span").text.strip() print(question) self.related_questions.append(question) logger.success(f"- {len(self.related_questions)} related questions") def extract(self, html_path): self.load_html(html_path) self.extract_query_results() self.extract_related_questions() self.search_results = { "query": self.query, "query_results": self.query_results, "related_questions": self.related_questions, } return self.search_results if __name__ == "__main__": html_path_root = Path(__file__).parents[1] / "files" # html_filename = "python教程" html_filename = "python_tutorials" html_path = html_path_root / f"{html_filename}.html" extractor = QueryResultsExtractor() extractor.extract(html_path)