Spaces:
Sleeping
Sleeping
File size: 3,092 Bytes
ef3de03 0acc824 ef3de03 0f6452f ef3de03 0f6452f ef3de03 0f6452f fa9ef68 0f6452f e789120 fa9ef68 a315628 fa9ef68 0acc824 0f6452f 8aa7160 0f6452f ef3de03 0acc824 ef3de03 fa9ef68 e789120 ef3de03 f150f6b ef3de03 0f6452f 0acc824 ef3de03 0f6452f f150f6b 0f6452f ef3de03 0f6452f ef3de03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 |
from bs4 import BeautifulSoup
from pathlib import Path
from utils.logger import logger
class QueryResultsExtractor:
def __init__(self) -> None:
self.query_results = []
self.related_questions = []
def load_html(self, html_path):
with open(html_path, "r", encoding="utf-8") as f:
html = f.read()
self.soup = BeautifulSoup(html, "html.parser")
def extract_query_results(self):
self.query = self.soup.find("textarea").text.strip()
query_result_elements = self.soup.find_all("div", class_="g")
for idx, result in enumerate(query_result_elements):
cite_tag = result.find("cite")
if cite_tag:
site = cite_tag.find_previous("span").text.strip()
else:
site = "Unknown"
url = result.find("a")["href"]
title_element = result.find("h3")
if title_element:
title = title_element.text.strip()
else:
title = "Unknown Title"
abstract_element_conditions = [
{"data-sncf": "1"},
{"class_": "ITZIwc"},
]
for condition in abstract_element_conditions:
abstract_element = result.find("div", condition)
if abstract_element is not None:
abstract = abstract_element.text.strip()
break
else:
abstract = ""
logger.mesg(
f"{title}\n" f" - {site}\n" f" - {url}\n" f" - {abstract}\n" f"\n"
)
self.query_results.append(
{
"title": title,
"site": site,
"url": url,
"abstract": abstract,
"position": idx + 1,
"type": "web",
}
)
logger.success(f"- {len(query_result_elements)} query results")
def extract_related_questions(self):
related_question_elements = self.soup.find_all(
"div", class_="related-question-pair"
)
for question_element in related_question_elements:
question = question_element.find("span").text.strip()
print(question)
self.related_questions.append(question)
logger.success(f"- {len(self.related_questions)} related questions")
def extract(self, html_path):
self.load_html(html_path)
self.extract_query_results()
self.extract_related_questions()
self.search_results = {
"query": self.query,
"query_results": self.query_results,
"related_questions": self.related_questions,
}
return self.search_results
if __name__ == "__main__":
html_path_root = Path(__file__).parents[1] / "files"
# html_filename = "python教程"
html_filename = "python_tutorials"
html_path = html_path_root / f"{html_filename}.html"
extractor = QueryResultsExtractor()
extractor.extract(html_path)
|