File size: 3,092 Bytes
ef3de03
 
0acc824
ef3de03
 
0f6452f
ef3de03
0f6452f
 
ef3de03
 
 
 
 
 
0f6452f
 
 
 
fa9ef68
 
 
 
 
0f6452f
e789120
 
 
 
 
fa9ef68
a315628
 
 
 
 
 
 
 
 
 
 
fa9ef68
0acc824
 
 
0f6452f
 
 
 
 
 
8aa7160
0f6452f
 
ef3de03
0acc824
ef3de03
fa9ef68
e789120
ef3de03
f150f6b
 
 
 
 
ef3de03
0f6452f
0acc824
ef3de03
 
 
0f6452f
f150f6b
0f6452f
 
 
 
 
 
ef3de03
 
 
 
 
 
 
0f6452f
ef3de03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
from bs4 import BeautifulSoup
from pathlib import Path
from utils.logger import logger


class QueryResultsExtractor:
    def __init__(self) -> None:
        self.query_results = []
        self.related_questions = []

    def load_html(self, html_path):
        with open(html_path, "r", encoding="utf-8") as f:
            html = f.read()
        self.soup = BeautifulSoup(html, "html.parser")

    def extract_query_results(self):
        self.query = self.soup.find("textarea").text.strip()
        query_result_elements = self.soup.find_all("div", class_="g")
        for idx, result in enumerate(query_result_elements):
            cite_tag = result.find("cite")
            if cite_tag:
                site = cite_tag.find_previous("span").text.strip()
            else:
                site = "Unknown"
            url = result.find("a")["href"]
            title_element = result.find("h3")
            if title_element:
                title = title_element.text.strip()
            else:
                title = "Unknown Title"
    
            abstract_element_conditions = [
                {"data-sncf": "1"},
                {"class_": "ITZIwc"},
            ]
            for condition in abstract_element_conditions:
                abstract_element = result.find("div", condition)
                if abstract_element is not None:
                    abstract = abstract_element.text.strip()
                    break
            else:
                abstract = ""
    
            logger.mesg(
                f"{title}\n" f"  - {site}\n" f"  - {url}\n" f"  - {abstract}\n" f"\n"
            )
            self.query_results.append(
                {
                    "title": title,
                    "site": site,
                    "url": url,
                    "abstract": abstract,
                    "position": idx + 1,
                    "type": "web",
                }
            )
        logger.success(f"- {len(query_result_elements)} query results")



    def extract_related_questions(self):
        related_question_elements = self.soup.find_all(
            "div", class_="related-question-pair"
        )
        for question_element in related_question_elements:
            question = question_element.find("span").text.strip()
            print(question)
            self.related_questions.append(question)
        logger.success(f"- {len(self.related_questions)} related questions")

    def extract(self, html_path):
        self.load_html(html_path)
        self.extract_query_results()
        self.extract_related_questions()
        self.search_results = {
            "query": self.query,
            "query_results": self.query_results,
            "related_questions": self.related_questions,
        }
        return self.search_results


if __name__ == "__main__":
    html_path_root = Path(__file__).parents[1] / "files"
    # html_filename = "python教程"
    html_filename = "python_tutorials"
    html_path = html_path_root / f"{html_filename}.html"
    extractor = QueryResultsExtractor()
    extractor.extract(html_path)