File size: 3,962 Bytes
e773696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d3e890
e773696
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
from pathlib import Path
from pprint import pprint
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
from tiktoken import get_encoding as tiktoken_get_encoding
from utils.logger import logger
from markdownify import markdownify

# from trafilatura import extract as extract_text_from_html
# from inscriptis import get_text as extract_text_from_html
# from html_text import extract_text as extract_text_from_html
# from readabilipy import simple_json_from_html_string as extract_text_from_html


class WebpageContentExtractor:
    def __init__(self):
        self.tokenizer = tiktoken_get_encoding("cl100k_base")

    def count_tokens(self, text):
        tokens = self.tokenizer.encode(text)
        token_count = len(tokens)
        return token_count

    def filter_html_str(self, html_str):
        soup = BeautifulSoup(html_str, "html.parser")

        ignore_tags = ["script", "style", "button"]

        ignore_classes = [
            "sidebar",
            "footer",
            "related",
            "comment",
            "topbar",
            "menu",
            "offcanvas",
            "navbar",
        ]
        ignore_classes_pattern = f'{"|".join(ignore_classes)}'
        removed_element_counts = 0
        for element in soup.find_all():
            class_str = ""
            id_str = ""
            try:
                class_attr = element.get("class", [])
                if class_attr:
                    class_str = " ".join(list(class_attr))
                if id_str:
                    class_str = f"{class_str} {id_str}"
            except:
                pass

            try:
                id_str = element.get("id", "")
            except:
                pass

            if (
                (not element.text.strip())
                or (element.name in ignore_tags)
                or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
                or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
            ):
                # try:
                #     logger.note(f"Removing:\n{element}")
                # except:
                #     logger.note(f"Removing unknown element")
                element.decompose()
                removed_element_counts += 1

        logger.note(
            f"Elements Removed/Remained:  {removed_element_counts}/{len(soup.find_all())}"
        )

        html_str = str(soup)
        return html_str

    def extract(self, html_path):
        logger.note(f"Extracing content from:{html_path}")
        with open(html_path, "r", encoding="utf-8") as f:
            html_str = f.read()

        html_str = self.filter_html_str(html_str)

        # self.main_content = extract_text_from_html(html_str)

        # # when using `readabilipy`
        # self.main_content = extract_text_from_html(html_str)["plain_content"]
        # self.main_content = "\n".join(
        #     item["text"] for item in extract_text_from_html(html_str)["plain_text"]
        # )
        # self.main_content = markdownify(extract_text_from_html(html_str)["content"])

        # self.main_content = markdownify(extract_text_from_html(html_str))

        self.main_content = markdownify(html_str, strip="a")
        self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
        # logger.line(self.main_content)
        # pprint(self.main_content)
        token_count = self.count_tokens(self.main_content)
        logger.note(f"Token Count: {token_count}")
        return self.main_content


if __name__ == "__main__":
    html_path = (
        Path(__file__).parents[1]
        / "files"
        / "urls"
        # / "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html"
        / "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html"
        # / "docs.python.org_zh-cn_3_tutorial_interpreter.html"
    )
    extractor = WebpageContentExtractor()
    main_content = extractor.extract(html_path)