Spaces:
Sleeping
Sleeping
File size: 3,962 Bytes
e773696 4d3e890 e773696 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import re
from pathlib import Path
from pprint import pprint
from bs4 import BeautifulSoup, Comment, NavigableString, Tag
from tiktoken import get_encoding as tiktoken_get_encoding
from utils.logger import logger
from markdownify import markdownify
# from trafilatura import extract as extract_text_from_html
# from inscriptis import get_text as extract_text_from_html
# from html_text import extract_text as extract_text_from_html
# from readabilipy import simple_json_from_html_string as extract_text_from_html
class WebpageContentExtractor:
def __init__(self):
self.tokenizer = tiktoken_get_encoding("cl100k_base")
def count_tokens(self, text):
tokens = self.tokenizer.encode(text)
token_count = len(tokens)
return token_count
def filter_html_str(self, html_str):
soup = BeautifulSoup(html_str, "html.parser")
ignore_tags = ["script", "style", "button"]
ignore_classes = [
"sidebar",
"footer",
"related",
"comment",
"topbar",
"menu",
"offcanvas",
"navbar",
]
ignore_classes_pattern = f'{"|".join(ignore_classes)}'
removed_element_counts = 0
for element in soup.find_all():
class_str = ""
id_str = ""
try:
class_attr = element.get("class", [])
if class_attr:
class_str = " ".join(list(class_attr))
if id_str:
class_str = f"{class_str} {id_str}"
except:
pass
try:
id_str = element.get("id", "")
except:
pass
if (
(not element.text.strip())
or (element.name in ignore_tags)
or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
):
# try:
# logger.note(f"Removing:\n{element}")
# except:
# logger.note(f"Removing unknown element")
element.decompose()
removed_element_counts += 1
logger.note(
f"Elements Removed/Remained: {removed_element_counts}/{len(soup.find_all())}"
)
html_str = str(soup)
return html_str
def extract(self, html_path):
logger.note(f"Extracing content from:{html_path}")
with open(html_path, "r", encoding="utf-8") as f:
html_str = f.read()
html_str = self.filter_html_str(html_str)
# self.main_content = extract_text_from_html(html_str)
# # when using `readabilipy`
# self.main_content = extract_text_from_html(html_str)["plain_content"]
# self.main_content = "\n".join(
# item["text"] for item in extract_text_from_html(html_str)["plain_text"]
# )
# self.main_content = markdownify(extract_text_from_html(html_str)["content"])
# self.main_content = markdownify(extract_text_from_html(html_str))
self.main_content = markdownify(html_str, strip="a")
self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
# logger.line(self.main_content)
# pprint(self.main_content)
token_count = self.count_tokens(self.main_content)
logger.note(f"Token Count: {token_count}")
return self.main_content
if __name__ == "__main__":
html_path = (
Path(__file__).parents[1]
/ "files"
/ "urls"
# / "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html"
/ "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html"
# / "docs.python.org_zh-cn_3_tutorial_interpreter.html"
)
extractor = WebpageContentExtractor()
main_content = extractor.extract(html_path)
|