Hansimov commited on
Commit
a636bcb
1 Parent(s): af2c647

:recycle: [Refactor] WebpageContentExtractor: Separate html and markdown processing

Browse files
documents/webpage_content_extractor.py CHANGED
@@ -1,16 +1,12 @@
1
  import re
2
  from pathlib import Path
3
  from pprint import pprint
4
- from bs4 import BeautifulSoup, Comment, NavigableString, Tag
5
  from tiktoken import get_encoding as tiktoken_get_encoding
6
  from utils.logger import logger
7
  from markdownify import markdownify
8
- from networks.network_configs import IGNORE_CLASSES
9
-
10
- # from trafilatura import extract as extract_text_from_html
11
- # from inscriptis import get_text as extract_text_from_html
12
- # from html_text import extract_text as extract_text_from_html
13
- # from readabilipy import simple_json_from_html_string as extract_text_from_html
14
 
15
 
16
  class WebpageContentExtractor:
@@ -22,11 +18,22 @@ class WebpageContentExtractor:
22
  token_count = len(tokens)
23
  return token_count
24
 
25
- def filter_html_str(self, html_str):
26
- soup = BeautifulSoup(html_str, "html.parser")
 
 
 
 
 
 
 
27
 
28
- ignore_tags = ["script", "style", "button"]
29
 
 
 
 
 
30
  ignore_classes_pattern = f'{"|".join(IGNORE_CLASSES)}'
31
  removed_element_counts = 0
32
  for element in soup.find_all():
@@ -48,25 +55,22 @@ class WebpageContentExtractor:
48
 
49
  if (
50
  (not element.text.strip())
51
- or (element.name in ignore_tags)
52
  or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
53
  or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
54
  ):
55
- try:
56
- logger.note(f"Removing:\n{element}")
57
- logger.warn(class_str)
58
- except:
59
- # logger.note(f"Removing unknown element")
60
- pass
61
  element.decompose()
62
  removed_element_counts += 1
63
 
64
- logger.note(
65
- f"Elements Removed/Remained: {removed_element_counts}/{len(soup.find_all())}"
 
66
  )
67
 
68
  html_str = str(soup)
69
- return html_str
 
 
70
 
71
  def extract(self, html_path):
72
  logger.note(f"Extracting content from: {html_path}")
@@ -78,26 +82,9 @@ class WebpageContentExtractor:
78
  with open(html_path, "r", encoding="utf-8") as rf:
79
  html_str = rf.read()
80
 
81
- html_str = self.filter_html_str(html_str)
82
-
83
- # self.main_content = extract_text_from_html(html_str)
84
-
85
- # # when using `readabilipy`
86
- # self.main_content = extract_text_from_html(html_str)["plain_content"]
87
- # self.main_content = "\n".join(
88
- # item["text"] for item in extract_text_from_html(html_str)["plain_text"]
89
- # )
90
- # self.main_content = markdownify(extract_text_from_html(html_str)["content"])
91
-
92
- # self.main_content = markdownify(extract_text_from_html(html_str))
93
-
94
- self.main_content = markdownify(html_str, strip="a")
95
- self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
96
- # logger.line(self.main_content)
97
- # pprint(self.main_content)
98
- token_count = self.count_tokens(self.main_content)
99
- logger.note(f"Token Count: {token_count}")
100
- return self.main_content
101
 
102
 
103
  if __name__ == "__main__":
 
1
  import re
2
  from pathlib import Path
3
  from pprint import pprint
4
+ from bs4 import BeautifulSoup
5
  from tiktoken import get_encoding as tiktoken_get_encoding
6
  from utils.logger import logger
7
  from markdownify import markdownify
8
+ from networks.network_configs import IGNORE_TAGS, IGNORE_CLASSES
9
+ from termcolor import colored
 
 
 
 
10
 
11
 
12
  class WebpageContentExtractor:
 
18
  token_count = len(tokens)
19
  return token_count
20
 
21
+ def html_to_markdown(self, html_str, ignore_links=True):
22
+ if ignore_links:
23
+ markdown_str = markdownify(html_str, strip="a")
24
+ else:
25
+ markdown_str = markdownify(html_str)
26
+ markdown_str = re.sub(r"\n{3,}", "\n\n", markdown_str)
27
+
28
+ self.markdown_token_count = self.count_tokens(markdown_str)
29
+ logger.mesg(f'- Tokens: {colored(self.markdown_token_count,"light_green")}')
30
 
31
+ self.markdown_str = markdown_str
32
 
33
+ return self.markdown_str
34
+
35
+ def remove_elements_from_html(self, html_str):
36
+ soup = BeautifulSoup(html_str, "html.parser")
37
  ignore_classes_pattern = f'{"|".join(IGNORE_CLASSES)}'
38
  removed_element_counts = 0
39
  for element in soup.find_all():
 
55
 
56
  if (
57
  (not element.text.strip())
58
+ or (element.name in IGNORE_TAGS)
59
  or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
60
  or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
61
  ):
 
 
 
 
 
 
62
  element.decompose()
63
  removed_element_counts += 1
64
 
65
+ logger.mesg(
66
+ f"- Elements: "
67
+ f'{colored(len(soup.find_all()),"light_green")} / {colored(removed_element_counts,"light_red")}'
68
  )
69
 
70
  html_str = str(soup)
71
+ self.html_str = html_str
72
+
73
+ return self.html_str
74
 
75
  def extract(self, html_path):
76
  logger.note(f"Extracting content from: {html_path}")
 
82
  with open(html_path, "r", encoding="utf-8") as rf:
83
  html_str = rf.read()
84
 
85
+ html_str = self.remove_elements_from_html(html_str)
86
+ markdown_str = self.html_to_markdown(html_str)
87
+ return markdown_str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
 
90
  if __name__ == "__main__":
networks/network_configs.py CHANGED
@@ -1,3 +1,4 @@
 
1
  IGNORE_CLASSES = [
2
  "sidebar",
3
  "footer",
 
1
+ IGNORE_TAGS = ["script", "style", "button"]
2
  IGNORE_CLASSES = [
3
  "sidebar",
4
  "footer",