document_loaders / toolbox /to_markdown /html_to_markdown.py
HoneyTian's picture
first commit
e94100d
raw
history blame
8.29 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import re
import shutil
import tempfile
import uuid
from markdownify import markdownify as md
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager, DriverCacheManager
from bs4 import BeautifulSoup
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
# default=(project_path / "data/files/html/nxlink.html").as_posix(),
default=(project_path / "data/files/html/nxcloud.html").as_posix(),
type=str
)
args = parser.parse_args()
return args
class HtmlPreprocess(object):
@staticmethod
def remove_comment(html_doc: str):
pattern = "<!--.*?-->"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_img(html_doc: str):
pattern = "<img.*?>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_multiple_newlines(html_doc: str):
html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_no_script(html_doc: str):
pattern = "<noscript>.*?</noscript>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_script(html_doc: str):
pattern = "<script.*?</script>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_style(html_doc: str):
remove_script_pattern = "<style.*?</style>"
html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_class_property(html_doc: str):
pattern = " class=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_id_property(html_doc: str):
pattern = " id=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_onclick_property(html_doc: str):
pattern = " onclick=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def remove_style_property(html_doc: str):
pattern = " style=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
pattern = " style='.+?'"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def replace_a(html_doc: str):
pattern = r"<a\b[^>]*>(.*?)</a>"
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def replace_br(html_doc: str):
pattern = r"(<br>|<br/>|<br />)"
html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL)
return html_doc
@staticmethod
def replace_div(html_doc: str):
pattern = r"<div\b[^>]*>(.*?)</div>"
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
return html_doc
@BaseToMarkdown.register("html_markdownify")
class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess):
def __init__(self, filename: str):
super().__init__(filename)
with open(self.filename, "r", encoding="utf-8") as f:
html_doc = f.read()
soup = BeautifulSoup(html_doc, "html.parser")
self.html_doc = soup.prettify()
def get_md_text(self) -> str:
options = {
"strip": ["script"],
"autolinks": False,
}
html_doc = self.html_doc
html_doc = html_doc.replace("&lt;", "<")
html_doc = html_doc.replace("&gt;", ">")
html_doc = self.remove_comment(html_doc)
html_doc = self.remove_img(html_doc)
html_doc = self.remove_no_script(html_doc)
html_doc = self.remove_script(html_doc)
html_doc = self.remove_style(html_doc)
html_doc = self.remove_class_property(html_doc)
html_doc = self.remove_id_property(html_doc)
html_doc = self.remove_onclick_property(html_doc)
html_doc = self.remove_style_property(html_doc)
html_doc = self.replace_a(html_doc)
html_doc = self.replace_br(html_doc)
html_doc = self.replace_div(html_doc)
html_doc = self.remove_multiple_newlines(html_doc)
md_text = md(html_doc, **options)
md_text = self.remove_multiple_newlines(md_text)
return md_text
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text()
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess):
def __init__(self, url: str):
super().__init__(url)
self.url = url
html_doc = self.get_url_content(url)
soup = BeautifulSoup(html_doc, "html.parser")
self.html_doc = soup.prettify()
def get_url_content(self, url: str):
chrome_driver_manager = ChromeDriverManager(
cache_manager=DriverCacheManager(
root_dir=(project_path / "data").as_posix()
)
)
driver_path = chrome_driver_manager.install()
print(f"driver_path: {driver_path}")
driver = webdriver.Chrome(
service=Service(driver_path=driver_path),
)
driver.get(url)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
html_doc = driver.page_source
driver.quit()
return html_doc
def get_md_text(self) -> str:
options = {
"strip": ["script"],
"autolinks": False,
}
html_doc = self.html_doc
html_doc = html_doc.replace("&lt;", "<")
html_doc = html_doc.replace("&gt;", ">")
html_doc = self.remove_comment(html_doc)
html_doc = self.remove_img(html_doc)
html_doc = self.remove_no_script(html_doc)
html_doc = self.remove_script(html_doc)
html_doc = self.remove_style(html_doc)
html_doc = self.remove_class_property(html_doc)
html_doc = self.remove_id_property(html_doc)
html_doc = self.remove_onclick_property(html_doc)
html_doc = self.remove_style_property(html_doc)
html_doc = self.replace_a(html_doc)
html_doc = self.replace_br(html_doc)
html_doc = self.replace_div(html_doc)
html_doc = self.remove_multiple_newlines(html_doc)
md_text = md(html_doc, **options)
md_text = self.remove_multiple_newlines(md_text)
return md_text
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text()
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def main():
args = get_args()
h2m = HtmlToMarkdown(args.filename)
output_zip_file = h2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
def main2():
args = get_args()
h2m = UrlToMarkdown("https://www.baidu.com/")
output_zip_file = h2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
# main()
main2()