document_loaders / toolbox /to_markdown /
HoneyTian's picture
first commit
history blame
8.29 kB
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import re
import shutil
import tempfile
import uuid
from markdownify import markdownify as md
from selenium import webdriver
from import Service
from import ChromeDriverManager, DriverCacheManager
from bs4 import BeautifulSoup
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
def get_args():
parser = argparse.ArgumentParser()
# default=(project_path / "data/files/html/nxlink.html").as_posix(),
default=(project_path / "data/files/html/nxcloud.html").as_posix(),
args = parser.parse_args()
return args
class HtmlPreprocess(object):
def remove_comment(html_doc: str):
pattern = "<!--.*?-->"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_img(html_doc: str):
pattern = "<img.*?>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_multiple_newlines(html_doc: str):
html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL)
return html_doc
def remove_no_script(html_doc: str):
pattern = "<noscript>.*?</noscript>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_script(html_doc: str):
pattern = "<script.*?</script>"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_style(html_doc: str):
remove_script_pattern = "<style.*?</style>"
html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_class_property(html_doc: str):
pattern = " class=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_id_property(html_doc: str):
pattern = " id=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_onclick_property(html_doc: str):
pattern = " onclick=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def remove_style_property(html_doc: str):
pattern = " style=\".+?\""
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
pattern = " style='.+?'"
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL)
return html_doc
def replace_a(html_doc: str):
pattern = r"<a\b[^>]*>(.*?)</a>"
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
return html_doc
def replace_br(html_doc: str):
pattern = r"(<br>|<br/>|<br />)"
html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL)
return html_doc
def replace_div(html_doc: str):
pattern = r"<div\b[^>]*>(.*?)</div>"
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL)
return html_doc
class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess):
def __init__(self, filename: str):
with open(self.filename, "r", encoding="utf-8") as f:
html_doc =
soup = BeautifulSoup(html_doc, "html.parser")
self.html_doc = soup.prettify()
def get_md_text(self) -> str:
options = {
"strip": ["script"],
"autolinks": False,
html_doc = self.html_doc
html_doc = html_doc.replace("&lt;", "<")
html_doc = html_doc.replace("&gt;", ">")
html_doc = self.remove_comment(html_doc)
html_doc = self.remove_img(html_doc)
html_doc = self.remove_no_script(html_doc)
html_doc = self.remove_script(html_doc)
html_doc = self.remove_style(html_doc)
html_doc = self.remove_class_property(html_doc)
html_doc = self.remove_id_property(html_doc)
html_doc = self.remove_onclick_property(html_doc)
html_doc = self.remove_style_property(html_doc)
html_doc = self.replace_a(html_doc)
html_doc = self.replace_br(html_doc)
html_doc = self.replace_div(html_doc)
html_doc = self.remove_multiple_newlines(html_doc)
md_text = md(html_doc, **options)
md_text = self.remove_multiple_newlines(md_text)
return md_text
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text()
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
return output_zip_file
class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess):
def __init__(self, url: str):
self.url = url
html_doc = self.get_url_content(url)
soup = BeautifulSoup(html_doc, "html.parser")
self.html_doc = soup.prettify()
def get_url_content(self, url: str):
chrome_driver_manager = ChromeDriverManager(
root_dir=(project_path / "data").as_posix()
driver_path = chrome_driver_manager.install()
print(f"driver_path: {driver_path}")
driver = webdriver.Chrome(
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
html_doc = driver.page_source
return html_doc
def get_md_text(self) -> str:
options = {
"strip": ["script"],
"autolinks": False,
html_doc = self.html_doc
html_doc = html_doc.replace("&lt;", "<")
html_doc = html_doc.replace("&gt;", ">")
html_doc = self.remove_comment(html_doc)
html_doc = self.remove_img(html_doc)
html_doc = self.remove_no_script(html_doc)
html_doc = self.remove_script(html_doc)
html_doc = self.remove_style(html_doc)
html_doc = self.remove_class_property(html_doc)
html_doc = self.remove_id_property(html_doc)
html_doc = self.remove_onclick_property(html_doc)
html_doc = self.remove_style_property(html_doc)
html_doc = self.replace_a(html_doc)
html_doc = self.replace_br(html_doc)
html_doc = self.replace_div(html_doc)
html_doc = self.remove_multiple_newlines(html_doc)
md_text = md(html_doc, **options)
md_text = self.remove_multiple_newlines(md_text)
return md_text
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text()
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
return output_zip_file
def main():
args = get_args()
h2m = HtmlToMarkdown(args.filename)
output_zip_file = h2m.save_to_zip(output_dir=".")
def main2():
args = get_args()
h2m = UrlToMarkdown("")
output_zip_file = h2m.save_to_zip(output_dir=".")
if __name__ == "__main__":
# main()