#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import os from pathlib import Path import re import shutil import tempfile import uuid from markdownify import markdownify as md from selenium import webdriver from selenium.webdriver.chrome.service import Service from webdriver_manager.chrome import ChromeDriverManager, DriverCacheManager from bs4 import BeautifulSoup from project_settings import project_path from toolbox.to_markdown.base_to_markdown import BaseToMarkdown def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--filename", # default=(project_path / "data/files/html/nxlink.html").as_posix(), default=(project_path / "data/files/html/nxcloud.html").as_posix(), type=str ) args = parser.parse_args() return args class HtmlPreprocess(object): @staticmethod def remove_comment(html_doc: str): pattern = "" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_img(html_doc: str): pattern = "" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_multiple_newlines(html_doc: str): html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_no_script(html_doc: str): pattern = "" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_script(html_doc: str): pattern = "" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_style(html_doc: str): remove_script_pattern = "" html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_class_property(html_doc: str): pattern = " class=\".+?\"" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_id_property(html_doc: str): pattern = " id=\".+?\"" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_onclick_property(html_doc: str): pattern = " onclick=\".+?\"" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def remove_style_property(html_doc: str): pattern = " style=\".+?\"" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) pattern = " style='.+?'" html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) return html_doc @staticmethod def replace_a(html_doc: str): pattern = r"]*>(.*?)" html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) return html_doc @staticmethod def replace_br(html_doc: str): pattern = r"(
|
|
)" html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL) return html_doc @staticmethod def replace_div(html_doc: str): pattern = r"]*>(.*?)" html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) return html_doc @BaseToMarkdown.register("html_markdownify") class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess): def __init__(self, filename: str): super().__init__(filename) with open(self.filename, "r", encoding="utf-8") as f: html_doc = f.read() soup = BeautifulSoup(html_doc, "html.parser") self.html_doc = soup.prettify() def get_md_text(self) -> str: options = { "strip": ["script"], "autolinks": False, } html_doc = self.html_doc html_doc = html_doc.replace("<", "<") html_doc = html_doc.replace(">", ">") html_doc = self.remove_comment(html_doc) html_doc = self.remove_img(html_doc) html_doc = self.remove_no_script(html_doc) html_doc = self.remove_script(html_doc) html_doc = self.remove_style(html_doc) html_doc = self.remove_class_property(html_doc) html_doc = self.remove_id_property(html_doc) html_doc = self.remove_onclick_property(html_doc) html_doc = self.remove_style_property(html_doc) html_doc = self.replace_a(html_doc) html_doc = self.replace_br(html_doc) html_doc = self.replace_div(html_doc) html_doc = self.remove_multiple_newlines(html_doc) md_text = md(html_doc, **options) md_text = self.remove_multiple_newlines(md_text) return md_text def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) md_file = temp_dir / f"{basename}.md" md_text = self.get_md_text() with open(md_file.as_posix(), "w", encoding="utf-8") as f: f.write(md_text) output_zip_file = os.path.join(output_dir, f"{basename}.zip") # zip self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess): def __init__(self, url: str): super().__init__(url) self.url = url html_doc = self.get_url_content(url) soup = BeautifulSoup(html_doc, "html.parser") self.html_doc = soup.prettify() def get_url_content(self, url: str): chrome_driver_manager = ChromeDriverManager( cache_manager=DriverCacheManager( root_dir=(project_path / "data").as_posix() ) ) driver_path = chrome_driver_manager.install() print(f"driver_path: {driver_path}") driver = webdriver.Chrome( service=Service(driver_path=driver_path), ) driver.get(url) driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') html_doc = driver.page_source driver.quit() return html_doc def get_md_text(self) -> str: options = { "strip": ["script"], "autolinks": False, } html_doc = self.html_doc html_doc = html_doc.replace("<", "<") html_doc = html_doc.replace(">", ">") html_doc = self.remove_comment(html_doc) html_doc = self.remove_img(html_doc) html_doc = self.remove_no_script(html_doc) html_doc = self.remove_script(html_doc) html_doc = self.remove_style(html_doc) html_doc = self.remove_class_property(html_doc) html_doc = self.remove_id_property(html_doc) html_doc = self.remove_onclick_property(html_doc) html_doc = self.remove_style_property(html_doc) html_doc = self.replace_a(html_doc) html_doc = self.replace_br(html_doc) html_doc = self.replace_div(html_doc) html_doc = self.remove_multiple_newlines(html_doc) md_text = md(html_doc, **options) md_text = self.remove_multiple_newlines(md_text) return md_text def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) md_file = temp_dir / f"{basename}.md" md_text = self.get_md_text() with open(md_file.as_posix(), "w", encoding="utf-8") as f: f.write(md_text) output_zip_file = os.path.join(output_dir, f"{basename}.zip") # zip self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file def main(): args = get_args() h2m = HtmlToMarkdown(args.filename) output_zip_file = h2m.save_to_zip(output_dir=".") print(output_zip_file) return def main2(): args = get_args() h2m = UrlToMarkdown("https://www.baidu.com/") output_zip_file = h2m.save_to_zip(output_dir=".") print(output_zip_file) return if __name__ == "__main__": # main() main2()