Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import os | |
from pathlib import Path | |
import re | |
import shutil | |
import tempfile | |
import uuid | |
from markdownify import markdownify as md | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager, DriverCacheManager | |
from bs4 import BeautifulSoup | |
from project_settings import project_path | |
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--filename", | |
# default=(project_path / "data/files/html/nxlink.html").as_posix(), | |
default=(project_path / "data/files/html/nxcloud.html").as_posix(), | |
type=str | |
) | |
args = parser.parse_args() | |
return args | |
class HtmlPreprocess(object): | |
def remove_comment(html_doc: str): | |
pattern = "<!--.*?-->" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_img(html_doc: str): | |
pattern = "<img.*?>" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_multiple_newlines(html_doc: str): | |
html_doc = re.sub(r"(\n\s*\n)+", "\n", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_no_script(html_doc: str): | |
pattern = "<noscript>.*?</noscript>" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_script(html_doc: str): | |
pattern = "<script.*?</script>" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_style(html_doc: str): | |
remove_script_pattern = "<style.*?</style>" | |
html_doc = re.sub(remove_script_pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_class_property(html_doc: str): | |
pattern = " class=\".+?\"" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_id_property(html_doc: str): | |
pattern = " id=\".+?\"" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_onclick_property(html_doc: str): | |
pattern = " onclick=\".+?\"" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def remove_style_property(html_doc: str): | |
pattern = " style=\".+?\"" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
pattern = " style='.+?'" | |
html_doc = re.sub(pattern, "", html_doc, flags=re.DOTALL) | |
return html_doc | |
def replace_a(html_doc: str): | |
pattern = r"<a\b[^>]*>(.*?)</a>" | |
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) | |
return html_doc | |
def replace_br(html_doc: str): | |
pattern = r"(<br>|<br/>|<br />)" | |
html_doc = re.sub(pattern, "\n", html_doc, flags=re.DOTALL) | |
return html_doc | |
def replace_div(html_doc: str): | |
pattern = r"<div\b[^>]*>(.*?)</div>" | |
html_doc = re.sub(pattern, r"\1", html_doc, flags=re.DOTALL) | |
return html_doc | |
class HtmlToMarkdown(BaseToMarkdown, HtmlPreprocess): | |
def __init__(self, filename: str): | |
super().__init__(filename) | |
with open(self.filename, "r", encoding="utf-8") as f: | |
html_doc = f.read() | |
soup = BeautifulSoup(html_doc, "html.parser") | |
self.html_doc = soup.prettify() | |
def get_md_text(self) -> str: | |
options = { | |
"strip": ["script"], | |
"autolinks": False, | |
} | |
html_doc = self.html_doc | |
html_doc = html_doc.replace("<", "<") | |
html_doc = html_doc.replace(">", ">") | |
html_doc = self.remove_comment(html_doc) | |
html_doc = self.remove_img(html_doc) | |
html_doc = self.remove_no_script(html_doc) | |
html_doc = self.remove_script(html_doc) | |
html_doc = self.remove_style(html_doc) | |
html_doc = self.remove_class_property(html_doc) | |
html_doc = self.remove_id_property(html_doc) | |
html_doc = self.remove_onclick_property(html_doc) | |
html_doc = self.remove_style_property(html_doc) | |
html_doc = self.replace_a(html_doc) | |
html_doc = self.replace_br(html_doc) | |
html_doc = self.replace_div(html_doc) | |
html_doc = self.remove_multiple_newlines(html_doc) | |
md_text = md(html_doc, **options) | |
md_text = self.remove_multiple_newlines(md_text) | |
return md_text | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
md_file = temp_dir / f"{basename}.md" | |
md_text = self.get_md_text() | |
with open(md_file.as_posix(), "w", encoding="utf-8") as f: | |
f.write(md_text) | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
# zip | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
class UrlToMarkdown(BaseToMarkdown, HtmlPreprocess): | |
def __init__(self, url: str): | |
super().__init__(url) | |
self.url = url | |
html_doc = self.get_url_content(url) | |
soup = BeautifulSoup(html_doc, "html.parser") | |
self.html_doc = soup.prettify() | |
def get_url_content(self, url: str): | |
chrome_driver_manager = ChromeDriverManager( | |
cache_manager=DriverCacheManager( | |
root_dir=(project_path / "data").as_posix() | |
) | |
) | |
driver_path = chrome_driver_manager.install() | |
print(f"driver_path: {driver_path}") | |
driver = webdriver.Chrome( | |
service=Service(driver_path=driver_path), | |
) | |
driver.get(url) | |
driver.execute_script('window.scrollTo(0, document.body.scrollHeight)') | |
html_doc = driver.page_source | |
driver.quit() | |
return html_doc | |
def get_md_text(self) -> str: | |
options = { | |
"strip": ["script"], | |
"autolinks": False, | |
} | |
html_doc = self.html_doc | |
html_doc = html_doc.replace("<", "<") | |
html_doc = html_doc.replace(">", ">") | |
html_doc = self.remove_comment(html_doc) | |
html_doc = self.remove_img(html_doc) | |
html_doc = self.remove_no_script(html_doc) | |
html_doc = self.remove_script(html_doc) | |
html_doc = self.remove_style(html_doc) | |
html_doc = self.remove_class_property(html_doc) | |
html_doc = self.remove_id_property(html_doc) | |
html_doc = self.remove_onclick_property(html_doc) | |
html_doc = self.remove_style_property(html_doc) | |
html_doc = self.replace_a(html_doc) | |
html_doc = self.replace_br(html_doc) | |
html_doc = self.replace_div(html_doc) | |
html_doc = self.remove_multiple_newlines(html_doc) | |
md_text = md(html_doc, **options) | |
md_text = self.remove_multiple_newlines(md_text) | |
return md_text | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
md_file = temp_dir / f"{basename}.md" | |
md_text = self.get_md_text() | |
with open(md_file.as_posix(), "w", encoding="utf-8") as f: | |
f.write(md_text) | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
# zip | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
def main(): | |
args = get_args() | |
h2m = HtmlToMarkdown(args.filename) | |
output_zip_file = h2m.save_to_zip(output_dir=".") | |
print(output_zip_file) | |
return | |
def main2(): | |
args = get_args() | |
h2m = UrlToMarkdown("https://www.baidu.com/") | |
output_zip_file = h2m.save_to_zip(output_dir=".") | |
print(output_zip_file) | |
return | |
if __name__ == "__main__": | |
# main() | |
main2() | |