#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import os from pathlib import Path import shutil import tempfile import uuid from docx2md.docxfile import DocxFile, DocxFileError from docx2md.docxmedia import DocxMedia from docx2md.converter import Converter from project_settings import project_path from toolbox.to_markdown.base_to_markdown import BaseToMarkdown from toolbox.os.command import Command def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--filename", # default=(project_path / "data/files/doc/公司开票信息:深圳牛信.docx").as_posix(), default=(project_path / "data/files/doc/坏账处理流程 v1.0.docx").as_posix(), # default=(project_path / "data/files/doc/账单管理流程 v1.0.doc").as_posix(), type=str ) args = parser.parse_args() return args class DocxToMarkdown(BaseToMarkdown): def __init__(self, filename: str): super().__init__(filename) self.docx = DocxFile(self.filename) self.media = DocxMedia(self.docx) def get_md_text(self, use_md_table: bool = True) -> str: xml_text = self.docx.document() converter = Converter( xml_text, self.media, use_md_table ) md_text = converter.convert() return md_text def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) self.media.save(temp_dir) md_file = temp_dir / f"{basename}.md" md_text = self.get_md_text(use_md_table=True) with open(md_file.as_posix(), "w", encoding="utf-8") as f: f.write(md_text) output_zip_file = os.path.join(output_dir, f"{basename}.zip") # zip self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file @BaseToMarkdown.register("docx2md") class Docx2md(BaseToMarkdown): def __init__(self, filename: str): super().__init__(filename) def command(self, filename: str, output_file: str): cmd = f'python -m docx2md -m "{filename}" "{output_file}"' Command.popen(cmd) return cmd def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) md_file = temp_dir / f"{basename}.md" self.command(self.filename, md_file.as_posix()) # zip output_zip_file = os.path.join(output_dir, f"{basename}.zip") self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file def main(): args = get_args() d2m = Docx2md(args.filename) output_zip_file = d2m.save_to_zip(output_dir=".") print(output_zip_file) return if __name__ == "__main__": main()