Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import os | |
from pathlib import Path | |
import shutil | |
import tempfile | |
import uuid | |
from docx2md.docxfile import DocxFile, DocxFileError | |
from docx2md.docxmedia import DocxMedia | |
from docx2md.converter import Converter | |
from project_settings import project_path | |
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown | |
from toolbox.os.command import Command | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--filename", | |
# default=(project_path / "data/files/doc/公司开票信息:深圳牛信.docx").as_posix(), | |
default=(project_path / "data/files/doc/坏账处理流程 v1.0.docx").as_posix(), | |
# default=(project_path / "data/files/doc/账单管理流程 v1.0.doc").as_posix(), | |
type=str | |
) | |
args = parser.parse_args() | |
return args | |
class DocxToMarkdown(BaseToMarkdown): | |
def __init__(self, filename: str): | |
super().__init__(filename) | |
self.docx = DocxFile(self.filename) | |
self.media = DocxMedia(self.docx) | |
def get_md_text(self, use_md_table: bool = True) -> str: | |
xml_text = self.docx.document() | |
converter = Converter( | |
xml_text, | |
self.media, | |
use_md_table | |
) | |
md_text = converter.convert() | |
return md_text | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
self.media.save(temp_dir) | |
md_file = temp_dir / f"{basename}.md" | |
md_text = self.get_md_text(use_md_table=True) | |
with open(md_file.as_posix(), "w", encoding="utf-8") as f: | |
f.write(md_text) | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
# zip | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
class Docx2md(BaseToMarkdown): | |
def __init__(self, filename: str): | |
super().__init__(filename) | |
def command(self, filename: str, output_file: str): | |
cmd = f'python -m docx2md -m "{filename}" "{output_file}"' | |
Command.popen(cmd) | |
return cmd | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
md_file = temp_dir / f"{basename}.md" | |
self.command(self.filename, md_file.as_posix()) | |
# zip | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
def main(): | |
args = get_args() | |
d2m = Docx2md(args.filename) | |
output_zip_file = d2m.save_to_zip(output_dir=".") | |
print(output_zip_file) | |
return | |
if __name__ == "__main__": | |
main() | |