document_loaders / toolbox /to_markdown /word_to_markdown.py
HoneyTian's picture
first commit
e94100d
raw
history blame
3 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import shutil
import tempfile
import uuid
from docx2md.docxfile import DocxFile, DocxFileError
from docx2md.docxmedia import DocxMedia
from docx2md.converter import Converter
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
from toolbox.os.command import Command
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
# default=(project_path / "data/files/doc/公司开票信息:深圳牛信.docx").as_posix(),
default=(project_path / "data/files/doc/坏账处理流程 v1.0.docx").as_posix(),
# default=(project_path / "data/files/doc/账单管理流程 v1.0.doc").as_posix(),
type=str
)
args = parser.parse_args()
return args
class DocxToMarkdown(BaseToMarkdown):
def __init__(self, filename: str):
super().__init__(filename)
self.docx = DocxFile(self.filename)
self.media = DocxMedia(self.docx)
def get_md_text(self, use_md_table: bool = True) -> str:
xml_text = self.docx.document()
converter = Converter(
xml_text,
self.media,
use_md_table
)
md_text = converter.convert()
return md_text
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
self.media.save(temp_dir)
md_file = temp_dir / f"{basename}.md"
md_text = self.get_md_text(use_md_table=True)
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
# zip
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
@BaseToMarkdown.register("docx2md")
class Docx2md(BaseToMarkdown):
def __init__(self, filename: str):
super().__init__(filename)
def command(self, filename: str, output_file: str):
cmd = f'python -m docx2md -m "{filename}" "{output_file}"'
Command.popen(cmd)
return cmd
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
self.command(self.filename, md_file.as_posix())
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def main():
args = get_args()
d2m = Docx2md(args.filename)
output_zip_file = d2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
main()