File size: 2,996 Bytes
e94100d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import shutil
import tempfile
import uuid

from docx2md.docxfile import DocxFile, DocxFileError
from docx2md.docxmedia import DocxMedia
from docx2md.converter import Converter

from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
from toolbox.os.command import Command


def get_args():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--filename",
        # default=(project_path / "data/files/doc/公司开票信息:深圳牛信.docx").as_posix(),
        default=(project_path / "data/files/doc/坏账处理流程 v1.0.docx").as_posix(),
        # default=(project_path / "data/files/doc/账单管理流程 v1.0.doc").as_posix(),
        type=str
    )
    args = parser.parse_args()
    return args


class DocxToMarkdown(BaseToMarkdown):
    def __init__(self, filename: str):
        super().__init__(filename)
        self.docx = DocxFile(self.filename)
        self.media = DocxMedia(self.docx)

    def get_md_text(self, use_md_table: bool = True) -> str:
        xml_text = self.docx.document()

        converter = Converter(
            xml_text,
            self.media,
            use_md_table
        )
        md_text = converter.convert()
        return md_text

    def save_to_zip(self, output_dir: str):
        basename = str(uuid.uuid4())

        temp_dir = Path(tempfile.gettempdir()) / basename
        temp_dir.mkdir(parents=True, exist_ok=False)

        self.media.save(temp_dir)

        md_file = temp_dir / f"{basename}.md"
        md_text = self.get_md_text(use_md_table=True)
        with open(md_file.as_posix(), "w", encoding="utf-8") as f:
            f.write(md_text)
        output_zip_file = os.path.join(output_dir, f"{basename}.zip")

        # zip
        self.zip_directory(temp_dir, output_zip_file)
        shutil.rmtree(temp_dir)
        return output_zip_file


@BaseToMarkdown.register("docx2md")
class Docx2md(BaseToMarkdown):
    def __init__(self, filename: str):
        super().__init__(filename)

    def command(self, filename: str, output_file: str):
        cmd = f'python -m docx2md -m "{filename}" "{output_file}"'
        Command.popen(cmd)
        return cmd

    def save_to_zip(self, output_dir: str):
        basename = str(uuid.uuid4())

        temp_dir = Path(tempfile.gettempdir()) / basename
        temp_dir.mkdir(parents=True, exist_ok=False)

        md_file = temp_dir / f"{basename}.md"

        self.command(self.filename, md_file.as_posix())

        # zip
        output_zip_file = os.path.join(output_dir, f"{basename}.zip")
        self.zip_directory(temp_dir, output_zip_file)
        shutil.rmtree(temp_dir)
        return output_zip_file


def main():
    args = get_args()

    d2m = Docx2md(args.filename)

    output_zip_file = d2m.save_to_zip(output_dir=".")
    print(output_zip_file)
    return


if __name__ == "__main__":
    main()