document_loaders / toolbox /to_markdown /pptx_to_markdown.py
HoneyTian's picture
first commit
e94100d
raw
history blame
1.85 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import shutil
import tempfile
import uuid
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
from toolbox.os.command import Command
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
default=(project_path / "data/files/ppt/AI能力分享.pptx").as_posix(),
# default=(project_path / "data/files/ppt/钉钉基础使用手册.pptx").as_posix(),
type=str
)
args = parser.parse_args()
return args
@BaseToMarkdown.register("pptx2md")
class Pptx2md(BaseToMarkdown):
"""
https://github.com/ptsefton/pptx_to_md
"""
def __init__(self, filename: str):
super().__init__(filename)
def command(self, filename: str, output_file: str, image_dir: str = "media"):
cmd = f'python -m pptx2md -o "{output_file}" -i "{image_dir}" "{filename}"'
Command.popen(cmd)
return cmd
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
self.command(
self.filename,
output_file=md_file.as_posix(),
image_dir=(temp_dir / "media").as_posix()
)
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def main():
args = get_args()
p2m = Pptx2md(args.filename)
output_zip_file = p2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
main()