document_loaders / toolbox /to_markdown /pdf_to_markdown.py
HoneyTian's picture
first commit
e94100d
raw
history blame
4.16 kB
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import argparse
import os
from pathlib import Path
import re
import shutil
import tempfile
import uuid
import aspose.words as aw
import pymupdf4llm
from project_settings import project_path
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--filename",
default=(project_path / "data/files/pdf/2024.naacl-long.35.pdf").as_posix(),
# default=(project_path / "data/files/pdf/临时救助工作应知应会知识.pdf").as_posix(),
# default=(project_path / "data/unstructured_eval/pdf/麦肯锡2023年AI现状_生成式AI的爆发之年.pdf").as_posix(),
type=str
)
args = parser.parse_args()
return args
@BaseToMarkdown.register("pymupdf4llm")
class PyMuPdf2Llm(BaseToMarkdown):
"""
不支持图像
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/
"""
def __init__(self, filename: str, image_folder: str = "media"):
super().__init__(filename)
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
# pdf to md
md_text = pymupdf4llm.to_markdown(self.filename)
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
@BaseToMarkdown.register("aspose_words")
class AsposeWordsPdf2Md(BaseToMarkdown):
"""
https://pypi.org/project/aspose-words/
https://products.aspose.com/words/python-net/
https://products.aspose.com/words/python-net/merge/pdf-to-markdown/
"""
def __init__(self, filename: str, image_folder: str = "media"):
super().__init__(filename)
self.doc = aw.Document(self.filename)
self.image_folder = image_folder
def save_to_zip(self, output_dir: str):
basename = str(uuid.uuid4())
temp_dir = Path(tempfile.gettempdir()) / basename
temp_dir.mkdir(parents=True, exist_ok=False)
md_file = temp_dir / f"{basename}.md"
media_dir = temp_dir / self.image_folder
media_dir.mkdir(parents=True, exist_ok=False)
# pdf to md
self.doc.save(md_file.as_posix())
# images
for pattern in ["*.jpeg", "*.jpg", "*.png", "*.gif", "*.bmp", "*.tiff"]:
for image_file in temp_dir.glob(pattern):
shutil.move(
src=image_file.as_posix(),
dst=media_dir.as_posix(),
)
# md image convert
with open(md_file.as_posix(), "r", encoding="utf-8") as f:
md_text = f.read()
md_text = self.convert_image_to_media_dir(md_text, image_folder=self.image_folder)
with open(md_file.as_posix(), "w", encoding="utf-8") as f:
f.write(md_text)
# zip
output_zip_file = os.path.join(output_dir, f"{basename}.zip")
self.zip_directory(temp_dir, output_zip_file)
shutil.rmtree(temp_dir)
return output_zip_file
def convert_image_to_media_dir(self,
markdown_text: str,
image_folder: str = "media",
):
pattern1 = r'\!\[(?:.*?)\]\((.+?)\)'
def replace(match):
relative_path = match.group(1)
relative_path = os.path.join(image_folder, relative_path)
result = f"![]({relative_path})"
return result
markdown_text = re.sub(pattern1, replace, markdown_text)
return markdown_text
def main():
args = get_args()
p2m = PyMuPdf2Llm(args.filename)
# p2m = AsposeWordsPdf2Md(args.filename)
output_zip_file = p2m.save_to_zip(output_dir=".")
print(output_zip_file)
return
if __name__ == "__main__":
main()