#!/usr/bin/python3 # -*- coding: utf-8 -*- import argparse import os from pathlib import Path import re import shutil import tempfile import uuid import aspose.words as aw import pymupdf4llm from project_settings import project_path from toolbox.to_markdown.base_to_markdown import BaseToMarkdown def get_args(): parser = argparse.ArgumentParser() parser.add_argument( "--filename", default=(project_path / "data/files/pdf/2024.naacl-long.35.pdf").as_posix(), # default=(project_path / "data/files/pdf/临时救助工作应知应会知识.pdf").as_posix(), # default=(project_path / "data/unstructured_eval/pdf/麦肯锡2023年AI现状_生成式AI的爆发之年.pdf").as_posix(), type=str ) args = parser.parse_args() return args @BaseToMarkdown.register("pymupdf4llm") class PyMuPdf2Llm(BaseToMarkdown): """ 不支持图像 https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/ """ def __init__(self, filename: str, image_folder: str = "media"): super().__init__(filename) def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) md_file = temp_dir / f"{basename}.md" # pdf to md md_text = pymupdf4llm.to_markdown(self.filename) with open(md_file.as_posix(), "w", encoding="utf-8") as f: f.write(md_text) # zip output_zip_file = os.path.join(output_dir, f"{basename}.zip") self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file @BaseToMarkdown.register("aspose_words") class AsposeWordsPdf2Md(BaseToMarkdown): """ https://pypi.org/project/aspose-words/ https://products.aspose.com/words/python-net/ https://products.aspose.com/words/python-net/merge/pdf-to-markdown/ """ def __init__(self, filename: str, image_folder: str = "media"): super().__init__(filename) self.doc = aw.Document(self.filename) self.image_folder = image_folder def save_to_zip(self, output_dir: str): basename = str(uuid.uuid4()) temp_dir = Path(tempfile.gettempdir()) / basename temp_dir.mkdir(parents=True, exist_ok=False) md_file = temp_dir / f"{basename}.md" media_dir = temp_dir / self.image_folder media_dir.mkdir(parents=True, exist_ok=False) # pdf to md self.doc.save(md_file.as_posix()) # images for pattern in ["*.jpeg", "*.jpg", "*.png", "*.gif", "*.bmp", "*.tiff"]: for image_file in temp_dir.glob(pattern): shutil.move( src=image_file.as_posix(), dst=media_dir.as_posix(), ) # md image convert with open(md_file.as_posix(), "r", encoding="utf-8") as f: md_text = f.read() md_text = self.convert_image_to_media_dir(md_text, image_folder=self.image_folder) with open(md_file.as_posix(), "w", encoding="utf-8") as f: f.write(md_text) # zip output_zip_file = os.path.join(output_dir, f"{basename}.zip") self.zip_directory(temp_dir, output_zip_file) shutil.rmtree(temp_dir) return output_zip_file def convert_image_to_media_dir(self, markdown_text: str, image_folder: str = "media", ): pattern1 = r'\!\[(?:.*?)\]\((.+?)\)' def replace(match): relative_path = match.group(1) relative_path = os.path.join(image_folder, relative_path) result = f"![]({relative_path})" return result markdown_text = re.sub(pattern1, replace, markdown_text) return markdown_text def main(): args = get_args() p2m = PyMuPdf2Llm(args.filename) # p2m = AsposeWordsPdf2Md(args.filename) output_zip_file = p2m.save_to_zip(output_dir=".") print(output_zip_file) return if __name__ == "__main__": main()