Spaces:
Sleeping
Sleeping
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
import argparse | |
import os | |
from pathlib import Path | |
import re | |
import shutil | |
import tempfile | |
import uuid | |
import aspose.words as aw | |
import pymupdf4llm | |
from project_settings import project_path | |
from toolbox.to_markdown.base_to_markdown import BaseToMarkdown | |
def get_args(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument( | |
"--filename", | |
default=(project_path / "data/files/pdf/2024.naacl-long.35.pdf").as_posix(), | |
# default=(project_path / "data/files/pdf/临时救助工作应知应会知识.pdf").as_posix(), | |
# default=(project_path / "data/unstructured_eval/pdf/麦肯锡2023年AI现状_生成式AI的爆发之年.pdf").as_posix(), | |
type=str | |
) | |
args = parser.parse_args() | |
return args | |
class PyMuPdf2Llm(BaseToMarkdown): | |
""" | |
不支持图像 | |
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/ | |
""" | |
def __init__(self, filename: str, image_folder: str = "media"): | |
super().__init__(filename) | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
md_file = temp_dir / f"{basename}.md" | |
# pdf to md | |
md_text = pymupdf4llm.to_markdown(self.filename) | |
with open(md_file.as_posix(), "w", encoding="utf-8") as f: | |
f.write(md_text) | |
# zip | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
class AsposeWordsPdf2Md(BaseToMarkdown): | |
""" | |
https://pypi.org/project/aspose-words/ | |
https://products.aspose.com/words/python-net/ | |
https://products.aspose.com/words/python-net/merge/pdf-to-markdown/ | |
""" | |
def __init__(self, filename: str, image_folder: str = "media"): | |
super().__init__(filename) | |
self.doc = aw.Document(self.filename) | |
self.image_folder = image_folder | |
def save_to_zip(self, output_dir: str): | |
basename = str(uuid.uuid4()) | |
temp_dir = Path(tempfile.gettempdir()) / basename | |
temp_dir.mkdir(parents=True, exist_ok=False) | |
md_file = temp_dir / f"{basename}.md" | |
media_dir = temp_dir / self.image_folder | |
media_dir.mkdir(parents=True, exist_ok=False) | |
# pdf to md | |
self.doc.save(md_file.as_posix()) | |
# images | |
for pattern in ["*.jpeg", "*.jpg", "*.png", "*.gif", "*.bmp", "*.tiff"]: | |
for image_file in temp_dir.glob(pattern): | |
shutil.move( | |
src=image_file.as_posix(), | |
dst=media_dir.as_posix(), | |
) | |
# md image convert | |
with open(md_file.as_posix(), "r", encoding="utf-8") as f: | |
md_text = f.read() | |
md_text = self.convert_image_to_media_dir(md_text, image_folder=self.image_folder) | |
with open(md_file.as_posix(), "w", encoding="utf-8") as f: | |
f.write(md_text) | |
# zip | |
output_zip_file = os.path.join(output_dir, f"{basename}.zip") | |
self.zip_directory(temp_dir, output_zip_file) | |
shutil.rmtree(temp_dir) | |
return output_zip_file | |
def convert_image_to_media_dir(self, | |
markdown_text: str, | |
image_folder: str = "media", | |
): | |
pattern1 = r'\!\[(?:.*?)\]\((.+?)\)' | |
def replace(match): | |
relative_path = match.group(1) | |
relative_path = os.path.join(image_folder, relative_path) | |
result = f"![]({relative_path})" | |
return result | |
markdown_text = re.sub(pattern1, replace, markdown_text) | |
return markdown_text | |
def main(): | |
args = get_args() | |
p2m = PyMuPdf2Llm(args.filename) | |
# p2m = AsposeWordsPdf2Md(args.filename) | |
output_zip_file = p2m.save_to_zip(output_dir=".") | |
print(output_zip_file) | |
return | |
if __name__ == "__main__": | |
main() | |