duplicate_llm / modules /pdfExtractor.py
Kurian07's picture
Upload 15 files
60fc5e8 verified
raw
history blame contribute delete
585 Bytes
import os
import pymupdf4llm
class PdfConverter:
def __init__(self, pdf_file):
self.pdf_file = pdf_file
self.md_text = None
def convert_to_markdown(self):
self.md_text = pymupdf4llm.to_markdown(self.pdf_file)
return self.md_text
def save_markdown(self, output_file):
with open(output_file, 'w') as file:
file.write(self.md_text)
# Example usage
# pdf_file = os.path.join(os.getcwd(), "pdfs", "test.pdf")
# converter = PdfConverter(pdf_file)
# text = converter.convert_to_markdown()
# print(text)