import pymupdf ## fitz # PyMuPDF from langchain.text_splitter import RecursiveCharacterTextSplitter def extract_text_from_pdf(pdf_file): ## doc = pymupdf.open(pdf_file) # with pymupdf.open(pdf_file) as doc: # with fitz.open(pdf_file) as doc: text = "" with pymupdf.open(stream=pdf_file.read(), filetype="pdf") as doc: for page in doc: text += page.get_text() return text MARKDOWN_SEPARATORS = [ "\n#{1,6} ", "```\n", "\n\\*\\*\\*+\n", "\n---+\n", "\n___+\n", "\n\n", "\n", " ", "", ] def chunk_text(text, chunk_size=1000, chunk_overlap=100): text_splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, add_start_index=True, strip_whitespace=True, separators=MARKDOWN_SEPARATORS ) return text_splitter.split_text(text)