import re import pandas as pd from dotenv import load_dotenv from llama_index.core import SimpleDirectoryReader from llama_parse import LlamaParse load_dotenv() MIN_PARAGRAPH_LENGTH = 50 def extract_paragraphs(markdown_text): """ Extract paragraphs from a markdown text. """ # Split the text into paragraphs using regex paragraphs = re.split(r"\n\n+", markdown_text) # Remove leading and trailing whitespaces from each paragraph paragraphs = [p.strip() for p in paragraphs if p.strip()] paragraphs = [ p for p in paragraphs if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#") ] print(f"created {len(paragraphs)} paragraphs\n", paragraphs) return paragraphs def extract_endpoint(file_paths): """ Extract PDFs using LlamaParse. """ # set up parser parser = LlamaParse(result_type="markdown") # "markdown" and "text" are available # use SimpleDirectoryReader to parse our file file_extractor = {".pdf": parser} documents = SimpleDirectoryReader( input_files=file_paths, file_extractor=file_extractor ).load_data() extracted_data = [] for doc in documents: print(doc.text[:500]) paragraphs = extract_paragraphs(doc.text) data = { "paper": doc.metadata["file_name"], "chunks": paragraphs, } extracted_data.append(data) df = pd.DataFrame(extracted_data) return [extracted_data, df]