|
import re |
|
|
|
import pandas as pd |
|
from dotenv import load_dotenv |
|
from llama_index.core import SimpleDirectoryReader |
|
from llama_parse import LlamaParse |
|
|
|
load_dotenv() |
|
MIN_PARAGRAPH_LENGTH = 50 |
|
|
|
|
|
def extract_paragraphs(markdown_text): |
|
""" |
|
Extract paragraphs from a markdown text. |
|
""" |
|
|
|
paragraphs = re.split(r"\n\n+", markdown_text) |
|
|
|
paragraphs = [p.strip() for p in paragraphs if p.strip()] |
|
paragraphs = [ |
|
p |
|
for p in paragraphs |
|
if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#") |
|
] |
|
print(f"created {len(paragraphs)} paragraphs\n", paragraphs) |
|
|
|
return paragraphs |
|
|
|
|
|
def extract_endpoint(file_paths): |
|
""" |
|
Extract PDFs using LlamaParse. |
|
""" |
|
|
|
|
|
parser = LlamaParse(result_type="markdown") |
|
|
|
|
|
file_extractor = {".pdf": parser} |
|
documents = SimpleDirectoryReader( |
|
input_files=file_paths, file_extractor=file_extractor |
|
).load_data() |
|
|
|
extracted_data = [] |
|
|
|
for doc in documents: |
|
print(doc.text[:500]) |
|
paragraphs = extract_paragraphs(doc.text) |
|
data = { |
|
"paper": doc.metadata["file_name"], |
|
"chunks": paragraphs, |
|
} |
|
extracted_data.append(data) |
|
|
|
df = pd.DataFrame(extracted_data) |
|
|
|
return [extracted_data, df] |
|
|