jojortz's picture
add initial visualize app
3caa485
import re
import pandas as pd
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader
from llama_parse import LlamaParse
load_dotenv()
MIN_PARAGRAPH_LENGTH = 50
def extract_paragraphs(markdown_text):
"""
Extract paragraphs from a markdown text.
"""
# Split the text into paragraphs using regex
paragraphs = re.split(r"\n\n+", markdown_text)
# Remove leading and trailing whitespaces from each paragraph
paragraphs = [p.strip() for p in paragraphs if p.strip()]
paragraphs = [
p
for p in paragraphs
if len(p) >= MIN_PARAGRAPH_LENGTH and not p.startswith("#")
]
print(f"created {len(paragraphs)} paragraphs\n", paragraphs)
return paragraphs
def extract_endpoint(file_paths):
"""
Extract PDFs using LlamaParse.
"""
# set up parser
parser = LlamaParse(result_type="markdown") # "markdown" and "text" are available
# use SimpleDirectoryReader to parse our file
file_extractor = {".pdf": parser}
documents = SimpleDirectoryReader(
input_files=file_paths, file_extractor=file_extractor
).load_data()
extracted_data = []
for doc in documents:
print(doc.text[:500])
paragraphs = extract_paragraphs(doc.text)
data = {
"paper": doc.metadata["file_name"],
"chunks": paragraphs,
}
extracted_data.append(data)
df = pd.DataFrame(extracted_data)
return [extracted_data, df]