Spaces:
Running
Running
import gradio as gr | |
import pandas as pd | |
from langchain_community.document_loaders import UnstructuredFileLoader | |
def extract_text_with_langchain_pdf(pdf_file): | |
"""Extract text from a PDF page by page using LangChain's UnstructuredFileLoader.""" | |
loader = UnstructuredFileLoader(pdf_file) # Use the file path directly | |
documents = loader.load() | |
# Initialize an empty list to collect all extracted paragraphs | |
extracted_data = [] | |
# Extract content for each page, split into paragraphs, and collect metadata | |
doc_name = pdf_file.split("/")[-1] # Get the document name | |
for doc in documents: | |
page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available | |
paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs | |
for paragraph in paragraphs: | |
if paragraph.strip(): # Skip empty paragraphs | |
extracted_data.append({ | |
"Document": doc_name, | |
"Page": page_num, | |
"Paragraph": paragraph.strip() | |
}) | |
# Convert the extracted data to a DataFrame | |
df = pd.DataFrame(extracted_data) | |
return df | |
def save_df_to_csv(df, output_filename="extracted_content.csv"): | |
"""Save the DataFrame to a CSV file.""" | |
df.to_csv(output_filename, index=False) | |
return output_filename | |
with gr.Blocks() as demo: | |
with gr.Row(): | |
gr.Markdown("# PDF Text Extractor with Metadata and CSV Export") | |
with gr.Row(): | |
pdf_file = gr.File(label="Upload PDF", type="filepath") | |
with gr.Row(): | |
extract_button = gr.Button("Extract and Download CSV") | |
with gr.Row(): | |
download_button = gr.File(label="Download Extracted CSV") | |
def on_extract(pdf_file): | |
"""Callback function to extract text, store in a DataFrame, and return a downloadable CSV.""" | |
df = extract_text_with_langchain_pdf(pdf_file) | |
csv_path = save_df_to_csv(df) | |
return csv_path | |
extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button]) | |
# Launch the Gradio app | |
demo.queue().launch() |