PDF-text-extra / app.py
sadickam's picture
Update app.py
9f42776 verified
raw
history blame
2.13 kB
import gradio as gr
import pandas as pd
from langchain_community.document_loaders import UnstructuredFileLoader
def extract_text_with_langchain_pdf(pdf_file):
"""Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
documents = loader.load()
# Initialize an empty list to collect all extracted paragraphs
extracted_data = []
# Extract content for each page, split into paragraphs, and collect metadata
doc_name = pdf_file.split("/")[-1] # Get the document name
for doc in documents:
page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
for paragraph in paragraphs:
if paragraph.strip(): # Skip empty paragraphs
extracted_data.append({
"Document": doc_name,
"Page": page_num,
"Paragraph": paragraph.strip()
})
# Convert the extracted data to a DataFrame
df = pd.DataFrame(extracted_data)
return df
def save_df_to_csv(df, output_filename="extracted_content.csv"):
"""Save the DataFrame to a CSV file."""
df.to_csv(output_filename, index=False)
return output_filename
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
with gr.Row():
pdf_file = gr.File(label="Upload PDF", type="filepath")
with gr.Row():
extract_button = gr.Button("Extract and Download CSV")
with gr.Row():
download_button = gr.File(label="Download Extracted CSV")
def on_extract(pdf_file):
"""Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
df = extract_text_with_langchain_pdf(pdf_file)
csv_path = save_df_to_csv(df)
return csv_path
extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
# Launch the Gradio app
demo.queue().launch()