Spaces:
Running
Running
File size: 2,129 Bytes
b2e0c78 c0ce244 9f42776 b2e0c78 9f42776 ff0f9fc 9f42776 f2d2148 9f42776 ff0f9fc 9f42776 f2d2148 b2e0c78 c4707d0 9f42776 3428389 ff0f9fc 9f42776 3428389 9f42776 ff0f9fc de88355 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import gradio as gr
import pandas as pd
from langchain_community.document_loaders import UnstructuredFileLoader
def extract_text_with_langchain_pdf(pdf_file):
"""Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
documents = loader.load()
# Initialize an empty list to collect all extracted paragraphs
extracted_data = []
# Extract content for each page, split into paragraphs, and collect metadata
doc_name = pdf_file.split("/")[-1] # Get the document name
for doc in documents:
page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
for paragraph in paragraphs:
if paragraph.strip(): # Skip empty paragraphs
extracted_data.append({
"Document": doc_name,
"Page": page_num,
"Paragraph": paragraph.strip()
})
# Convert the extracted data to a DataFrame
df = pd.DataFrame(extracted_data)
return df
def save_df_to_csv(df, output_filename="extracted_content.csv"):
"""Save the DataFrame to a CSV file."""
df.to_csv(output_filename, index=False)
return output_filename
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
with gr.Row():
pdf_file = gr.File(label="Upload PDF", type="filepath")
with gr.Row():
extract_button = gr.Button("Extract and Download CSV")
with gr.Row():
download_button = gr.File(label="Download Extracted CSV")
def on_extract(pdf_file):
"""Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
df = extract_text_with_langchain_pdf(pdf_file)
csv_path = save_df_to_csv(df)
return csv_path
extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
# Launch the Gradio app
demo.queue().launch() |