File size: 2,129 Bytes
b2e0c78
c0ce244
9f42776
b2e0c78
9f42776
 
 
 
ff0f9fc
9f42776
f2d2148
 
9f42776
 
 
 
 
ff0f9fc
9f42776
 
 
 
 
 
 
 
 
 
 
 
 
 
f2d2148
 
 
b2e0c78
c4707d0
9f42776
 
 
 
 
3428389
ff0f9fc
9f42776
3428389
9f42776
 
 
 
 
 
 
 
 
ff0f9fc
 
de88355
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import gradio as gr
import pandas as pd
from langchain_community.document_loaders import UnstructuredFileLoader

def extract_text_with_langchain_pdf(pdf_file):
    """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
    loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
    documents = loader.load()

    # Initialize an empty list to collect all extracted paragraphs
    extracted_data = []

    # Extract content for each page, split into paragraphs, and collect metadata
    doc_name = pdf_file.split("/")[-1]  # Get the document name
    for doc in documents:
        page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
        paragraphs = doc.page_content.split("\n\n")  # Split content by paragraphs
        
        for paragraph in paragraphs:
            if paragraph.strip():  # Skip empty paragraphs
                extracted_data.append({
                    "Document": doc_name,
                    "Page": page_num,
                    "Paragraph": paragraph.strip()
                })

    # Convert the extracted data to a DataFrame
    df = pd.DataFrame(extracted_data)
    return df

def save_df_to_csv(df, output_filename="extracted_content.csv"):
    """Save the DataFrame to a CSV file."""
    df.to_csv(output_filename, index=False)
    return output_filename

with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")

    with gr.Row():
        pdf_file = gr.File(label="Upload PDF", type="filepath")

    with gr.Row():
        extract_button = gr.Button("Extract and Download CSV")

    with gr.Row():
        download_button = gr.File(label="Download Extracted CSV")

    def on_extract(pdf_file):
        """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
        df = extract_text_with_langchain_pdf(pdf_file)
        csv_path = save_df_to_csv(df)
        return csv_path

    extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])

# Launch the Gradio app
demo.queue().launch()