Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

3428389

verified ·

1 Parent(s): 694498e

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -36

app.py CHANGED Viewed

@@ -1,58 +1,123 @@
 import gradio as gr
 import pandas as pd
-from langchain_community.document_loaders import UnstructuredFileLoader
-def extract_text_with_langchain_pdf(pdf_file):
-    """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
-    loader = UnstructuredFileLoader(pdf_file)  # Use the file path directly
-    documents = loader.load()
-    # Initialize an empty list to collect all extracted paragraphs
     extracted_data = []
-    # Extract content for each page, split into paragraphs, and collect metadata
-    doc_name = pdf_file.split("/")[-1]  # Get the document name
-    for doc in documents:
-        page_num = doc.metadata.get("page_number", "Unknown")  # Get the page number if available
-        paragraphs = doc.page_content.split("\n\n")  # Split content by paragraphs
-        for paragraph in paragraphs:
-            if paragraph.strip():  # Skip empty paragraphs
-                extracted_data.append({
-                    "Document": doc_name,
-                    "Page": page_num,
-                    "Paragraph": paragraph.strip()
-                })
     # Convert the extracted data to a DataFrame
     df = pd.DataFrame(extracted_data)
-    return df
-def save_df_to_csv(df, output_filename="extracted_content.csv"):
-    """Save the DataFrame to a CSV file."""
-    df.to_csv(output_filename, index=False)
     return output_filename
 with gr.Blocks() as demo:
-    with gr.Row():
-        gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
-    with gr.Row():
-        pdf_file = gr.File(label="Upload PDF", type="filepath")
-    with gr.Row():
-        extract_button = gr.Button("Extract and Download CSV")
     with gr.Row():
-        download_button = gr.File(label="Download Extracted CSV")
-    def on_extract(pdf_file):
-        """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
-        df = extract_text_with_langchain_pdf(pdf_file)
-        csv_path = save_df_to_csv(df)
-        return csv_path
-    extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 # Launch the Gradio
 demo.queue().launch()

 import gradio as gr
 import pandas as pd
+import os
+from langchain_community.document_loaders import UnstructuredPDFLoader
+from PyPDF2 import PdfReader
+import concurrent.futures
+def extract_and_save(pdf_file, progress=gr.Progress()):
+    """
+    Extract text from each page of the PDF, split into paragraphs,
+    track page numbers and document name, append to DataFrame,
+    and save as a CSV file with progress updates.
+    """
+    if pdf_file is None:
+        return "No file uploaded."
+    pdf_file_path = pdf_file.name
+    doc_name = os.path.basename(pdf_file_path)
+    # Initialize PDF reader to get the number of pages
+    try:
+        reader = PdfReader(pdf_file_path)
+        num_pages = len(reader.pages)
+        if num_pages == 0:
+            return "The uploaded PDF has no pages."
+    except Exception as e:
+        return f"Error reading PDF: {e}"
     extracted_data = []
+    def process_page(page_num):
+        """
+        Extract paragraphs from a single page.
+        Returns a list of dictionaries with Document, Page, and Paragraph.
+        """
+        try:
+            loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
+            documents = loader.load()
+            if not documents:
+                print(f"No content found on Page {page_num}.")
+                return []
+            page_data = []
+            for doc in documents:
+                # Split content into paragraphs based on double newlines
+                page_text = '\n'.join(doc.page_content)
+                paragraphs = page_text.split("\n\n")
+                for para in paragraphs:
+                    if para.strip():  # Skip empty paragraphs
+                        page_data.append({
+                            "Document": doc_name,
+                            "Page": page_num,
+                            "Paragraph": para.strip()
+                        })
+            return page_data
+        except Exception as e:
+            print(f"Error processing Page {page_num}: {e}")
+            return []
+    # Use ThreadPoolExecutor for parallel processing
+    max_workers = min(3, 6)  # Limit the number of threads to prevent resource exhaustion
+    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+        # Submit all page processing tasks
+        future_to_page = {executor.submit(process_page, page_num): page_num for page_num in range(1, num_pages + 1)}
+        completed = 0
+        for future in concurrent.futures.as_completed(future_to_page):
+            page_num = future_to_page[future]
+            try:
+                page_data = future.result()
+                extracted_data.extend(page_data)
+            except Exception as e:
+                print(f"Error processing Page {page_num}: {e}")
+            completed += 1
+            # Update progress: 1 step per completed page
+            progress(1, description=f"Processed page {page_num}/{num_pages}")
+    if not extracted_data:
+        return "No text extracted from the PDF."
     # Convert the extracted data to a DataFrame
     df = pd.DataFrame(extracted_data)
+    # Save the DataFrame to a CSV file
+    output_filename = "extracted_content.csv"
+    try:
+        df.to_csv(output_filename, index=False)
+    except Exception as e:
+        return f"Error saving CSV: {e}"
     return output_filename
+# Gradio Interface
 with gr.Blocks() as demo:
+    gr.Markdown("""
+    # 📄 PDF Text Extractor with Metadata and CSV Export
+    Upload a PDF document to extract its text content. The app processes the PDF **page by page**, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
+    ## How It Works
+    1. **Upload PDF**: Select and upload your PDF file.
+    2. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
+    3. **Progress Updates**: Watch the progress bar as each page is processed.
+    4. **Download**: Once complete, download the CSV file containing the extracted data.
+    """)
     with gr.Row():
+        pdf_input = gr.File(label="📁 Upload PDF", type="filepath")
+    with gr.Row():
+        extract_button = gr.Button("🟢 Extract and Download CSV")
+    with gr.Row():
+        download_csv = gr.File(label="📥 Download Extracted CSV")
+    # Link the button to the extraction function with progress enabled
+    extract_button.click(
+        fn=extract_and_save,
+        inputs=pdf_input,
+        outputs=download_csv,
+        show_progress=True  # Enables the progress bar
+    )
 # Launch the Gradio
 demo.queue().launch()