Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

f2d2148

verified ·

1 Parent(s): c4707d0

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -47

app.py CHANGED Viewed

@@ -4,7 +4,67 @@ import os
 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
-def extract_and_save(pdf_file, extraction_option, start_page, end_page):
     """
     Main function to extract text based on user options and save to CSV.
@@ -13,12 +73,13 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
     - extraction_option (str): 'All Pages' or 'Page Range'.
     - start_page (int): Starting page number (if applicable).
     - end_page (int): Ending page number (if applicable).
     Returns:
-    - str: Path to the saved CSV file or error message.
     """
     if pdf_file is None:
-        return "No file uploaded."
     pdf_file_path = pdf_file.name
@@ -27,9 +88,9 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
         reader = PdfReader(pdf_file_path)
         total_pages = len(reader.pages)
         if total_pages == 0:
-            return "The uploaded PDF has no pages."
     except Exception as e:
-        return f"Error reading PDF: {e}"
     # Determine extraction parameters
     if extraction_option == "All Pages":
@@ -37,63 +98,34 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
     else:
         # Validate start and end pages
         if start_page is None or end_page is None:
-            return "Please specify both start and end pages."
         if start_page < 1 or end_page > total_pages:
-            return f"Page range must be between 1 and {total_pages}."
         if start_page > end_page:
-            return "Start page cannot be greater than end page."
         pages_to_extract = list(range(int(start_page), int(end_page) + 1))
-    doc_name = os.path.basename(pdf_file_path)
     extracted_data = []
     try:
-        with gr.Progress() as progress:
-            for idx, page_num in enumerate(pages_to_extract, start=1):
-                try:
-                    progress(1, description=f"Processing Page {page_num}/{len(pages_to_extract)}")
-                    loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
-                    documents = loader.load()
-                    if not documents:
-                        print(f"No content found on Page {page_num}.")
-                        continue
-                    # Concatenate all text from the page to preserve column integrity
-                    pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
-                    # Split content into paragraphs based on double newlines
-                    paragraphs = pdf_pages_content.split("\n\n")
-                    for para in paragraphs:
-                        if para.strip():  # Skip empty paragraphs
-                            extracted_data.append({
-                                "Document": doc_name,
-                                "Page": page_num,
-                                "Paragraph": para.strip()
-                            })
-                except Exception as e:
-                    print(f"Error processing Page {page_num}: {e}")
-                    extracted_data.append({
-                        "Document": doc_name,
-                        "Page": page_num,
-                        "Paragraph": f"Error extracting this page: {e}"
-                    })
     except Exception as e:
-        return f"An error occurred during extraction: {e}"
     if not extracted_data:
-        return "No text extracted from the specified pages."
     # Save to CSV
     try:
         csv_filename = "extracted_content.csv"
-        df = pd.DataFrame(extracted_data)
-        df.to_csv(csv_filename, index=False)
     except Exception as e:
-        return f"Error saving CSV: {e}"
-    return csv_filename
 # Gradio Interface
 with gr.Blocks() as demo:
@@ -103,12 +135,17 @@ with gr.Blocks() as demo:
     Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
     ## How It Works
     1. **Upload PDF**: Select and upload your PDF file.
     2. **Choose Extraction Option**:
        - **All Pages**: Extract text from every page in the PDF.
        - **Page Range**: Specify the start and end pages to extract text from.
     3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
     4. **Progress Updates**: Watch the progress bar as each page is processed.
     5. **Download**: Once complete, download the CSV file containing the extracted data.
     """)
@@ -144,11 +181,12 @@ with gr.Blocks() as demo:
     with gr.Row():
         download_csv = gr.File(label="📥 Download Extracted CSV")
     extract_button.click(
         fn=extract_and_save,
         inputs=[pdf_input, extraction_option, start_page, end_page],
-        outputs=[download_csv],
         show_progress=True
     )

 from langchain_community.document_loaders import UnstructuredPDFLoader
 from PyPDF2 import PdfReader
+def extract_text_by_page(pdf_file_path, page_num):
+    """
+    Extract text from a single page of the PDF and return as a list of dictionaries.
+    Parameters:
+    - pdf_file_path (str): Path to the uploaded PDF file.
+    - page_num (int): Page number to extract (1-based indexing).
+    Returns:
+    - list of dict: Extracted data with Document, Page, and Paragraph.
+    """
+    doc_name = os.path.basename(pdf_file_path)
+    extracted_data = []
+    try:
+        loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1])  # Zero-based indexing
+        documents = loader.load()
+        if not documents:
+            print(f"No content found on Page {page_num}.")
+            return extracted_data  # Empty list
+        # Concatenate all text from the page to preserve column integrity
+        pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
+        # Split content into paragraphs based on double newlines
+        paragraphs = pdf_pages_content.split("\n\n")
+        for para in paragraphs:
+            if para.strip():  # Skip empty paragraphs
+                extracted_data.append({
+                    "Document": doc_name,
+                    "Page": page_num,
+                    "Paragraph": para.strip()
+                })
+    except Exception as e:
+        print(f"Error processing Page {page_num}: {e}")
+        extracted_data.append({
+            "Document": doc_name,
+            "Page": page_num,
+            "Paragraph": f"Error extracting this page: {e}"
+        })
+    return extracted_data
+def save_to_csv(data, output_filename="extracted_content.csv"):
+    """
+    Save extracted data to a CSV file.
+    Parameters:
+    - data (list of dict): Extracted data.
+    - output_filename (str): Name of the output CSV file.
+    Returns:
+    - str: Path to the saved CSV file.
+    """
+    df = pd.DataFrame(data)
+    df.to_csv(output_filename, index=False)
+    return output_filename
+def extract_and_save(pdf_file, extraction_option, start_page, end_page, progress):
     """
     Main function to extract text based on user options and save to CSV.
     - extraction_option (str): 'All Pages' or 'Page Range'.
     - start_page (int): Starting page number (if applicable).
     - end_page (int): Ending page number (if applicable).
+    - progress (gr.Progress): Gradio progress object.
     Returns:
+    - tuple: (csv_path, message)
     """
     if pdf_file is None:
+        return None, "❌ No file uploaded."
     pdf_file_path = pdf_file.name
         reader = PdfReader(pdf_file_path)
         total_pages = len(reader.pages)
         if total_pages == 0:
+            return None, "❌ The uploaded PDF has no pages."
     except Exception as e:
+        return None, f"❌ Error reading PDF: {e}"
     # Determine extraction parameters
     if extraction_option == "All Pages":
     else:
         # Validate start and end pages
         if start_page is None or end_page is None:
+            return None, "❌ Please specify both start and end pages."
         if start_page < 1 or end_page > total_pages:
+            return None, f"❌ Page range must be between 1 and {total_pages}."
         if start_page > end_page:
+            return None, "❌ Start page cannot be greater than end page."
         pages_to_extract = list(range(int(start_page), int(end_page) + 1))
     extracted_data = []
     try:
+        for idx, page_num in enumerate(pages_to_extract, start=1):
+            progress(1, description=f"🔍 Processing Page {page_num}/{len(pages_to_extract)}")
+            page_data = extract_text_by_page(pdf_file_path, page_num)
+            extracted_data.extend(page_data)
     except Exception as e:
+        return None, f"❌ An error occurred during extraction: {e}"
     if not extracted_data:
+        return None, "❌ No text extracted from the specified pages."
     # Save to CSV
     try:
         csv_filename = "extracted_content.csv"
+        csv_path = save_to_csv(extracted_data, csv_filename)
     except Exception as e:
+        return None, f"❌ Error saving CSV: {e}"
+    return csv_path, "✅ Extraction successful! Download your CSV file below."
 # Gradio Interface
 with gr.Blocks() as demo:
     Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
     ## How It Works
     1. **Upload PDF**: Select and upload your PDF file.
     2. **Choose Extraction Option**:
        - **All Pages**: Extract text from every page in the PDF.
        - **Page Range**: Specify the start and end pages to extract text from.
     3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
     4. **Progress Updates**: Watch the progress bar as each page is processed.
     5. **Download**: Once complete, download the CSV file containing the extracted data.
     """)
     with gr.Row():
         download_csv = gr.File(label="📥 Download Extracted CSV")
+        message = gr.Textbox(label="Message", interactive=False, lines=2)
     extract_button.click(
         fn=extract_and_save,
         inputs=[pdf_input, extraction_option, start_page, end_page],
+        outputs=[download_csv, message],
         show_progress=True
     )