Spaces:

sadickam
/

PDF-text-extra

Running

App Files Files Community

sadickam commited on Oct 15, 2024

Commit

b2e0c78

verified ·

1 Parent(s): 01f3b85

Create app.py

Browse files

Files changed (1) hide show

app.py +128 -0

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# Ensure Poppler is installed
+from install_poppler import install_poppler
+install_poppler()  # Run the Poppler installation function
+import layoutparser as lp
+from pdf2image import convert_from_path
+import pytesseract
+import pandas as pd
+import torch
+import gradio as gr
+import logging
+import time
+import os
+import spaces
+# Initialize logging
+logging.basicConfig(
+    filename='pdf_extraction.log',
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+)
+# Initialize Detectron2 model with GPU support
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = lp.Detectron2LayoutModel(
+    'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
+    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
+    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
+    device=device
+)
+def pdf_to_images(pdf_path, start_page=0, end_page=None):
+    """Convert PDF pages to images."""
+    return convert_from_path(pdf_path, dpi=300, first_page=start_page + 1, last_page=end_page)
+def extract_layout_elements(image):
+    """Detect layout elements (text blocks and tables) from an image."""
+    layout = model.detect(image)
+    text_blocks = lp.Layout([b for b in layout if b.type in ["Text", "Title"]])
+    table_blocks = lp.Layout([b for b in layout if b.type == "Table"])
+    return text_blocks, table_blocks
+def extract_text_from_block(image, block):
+    """Perform OCR on a cropped block."""
+    segment = image.crop(block.coordinates)
+    text = pytesseract.image_to_string(segment)
+    return text.strip()
+def process_pdf_in_batches(pdf_file, batch_size, wait_time):
+    """Process the PDF in batches and return a DataFrame."""
+    num_pages = len(convert_from_path(pdf_file, dpi=300, first_page=1, last_page=2))
+    data = []
+    for batch_start in range(0, num_pages, batch_size):
+        batch_end = min(batch_start + batch_size, num_pages)
+        logging.info(f"Processing pages {batch_start + 1} to {batch_end}...")
+        try:
+            images = pdf_to_images(pdf_file, start_page=batch_start, end_page=batch_end)
+            for page_num, image in enumerate(images, start=batch_start + 1):
+                text_blocks, table_blocks = extract_layout_elements(image)
+                for block in text_blocks:
+                    text_content = extract_text_from_block(image, block)
+                    content_type = "Title" if block.type == "Title" else "Paragraph"
+                    data.append([pdf_file.name, page_num, content_type, text_content])
+                for table in table_blocks:
+                    table_image = image.crop(table.coordinates)
+                    table_data = pytesseract.image_to_string(table_image, config='--psm 6').splitlines()
+                    for row in table_data:
+                        if row.strip():
+                            data.append([pdf_file.name, page_num, "TableRow", row])
+        except Exception as e:
+            logging.error(f"Error processing pages {batch_start + 1} to {batch_end}: {str(e)}")
+        logging.info(f"Completed batch {batch_start + 1} to {batch_end}")
+        time.sleep(wait_time)
+    df = pd.DataFrame(data, columns=["Document", "Page", "Content_Type", "Content"])
+    return df
+def extract_and_save_pdf_content(pdf_file, batch_size, wait_time):
+    """Extract content from the uploaded PDF and save it as a CSV."""
+    df = process_pdf_in_batches(pdf_file, batch_size, wait_time)
+    output_path = f"{os.path.splitext(pdf_file.name)[0]}_extracted.csv"
+    df.to_csv(output_path, index=False)
+    logging.info(f"Data saved to {output_path}")
+    return output_path
+def gradio_interface(pdf_file, batch_size, wait_time):
+    """Gradio interface function to extract content and return CSV."""
+    output_csv = extract_and_save_pdf_content(pdf_file, batch_size, wait_time)
+    return output_csv
+# Gradio Blocks Interface
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("# ML-powered PDF Extractor")
+    with gr.Row():
+        gr.Markdown("Upload a PDF to extract text, titles, and tables into a structured CSV. Adjust batch size and wait time for optimal performance.")
+    with gr.Row():
+        pdf_file = gr.File(label="Upload PDF", type="file")
+    with gr.Row():
+        batch_size = gr.Number(label="Batch Size", value=5, precision=0)
+        wait_time = gr.Number(label="Wait Time (seconds)", value=5, precision=1)
+    with gr.Row():
+        extract_button = gr.Button("Extract PDF Content")
+    with gr.Row():
+        output_csv = gr.File(label="Download Extracted CSV")
+    @spaces.GPU
+    def on_extract(pdf_file, batch_size, wait_time):
+        """Callback function to extract content and display the result."""
+        csv_path = gradio_interface(pdf_file, batch_size, wait_time)
+        return csv_path
+    extract_button.click(on_extract, inputs=[pdf_file, batch_size, wait_time], outputs=output_csv)
+# Launch the app
+demo.queue().launch()