sadickam commited on
Commit
b2e0c78
·
verified ·
1 Parent(s): 01f3b85

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +128 -0
app.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ensure Poppler is installed
2
+ from install_poppler import install_poppler
3
+
4
+ install_poppler() # Run the Poppler installation function
5
+
6
+ import layoutparser as lp
7
+ from pdf2image import convert_from_path
8
+ import pytesseract
9
+ import pandas as pd
10
+ import torch
11
+ import gradio as gr
12
+ import logging
13
+ import time
14
+ import os
15
+ import spaces
16
+
17
+ # Initialize logging
18
+ logging.basicConfig(
19
+ filename='pdf_extraction.log',
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s',
22
+ )
23
+
24
+ # Initialize Detectron2 model with GPU support
25
+ device = "cuda" if torch.cuda.is_available() else "cpu"
26
+ model = lp.Detectron2LayoutModel(
27
+ 'lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config',
28
+ extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8],
29
+ label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
30
+ device=device
31
+ )
32
+
33
+ def pdf_to_images(pdf_path, start_page=0, end_page=None):
34
+ """Convert PDF pages to images."""
35
+ return convert_from_path(pdf_path, dpi=300, first_page=start_page + 1, last_page=end_page)
36
+
37
+ def extract_layout_elements(image):
38
+ """Detect layout elements (text blocks and tables) from an image."""
39
+ layout = model.detect(image)
40
+ text_blocks = lp.Layout([b for b in layout if b.type in ["Text", "Title"]])
41
+ table_blocks = lp.Layout([b for b in layout if b.type == "Table"])
42
+ return text_blocks, table_blocks
43
+
44
+ def extract_text_from_block(image, block):
45
+ """Perform OCR on a cropped block."""
46
+ segment = image.crop(block.coordinates)
47
+ text = pytesseract.image_to_string(segment)
48
+ return text.strip()
49
+
50
+ def process_pdf_in_batches(pdf_file, batch_size, wait_time):
51
+ """Process the PDF in batches and return a DataFrame."""
52
+ num_pages = len(convert_from_path(pdf_file, dpi=300, first_page=1, last_page=2))
53
+ data = []
54
+
55
+ for batch_start in range(0, num_pages, batch_size):
56
+ batch_end = min(batch_start + batch_size, num_pages)
57
+ logging.info(f"Processing pages {batch_start + 1} to {batch_end}...")
58
+
59
+ try:
60
+ images = pdf_to_images(pdf_file, start_page=batch_start, end_page=batch_end)
61
+
62
+ for page_num, image in enumerate(images, start=batch_start + 1):
63
+ text_blocks, table_blocks = extract_layout_elements(image)
64
+
65
+ for block in text_blocks:
66
+ text_content = extract_text_from_block(image, block)
67
+ content_type = "Title" if block.type == "Title" else "Paragraph"
68
+ data.append([pdf_file.name, page_num, content_type, text_content])
69
+
70
+ for table in table_blocks:
71
+ table_image = image.crop(table.coordinates)
72
+ table_data = pytesseract.image_to_string(table_image, config='--psm 6').splitlines()
73
+ for row in table_data:
74
+ if row.strip():
75
+ data.append([pdf_file.name, page_num, "TableRow", row])
76
+
77
+ except Exception as e:
78
+ logging.error(f"Error processing pages {batch_start + 1} to {batch_end}: {str(e)}")
79
+
80
+ logging.info(f"Completed batch {batch_start + 1} to {batch_end}")
81
+ time.sleep(wait_time)
82
+
83
+ df = pd.DataFrame(data, columns=["Document", "Page", "Content_Type", "Content"])
84
+ return df
85
+
86
+ def extract_and_save_pdf_content(pdf_file, batch_size, wait_time):
87
+ """Extract content from the uploaded PDF and save it as a CSV."""
88
+ df = process_pdf_in_batches(pdf_file, batch_size, wait_time)
89
+ output_path = f"{os.path.splitext(pdf_file.name)[0]}_extracted.csv"
90
+ df.to_csv(output_path, index=False)
91
+ logging.info(f"Data saved to {output_path}")
92
+ return output_path
93
+
94
+ def gradio_interface(pdf_file, batch_size, wait_time):
95
+ """Gradio interface function to extract content and return CSV."""
96
+ output_csv = extract_and_save_pdf_content(pdf_file, batch_size, wait_time)
97
+ return output_csv
98
+
99
+ # Gradio Blocks Interface
100
+ with gr.Blocks() as demo:
101
+ with gr.Row():
102
+ gr.Markdown("# ML-powered PDF Extractor")
103
+ with gr.Row():
104
+ gr.Markdown("Upload a PDF to extract text, titles, and tables into a structured CSV. Adjust batch size and wait time for optimal performance.")
105
+
106
+ with gr.Row():
107
+ pdf_file = gr.File(label="Upload PDF", type="file")
108
+
109
+ with gr.Row():
110
+ batch_size = gr.Number(label="Batch Size", value=5, precision=0)
111
+ wait_time = gr.Number(label="Wait Time (seconds)", value=5, precision=1)
112
+
113
+ with gr.Row():
114
+ extract_button = gr.Button("Extract PDF Content")
115
+
116
+ with gr.Row():
117
+ output_csv = gr.File(label="Download Extracted CSV")
118
+
119
+ @spaces.GPU
120
+ def on_extract(pdf_file, batch_size, wait_time):
121
+ """Callback function to extract content and display the result."""
122
+ csv_path = gradio_interface(pdf_file, batch_size, wait_time)
123
+ return csv_path
124
+
125
+ extract_button.click(on_extract, inputs=[pdf_file, batch_size, wait_time], outputs=output_csv)
126
+
127
+ # Launch the app
128
+ demo.queue().launch()