sadickam commited on
Commit
f2d2148
Β·
verified Β·
1 Parent(s): c4707d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -47
app.py CHANGED
@@ -4,7 +4,67 @@ import os
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from PyPDF2 import PdfReader
6
 
7
- def extract_and_save(pdf_file, extraction_option, start_page, end_page):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  """
9
  Main function to extract text based on user options and save to CSV.
10
 
@@ -13,12 +73,13 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
13
  - extraction_option (str): 'All Pages' or 'Page Range'.
14
  - start_page (int): Starting page number (if applicable).
15
  - end_page (int): Ending page number (if applicable).
 
16
 
17
  Returns:
18
- - str: Path to the saved CSV file or error message.
19
  """
20
  if pdf_file is None:
21
- return "No file uploaded."
22
 
23
  pdf_file_path = pdf_file.name
24
 
@@ -27,9 +88,9 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
27
  reader = PdfReader(pdf_file_path)
28
  total_pages = len(reader.pages)
29
  if total_pages == 0:
30
- return "The uploaded PDF has no pages."
31
  except Exception as e:
32
- return f"Error reading PDF: {e}"
33
 
34
  # Determine extraction parameters
35
  if extraction_option == "All Pages":
@@ -37,63 +98,34 @@ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
37
  else:
38
  # Validate start and end pages
39
  if start_page is None or end_page is None:
40
- return "Please specify both start and end pages."
41
  if start_page < 1 or end_page > total_pages:
42
- return f"Page range must be between 1 and {total_pages}."
43
  if start_page > end_page:
44
- return "Start page cannot be greater than end page."
45
  pages_to_extract = list(range(int(start_page), int(end_page) + 1))
46
 
47
- doc_name = os.path.basename(pdf_file_path)
48
  extracted_data = []
49
 
50
  try:
51
- with gr.Progress() as progress:
52
- for idx, page_num in enumerate(pages_to_extract, start=1):
53
- try:
54
- progress(1, description=f"Processing Page {page_num}/{len(pages_to_extract)}")
55
- loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
56
- documents = loader.load()
57
-
58
- if not documents:
59
- print(f"No content found on Page {page_num}.")
60
- continue
61
-
62
- # Concatenate all text from the page to preserve column integrity
63
- pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
64
- # Split content into paragraphs based on double newlines
65
- paragraphs = pdf_pages_content.split("\n\n")
66
-
67
- for para in paragraphs:
68
- if para.strip(): # Skip empty paragraphs
69
- extracted_data.append({
70
- "Document": doc_name,
71
- "Page": page_num,
72
- "Paragraph": para.strip()
73
- })
74
- except Exception as e:
75
- print(f"Error processing Page {page_num}: {e}")
76
- extracted_data.append({
77
- "Document": doc_name,
78
- "Page": page_num,
79
- "Paragraph": f"Error extracting this page: {e}"
80
- })
81
-
82
  except Exception as e:
83
- return f"An error occurred during extraction: {e}"
84
 
85
  if not extracted_data:
86
- return "No text extracted from the specified pages."
87
 
88
  # Save to CSV
89
  try:
90
  csv_filename = "extracted_content.csv"
91
- df = pd.DataFrame(extracted_data)
92
- df.to_csv(csv_filename, index=False)
93
  except Exception as e:
94
- return f"Error saving CSV: {e}"
95
 
96
- return csv_filename
97
 
98
  # Gradio Interface
99
  with gr.Blocks() as demo:
@@ -103,12 +135,17 @@ with gr.Blocks() as demo:
103
  Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
104
 
105
  ## How It Works
 
106
  1. **Upload PDF**: Select and upload your PDF file.
 
107
  2. **Choose Extraction Option**:
108
  - **All Pages**: Extract text from every page in the PDF.
109
  - **Page Range**: Specify the start and end pages to extract text from.
 
110
  3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
 
111
  4. **Progress Updates**: Watch the progress bar as each page is processed.
 
112
  5. **Download**: Once complete, download the CSV file containing the extracted data.
113
  """)
114
 
@@ -144,11 +181,12 @@ with gr.Blocks() as demo:
144
 
145
  with gr.Row():
146
  download_csv = gr.File(label="πŸ“₯ Download Extracted CSV")
 
147
 
148
  extract_button.click(
149
  fn=extract_and_save,
150
  inputs=[pdf_input, extraction_option, start_page, end_page],
151
- outputs=[download_csv],
152
  show_progress=True
153
  )
154
 
 
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from PyPDF2 import PdfReader
6
 
7
+ def extract_text_by_page(pdf_file_path, page_num):
8
+ """
9
+ Extract text from a single page of the PDF and return as a list of dictionaries.
10
+
11
+ Parameters:
12
+ - pdf_file_path (str): Path to the uploaded PDF file.
13
+ - page_num (int): Page number to extract (1-based indexing).
14
+
15
+ Returns:
16
+ - list of dict: Extracted data with Document, Page, and Paragraph.
17
+ """
18
+ doc_name = os.path.basename(pdf_file_path)
19
+ extracted_data = []
20
+
21
+ try:
22
+ loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
23
+ documents = loader.load()
24
+ if not documents:
25
+ print(f"No content found on Page {page_num}.")
26
+ return extracted_data # Empty list
27
+
28
+ # Concatenate all text from the page to preserve column integrity
29
+ pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
30
+
31
+ # Split content into paragraphs based on double newlines
32
+ paragraphs = pdf_pages_content.split("\n\n")
33
+
34
+ for para in paragraphs:
35
+ if para.strip(): # Skip empty paragraphs
36
+ extracted_data.append({
37
+ "Document": doc_name,
38
+ "Page": page_num,
39
+ "Paragraph": para.strip()
40
+ })
41
+
42
+ except Exception as e:
43
+ print(f"Error processing Page {page_num}: {e}")
44
+ extracted_data.append({
45
+ "Document": doc_name,
46
+ "Page": page_num,
47
+ "Paragraph": f"Error extracting this page: {e}"
48
+ })
49
+
50
+ return extracted_data
51
+
52
+ def save_to_csv(data, output_filename="extracted_content.csv"):
53
+ """
54
+ Save extracted data to a CSV file.
55
+
56
+ Parameters:
57
+ - data (list of dict): Extracted data.
58
+ - output_filename (str): Name of the output CSV file.
59
+
60
+ Returns:
61
+ - str: Path to the saved CSV file.
62
+ """
63
+ df = pd.DataFrame(data)
64
+ df.to_csv(output_filename, index=False)
65
+ return output_filename
66
+
67
+ def extract_and_save(pdf_file, extraction_option, start_page, end_page, progress):
68
  """
69
  Main function to extract text based on user options and save to CSV.
70
 
 
73
  - extraction_option (str): 'All Pages' or 'Page Range'.
74
  - start_page (int): Starting page number (if applicable).
75
  - end_page (int): Ending page number (if applicable).
76
+ - progress (gr.Progress): Gradio progress object.
77
 
78
  Returns:
79
+ - tuple: (csv_path, message)
80
  """
81
  if pdf_file is None:
82
+ return None, "❌ No file uploaded."
83
 
84
  pdf_file_path = pdf_file.name
85
 
 
88
  reader = PdfReader(pdf_file_path)
89
  total_pages = len(reader.pages)
90
  if total_pages == 0:
91
+ return None, "❌ The uploaded PDF has no pages."
92
  except Exception as e:
93
+ return None, f"❌ Error reading PDF: {e}"
94
 
95
  # Determine extraction parameters
96
  if extraction_option == "All Pages":
 
98
  else:
99
  # Validate start and end pages
100
  if start_page is None or end_page is None:
101
+ return None, "❌ Please specify both start and end pages."
102
  if start_page < 1 or end_page > total_pages:
103
+ return None, f"❌ Page range must be between 1 and {total_pages}."
104
  if start_page > end_page:
105
+ return None, "❌ Start page cannot be greater than end page."
106
  pages_to_extract = list(range(int(start_page), int(end_page) + 1))
107
 
 
108
  extracted_data = []
109
 
110
  try:
111
+ for idx, page_num in enumerate(pages_to_extract, start=1):
112
+ progress(1, description=f"πŸ” Processing Page {page_num}/{len(pages_to_extract)}")
113
+ page_data = extract_text_by_page(pdf_file_path, page_num)
114
+ extracted_data.extend(page_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  except Exception as e:
116
+ return None, f"❌ An error occurred during extraction: {e}"
117
 
118
  if not extracted_data:
119
+ return None, "❌ No text extracted from the specified pages."
120
 
121
  # Save to CSV
122
  try:
123
  csv_filename = "extracted_content.csv"
124
+ csv_path = save_to_csv(extracted_data, csv_filename)
 
125
  except Exception as e:
126
+ return None, f"❌ Error saving CSV: {e}"
127
 
128
+ return csv_path, "βœ… Extraction successful! Download your CSV file below."
129
 
130
  # Gradio Interface
131
  with gr.Blocks() as demo:
 
135
  Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
136
 
137
  ## How It Works
138
+
139
  1. **Upload PDF**: Select and upload your PDF file.
140
+
141
  2. **Choose Extraction Option**:
142
  - **All Pages**: Extract text from every page in the PDF.
143
  - **Page Range**: Specify the start and end pages to extract text from.
144
+
145
  3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
146
+
147
  4. **Progress Updates**: Watch the progress bar as each page is processed.
148
+
149
  5. **Download**: Once complete, download the CSV file containing the extracted data.
150
  """)
151
 
 
181
 
182
  with gr.Row():
183
  download_csv = gr.File(label="πŸ“₯ Download Extracted CSV")
184
+ message = gr.Textbox(label="Message", interactive=False, lines=2)
185
 
186
  extract_button.click(
187
  fn=extract_and_save,
188
  inputs=[pdf_input, extraction_option, start_page, end_page],
189
+ outputs=[download_csv, message],
190
  show_progress=True
191
  )
192