sadickam commited on
Commit
c4707d0
Β·
verified Β·
1 Parent(s): af972d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -72
app.py CHANGED
@@ -3,120 +3,153 @@ import pandas as pd
3
  import os
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from PyPDF2 import PdfReader
6
- import concurrent.futures
7
 
8
- def extract_and_save(pdf_file, progress=gr.Progress()):
9
  """
10
- Extract text from each page of the PDF, split into paragraphs,
11
- track page numbers and document name, append to DataFrame,
12
- and save as a CSV file with progress updates.
 
 
 
 
 
 
 
13
  """
14
  if pdf_file is None:
15
  return "No file uploaded."
16
 
17
  pdf_file_path = pdf_file.name
18
- doc_name = os.path.basename(pdf_file_path)
19
-
20
- # Initialize PDF reader to get the number of pages
21
  try:
22
  reader = PdfReader(pdf_file_path)
23
- num_pages = len(reader.pages)
24
- if num_pages == 0:
25
  return "The uploaded PDF has no pages."
26
  except Exception as e:
27
  return f"Error reading PDF: {e}"
28
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  extracted_data = []
30
 
31
- def process_page(page_num):
32
- """
33
- Extract paragraphs from a single page.
34
- Returns a list of dictionaries with Document, Page, and Paragraph.
35
- """
36
- try:
37
- loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
38
- documents = loader.load()
39
- if not documents:
40
- print(f"No content found on Page {page_num}.")
41
- return []
42
- page_data = []
43
- for doc in documents:
44
- # Split content into paragraphs based on double newlines
45
- page_text = '\n'.join(doc.page_content)
46
- paragraphs = page_text.split("\n\n")
47
- for para in paragraphs:
48
- if para.strip(): # Skip empty paragraphs
49
- page_data.append({
50
- "Document": doc_name,
51
- "Page": page_num,
52
- "Paragraph": para.strip()
53
- })
54
- return page_data
55
- except Exception as e:
56
- print(f"Error processing Page {page_num}: {e}")
57
- return []
58
-
59
- # Use ThreadPoolExecutor for parallel processing
60
- max_workers = min(3, 6) # Limit the number of threads to prevent resource exhaustion
61
- with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
62
- # Submit all page processing tasks
63
- future_to_page = {executor.submit(process_page, page_num): page_num for page_num in range(1, num_pages + 1)}
64
- completed = 0
65
- for future in concurrent.futures.as_completed(future_to_page):
66
- page_num = future_to_page[future]
67
- try:
68
- page_data = future.result()
69
- extracted_data.extend(page_data)
70
- except Exception as e:
71
- print(f"Error processing Page {page_num}: {e}")
72
- completed += 1
73
- # Update progress: 1 step per completed page
74
- progress(1, description=f"Processed page {page_num}/{num_pages}")
75
 
76
- if not extracted_data:
77
- return "No text extracted from the PDF."
 
 
 
 
 
 
78
 
79
- # Convert the extracted data to a DataFrame
80
- df = pd.DataFrame(extracted_data)
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
- # Save the DataFrame to a CSV file
83
- output_filename = "extracted_content.csv"
 
 
 
 
 
84
  try:
85
- df.to_csv(output_filename, index=False)
 
 
86
  except Exception as e:
87
  return f"Error saving CSV: {e}"
88
 
89
- return output_filename
90
 
91
  # Gradio Interface
92
  with gr.Blocks() as demo:
93
  gr.Markdown("""
94
- # πŸ“„ PDF Text Extractor with Metadata and CSV Export
95
 
96
- Upload a PDF document to extract its text content. The app processes the PDF **page by page**, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
97
 
98
  ## How It Works
99
  1. **Upload PDF**: Select and upload your PDF file.
100
- 2. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
101
- 3. **Progress Updates**: Watch the progress bar as each page is processed.
102
- 4. **Download**: Once complete, download the CSV file containing the extracted data.
 
 
 
103
  """)
104
 
105
  with gr.Row():
106
  pdf_input = gr.File(label="πŸ“ Upload PDF", type="filepath")
107
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  with gr.Row():
109
  extract_button = gr.Button("🟒 Extract and Download CSV")
110
 
111
  with gr.Row():
112
  download_csv = gr.File(label="πŸ“₯ Download Extracted CSV")
113
 
114
- # Link the button to the extraction function with progress enabled
115
  extract_button.click(
116
  fn=extract_and_save,
117
- inputs=pdf_input,
118
- outputs=download_csv,
119
- show_progress=True # Enables the progress bar
120
  )
121
 
122
  # Launch the Gradio
 
3
  import os
4
  from langchain_community.document_loaders import UnstructuredPDFLoader
5
  from PyPDF2 import PdfReader
 
6
 
7
+ def extract_and_save(pdf_file, extraction_option, start_page, end_page):
8
  """
9
+ Main function to extract text based on user options and save to CSV.
10
+
11
+ Parameters:
12
+ - pdf_file (File): Uploaded PDF file.
13
+ - extraction_option (str): 'All Pages' or 'Page Range'.
14
+ - start_page (int): Starting page number (if applicable).
15
+ - end_page (int): Ending page number (if applicable).
16
+
17
+ Returns:
18
+ - str: Path to the saved CSV file or error message.
19
  """
20
  if pdf_file is None:
21
  return "No file uploaded."
22
 
23
  pdf_file_path = pdf_file.name
24
+
25
+ # Initialize PDF reader to get total pages
 
26
  try:
27
  reader = PdfReader(pdf_file_path)
28
+ total_pages = len(reader.pages)
29
+ if total_pages == 0:
30
  return "The uploaded PDF has no pages."
31
  except Exception as e:
32
  return f"Error reading PDF: {e}"
33
+
34
+ # Determine extraction parameters
35
+ if extraction_option == "All Pages":
36
+ pages_to_extract = list(range(1, total_pages + 1))
37
+ else:
38
+ # Validate start and end pages
39
+ if start_page is None or end_page is None:
40
+ return "Please specify both start and end pages."
41
+ if start_page < 1 or end_page > total_pages:
42
+ return f"Page range must be between 1 and {total_pages}."
43
+ if start_page > end_page:
44
+ return "Start page cannot be greater than end page."
45
+ pages_to_extract = list(range(int(start_page), int(end_page) + 1))
46
+
47
+ doc_name = os.path.basename(pdf_file_path)
48
  extracted_data = []
49
 
50
+ try:
51
+ with gr.Progress() as progress:
52
+ for idx, page_num in enumerate(pages_to_extract, start=1):
53
+ try:
54
+ progress(1, description=f"Processing Page {page_num}/{len(pages_to_extract)}")
55
+ loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
56
+ documents = loader.load()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
+ if not documents:
59
+ print(f"No content found on Page {page_num}.")
60
+ continue
61
+
62
+ # Concatenate all text from the page to preserve column integrity
63
+ pdf_pages_content = '\n'.join(doc.page_content for doc in documents)
64
+ # Split content into paragraphs based on double newlines
65
+ paragraphs = pdf_pages_content.split("\n\n")
66
 
67
+ for para in paragraphs:
68
+ if para.strip(): # Skip empty paragraphs
69
+ extracted_data.append({
70
+ "Document": doc_name,
71
+ "Page": page_num,
72
+ "Paragraph": para.strip()
73
+ })
74
+ except Exception as e:
75
+ print(f"Error processing Page {page_num}: {e}")
76
+ extracted_data.append({
77
+ "Document": doc_name,
78
+ "Page": page_num,
79
+ "Paragraph": f"Error extracting this page: {e}"
80
+ })
81
 
82
+ except Exception as e:
83
+ return f"An error occurred during extraction: {e}"
84
+
85
+ if not extracted_data:
86
+ return "No text extracted from the specified pages."
87
+
88
+ # Save to CSV
89
  try:
90
+ csv_filename = "extracted_content.csv"
91
+ df = pd.DataFrame(extracted_data)
92
+ df.to_csv(csv_filename, index=False)
93
  except Exception as e:
94
  return f"Error saving CSV: {e}"
95
 
96
+ return csv_filename
97
 
98
  # Gradio Interface
99
  with gr.Blocks() as demo:
100
  gr.Markdown("""
101
+ # πŸ“„ PDF Text Extractor with Page Range Selection and CSV Export
102
 
103
+ Upload a PDF document to extract its text content. Choose to extract text from **all pages** or a **specific range of pages**. The app processes the PDF **page by page**, concatenates column texts to maintain paragraph integrity, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
104
 
105
  ## How It Works
106
  1. **Upload PDF**: Select and upload your PDF file.
107
+ 2. **Choose Extraction Option**:
108
+ - **All Pages**: Extract text from every page in the PDF.
109
+ - **Page Range**: Specify the start and end pages to extract text from.
110
+ 3. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
111
+ 4. **Progress Updates**: Watch the progress bar as each page is processed.
112
+ 5. **Download**: Once complete, download the CSV file containing the extracted data.
113
  """)
114
 
115
  with gr.Row():
116
  pdf_input = gr.File(label="πŸ“ Upload PDF", type="filepath")
117
 
118
+ with gr.Row():
119
+ extraction_option = gr.Radio(
120
+ choices=["All Pages", "Page Range"],
121
+ value="All Pages",
122
+ label="Extraction Option"
123
+ )
124
+
125
+ with gr.Row():
126
+ start_page = gr.Number(label="πŸ“ Start Page", value=1, precision=0, visible=False)
127
+ end_page = gr.Number(label="πŸ“ End Page", value=1, precision=0, visible=False)
128
+
129
+ # Show or hide start/end page inputs based on extraction option
130
+ def toggle_page_range(option):
131
+ if option == "Page Range":
132
+ return gr.update(visible=True), gr.update(visible=True)
133
+ else:
134
+ return gr.update(visible=False), gr.update(visible=False)
135
+
136
+ extraction_option.change(
137
+ fn=toggle_page_range,
138
+ inputs=[extraction_option],
139
+ outputs=[start_page, end_page]
140
+ )
141
+
142
  with gr.Row():
143
  extract_button = gr.Button("🟒 Extract and Download CSV")
144
 
145
  with gr.Row():
146
  download_csv = gr.File(label="πŸ“₯ Download Extracted CSV")
147
 
 
148
  extract_button.click(
149
  fn=extract_and_save,
150
+ inputs=[pdf_input, extraction_option, start_page, end_page],
151
+ outputs=[download_csv],
152
+ show_progress=True
153
  )
154
 
155
  # Launch the Gradio