sadickam commited on
Commit
3428389
Β·
verified Β·
1 Parent(s): 694498e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -36
app.py CHANGED
@@ -1,58 +1,123 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from langchain_community.document_loaders import UnstructuredFileLoader
 
 
 
4
 
5
- def extract_text_with_langchain_pdf(pdf_file):
6
- """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
7
- loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
8
- documents = loader.load()
 
 
 
 
9
 
10
- # Initialize an empty list to collect all extracted paragraphs
 
 
 
 
 
 
 
 
 
 
 
11
  extracted_data = []
12
 
13
- # Extract content for each page, split into paragraphs, and collect metadata
14
- doc_name = pdf_file.split("/")[-1] # Get the document name
15
- for doc in documents:
16
- page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
17
- paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
18
-
19
- for paragraph in paragraphs:
20
- if paragraph.strip(): # Skip empty paragraphs
21
- extracted_data.append({
22
- "Document": doc_name,
23
- "Page": page_num,
24
- "Paragraph": paragraph.strip()
25
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  # Convert the extracted data to a DataFrame
28
  df = pd.DataFrame(extracted_data)
29
- return df
 
 
 
 
 
 
30
 
31
- def save_df_to_csv(df, output_filename="extracted_content.csv"):
32
- """Save the DataFrame to a CSV file."""
33
- df.to_csv(output_filename, index=False)
34
  return output_filename
35
 
 
36
  with gr.Blocks() as demo:
37
- with gr.Row():
38
- gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
39
 
40
- with gr.Row():
41
- pdf_file = gr.File(label="Upload PDF", type="filepath")
42
 
43
- with gr.Row():
44
- extract_button = gr.Button("Extract and Download CSV")
 
 
 
 
45
 
46
  with gr.Row():
47
- download_button = gr.File(label="Download Extracted CSV")
48
 
49
- def on_extract(pdf_file):
50
- """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
51
- df = extract_text_with_langchain_pdf(pdf_file)
52
- csv_path = save_df_to_csv(df)
53
- return csv_path
54
 
55
- extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 
56
 
 
 
 
 
 
 
 
 
57
  # Launch the Gradio
58
  demo.queue().launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import os
4
+ from langchain_community.document_loaders import UnstructuredPDFLoader
5
+ from PyPDF2 import PdfReader
6
+ import concurrent.futures
7
 
8
+ def extract_and_save(pdf_file, progress=gr.Progress()):
9
+ """
10
+ Extract text from each page of the PDF, split into paragraphs,
11
+ track page numbers and document name, append to DataFrame,
12
+ and save as a CSV file with progress updates.
13
+ """
14
+ if pdf_file is None:
15
+ return "No file uploaded."
16
 
17
+ pdf_file_path = pdf_file.name
18
+ doc_name = os.path.basename(pdf_file_path)
19
+
20
+ # Initialize PDF reader to get the number of pages
21
+ try:
22
+ reader = PdfReader(pdf_file_path)
23
+ num_pages = len(reader.pages)
24
+ if num_pages == 0:
25
+ return "The uploaded PDF has no pages."
26
+ except Exception as e:
27
+ return f"Error reading PDF: {e}"
28
+
29
  extracted_data = []
30
 
31
+ def process_page(page_num):
32
+ """
33
+ Extract paragraphs from a single page.
34
+ Returns a list of dictionaries with Document, Page, and Paragraph.
35
+ """
36
+ try:
37
+ loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
38
+ documents = loader.load()
39
+ if not documents:
40
+ print(f"No content found on Page {page_num}.")
41
+ return []
42
+ page_data = []
43
+ for doc in documents:
44
+ # Split content into paragraphs based on double newlines
45
+ page_text = '\n'.join(doc.page_content)
46
+ paragraphs = page_text.split("\n\n")
47
+ for para in paragraphs:
48
+ if para.strip(): # Skip empty paragraphs
49
+ page_data.append({
50
+ "Document": doc_name,
51
+ "Page": page_num,
52
+ "Paragraph": para.strip()
53
+ })
54
+ return page_data
55
+ except Exception as e:
56
+ print(f"Error processing Page {page_num}: {e}")
57
+ return []
58
+
59
+ # Use ThreadPoolExecutor for parallel processing
60
+ max_workers = min(3, 6) # Limit the number of threads to prevent resource exhaustion
61
+ with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
62
+ # Submit all page processing tasks
63
+ future_to_page = {executor.submit(process_page, page_num): page_num for page_num in range(1, num_pages + 1)}
64
+ completed = 0
65
+ for future in concurrent.futures.as_completed(future_to_page):
66
+ page_num = future_to_page[future]
67
+ try:
68
+ page_data = future.result()
69
+ extracted_data.extend(page_data)
70
+ except Exception as e:
71
+ print(f"Error processing Page {page_num}: {e}")
72
+ completed += 1
73
+ # Update progress: 1 step per completed page
74
+ progress(1, description=f"Processed page {page_num}/{num_pages}")
75
+
76
+ if not extracted_data:
77
+ return "No text extracted from the PDF."
78
 
79
  # Convert the extracted data to a DataFrame
80
  df = pd.DataFrame(extracted_data)
81
+
82
+ # Save the DataFrame to a CSV file
83
+ output_filename = "extracted_content.csv"
84
+ try:
85
+ df.to_csv(output_filename, index=False)
86
+ except Exception as e:
87
+ return f"Error saving CSV: {e}"
88
 
 
 
 
89
  return output_filename
90
 
91
+ # Gradio Interface
92
  with gr.Blocks() as demo:
93
+ gr.Markdown("""
94
+ # πŸ“„ PDF Text Extractor with Metadata and CSV Export
95
 
96
+ Upload a PDF document to extract its text content. The app processes the PDF **page by page**, splits the text into **paragraphs**, tracks **page numbers** and the **document name**, and compiles the results into a **CSV file** for download.
 
97
 
98
+ ## How It Works
99
+ 1. **Upload PDF**: Select and upload your PDF file.
100
+ 2. **Extract**: Click the "Extract and Download CSV" button to begin extraction.
101
+ 3. **Progress Updates**: Watch the progress bar as each page is processed.
102
+ 4. **Download**: Once complete, download the CSV file containing the extracted data.
103
+ """)
104
 
105
  with gr.Row():
106
+ pdf_input = gr.File(label="πŸ“ Upload PDF", type="filepath")
107
 
108
+ with gr.Row():
109
+ extract_button = gr.Button("🟒 Extract and Download CSV")
 
 
 
110
 
111
+ with gr.Row():
112
+ download_csv = gr.File(label="πŸ“₯ Download Extracted CSV")
113
 
114
+ # Link the button to the extraction function with progress enabled
115
+ extract_button.click(
116
+ fn=extract_and_save,
117
+ inputs=pdf_input,
118
+ outputs=download_csv,
119
+ show_progress=True # Enables the progress bar
120
+ )
121
+
122
  # Launch the Gradio
123
  demo.queue().launch()