sadickam commited on
Commit
9f42776
·
verified ·
1 Parent(s): ff0f9fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -65
app.py CHANGED
@@ -1,83 +1,58 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import time
4
- import os
5
- from langchain_community.document_loaders import UnstructuredPDFLoader
6
- from PyPDF2 import PdfReader
7
 
8
- def extract_text_by_page(pdf_file_path):
9
- """Extract text from each page of the PDF and return as a list of dictionaries."""
10
- # Initialize PDF reader
11
- reader = PdfReader(pdf_file_path)
12
- num_pages = len(reader.pages)
13
- doc_name = os.path.basename(pdf_file_path)
14
 
 
15
  extracted_data = []
16
 
17
- for page_num in range(1, num_pages + 1):
18
- print(f"Processing Page {page_num}...")
19
- # Initialize the loader for the specific page
20
- loader = UnstructuredPDFLoader(pdf_file_path, page_numbers=[page_num-1]) # Zero-based indexing
21
- documents = loader.load()
22
-
23
- if not documents:
24
- print(f"No content found on Page {page_num}.")
25
- continue
26
-
27
- for doc in documents:
28
- paragraphs = doc.page_content.split("\n\n") # Split text into paragraphs
29
- for para in paragraphs:
30
- if para.strip(): # Skip empty paragraphs
31
- extracted_data.append({
32
- "Document": doc_name,
33
- "Page": page_num,
34
- "Paragraph": para.strip()
35
- })
36
 
37
- time.sleep(1) # Optional: Introduce a small delay between pages
38
-
39
- return extracted_data
40
-
41
- def save_to_csv(data, output_filename="extracted_content.csv"):
42
- """Save extracted data to a CSV file."""
43
- df = pd.DataFrame(data)
 
 
 
 
 
 
 
44
  df.to_csv(output_filename, index=False)
45
  return output_filename
46
 
47
- def extract_and_save(pdf_file):
48
- """Main function to extract text and save to CSV."""
49
- if pdf_file is None:
50
- return "No file uploaded."
51
-
52
- # Extract text by page
53
- extracted_data = extract_text_by_page(pdf_file.name)
54
-
55
- if not extracted_data:
56
- return "No text extracted from the PDF."
57
-
58
- # Save to CSV
59
- csv_path = save_to_csv(extracted_data)
60
-
61
- return csv_path
62
-
63
- # Gradio Interface
64
  with gr.Blocks() as demo:
65
- gr.Markdown("# PDF Text Extractor with Page Tracking and CSV Export")
66
-
67
  with gr.Row():
68
- pdf_input = gr.File(label="Upload PDF", type="filepath")
69
-
 
 
 
70
  with gr.Row():
71
  extract_button = gr.Button("Extract and Download CSV")
72
-
73
  with gr.Row():
74
- download_csv = gr.File(label="Download Extracted CSV")
75
-
76
- extract_button.click(
77
- fn=extract_and_save,
78
- inputs=pdf_input,
79
- outputs=download_csv
80
- )
 
 
81
 
82
  # Launch the Gradio app
83
  demo.queue().launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
+ from langchain_community.document_loaders import UnstructuredFileLoader
 
 
 
4
 
5
+ def extract_text_with_langchain_pdf(pdf_file):
6
+ """Extract text from a PDF page by page using LangChain's UnstructuredFileLoader."""
7
+ loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
8
+ documents = loader.load()
 
 
9
 
10
+ # Initialize an empty list to collect all extracted paragraphs
11
  extracted_data = []
12
 
13
+ # Extract content for each page, split into paragraphs, and collect metadata
14
+ doc_name = pdf_file.split("/")[-1] # Get the document name
15
+ for doc in documents:
16
+ page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
17
+ paragraphs = doc.page_content.split("\n\n") # Split content by paragraphs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ for paragraph in paragraphs:
20
+ if paragraph.strip(): # Skip empty paragraphs
21
+ extracted_data.append({
22
+ "Document": doc_name,
23
+ "Page": page_num,
24
+ "Paragraph": paragraph.strip()
25
+ })
26
+
27
+ # Convert the extracted data to a DataFrame
28
+ df = pd.DataFrame(extracted_data)
29
+ return df
30
+
31
+ def save_df_to_csv(df, output_filename="extracted_content.csv"):
32
+ """Save the DataFrame to a CSV file."""
33
  df.to_csv(output_filename, index=False)
34
  return output_filename
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  with gr.Blocks() as demo:
 
 
37
  with gr.Row():
38
+ gr.Markdown("# PDF Text Extractor with Metadata and CSV Export")
39
+
40
+ with gr.Row():
41
+ pdf_file = gr.File(label="Upload PDF", type="filepath")
42
+
43
  with gr.Row():
44
  extract_button = gr.Button("Extract and Download CSV")
45
+
46
  with gr.Row():
47
+ download_button = gr.File(label="Download Extracted CSV")
48
+
49
+ def on_extract(pdf_file):
50
+ """Callback function to extract text, store in a DataFrame, and return a downloadable CSV."""
51
+ df = extract_text_with_langchain_pdf(pdf_file)
52
+ csv_path = save_df_to_csv(df)
53
+ return csv_path
54
+
55
+ extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
56
 
57
  # Launch the Gradio app
58
  demo.queue().launch()