sadickam commited on
Commit
de88355
·
verified ·
1 Parent(s): d2db20e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -57
app.py CHANGED
@@ -1,78 +1,45 @@
1
  import gradio as gr
2
- import pandas as pd
3
- import time
4
- from langchain_community.document_loaders import UnstructuredFileLoader # Updated import
5
 
6
  def extract_text_with_langchain_pdf(pdf_file):
7
  """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
8
- loader = UnstructuredFileLoader(pdf_file) # Pass the filepath directly
9
  documents = loader.load()
10
 
11
- # Collect text per page and return as a list of tuples (page_num, paragraph)
12
- extracted_data = []
13
  for doc in documents:
14
- page_num = doc.metadata.get("page_number", "Unknown") # Extract page number if available
15
- paragraphs = doc.page_content.split("\n\n") # Split text by paragraphs
16
- for para in paragraphs:
17
- if para.strip(): # Skip empty paragraphs
18
- extracted_data.append((page_num, para.strip()))
19
-
20
- return extracted_data
21
-
22
- def process_pdf_with_batches(pdf_file, batch_size, wait_time):
23
- """Extract text, split into batches, and store in a DataFrame."""
24
- extracted_data = extract_text_with_langchain_pdf(pdf_file)
25
- doc_name = pdf_file.split("/")[-1]
26
-
27
- # Create a DataFrame from the extracted data
28
- df = pd.DataFrame(extracted_data, columns=["Page", "Paragraph"])
29
- df["Document"] = doc_name # Add document name as a column
30
-
31
- # Split the DataFrame into batches for display
32
- batches = [df[i:i + batch_size] for i in range(0, len(df), batch_size)]
33
 
34
- output = []
35
- for idx, batch in enumerate(batches):
36
- output.append(f"Batch {idx + 1}:\n{batch.to_string(index=False)}")
37
- time.sleep(wait_time) # Wait between batches
38
 
39
- return df, "\n\n".join(output)
40
-
41
- def save_csv(df):
42
- """Save the extracted DataFrame to a CSV file."""
43
- output_path = "extracted_content.csv"
44
- df.to_csv(output_path, index=False)
45
- return output_path
46
 
47
  with gr.Blocks() as demo:
48
  with gr.Row():
49
- gr.Markdown("# Enhanced PDF Text Extractor with LangChain")
50
 
51
  with gr.Row():
52
- pdf_file = gr.File(label="Upload PDF", type="filepath") # Updated type to 'filepath'
53
 
54
  with gr.Row():
55
- batch_size = gr.Slider(label="Batch Size (rows)", value=10, minimum=1, maximum=50, step=1)
56
- wait_time = gr.Slider(label="Wait Time (seconds)", value=2, minimum=0, maximum=10, step=0.5)
57
 
58
  with gr.Row():
59
- extract_button = gr.Button("Extract and Save CSV")
60
 
61
- with gr.Row():
62
- output_text = gr.Textbox(label="Extracted Text", lines=20, interactive=False)
63
- download_button = gr.File(label="Download Extracted CSV")
64
-
65
- def on_extract(pdf_file, batch_size, wait_time):
66
- """Callback function to extract text, display batches, and save CSV."""
67
- df, batch_output = process_pdf_with_batches(pdf_file, int(batch_size), wait_time)
68
- csv_path = save_csv(df)
69
- return batch_output, csv_path
70
 
71
- extract_button.click(
72
- on_extract,
73
- inputs=[pdf_file, batch_size, wait_time],
74
- outputs=[output_text, download_button]
75
- )
76
 
77
- # Launch the Gradio app
78
- demo.queue().launch()
 
1
  import gradio as gr
2
+ from langchain_community.document_loaders import UnstructuredFileLoader
 
 
3
 
4
  def extract_text_with_langchain_pdf(pdf_file):
5
  """Extract text from a PDF using LangChain's UnstructuredFileLoader."""
6
+ loader = UnstructuredFileLoader(pdf_file) # Use the file path directly
7
  documents = loader.load()
8
 
9
+ # Concatenate the content from all pages with page numbers
10
+ pdf_content = ""
11
  for doc in documents:
12
+ page_num = doc.metadata.get("page_number", "Unknown") # Get the page number if available
13
+ pdf_content += f"\n\n--- Page {page_num} ---\n{doc.page_content.strip()}\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
+ return pdf_content
 
 
 
16
 
17
+ def save_text_to_file(text, output_filename="extracted_content.txt"):
18
+ """Save extracted text to a .txt file."""
19
+ with open(output_filename, "w", encoding="utf-8") as f:
20
+ f.write(text)
21
+ return output_filename
 
 
22
 
23
  with gr.Blocks() as demo:
24
  with gr.Row():
25
+ gr.Markdown("# PDF Text Extractor with Page Numbers")
26
 
27
  with gr.Row():
28
+ pdf_file = gr.File(label="Upload PDF", type="filepath")
29
 
30
  with gr.Row():
31
+ extract_button = gr.Button("Extract and Download Text")
 
32
 
33
  with gr.Row():
34
+ download_button = gr.File(label="Download Extracted Text")
35
 
36
+ def on_extract(pdf_file):
37
+ """Callback function to extract text with page numbers and return a downloadable .txt file."""
38
+ extracted_text = extract_text_with_langchain_pdf(pdf_file)
39
+ txt_path = save_text_to_file(extracted_text)
40
+ return txt_path
 
 
 
 
41
 
42
+ extract_button.click(on_extract, inputs=[pdf_file], outputs=[download_button])
 
 
 
 
43
 
44
+ # Launch the Gradio
45
+ demo.queue().launch()