sadickam commited on
Commit
e56c8a4
·
verified ·
1 Parent(s): 9847598

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -3,14 +3,14 @@ import pandas as pd
3
  import io
4
  import tempfile
5
  import os
6
- from langchain_community.document_loaders import UnstructuredPDFLoader
7
 
8
  # Create a temporary directory for storing download files
9
  temp_dir = tempfile.TemporaryDirectory()
10
 
11
- def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=None):
12
  """
13
- Extract text from a PDF page by page using LangChain's UnstructuredPDFLoader.
14
 
15
  Args:
16
  pdf_file_path (str): The file path to the uploaded PDF.
@@ -21,9 +21,9 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
21
  tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
22
  """
23
  try:
24
- # Initialize the loader with split_pages=True to ensure each page is a separate document
25
- loader = UnstructuredPDFLoader(pdf_file_path, split_pages=True)
26
- documents = loader.load()
27
 
28
  total_pages = len(documents)
29
  doc_name = os.path.basename(pdf_file_path) # Extract document name
@@ -54,9 +54,9 @@ def extract_text_with_langchain_pdf(pdf_file_path, start_page=None, end_page=Non
54
 
55
  extracted_data = []
56
 
57
- for idx, doc in enumerate(selected_docs, start=1):
58
  # Assign the actual page number
59
- page_num = start_page + idx - 1
60
 
61
  # Split content into paragraphs
62
  paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
@@ -137,7 +137,7 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
137
  selected_end = end_page
138
 
139
  # Extract text and create DataFrame
140
- df, full_text = extract_text_with_langchain_pdf(
141
  pdf_file_path,
142
  start_page=selected_start,
143
  end_page=selected_end
 
3
  import io
4
  import tempfile
5
  import os
6
+ from langchain.document_loaders import PyPDFLoader # Updated import
7
 
8
  # Create a temporary directory for storing download files
9
  temp_dir = tempfile.TemporaryDirectory()
10
 
11
+ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
12
  """
13
+ Extract text from a PDF page by page using LangChain's PyPDFLoader.
14
 
15
  Args:
16
  pdf_file_path (str): The file path to the uploaded PDF.
 
21
  tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
22
  """
23
  try:
24
+ # Initialize the loader
25
+ loader = PyPDFLoader(pdf_file_path)
26
+ documents = loader.load_and_split() # Each document corresponds to a single page
27
 
28
  total_pages = len(documents)
29
  doc_name = os.path.basename(pdf_file_path) # Extract document name
 
54
 
55
  extracted_data = []
56
 
57
+ for idx, doc in enumerate(selected_docs, start=start_page):
58
  # Assign the actual page number
59
+ page_num = idx
60
 
61
  # Split content into paragraphs
62
  paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
 
137
  selected_end = end_page
138
 
139
  # Extract text and create DataFrame
140
+ df, full_text = extract_text_with_py_pdf_loader(
141
  pdf_file_path,
142
  start_page=selected_start,
143
  end_page=selected_end