sadickam commited on
Commit
ba8b960
·
verified ·
1 Parent(s): 49bcd81

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -63
app.py CHANGED
@@ -4,6 +4,11 @@ import io
4
  import tempfile
5
  import os
6
  from langchain_community.document_loaders import PyPDFLoader
 
 
 
 
 
7
 
8
  # Create a temporary directory for storing download files
9
  temp_dir = tempfile.TemporaryDirectory()
@@ -11,14 +16,16 @@ temp_dir = tempfile.TemporaryDirectory()
11
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
12
  """
13
  Extract text from a PDF page by page using LangChain's PyPDFLoader.
14
-
15
  Args:
16
  pdf_file_path (str): The file path to the uploaded PDF.
17
  start_page (int, optional): The starting page number for extraction (1-based index).
18
  end_page (int, optional): The ending page number for extraction (1-based index).
19
-
20
  Returns:
21
- tuple: DataFrame containing the extracted text with metadata, and the full concatenated text.
 
 
22
  """
23
  try:
24
  # Initialize the loader
@@ -49,29 +56,37 @@ def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=Non
49
  start_page = 1
50
  end_page = total_pages
51
 
52
- # Concatenate selected page contents into a single string
53
- pdf_pages_content = '\n'.join(doc.page_content for doc in selected_docs)
54
-
55
- extracted_data = []
56
 
57
  for idx, doc in enumerate(selected_docs, start=start_page):
58
- # Assign the actual page number
59
  page_num = idx
60
-
61
- # Split content into paragraphs
62
- paragraphs = doc.page_content.split("\n\n") # Split into paragraphs
63
-
64
- for paragraph in paragraphs:
65
- clean_para = paragraph.strip()
66
- if clean_para:
67
- extracted_data.append({
 
 
 
 
 
 
 
68
  "Document": doc_name,
69
  "Page": page_num,
70
- "Paragraph": clean_para
71
  })
72
 
73
- df = pd.DataFrame(extracted_data)
74
- return df, pdf_pages_content
 
 
 
75
 
76
  except Exception as e:
77
  raise RuntimeError(f"Error during PDF extraction: {e}")
@@ -95,34 +110,21 @@ def df_to_csv_bytes(df):
95
  except Exception as e:
96
  raise RuntimeError(f"Error during CSV conversion: {e}")
97
 
98
- def text_to_txt_bytes(text):
99
- """
100
- Convert text to TXT in bytes.
101
-
102
- Args:
103
- text (str): The text to convert.
104
-
105
- Returns:
106
- bytes: TXT data in bytes.
107
- """
108
- try:
109
- txt_data = text.encode('utf-8')
110
- return txt_data
111
- except Exception as e:
112
- raise RuntimeError(f"Error during TXT conversion: {e}")
113
-
114
  def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
115
  """
116
- Callback function to extract text from PDF and return CSV and TXT data.
117
-
118
  Args:
119
  pdf_file_path (str): The file path to the uploaded PDF.
120
  extraction_mode (str): "All Pages" or "Range of Pages".
121
  start_page (float): Starting page number for extraction.
122
  end_page (float): Ending page number for extraction.
123
-
124
  Returns:
125
- tuple: Paths to CSV and TXT files, Status message.
 
 
 
126
  """
127
  if not pdf_file_path:
128
  return None, None, "No file uploaded."
@@ -136,37 +138,36 @@ def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
136
  selected_start = start_page
137
  selected_end = end_page
138
 
139
- # Extract text and create DataFrame
140
- df, full_text = extract_text_with_py_pdf_loader(
141
  pdf_file_path,
142
  start_page=selected_start,
143
  end_page=selected_end
144
  )
145
 
146
- # Convert DataFrame to CSV bytes
147
- csv_bytes = df_to_csv_bytes(df)
148
- csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_extracted.csv"
149
 
150
- # Convert full text to TXT bytes
151
- txt_bytes = text_to_txt_bytes(full_text)
152
- txt_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_full_text.txt"
153
 
154
  # Define full paths within the temporary directory
155
- csv_tmp_path = os.path.join(temp_dir.name, csv_filename)
156
- txt_tmp_path = os.path.join(temp_dir.name, txt_filename)
157
 
158
- # Write CSV bytes to temporary file
159
- with open(csv_tmp_path, 'wb') as csv_tmp:
160
- csv_tmp.write(csv_bytes)
161
 
162
- # Write TXT bytes to temporary file
163
- with open(txt_tmp_path, 'wb') as txt_tmp:
164
- txt_tmp.write(txt_bytes)
165
 
166
- # Return the paths to the temporary files and a success message
167
  return (
168
- csv_tmp_path,
169
- txt_tmp_path,
170
  "Extraction successful!"
171
  )
172
  except Exception as e:
@@ -221,12 +222,12 @@ with gr.Blocks() as demo:
221
  extract_button = gr.Button("Extract and Download")
222
 
223
  with gr.Row():
224
- csv_download = gr.File(
225
- label="Download Extracted CSV",
226
  interactive=False
227
  )
228
- txt_download = gr.File(
229
- label="Download Full Text",
230
  interactive=False
231
  )
232
 
@@ -240,7 +241,7 @@ with gr.Blocks() as demo:
240
  extract_button.click(
241
  fn=on_extract,
242
  inputs=[pdf_input, extraction_mode, start_page, end_page],
243
- outputs=[csv_download, txt_download, status_output]
244
  )
245
 
246
  gr.Markdown("""
 
4
  import tempfile
5
  import os
6
  from langchain_community.document_loaders import PyPDFLoader
7
+ import nltk
8
+ from nltk.tokenize import sent_tokenize
9
+
10
+ # Download NLTK's punkt tokenizer if not already downloaded
11
+ nltk.download('punkt')
12
 
13
  # Create a temporary directory for storing download files
14
  temp_dir = tempfile.TemporaryDirectory()
 
16
  def extract_text_with_py_pdf_loader(pdf_file_path, start_page=None, end_page=None):
17
  """
18
  Extract text from a PDF page by page using LangChain's PyPDFLoader.
19
+
20
  Args:
21
  pdf_file_path (str): The file path to the uploaded PDF.
22
  start_page (int, optional): The starting page number for extraction (1-based index).
23
  end_page (int, optional): The ending page number for extraction (1-based index).
24
+
25
  Returns:
26
+ tuple:
27
+ - page_df (pd.DataFrame): DataFrame containing Document, Page, and Text.
28
+ - sentence_df (pd.DataFrame): DataFrame containing Document, Page, and Sentence.
29
  """
30
  try:
31
  # Initialize the loader
 
56
  start_page = 1
57
  end_page = total_pages
58
 
59
+ # Initialize lists to store data
60
+ page_data = []
61
+ sentence_data = []
 
62
 
63
  for idx, doc in enumerate(selected_docs, start=start_page):
 
64
  page_num = idx
65
+ text = doc.page_content.strip()
66
+
67
+ # Append page-wise data
68
+ page_data.append({
69
+ "Document": doc_name,
70
+ "Page": page_num,
71
+ "Text": text
72
+ })
73
+
74
+ # Sentence tokenization
75
+ sentences = sent_tokenize(text)
76
+ for sentence in sentences:
77
+ sentence = sentence.strip()
78
+ if sentence:
79
+ sentence_data.append({
80
  "Document": doc_name,
81
  "Page": page_num,
82
+ "Sentence": sentence
83
  })
84
 
85
+ # Create DataFrames
86
+ page_df = pd.DataFrame(page_data)
87
+ sentence_df = pd.DataFrame(sentence_data)
88
+
89
+ return page_df, sentence_df
90
 
91
  except Exception as e:
92
  raise RuntimeError(f"Error during PDF extraction: {e}")
 
110
  except Exception as e:
111
  raise RuntimeError(f"Error during CSV conversion: {e}")
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def on_extract(pdf_file_path, extraction_mode, start_page, end_page):
114
  """
115
+ Callback function to extract text from PDF and return CSV data.
116
+
117
  Args:
118
  pdf_file_path (str): The file path to the uploaded PDF.
119
  extraction_mode (str): "All Pages" or "Range of Pages".
120
  start_page (float): Starting page number for extraction.
121
  end_page (float): Ending page number for extraction.
122
+
123
  Returns:
124
+ tuple:
125
+ - page_csv_path (str): Path to the page-wise CSV file.
126
+ - sentence_csv_path (str): Path to the sentence-wise CSV file.
127
+ - status_message (str): Status of the extraction process.
128
  """
129
  if not pdf_file_path:
130
  return None, None, "No file uploaded."
 
138
  selected_start = start_page
139
  selected_end = end_page
140
 
141
+ # Extract text and create DataFrames
142
+ page_df, sentence_df = extract_text_with_py_pdf_loader(
143
  pdf_file_path,
144
  start_page=selected_start,
145
  end_page=selected_end
146
  )
147
 
148
+ # Convert DataFrames to CSV bytes
149
+ page_csv_bytes = df_to_csv_bytes(page_df)
150
+ sentence_csv_bytes = df_to_csv_bytes(sentence_df)
151
 
152
+ # Define CSV filenames
153
+ page_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_pages.csv"
154
+ sentence_csv_filename = f"{os.path.splitext(os.path.basename(pdf_file_path))[0]}_sentences.csv"
155
 
156
  # Define full paths within the temporary directory
157
+ page_csv_path = os.path.join(temp_dir.name, page_csv_filename)
158
+ sentence_csv_path = os.path.join(temp_dir.name, sentence_csv_filename)
159
 
160
+ # Write CSV bytes to temporary files
161
+ with open(page_csv_path, 'wb') as page_csv_file:
162
+ page_csv_file.write(page_csv_bytes)
163
 
164
+ with open(sentence_csv_path, 'wb') as sentence_csv_file:
165
+ sentence_csv_file.write(sentence_csv_bytes)
 
166
 
167
+ # Return the paths to the temporary CSV files and a success message
168
  return (
169
+ page_csv_path,
170
+ sentence_csv_path,
171
  "Extraction successful!"
172
  )
173
  except Exception as e:
 
222
  extract_button = gr.Button("Extract and Download")
223
 
224
  with gr.Row():
225
+ page_csv_download = gr.File(
226
+ label="Download Page-wise CSV",
227
  interactive=False
228
  )
229
+ sentence_csv_download = gr.File(
230
+ label="Download Sentence-wise CSV",
231
  interactive=False
232
  )
233
 
 
241
  extract_button.click(
242
  fn=on_extract,
243
  inputs=[pdf_input, extraction_mode, start_page, end_page],
244
+ outputs=[page_csv_download, sentence_csv_download, status_output]
245
  )
246
 
247
  gr.Markdown("""