ak3ra commited on
Commit
5674d87
·
1 Parent(s): de083ff

Refactor process_pdf_uploads function and handle PDF uploads in create_gr_interface

Browse files

- Refactor the process_pdf_uploads function in app.py to improve code readability and error handling.
- Update the handle_pdf_upload function in create_gr_interface to handle PDF uploads and processing.
- Add error logging for exceptions in the PDFProcessor class in pdf_processor.py.

Files changed (2) hide show
  1. app.py +52 -29
  2. utils/pdf_processor.py +37 -29
app.py CHANGED
@@ -231,12 +231,10 @@ def download_as_csv(markdown_content):
231
  # PDF Support
232
 
233
 
234
- def process_pdf_uploads(
235
- files: List[str], collection_name: str
236
- ) -> Tuple[str, gr.update]:
237
  """Process uploaded PDF files and add them to the system."""
238
- if not files:
239
- return "Please upload PDF files", gr.update()
240
 
241
  try:
242
  processor = PDFProcessor()
@@ -244,34 +242,35 @@ def process_pdf_uploads(
244
  # Save uploaded files temporarily
245
  file_paths = []
246
  for file in files:
247
- temp_path = os.path.join(processor.upload_dir, file.name)
248
- with open(temp_path, "wb") as f:
249
- f.write(file.read())
 
 
 
250
  file_paths.append(temp_path)
251
 
252
  # Process PDFs
253
  output_path = processor.process_pdfs(file_paths, collection_name)
254
 
255
  # Add to study files and ChromaDB
256
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
257
- collection_id = f"PDF-{timestamp}-{collection_name}"
258
  append_to_study_files("study_files.json", collection_id, output_path)
259
  add_study_files_to_chromadb("study_files.json", "study_files_collection")
260
 
261
- # Cleanup temporary files
262
  for path in file_paths:
263
- try:
264
- os.remove(path)
265
- except Exception as e:
266
- logger.warning(f"Failed to remove temporary file {path}: {e}")
 
267
 
268
- return (
269
- f"Successfully processed {len(files)} PDF files into collection: {collection_id}",
270
- gr.update(value=output_path),
271
- )
272
 
273
  except Exception as e:
274
- return f"Error processing PDF files: {str(e)}", gr.update()
 
275
 
276
 
277
  def chat_response(
@@ -387,8 +386,10 @@ def create_gr_interface() -> gr.Blocks:
387
  label="Collection Name",
388
  placeholder="Name this PDF collection...",
389
  )
 
 
390
  pdf_status = gr.Markdown()
391
-
392
  # Event handlers for Study Analysis tab
393
  process_zotero_btn.click(
394
  process_zotero_library_items,
@@ -411,6 +412,22 @@ def create_gr_interface() -> gr.Blocks:
411
  ).then(fn=cleanup_temp_files, inputs=None, outputs=None)
412
 
413
  # Event handlers for PDF Chat tab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
  def add_message(history, message):
415
  """Add user message to chat history."""
416
  if not message.strip():
@@ -418,22 +435,28 @@ def create_gr_interface() -> gr.Blocks:
418
  history = history + [(message, None)]
419
  return history, ""
420
 
421
- def generate_chat_response(history, collection):
422
  """Generate response for the last message in history."""
 
 
423
  if len(history) == 0:
424
  return history
425
 
426
  last_message = history[-1][0]
427
- response = chat_function(last_message, collection, "Default")
 
 
 
 
 
428
 
429
- # Update the last message pair with the response
430
- history[-1] = (last_message, response)
431
  return history
432
 
433
- pdf_files.upload(
434
- process_pdf_uploads,
 
435
  inputs=[pdf_files, collection_name],
436
- outputs=[pdf_status],
437
  )
438
 
439
  # Fixed chat event handling
@@ -443,7 +466,7 @@ def create_gr_interface() -> gr.Blocks:
443
  outputs=[chat_history, query_input],
444
  ).success(
445
  generate_chat_response,
446
- inputs=[chat_history, collection_name],
447
  outputs=[chat_history],
448
  )
449
 
 
231
  # PDF Support
232
 
233
 
234
+ def process_pdf_uploads(files: List[gr.File], collection_name: str) -> str:
 
 
235
  """Process uploaded PDF files and add them to the system."""
236
+ if not files or not collection_name:
237
+ return "Please upload PDF files and provide a collection name"
238
 
239
  try:
240
  processor = PDFProcessor()
 
242
  # Save uploaded files temporarily
243
  file_paths = []
244
  for file in files:
245
+ # Get the actual file path from the Gradio File object
246
+ if hasattr(file, "name"): # If it's already a path
247
+ temp_path = file.name
248
+ else: # If it needs to be saved
249
+ temp_path = os.path.join(processor.upload_dir, file.orig_name)
250
+ file.save(temp_path)
251
  file_paths.append(temp_path)
252
 
253
  # Process PDFs
254
  output_path = processor.process_pdfs(file_paths, collection_name)
255
 
256
  # Add to study files and ChromaDB
257
+ collection_id = f"pdf_{slugify(collection_name)}"
 
258
  append_to_study_files("study_files.json", collection_id, output_path)
259
  add_study_files_to_chromadb("study_files.json", "study_files_collection")
260
 
261
+ # Cleanup temporary files if they were created by us
262
  for path in file_paths:
263
+ if path.startswith(processor.upload_dir):
264
+ try:
265
+ os.remove(path)
266
+ except Exception as e:
267
+ logger.warning(f"Failed to remove temporary file {path}: {e}")
268
 
269
+ return f"Successfully processed PDFs into collection: {collection_id}"
 
 
 
270
 
271
  except Exception as e:
272
+ logger.error(f"Error in process_pdf_uploads: {str(e)}")
273
+ return f"Error processing PDF files: {str(e)}"
274
 
275
 
276
  def chat_response(
 
386
  label="Collection Name",
387
  placeholder="Name this PDF collection...",
388
  )
389
+ with gr.Row():
390
+ upload_btn = gr.Button("Process PDFs", variant="primary")
391
  pdf_status = gr.Markdown()
392
+ current_collection = gr.State(value=None)
393
  # Event handlers for Study Analysis tab
394
  process_zotero_btn.click(
395
  process_zotero_library_items,
 
412
  ).then(fn=cleanup_temp_files, inputs=None, outputs=None)
413
 
414
  # Event handlers for PDF Chat tab
415
+
416
+ def handle_pdf_upload(files, name):
417
+ """Handle PDF upload and processing."""
418
+ if not name:
419
+ return "Please provide a collection name", None
420
+ if not files:
421
+ return "Please select PDF files", None
422
+
423
+ try:
424
+ result = process_pdf_uploads(files, name)
425
+ collection_id = f"pdf_{slugify(name)}"
426
+ return result, collection_id
427
+ except Exception as e:
428
+ logger.error(f"Error in handle_pdf_upload: {str(e)}")
429
+ return f"Error: {str(e)}", None
430
+
431
  def add_message(history, message):
432
  """Add user message to chat history."""
433
  if not message.strip():
 
435
  history = history + [(message, None)]
436
  return history, ""
437
 
438
+ def generate_chat_response(history, collection_id):
439
  """Generate response for the last message in history."""
440
+ if not collection_id:
441
+ raise gr.Error("Please upload PDFs first")
442
  if len(history) == 0:
443
  return history
444
 
445
  last_message = history[-1][0]
446
+ try:
447
+ response = chat_function(last_message, collection_id, "Default")
448
+ history[-1] = (last_message, response)
449
+ except Exception as e:
450
+ logger.error(f"Error in generate_chat_response: {str(e)}")
451
+ history[-1] = (last_message, f"Error: {str(e)}")
452
 
 
 
453
  return history
454
 
455
+ # Update PDF event handlers
456
+ upload_btn.click( # Change from pdf_files.upload to upload_btn.click
457
+ handle_pdf_upload,
458
  inputs=[pdf_files, collection_name],
459
+ outputs=[pdf_status, current_collection],
460
  )
461
 
462
  # Fixed chat event handling
 
466
  outputs=[chat_history, query_input],
467
  ).success(
468
  generate_chat_response,
469
+ inputs=[chat_history, current_collection],
470
  outputs=[chat_history],
471
  )
472
 
utils/pdf_processor.py CHANGED
@@ -24,35 +24,43 @@ class PDFProcessor:
24
 
25
  def extract_text_from_pdf(self, file_path: str) -> Dict:
26
  """Extract text and metadata from a PDF file."""
27
- doc = fitz.open(file_path)
28
-
29
- # Extract text from all pages with page tracking
30
- text = ""
31
- pages = {}
32
- for page_num in range(len(doc)):
33
- page_text = doc[page_num].get_text()
34
- pages[page_num] = page_text
35
- text += page_text + "\n"
36
-
37
- # Extract metadata
38
- metadata = doc.metadata
39
- if not metadata.get("title"):
40
- metadata["title"] = os.path.basename(file_path)
41
-
42
- # Create structured document
43
- document = {
44
- "title": metadata.get("title", ""),
45
- "authors": metadata.get("author", "").split(";"),
46
- "date": metadata.get("creationDate", ""),
47
- "abstract": text[:500] + "..." if len(text) > 500 else text,
48
- "full_text": text,
49
- "source_file": file_path,
50
- "pages": pages,
51
- "page_count": len(doc),
52
- }
53
-
54
- doc.close()
55
- return document
 
 
 
 
 
 
 
 
56
 
57
  def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
58
  """Process multiple PDF files and store their content."""
 
24
 
25
  def extract_text_from_pdf(self, file_path: str) -> Dict:
26
  """Extract text and metadata from a PDF file."""
27
+ try:
28
+ doc = fitz.open(file_path)
29
+
30
+ # Extract text from all pages with page tracking
31
+ text = ""
32
+ pages = {}
33
+ for page_num in range(len(doc)):
34
+ page_text = doc[page_num].get_text()
35
+ pages[page_num] = page_text
36
+ text += page_text + "\n"
37
+
38
+ # Extract metadata
39
+ metadata = doc.metadata
40
+ if not metadata.get("title"):
41
+ metadata["title"] = os.path.basename(file_path)
42
+
43
+ # Create structured document
44
+ document = {
45
+ "title": metadata.get("title", ""),
46
+ "authors": (
47
+ metadata.get("author", "").split(";")
48
+ if metadata.get("author")
49
+ else []
50
+ ),
51
+ "date": metadata.get("creationDate", ""),
52
+ "abstract": text[:500] + "..." if len(text) > 500 else text,
53
+ "full_text": text,
54
+ "source_file": file_path,
55
+ "pages": pages,
56
+ "page_count": len(doc),
57
+ }
58
+
59
+ doc.close()
60
+ return document
61
+ except Exception as e:
62
+ logger.error(f"Error processing PDF {file_path}: {str(e)}")
63
+ raise
64
 
65
  def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
66
  """Process multiple PDF files and store their content."""