Spaces:
Running
Running
Refactor process_pdf_uploads function and handle PDF uploads in create_gr_interface
Browse files- Refactor the process_pdf_uploads function in app.py to improve code readability and error handling.
- Update the handle_pdf_upload function in create_gr_interface to handle PDF uploads and processing.
- Add error logging for exceptions in the PDFProcessor class in pdf_processor.py.
- app.py +52 -29
- utils/pdf_processor.py +37 -29
app.py
CHANGED
@@ -231,12 +231,10 @@ def download_as_csv(markdown_content):
|
|
231 |
# PDF Support
|
232 |
|
233 |
|
234 |
-
def process_pdf_uploads(
|
235 |
-
files: List[str], collection_name: str
|
236 |
-
) -> Tuple[str, gr.update]:
|
237 |
"""Process uploaded PDF files and add them to the system."""
|
238 |
-
if not files:
|
239 |
-
return "Please upload PDF files"
|
240 |
|
241 |
try:
|
242 |
processor = PDFProcessor()
|
@@ -244,34 +242,35 @@ def process_pdf_uploads(
|
|
244 |
# Save uploaded files temporarily
|
245 |
file_paths = []
|
246 |
for file in files:
|
247 |
-
|
248 |
-
|
249 |
-
|
|
|
|
|
|
|
250 |
file_paths.append(temp_path)
|
251 |
|
252 |
# Process PDFs
|
253 |
output_path = processor.process_pdfs(file_paths, collection_name)
|
254 |
|
255 |
# Add to study files and ChromaDB
|
256 |
-
|
257 |
-
collection_id = f"PDF-{timestamp}-{collection_name}"
|
258 |
append_to_study_files("study_files.json", collection_id, output_path)
|
259 |
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
260 |
|
261 |
-
# Cleanup temporary files
|
262 |
for path in file_paths:
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
|
|
267 |
|
268 |
-
return
|
269 |
-
f"Successfully processed {len(files)} PDF files into collection: {collection_id}",
|
270 |
-
gr.update(value=output_path),
|
271 |
-
)
|
272 |
|
273 |
except Exception as e:
|
274 |
-
|
|
|
275 |
|
276 |
|
277 |
def chat_response(
|
@@ -387,8 +386,10 @@ def create_gr_interface() -> gr.Blocks:
|
|
387 |
label="Collection Name",
|
388 |
placeholder="Name this PDF collection...",
|
389 |
)
|
|
|
|
|
390 |
pdf_status = gr.Markdown()
|
391 |
-
|
392 |
# Event handlers for Study Analysis tab
|
393 |
process_zotero_btn.click(
|
394 |
process_zotero_library_items,
|
@@ -411,6 +412,22 @@ def create_gr_interface() -> gr.Blocks:
|
|
411 |
).then(fn=cleanup_temp_files, inputs=None, outputs=None)
|
412 |
|
413 |
# Event handlers for PDF Chat tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
def add_message(history, message):
|
415 |
"""Add user message to chat history."""
|
416 |
if not message.strip():
|
@@ -418,22 +435,28 @@ def create_gr_interface() -> gr.Blocks:
|
|
418 |
history = history + [(message, None)]
|
419 |
return history, ""
|
420 |
|
421 |
-
def generate_chat_response(history,
|
422 |
"""Generate response for the last message in history."""
|
|
|
|
|
423 |
if len(history) == 0:
|
424 |
return history
|
425 |
|
426 |
last_message = history[-1][0]
|
427 |
-
|
|
|
|
|
|
|
|
|
|
|
428 |
|
429 |
-
# Update the last message pair with the response
|
430 |
-
history[-1] = (last_message, response)
|
431 |
return history
|
432 |
|
433 |
-
|
434 |
-
|
|
|
435 |
inputs=[pdf_files, collection_name],
|
436 |
-
outputs=[pdf_status],
|
437 |
)
|
438 |
|
439 |
# Fixed chat event handling
|
@@ -443,7 +466,7 @@ def create_gr_interface() -> gr.Blocks:
|
|
443 |
outputs=[chat_history, query_input],
|
444 |
).success(
|
445 |
generate_chat_response,
|
446 |
-
inputs=[chat_history,
|
447 |
outputs=[chat_history],
|
448 |
)
|
449 |
|
|
|
231 |
# PDF Support
|
232 |
|
233 |
|
234 |
+
def process_pdf_uploads(files: List[gr.File], collection_name: str) -> str:
|
|
|
|
|
235 |
"""Process uploaded PDF files and add them to the system."""
|
236 |
+
if not files or not collection_name:
|
237 |
+
return "Please upload PDF files and provide a collection name"
|
238 |
|
239 |
try:
|
240 |
processor = PDFProcessor()
|
|
|
242 |
# Save uploaded files temporarily
|
243 |
file_paths = []
|
244 |
for file in files:
|
245 |
+
# Get the actual file path from the Gradio File object
|
246 |
+
if hasattr(file, "name"): # If it's already a path
|
247 |
+
temp_path = file.name
|
248 |
+
else: # If it needs to be saved
|
249 |
+
temp_path = os.path.join(processor.upload_dir, file.orig_name)
|
250 |
+
file.save(temp_path)
|
251 |
file_paths.append(temp_path)
|
252 |
|
253 |
# Process PDFs
|
254 |
output_path = processor.process_pdfs(file_paths, collection_name)
|
255 |
|
256 |
# Add to study files and ChromaDB
|
257 |
+
collection_id = f"pdf_{slugify(collection_name)}"
|
|
|
258 |
append_to_study_files("study_files.json", collection_id, output_path)
|
259 |
add_study_files_to_chromadb("study_files.json", "study_files_collection")
|
260 |
|
261 |
+
# Cleanup temporary files if they were created by us
|
262 |
for path in file_paths:
|
263 |
+
if path.startswith(processor.upload_dir):
|
264 |
+
try:
|
265 |
+
os.remove(path)
|
266 |
+
except Exception as e:
|
267 |
+
logger.warning(f"Failed to remove temporary file {path}: {e}")
|
268 |
|
269 |
+
return f"Successfully processed PDFs into collection: {collection_id}"
|
|
|
|
|
|
|
270 |
|
271 |
except Exception as e:
|
272 |
+
logger.error(f"Error in process_pdf_uploads: {str(e)}")
|
273 |
+
return f"Error processing PDF files: {str(e)}"
|
274 |
|
275 |
|
276 |
def chat_response(
|
|
|
386 |
label="Collection Name",
|
387 |
placeholder="Name this PDF collection...",
|
388 |
)
|
389 |
+
with gr.Row():
|
390 |
+
upload_btn = gr.Button("Process PDFs", variant="primary")
|
391 |
pdf_status = gr.Markdown()
|
392 |
+
current_collection = gr.State(value=None)
|
393 |
# Event handlers for Study Analysis tab
|
394 |
process_zotero_btn.click(
|
395 |
process_zotero_library_items,
|
|
|
412 |
).then(fn=cleanup_temp_files, inputs=None, outputs=None)
|
413 |
|
414 |
# Event handlers for PDF Chat tab
|
415 |
+
|
416 |
+
def handle_pdf_upload(files, name):
|
417 |
+
"""Handle PDF upload and processing."""
|
418 |
+
if not name:
|
419 |
+
return "Please provide a collection name", None
|
420 |
+
if not files:
|
421 |
+
return "Please select PDF files", None
|
422 |
+
|
423 |
+
try:
|
424 |
+
result = process_pdf_uploads(files, name)
|
425 |
+
collection_id = f"pdf_{slugify(name)}"
|
426 |
+
return result, collection_id
|
427 |
+
except Exception as e:
|
428 |
+
logger.error(f"Error in handle_pdf_upload: {str(e)}")
|
429 |
+
return f"Error: {str(e)}", None
|
430 |
+
|
431 |
def add_message(history, message):
|
432 |
"""Add user message to chat history."""
|
433 |
if not message.strip():
|
|
|
435 |
history = history + [(message, None)]
|
436 |
return history, ""
|
437 |
|
438 |
+
def generate_chat_response(history, collection_id):
|
439 |
"""Generate response for the last message in history."""
|
440 |
+
if not collection_id:
|
441 |
+
raise gr.Error("Please upload PDFs first")
|
442 |
if len(history) == 0:
|
443 |
return history
|
444 |
|
445 |
last_message = history[-1][0]
|
446 |
+
try:
|
447 |
+
response = chat_function(last_message, collection_id, "Default")
|
448 |
+
history[-1] = (last_message, response)
|
449 |
+
except Exception as e:
|
450 |
+
logger.error(f"Error in generate_chat_response: {str(e)}")
|
451 |
+
history[-1] = (last_message, f"Error: {str(e)}")
|
452 |
|
|
|
|
|
453 |
return history
|
454 |
|
455 |
+
# Update PDF event handlers
|
456 |
+
upload_btn.click( # Change from pdf_files.upload to upload_btn.click
|
457 |
+
handle_pdf_upload,
|
458 |
inputs=[pdf_files, collection_name],
|
459 |
+
outputs=[pdf_status, current_collection],
|
460 |
)
|
461 |
|
462 |
# Fixed chat event handling
|
|
|
466 |
outputs=[chat_history, query_input],
|
467 |
).success(
|
468 |
generate_chat_response,
|
469 |
+
inputs=[chat_history, current_collection],
|
470 |
outputs=[chat_history],
|
471 |
)
|
472 |
|
utils/pdf_processor.py
CHANGED
@@ -24,35 +24,43 @@ class PDFProcessor:
|
|
24 |
|
25 |
def extract_text_from_pdf(self, file_path: str) -> Dict:
|
26 |
"""Extract text and metadata from a PDF file."""
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
metadata
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
|
58 |
"""Process multiple PDF files and store their content."""
|
|
|
24 |
|
25 |
def extract_text_from_pdf(self, file_path: str) -> Dict:
|
26 |
"""Extract text and metadata from a PDF file."""
|
27 |
+
try:
|
28 |
+
doc = fitz.open(file_path)
|
29 |
+
|
30 |
+
# Extract text from all pages with page tracking
|
31 |
+
text = ""
|
32 |
+
pages = {}
|
33 |
+
for page_num in range(len(doc)):
|
34 |
+
page_text = doc[page_num].get_text()
|
35 |
+
pages[page_num] = page_text
|
36 |
+
text += page_text + "\n"
|
37 |
+
|
38 |
+
# Extract metadata
|
39 |
+
metadata = doc.metadata
|
40 |
+
if not metadata.get("title"):
|
41 |
+
metadata["title"] = os.path.basename(file_path)
|
42 |
+
|
43 |
+
# Create structured document
|
44 |
+
document = {
|
45 |
+
"title": metadata.get("title", ""),
|
46 |
+
"authors": (
|
47 |
+
metadata.get("author", "").split(";")
|
48 |
+
if metadata.get("author")
|
49 |
+
else []
|
50 |
+
),
|
51 |
+
"date": metadata.get("creationDate", ""),
|
52 |
+
"abstract": text[:500] + "..." if len(text) > 500 else text,
|
53 |
+
"full_text": text,
|
54 |
+
"source_file": file_path,
|
55 |
+
"pages": pages,
|
56 |
+
"page_count": len(doc),
|
57 |
+
}
|
58 |
+
|
59 |
+
doc.close()
|
60 |
+
return document
|
61 |
+
except Exception as e:
|
62 |
+
logger.error(f"Error processing PDF {file_path}: {str(e)}")
|
63 |
+
raise
|
64 |
|
65 |
def process_pdfs(self, file_paths: List[str], collection_name: str) -> str:
|
66 |
"""Process multiple PDF files and store their content."""
|