TahaRasouli commited on
Commit
7349704
·
verified ·
1 Parent(s): 58857c9

Update unified_document_processor.py

Browse files
Files changed (1) hide show
  1. unified_document_processor.py +61 -61
unified_document_processor.py CHANGED
@@ -446,64 +446,64 @@ class UnifiedDocumentProcessor:
446
  'error': str(e)
447
  }
448
 
449
- def get_available_files(self) -> Dict[str, List[str]]:
450
- """Get list of all files in the database"""
451
- try:
452
- all_entries = self.collection.get(
453
- include=['metadatas']
454
- )
455
-
456
- files = {
457
- 'pdf': set(),
458
- 'xml': set()
459
- }
460
-
461
- for metadata in all_entries['metadatas']:
462
- file_type = metadata['content_type']
463
- file_name = metadata['source_file']
464
- files[file_type].add(file_name)
465
-
466
- return {
467
- 'pdf': sorted(list(files['pdf'])),
468
- 'xml': sorted(list(files['xml']))
469
- }
470
- except Exception as e:
471
- print(f"Error getting available files: {str(e)}")
472
- return {'pdf': [], 'xml': []}
473
-
474
- def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
475
- """Ask a question using only the selected files"""
476
- try:
477
- filter_dict = {
478
- 'source_file': {'$in': selected_files}
479
- }
480
-
481
- results = self.collection.query(
482
- query_texts=[question],
483
- n_results=n_results,
484
- where=filter_dict,
485
- include=["documents", "metadatas"]
486
- )
487
-
488
- if not results['documents'][0]:
489
- return "No relevant content found in the selected files."
490
-
491
- context = "\n\n".join(results['documents'][0])
492
-
493
- prompt = f"""Based on the following content from the selected files, please answer this question: {question}
494
-
495
- Content:
496
- {context}
497
-
498
- Please provide a direct answer based only on the information provided above."""
499
-
500
- response = self.groq_client.chat.completions.create(
501
- messages=[{"role": "user", "content": prompt}],
502
- model="llama3-8b-8192",
503
- temperature=0.2
504
- )
505
-
506
- return response.choices[0].message.content
507
-
508
- except Exception as e:
509
- return f"Error processing your question: {str(e)}"
 
446
  'error': str(e)
447
  }
448
 
449
+ def get_available_files(self) -> Dict[str, List[str]]:
450
+ """Get list of all files in the database"""
451
+ try:
452
+ all_entries = self.collection.get(
453
+ include=['metadatas']
454
+ )
455
+
456
+ files = {
457
+ 'pdf': set(),
458
+ 'xml': set()
459
+ }
460
+
461
+ for metadata in all_entries['metadatas']:
462
+ file_type = metadata['content_type']
463
+ file_name = metadata['source_file']
464
+ files[file_type].add(file_name)
465
+
466
+ return {
467
+ 'pdf': sorted(list(files['pdf'])),
468
+ 'xml': sorted(list(files['xml']))
469
+ }
470
+ except Exception as e:
471
+ print(f"Error getting available files: {str(e)}")
472
+ return {'pdf': [], 'xml': []}
473
+
474
+ def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
475
+ """Ask a question using only the selected files"""
476
+ try:
477
+ filter_dict = {
478
+ 'source_file': {'$in': selected_files}
479
+ }
480
+
481
+ results = self.collection.query(
482
+ query_texts=[question],
483
+ n_results=n_results,
484
+ where=filter_dict,
485
+ include=["documents", "metadatas"]
486
+ )
487
+
488
+ if not results['documents'][0]:
489
+ return "No relevant content found in the selected files."
490
+
491
+ context = "\n\n".join(results['documents'][0])
492
+
493
+ prompt = f"""Based on the following content from the selected files, please answer this question: {question}
494
+
495
+ Content:
496
+ {context}
497
+
498
+ Please provide a direct answer based only on the information provided above."""
499
+
500
+ response = self.groq_client.chat.completions.create(
501
+ messages=[{"role": "user", "content": prompt}],
502
+ model="llama3-8b-8192",
503
+ temperature=0.2
504
+ )
505
+
506
+ return response.choices[0].message.content
507
+
508
+ except Exception as e:
509
+ return f"Error processing your question: {str(e)}"