Spaces:
Sleeping
Sleeping
Update unified_document_processor.py
Browse files- unified_document_processor.py +61 -61
unified_document_processor.py
CHANGED
@@ -446,64 +446,64 @@ class UnifiedDocumentProcessor:
|
|
446 |
'error': str(e)
|
447 |
}
|
448 |
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
|
|
446 |
'error': str(e)
|
447 |
}
|
448 |
|
449 |
+
def get_available_files(self) -> Dict[str, List[str]]:
|
450 |
+
"""Get list of all files in the database"""
|
451 |
+
try:
|
452 |
+
all_entries = self.collection.get(
|
453 |
+
include=['metadatas']
|
454 |
+
)
|
455 |
+
|
456 |
+
files = {
|
457 |
+
'pdf': set(),
|
458 |
+
'xml': set()
|
459 |
+
}
|
460 |
+
|
461 |
+
for metadata in all_entries['metadatas']:
|
462 |
+
file_type = metadata['content_type']
|
463 |
+
file_name = metadata['source_file']
|
464 |
+
files[file_type].add(file_name)
|
465 |
+
|
466 |
+
return {
|
467 |
+
'pdf': sorted(list(files['pdf'])),
|
468 |
+
'xml': sorted(list(files['xml']))
|
469 |
+
}
|
470 |
+
except Exception as e:
|
471 |
+
print(f"Error getting available files: {str(e)}")
|
472 |
+
return {'pdf': [], 'xml': []}
|
473 |
+
|
474 |
+
def ask_question_selective(self, question: str, selected_files: List[str], n_results: int = 5) -> str:
|
475 |
+
"""Ask a question using only the selected files"""
|
476 |
+
try:
|
477 |
+
filter_dict = {
|
478 |
+
'source_file': {'$in': selected_files}
|
479 |
+
}
|
480 |
+
|
481 |
+
results = self.collection.query(
|
482 |
+
query_texts=[question],
|
483 |
+
n_results=n_results,
|
484 |
+
where=filter_dict,
|
485 |
+
include=["documents", "metadatas"]
|
486 |
+
)
|
487 |
+
|
488 |
+
if not results['documents'][0]:
|
489 |
+
return "No relevant content found in the selected files."
|
490 |
+
|
491 |
+
context = "\n\n".join(results['documents'][0])
|
492 |
+
|
493 |
+
prompt = f"""Based on the following content from the selected files, please answer this question: {question}
|
494 |
+
|
495 |
+
Content:
|
496 |
+
{context}
|
497 |
+
|
498 |
+
Please provide a direct answer based only on the information provided above."""
|
499 |
+
|
500 |
+
response = self.groq_client.chat.completions.create(
|
501 |
+
messages=[{"role": "user", "content": prompt}],
|
502 |
+
model="llama3-8b-8192",
|
503 |
+
temperature=0.2
|
504 |
+
)
|
505 |
+
|
506 |
+
return response.choices[0].message.content
|
507 |
+
|
508 |
+
except Exception as e:
|
509 |
+
return f"Error processing your question: {str(e)}"
|