Spaces:
Sleeping
Sleeping
Update unified_document_processor.py
Browse files- unified_document_processor.py +45 -39
unified_document_processor.py
CHANGED
@@ -373,51 +373,57 @@ class UnifiedDocumentProcessor:
|
|
373 |
return f"Error processing your question: {str(e)}"
|
374 |
|
375 |
def get_detailed_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
results = self.collection.query(
|
383 |
-
query_texts=[question],
|
384 |
-
n_results=n_results,
|
385 |
-
where=filter_dict,
|
386 |
-
include=["documents", "metadatas", "distances"]
|
387 |
-
)
|
388 |
-
|
389 |
-
if not results['documents'][0]:
|
390 |
-
return {
|
391 |
-
'success': False,
|
392 |
-
'error': "No relevant content found"
|
393 |
-
}
|
394 |
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
'source_info': {
|
402 |
-
'file': meta['source_file'],
|
403 |
-
'type': meta['content_type'],
|
404 |
-
'path': meta.get('xml_path', 'N/A'), # Only for XML files
|
405 |
-
'context': json.loads(meta['context']) if meta.get('context') else {}
|
406 |
-
}
|
407 |
-
}
|
408 |
-
detailed_results.append(result_info)
|
409 |
|
|
|
410 |
return {
|
411 |
-
'success':
|
412 |
-
'
|
413 |
-
'query': question
|
414 |
}
|
415 |
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
420 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
421 |
|
422 |
def get_hierarchical_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
423 |
"""Get hierarchical context for XML files including parent-child relationships"""
|
|
|
373 |
return f"Error processing your question: {str(e)}"
|
374 |
|
375 |
def get_detailed_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
376 |
+
"""Get detailed context with enhanced scoring"""
|
377 |
+
try:
|
378 |
+
filter_dict = {
|
379 |
+
'source_file': {'$in': selected_files}
|
380 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
381 |
|
382 |
+
results = self.collection.query(
|
383 |
+
query_texts=[question],
|
384 |
+
n_results=n_results,
|
385 |
+
where=filter_dict,
|
386 |
+
include=["documents", "metadatas", "distances"]
|
387 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
+
if not results['documents'][0]:
|
390 |
return {
|
391 |
+
'success': False,
|
392 |
+
'error': "No relevant content found"
|
|
|
393 |
}
|
394 |
|
395 |
+
detailed_results = []
|
396 |
+
for doc, meta, distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
|
397 |
+
# Calculate detailed score
|
398 |
+
detailed_score = self.calculate_detailed_score(distance, meta, doc, question)
|
399 |
+
|
400 |
+
result_info = {
|
401 |
+
'content': doc,
|
402 |
+
'metadata': meta,
|
403 |
+
'score_details': detailed_score,
|
404 |
+
'source_info': {
|
405 |
+
'file': meta['source_file'],
|
406 |
+
'type': meta['content_type'],
|
407 |
+
'path': meta.get('xml_path', 'N/A'),
|
408 |
+
'context': json.loads(meta['context']) if meta.get('context') else {}
|
409 |
+
}
|
410 |
}
|
411 |
+
detailed_results.append(result_info)
|
412 |
+
|
413 |
+
# Sort results by total score
|
414 |
+
detailed_results.sort(key=lambda x: x['score_details']['total_score'], reverse=True)
|
415 |
+
|
416 |
+
return {
|
417 |
+
'success': True,
|
418 |
+
'results': detailed_results,
|
419 |
+
'query': question
|
420 |
+
}
|
421 |
+
|
422 |
+
except Exception as e:
|
423 |
+
return {
|
424 |
+
'success': False,
|
425 |
+
'error': str(e)
|
426 |
+
}
|
427 |
|
428 |
def get_hierarchical_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
429 |
"""Get hierarchical context for XML files including parent-child relationships"""
|