Spaces:
Sleeping
Sleeping
Update unified_document_processor.py
Browse files- unified_document_processor.py +167 -1
unified_document_processor.py
CHANGED
@@ -377,4 +377,170 @@ class UnifiedDocumentProcessor:
|
|
377 |
return response.choices[0].message.content
|
378 |
|
379 |
except Exception as e:
|
380 |
-
return f"Error processing your question: {str(e)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
return response.choices[0].message.content
|
378 |
|
379 |
except Exception as e:
|
380 |
+
return f"Error processing your question: {str(e)}"
|
381 |
+
def get_detailed_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
382 |
+
"""Get detailed context including path and metadata information"""
|
383 |
+
try:
|
384 |
+
filter_dict = {
|
385 |
+
'source_file': {'$in': selected_files}
|
386 |
+
}
|
387 |
+
|
388 |
+
results = self.collection.query(
|
389 |
+
query_texts=[question],
|
390 |
+
n_results=n_results,
|
391 |
+
where=filter_dict,
|
392 |
+
include=["documents", "metadatas", "distances"]
|
393 |
+
)
|
394 |
+
|
395 |
+
if not results['documents'][0]:
|
396 |
+
return {
|
397 |
+
'success': False,
|
398 |
+
'error': "No relevant content found"
|
399 |
+
}
|
400 |
+
|
401 |
+
detailed_results = []
|
402 |
+
for doc, meta, distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
|
403 |
+
result_info = {
|
404 |
+
'content': doc,
|
405 |
+
'metadata': meta,
|
406 |
+
'relevance_score': 1 - distance, # Convert distance to similarity score
|
407 |
+
'source_info': {
|
408 |
+
'file': meta['source_file'],
|
409 |
+
'type': meta['content_type'],
|
410 |
+
'path': meta.get('xml_path', 'N/A'), # Only for XML files
|
411 |
+
'context': json.loads(meta['context']) if meta.get('context') else {}
|
412 |
+
}
|
413 |
+
}
|
414 |
+
detailed_results.append(result_info)
|
415 |
+
|
416 |
+
return {
|
417 |
+
'success': True,
|
418 |
+
'results': detailed_results,
|
419 |
+
'query': question
|
420 |
+
}
|
421 |
+
|
422 |
+
except Exception as e:
|
423 |
+
return {
|
424 |
+
'success': False,
|
425 |
+
'error': str(e)
|
426 |
+
}
|
427 |
+
|
428 |
+
def get_hierarchical_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
|
429 |
+
"""Get hierarchical context for XML files including parent-child relationships"""
|
430 |
+
try:
|
431 |
+
# Get initial results
|
432 |
+
initial_results = self.get_detailed_context(question, selected_files, n_results)
|
433 |
+
|
434 |
+
if not initial_results['success']:
|
435 |
+
return initial_results
|
436 |
+
|
437 |
+
hierarchical_results = []
|
438 |
+
for result in initial_results['results']:
|
439 |
+
if result['metadata']['content_type'] == 'xml':
|
440 |
+
# Get parent elements
|
441 |
+
parent_path = '/'.join(result['source_info']['path'].split('/')[:-1])
|
442 |
+
if parent_path:
|
443 |
+
parent_filter = {
|
444 |
+
'source_file': result['metadata']['source_file'],
|
445 |
+
'xml_path': parent_path
|
446 |
+
}
|
447 |
+
parent_results = self.collection.query(
|
448 |
+
query_texts=[""], # Empty query to get exact match
|
449 |
+
where=parent_filter,
|
450 |
+
include=["documents", "metadatas"],
|
451 |
+
n_results=1
|
452 |
+
)
|
453 |
+
if parent_results['documents'][0]:
|
454 |
+
result['parent_info'] = {
|
455 |
+
'content': parent_results['documents'][0][0],
|
456 |
+
'metadata': parent_results['metadatas'][0][0]
|
457 |
+
}
|
458 |
+
|
459 |
+
# Get immediate children
|
460 |
+
child_path_prefix = result['source_info']['path'] + '/'
|
461 |
+
child_filter = {
|
462 |
+
'source_file': result['metadata']['source_file'],
|
463 |
+
'xml_path': {'$contains': child_path_prefix}
|
464 |
+
}
|
465 |
+
child_results = self.collection.query(
|
466 |
+
query_texts=[""], # Empty query to get exact matches
|
467 |
+
where=child_filter,
|
468 |
+
include=["documents", "metadatas"],
|
469 |
+
n_results=5
|
470 |
+
)
|
471 |
+
if child_results['documents'][0]:
|
472 |
+
result['children_info'] = [{
|
473 |
+
'content': doc,
|
474 |
+
'metadata': meta
|
475 |
+
} for doc, meta in zip(child_results['documents'][0], child_results['metadatas'][0])]
|
476 |
+
|
477 |
+
hierarchical_results.append(result)
|
478 |
+
|
479 |
+
return {
|
480 |
+
'success': True,
|
481 |
+
'results': hierarchical_results,
|
482 |
+
'query': question
|
483 |
+
}
|
484 |
+
|
485 |
+
except Exception as e:
|
486 |
+
return {
|
487 |
+
'success': False,
|
488 |
+
'error': str(e)
|
489 |
+
}
|
490 |
+
|
491 |
+
def get_summary_and_details(self, question: str, selected_files: List[str]) -> Dict:
|
492 |
+
"""Get both a summary answer and detailed supporting information"""
|
493 |
+
try:
|
494 |
+
# Get hierarchical context first
|
495 |
+
detailed_results = self.get_hierarchical_context(question, selected_files)
|
496 |
+
|
497 |
+
if not detailed_results['success']:
|
498 |
+
return detailed_results
|
499 |
+
|
500 |
+
# Create summary prompt
|
501 |
+
relevant_content = []
|
502 |
+
for result in detailed_results['results']:
|
503 |
+
if result['metadata']['content_type'] == 'xml':
|
504 |
+
content_info = [
|
505 |
+
f"XML Path: {result['source_info']['path']}",
|
506 |
+
f"Content: {result['content']}"
|
507 |
+
]
|
508 |
+
if 'parent_info' in result:
|
509 |
+
content_info.append(f"Parent: {result['parent_info']['content']}")
|
510 |
+
if 'children_info' in result:
|
511 |
+
children_content = [child['content'] for child in result['children_info']]
|
512 |
+
content_info.append(f"Related Elements: {', '.join(children_content)}")
|
513 |
+
else:
|
514 |
+
content_info = [f"Content: {result['content']}"]
|
515 |
+
|
516 |
+
relevant_content.append('\n'.join(content_info))
|
517 |
+
|
518 |
+
summary_prompt = f"""Based on the following content, please provide:
|
519 |
+
1. A concise answer to the question
|
520 |
+
2. Key supporting points
|
521 |
+
3. Related context if relevant
|
522 |
+
|
523 |
+
Question: {question}
|
524 |
+
|
525 |
+
Content:
|
526 |
+
{'\n\n'.join(relevant_content)}
|
527 |
+
"""
|
528 |
+
|
529 |
+
response = self.groq_client.chat.completions.create(
|
530 |
+
messages=[{"role": "user", "content": summary_prompt}],
|
531 |
+
model="llama3-8b-8192",
|
532 |
+
temperature=0.2
|
533 |
+
)
|
534 |
+
|
535 |
+
return {
|
536 |
+
'success': True,
|
537 |
+
'summary': response.choices[0].message.content,
|
538 |
+
'details': detailed_results['results'],
|
539 |
+
'query': question
|
540 |
+
}
|
541 |
+
|
542 |
+
except Exception as e:
|
543 |
+
return {
|
544 |
+
'success': False,
|
545 |
+
'error': str(e)
|
546 |
+
}
|