TahaRasouli commited on
Commit
b658c92
·
verified ·
1 Parent(s): b8885ab

Update unified_document_processor.py

Browse files
Files changed (1) hide show
  1. unified_document_processor.py +167 -1
unified_document_processor.py CHANGED
@@ -377,4 +377,170 @@ class UnifiedDocumentProcessor:
377
  return response.choices[0].message.content
378
 
379
  except Exception as e:
380
- return f"Error processing your question: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
  return response.choices[0].message.content
378
 
379
  except Exception as e:
380
+ return f"Error processing your question: {str(e)}"
381
+ def get_detailed_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
382
+ """Get detailed context including path and metadata information"""
383
+ try:
384
+ filter_dict = {
385
+ 'source_file': {'$in': selected_files}
386
+ }
387
+
388
+ results = self.collection.query(
389
+ query_texts=[question],
390
+ n_results=n_results,
391
+ where=filter_dict,
392
+ include=["documents", "metadatas", "distances"]
393
+ )
394
+
395
+ if not results['documents'][0]:
396
+ return {
397
+ 'success': False,
398
+ 'error': "No relevant content found"
399
+ }
400
+
401
+ detailed_results = []
402
+ for doc, meta, distance in zip(results['documents'][0], results['metadatas'][0], results['distances'][0]):
403
+ result_info = {
404
+ 'content': doc,
405
+ 'metadata': meta,
406
+ 'relevance_score': 1 - distance, # Convert distance to similarity score
407
+ 'source_info': {
408
+ 'file': meta['source_file'],
409
+ 'type': meta['content_type'],
410
+ 'path': meta.get('xml_path', 'N/A'), # Only for XML files
411
+ 'context': json.loads(meta['context']) if meta.get('context') else {}
412
+ }
413
+ }
414
+ detailed_results.append(result_info)
415
+
416
+ return {
417
+ 'success': True,
418
+ 'results': detailed_results,
419
+ 'query': question
420
+ }
421
+
422
+ except Exception as e:
423
+ return {
424
+ 'success': False,
425
+ 'error': str(e)
426
+ }
427
+
428
+ def get_hierarchical_context(self, question: str, selected_files: List[str], n_results: int = 5) -> Dict:
429
+ """Get hierarchical context for XML files including parent-child relationships"""
430
+ try:
431
+ # Get initial results
432
+ initial_results = self.get_detailed_context(question, selected_files, n_results)
433
+
434
+ if not initial_results['success']:
435
+ return initial_results
436
+
437
+ hierarchical_results = []
438
+ for result in initial_results['results']:
439
+ if result['metadata']['content_type'] == 'xml':
440
+ # Get parent elements
441
+ parent_path = '/'.join(result['source_info']['path'].split('/')[:-1])
442
+ if parent_path:
443
+ parent_filter = {
444
+ 'source_file': result['metadata']['source_file'],
445
+ 'xml_path': parent_path
446
+ }
447
+ parent_results = self.collection.query(
448
+ query_texts=[""], # Empty query to get exact match
449
+ where=parent_filter,
450
+ include=["documents", "metadatas"],
451
+ n_results=1
452
+ )
453
+ if parent_results['documents'][0]:
454
+ result['parent_info'] = {
455
+ 'content': parent_results['documents'][0][0],
456
+ 'metadata': parent_results['metadatas'][0][0]
457
+ }
458
+
459
+ # Get immediate children
460
+ child_path_prefix = result['source_info']['path'] + '/'
461
+ child_filter = {
462
+ 'source_file': result['metadata']['source_file'],
463
+ 'xml_path': {'$contains': child_path_prefix}
464
+ }
465
+ child_results = self.collection.query(
466
+ query_texts=[""], # Empty query to get exact matches
467
+ where=child_filter,
468
+ include=["documents", "metadatas"],
469
+ n_results=5
470
+ )
471
+ if child_results['documents'][0]:
472
+ result['children_info'] = [{
473
+ 'content': doc,
474
+ 'metadata': meta
475
+ } for doc, meta in zip(child_results['documents'][0], child_results['metadatas'][0])]
476
+
477
+ hierarchical_results.append(result)
478
+
479
+ return {
480
+ 'success': True,
481
+ 'results': hierarchical_results,
482
+ 'query': question
483
+ }
484
+
485
+ except Exception as e:
486
+ return {
487
+ 'success': False,
488
+ 'error': str(e)
489
+ }
490
+
491
+ def get_summary_and_details(self, question: str, selected_files: List[str]) -> Dict:
492
+ """Get both a summary answer and detailed supporting information"""
493
+ try:
494
+ # Get hierarchical context first
495
+ detailed_results = self.get_hierarchical_context(question, selected_files)
496
+
497
+ if not detailed_results['success']:
498
+ return detailed_results
499
+
500
+ # Create summary prompt
501
+ relevant_content = []
502
+ for result in detailed_results['results']:
503
+ if result['metadata']['content_type'] == 'xml':
504
+ content_info = [
505
+ f"XML Path: {result['source_info']['path']}",
506
+ f"Content: {result['content']}"
507
+ ]
508
+ if 'parent_info' in result:
509
+ content_info.append(f"Parent: {result['parent_info']['content']}")
510
+ if 'children_info' in result:
511
+ children_content = [child['content'] for child in result['children_info']]
512
+ content_info.append(f"Related Elements: {', '.join(children_content)}")
513
+ else:
514
+ content_info = [f"Content: {result['content']}"]
515
+
516
+ relevant_content.append('\n'.join(content_info))
517
+
518
+ summary_prompt = f"""Based on the following content, please provide:
519
+ 1. A concise answer to the question
520
+ 2. Key supporting points
521
+ 3. Related context if relevant
522
+
523
+ Question: {question}
524
+
525
+ Content:
526
+ {'\n\n'.join(relevant_content)}
527
+ """
528
+
529
+ response = self.groq_client.chat.completions.create(
530
+ messages=[{"role": "user", "content": summary_prompt}],
531
+ model="llama3-8b-8192",
532
+ temperature=0.2
533
+ )
534
+
535
+ return {
536
+ 'success': True,
537
+ 'summary': response.choices[0].message.content,
538
+ 'details': detailed_results['results'],
539
+ 'query': question
540
+ }
541
+
542
+ except Exception as e:
543
+ return {
544
+ 'success': False,
545
+ 'error': str(e)
546
+ }