Spaces:

TahaRasouli
/

Smart_AAS_v2.0

Sleeping

App Files Files Community

TahaRasouli commited on Nov 25, 2024

Commit

c94951d

verified ·

1 Parent(s): 8fa9209

Update unified_document_processor.py

Browse files

Files changed (1) hide show

unified_document_processor.py +60 -74

unified_document_processor.py CHANGED Viewed

@@ -126,118 +126,104 @@ class UnifiedDocumentProcessor:
         return chunks
     def process_xml_file(self, xml_file_path: str) -> Dict:
-        """Process XML file with direct embedding and hierarchy preservation"""
         try:
             tree = ET.parse(xml_file_path)
             root = tree.getroot()
-            # Process XML into semantic chunks with context
             chunks = []
-            current_path = []
-            def process_element(element, context=None):
-                if context is None:
-                    context = {}
                 # Create element description
-                current_path.append(element.tag)
                 element_info = []
-                # Add tag information
                 element_info.append(f"Element: {element.tag}")
-                element_info.append(f"Path: {'/' + '/'.join(current_path)}")
-                # Process namespace if present
                 if '}' in element.tag:
                     namespace = element.tag.split('}')[0].strip('{')
                     element_info.append(f"Namespace: {namespace}")
-                # Process attributes with improved structure
-                if element.attrib:
-                    special_attrs = {
-                        'NodeId': 'Identifier',
-                        'BrowseName': 'Name',
-                        'DisplayName': 'Display Name',
-                        'Description': 'Description',
-                        'DataType': 'Data Type',
-                        'ModellingRule': 'Modeling Rule'
-                    }
-                    for key, value in element.attrib.items():
-                        if key in special_attrs:
-                            element_info.append(f"{special_attrs[key]}: {value}")
-                        else:
-                            element_info.append(f"Attribute - {key}: {value}")
-                # Process text content
                 if element.text and element.text.strip():
                     element_info.append(f"Content: {element.text.strip()}")
                 # Create chunk text
                 chunk_text = " | ".join(element_info)
-                # Store chunk with metadata
-                chunks.append({
-                    'text': chunk_text,
-                    'path': '/' + '/'.join(current_path),
-                    'context': context.copy(),
-                    'element_type': element.tag,
-                    'attributes': element.attrib
-                })
                 # Process children
-                child_context = context.copy()
-                if element.attrib:
-                    child_context[element.tag] = element.attrib
                 for child in element:
-                    process_element(child, child_context)
-                current_path.pop()
             # Start processing from root
             process_element(root)
             print(f"Generated {len(chunks)} XML chunks")
             results = []
-            for i, chunk in enumerate(chunks):
-                try:
-                    metadata = {
-                        'source_file': os.path.basename(xml_file_path),
-                        'content_type': 'xml',
-                        'chunk_id': i,
-                        'total_chunks': len(chunks),
-                        'xml_path': chunk['path'],
-                        'element_type': chunk['element_type'],
-                        'context': json.dumps(chunk['context']),
-                        'timestamp': str(datetime.datetime.now())
-                    }
-                    # Store directly in vector database
-                    doc_id = self.store_in_vector_db(chunk['text'], metadata)
-                    results.append({
-                        'chunk': i,
-                        'success': True,
-                        'doc_id': doc_id,
-                        'text': chunk['text']
-                    })
-                except Exception as e:
-                    print(f"Error processing chunk {i}: {str(e)}")
-                    results.append({
-                        'chunk': i,
-                        'success': False,
-                        'error': str(e)
-                    })
             return {
                 'success': True,
                 'total_chunks': len(chunks),
                 'results': results
             }
         except Exception as e:
             return {
                 'success': False,
                 'error': str(e)

         return chunks
     def process_xml_file(self, xml_file_path: str) -> Dict:
+        """Process XML file with optimized batching and reduced database operations"""
         try:
             tree = ET.parse(xml_file_path)
             root = tree.getroot()
+            # Process XML into chunks efficiently
             chunks = []
+            paths = []
+            def process_element(element, current_path=""):
                 # Create element description
                 element_info = []
+                # Add basic information
                 element_info.append(f"Element: {element.tag}")
+                # Process namespace only if present
                 if '}' in element.tag:
                     namespace = element.tag.split('}')[0].strip('{')
                     element_info.append(f"Namespace: {namespace}")
+                # Process important attributes only
+                important_attrs = ['NodeId', 'BrowseName', 'DisplayName', 'Description', 'DataType']
+                attrs = {k: v for k, v in element.attrib.items() if k in important_attrs}
+                if attrs:
+                    for key, value in attrs.items():
+                        element_info.append(f"{key}: {value}")
+                # Process text content if meaningful
                 if element.text and element.text.strip():
                     element_info.append(f"Content: {element.text.strip()}")
                 # Create chunk text
                 chunk_text = " | ".join(element_info)
+                new_path = f"{current_path}/{element.tag}" if current_path else element.tag
+                chunks.append(chunk_text)
+                paths.append(new_path)
                 # Process children
                 for child in element:
+                    process_element(child, new_path)
             # Start processing from root
             process_element(root)
             print(f"Generated {len(chunks)} XML chunks")
+            # Batch process into database
+            batch_size = 100  # Increased batch size
             results = []
+            for i in range(0, len(chunks), batch_size):
+                batch_end = min(i + batch_size, len(chunks))
+                batch_chunks = chunks[i:batch_end]
+                batch_paths = paths[i:batch_end]
+                # Prepare batch metadata
+                batch_metadata = [{
+                    'source_file': os.path.basename(xml_file_path),
+                    'content_type': 'xml',
+                    'chunk_id': idx,
+                    'total_chunks': len(chunks),
+                    'xml_path': path,
+                    'timestamp': str(datetime.datetime.now())
+                } for idx, path in enumerate(batch_paths, start=i)]
+                # Generate batch IDs
+                batch_ids = [
+                    f"{os.path.basename(xml_file_path)}_xml_{idx}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
+                    for idx in range(i, batch_end)
+                ]
+                # Store batch in vector database
+                self.collection.add(
+                    documents=batch_chunks,
+                    metadatas=batch_metadata,
+                    ids=batch_ids
+                )
+                # Track results
+                results.extend([{
+                    'chunk': idx,
+                    'success': True,
+                    'doc_id': doc_id,
+                    'text': text
+                } for idx, (doc_id, text) in enumerate(zip(batch_ids, batch_chunks), start=i)])
+                # Print progress
+                print(f"Processed chunks {i} to {batch_end} of {len(chunks)}")
             return {
                 'success': True,
                 'total_chunks': len(chunks),
                 'results': results
             }
         except Exception as e:
+            print(f"Error processing XML: {str(e)}")
             return {
                 'success': False,
                 'error': str(e)