TahaRasouli commited on
Commit
c94951d
·
verified ·
1 Parent(s): 8fa9209

Update unified_document_processor.py

Browse files
Files changed (1) hide show
  1. unified_document_processor.py +60 -74
unified_document_processor.py CHANGED
@@ -126,118 +126,104 @@ class UnifiedDocumentProcessor:
126
  return chunks
127
 
128
  def process_xml_file(self, xml_file_path: str) -> Dict:
129
- """Process XML file with direct embedding and hierarchy preservation"""
130
  try:
131
  tree = ET.parse(xml_file_path)
132
  root = tree.getroot()
133
 
134
- # Process XML into semantic chunks with context
135
  chunks = []
136
- current_path = []
137
 
138
- def process_element(element, context=None):
139
- if context is None:
140
- context = {}
141
-
142
  # Create element description
143
- current_path.append(element.tag)
144
  element_info = []
145
 
146
- # Add tag information
147
  element_info.append(f"Element: {element.tag}")
148
- element_info.append(f"Path: {'/' + '/'.join(current_path)}")
149
 
150
- # Process namespace if present
151
  if '}' in element.tag:
152
  namespace = element.tag.split('}')[0].strip('{')
153
  element_info.append(f"Namespace: {namespace}")
154
 
155
- # Process attributes with improved structure
156
- if element.attrib:
157
- special_attrs = {
158
- 'NodeId': 'Identifier',
159
- 'BrowseName': 'Name',
160
- 'DisplayName': 'Display Name',
161
- 'Description': 'Description',
162
- 'DataType': 'Data Type',
163
- 'ModellingRule': 'Modeling Rule'
164
- }
165
-
166
- for key, value in element.attrib.items():
167
- if key in special_attrs:
168
- element_info.append(f"{special_attrs[key]}: {value}")
169
- else:
170
- element_info.append(f"Attribute - {key}: {value}")
171
 
172
- # Process text content
173
  if element.text and element.text.strip():
174
  element_info.append(f"Content: {element.text.strip()}")
175
 
176
  # Create chunk text
177
  chunk_text = " | ".join(element_info)
 
178
 
179
- # Store chunk with metadata
180
- chunks.append({
181
- 'text': chunk_text,
182
- 'path': '/' + '/'.join(current_path),
183
- 'context': context.copy(),
184
- 'element_type': element.tag,
185
- 'attributes': element.attrib
186
- })
187
 
188
  # Process children
189
- child_context = context.copy()
190
- if element.attrib:
191
- child_context[element.tag] = element.attrib
192
-
193
  for child in element:
194
- process_element(child, child_context)
195
-
196
- current_path.pop()
197
 
198
  # Start processing from root
199
  process_element(root)
200
  print(f"Generated {len(chunks)} XML chunks")
201
 
 
 
202
  results = []
203
- for i, chunk in enumerate(chunks):
204
- try:
205
- metadata = {
206
- 'source_file': os.path.basename(xml_file_path),
207
- 'content_type': 'xml',
208
- 'chunk_id': i,
209
- 'total_chunks': len(chunks),
210
- 'xml_path': chunk['path'],
211
- 'element_type': chunk['element_type'],
212
- 'context': json.dumps(chunk['context']),
213
- 'timestamp': str(datetime.datetime.now())
214
- }
215
-
216
- # Store directly in vector database
217
- doc_id = self.store_in_vector_db(chunk['text'], metadata)
218
-
219
- results.append({
220
- 'chunk': i,
221
- 'success': True,
222
- 'doc_id': doc_id,
223
- 'text': chunk['text']
224
- })
225
-
226
- except Exception as e:
227
- print(f"Error processing chunk {i}: {str(e)}")
228
- results.append({
229
- 'chunk': i,
230
- 'success': False,
231
- 'error': str(e)
232
- })
 
 
 
 
 
 
 
 
 
233
 
234
  return {
235
  'success': True,
236
  'total_chunks': len(chunks),
237
  'results': results
238
  }
239
-
240
  except Exception as e:
 
241
  return {
242
  'success': False,
243
  'error': str(e)
 
126
  return chunks
127
 
128
  def process_xml_file(self, xml_file_path: str) -> Dict:
129
+ """Process XML file with optimized batching and reduced database operations"""
130
  try:
131
  tree = ET.parse(xml_file_path)
132
  root = tree.getroot()
133
 
134
+ # Process XML into chunks efficiently
135
  chunks = []
136
+ paths = []
137
 
138
+ def process_element(element, current_path=""):
 
 
 
139
  # Create element description
 
140
  element_info = []
141
 
142
+ # Add basic information
143
  element_info.append(f"Element: {element.tag}")
 
144
 
145
+ # Process namespace only if present
146
  if '}' in element.tag:
147
  namespace = element.tag.split('}')[0].strip('{')
148
  element_info.append(f"Namespace: {namespace}")
149
 
150
+ # Process important attributes only
151
+ important_attrs = ['NodeId', 'BrowseName', 'DisplayName', 'Description', 'DataType']
152
+ attrs = {k: v for k, v in element.attrib.items() if k in important_attrs}
153
+ if attrs:
154
+ for key, value in attrs.items():
155
+ element_info.append(f"{key}: {value}")
 
 
 
 
 
 
 
 
 
 
156
 
157
+ # Process text content if meaningful
158
  if element.text and element.text.strip():
159
  element_info.append(f"Content: {element.text.strip()}")
160
 
161
  # Create chunk text
162
  chunk_text = " | ".join(element_info)
163
+ new_path = f"{current_path}/{element.tag}" if current_path else element.tag
164
 
165
+ chunks.append(chunk_text)
166
+ paths.append(new_path)
 
 
 
 
 
 
167
 
168
  # Process children
 
 
 
 
169
  for child in element:
170
+ process_element(child, new_path)
 
 
171
 
172
  # Start processing from root
173
  process_element(root)
174
  print(f"Generated {len(chunks)} XML chunks")
175
 
176
+ # Batch process into database
177
+ batch_size = 100 # Increased batch size
178
  results = []
179
+
180
+ for i in range(0, len(chunks), batch_size):
181
+ batch_end = min(i + batch_size, len(chunks))
182
+ batch_chunks = chunks[i:batch_end]
183
+ batch_paths = paths[i:batch_end]
184
+
185
+ # Prepare batch metadata
186
+ batch_metadata = [{
187
+ 'source_file': os.path.basename(xml_file_path),
188
+ 'content_type': 'xml',
189
+ 'chunk_id': idx,
190
+ 'total_chunks': len(chunks),
191
+ 'xml_path': path,
192
+ 'timestamp': str(datetime.datetime.now())
193
+ } for idx, path in enumerate(batch_paths, start=i)]
194
+
195
+ # Generate batch IDs
196
+ batch_ids = [
197
+ f"{os.path.basename(xml_file_path)}_xml_{idx}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
198
+ for idx in range(i, batch_end)
199
+ ]
200
+
201
+ # Store batch in vector database
202
+ self.collection.add(
203
+ documents=batch_chunks,
204
+ metadatas=batch_metadata,
205
+ ids=batch_ids
206
+ )
207
+
208
+ # Track results
209
+ results.extend([{
210
+ 'chunk': idx,
211
+ 'success': True,
212
+ 'doc_id': doc_id,
213
+ 'text': text
214
+ } for idx, (doc_id, text) in enumerate(zip(batch_ids, batch_chunks), start=i)])
215
+
216
+ # Print progress
217
+ print(f"Processed chunks {i} to {batch_end} of {len(chunks)}")
218
 
219
  return {
220
  'success': True,
221
  'total_chunks': len(chunks),
222
  'results': results
223
  }
224
+
225
  except Exception as e:
226
+ print(f"Error processing XML: {str(e)}")
227
  return {
228
  'success': False,
229
  'error': str(e)