Spaces:
Sleeping
Sleeping
Update unified_document_processor.py
Browse files- unified_document_processor.py +60 -74
unified_document_processor.py
CHANGED
@@ -126,118 +126,104 @@ class UnifiedDocumentProcessor:
|
|
126 |
return chunks
|
127 |
|
128 |
def process_xml_file(self, xml_file_path: str) -> Dict:
|
129 |
-
"""Process XML file with
|
130 |
try:
|
131 |
tree = ET.parse(xml_file_path)
|
132 |
root = tree.getroot()
|
133 |
|
134 |
-
# Process XML into
|
135 |
chunks = []
|
136 |
-
|
137 |
|
138 |
-
def process_element(element,
|
139 |
-
if context is None:
|
140 |
-
context = {}
|
141 |
-
|
142 |
# Create element description
|
143 |
-
current_path.append(element.tag)
|
144 |
element_info = []
|
145 |
|
146 |
-
# Add
|
147 |
element_info.append(f"Element: {element.tag}")
|
148 |
-
element_info.append(f"Path: {'/' + '/'.join(current_path)}")
|
149 |
|
150 |
-
# Process namespace if present
|
151 |
if '}' in element.tag:
|
152 |
namespace = element.tag.split('}')[0].strip('{')
|
153 |
element_info.append(f"Namespace: {namespace}")
|
154 |
|
155 |
-
# Process attributes
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
'Description': 'Description',
|
162 |
-
'DataType': 'Data Type',
|
163 |
-
'ModellingRule': 'Modeling Rule'
|
164 |
-
}
|
165 |
-
|
166 |
-
for key, value in element.attrib.items():
|
167 |
-
if key in special_attrs:
|
168 |
-
element_info.append(f"{special_attrs[key]}: {value}")
|
169 |
-
else:
|
170 |
-
element_info.append(f"Attribute - {key}: {value}")
|
171 |
|
172 |
-
# Process text content
|
173 |
if element.text and element.text.strip():
|
174 |
element_info.append(f"Content: {element.text.strip()}")
|
175 |
|
176 |
# Create chunk text
|
177 |
chunk_text = " | ".join(element_info)
|
|
|
178 |
|
179 |
-
|
180 |
-
|
181 |
-
'text': chunk_text,
|
182 |
-
'path': '/' + '/'.join(current_path),
|
183 |
-
'context': context.copy(),
|
184 |
-
'element_type': element.tag,
|
185 |
-
'attributes': element.attrib
|
186 |
-
})
|
187 |
|
188 |
# Process children
|
189 |
-
child_context = context.copy()
|
190 |
-
if element.attrib:
|
191 |
-
child_context[element.tag] = element.attrib
|
192 |
-
|
193 |
for child in element:
|
194 |
-
process_element(child,
|
195 |
-
|
196 |
-
current_path.pop()
|
197 |
|
198 |
# Start processing from root
|
199 |
process_element(root)
|
200 |
print(f"Generated {len(chunks)} XML chunks")
|
201 |
|
|
|
|
|
202 |
results = []
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
233 |
|
234 |
return {
|
235 |
'success': True,
|
236 |
'total_chunks': len(chunks),
|
237 |
'results': results
|
238 |
}
|
239 |
-
|
240 |
except Exception as e:
|
|
|
241 |
return {
|
242 |
'success': False,
|
243 |
'error': str(e)
|
|
|
126 |
return chunks
|
127 |
|
128 |
def process_xml_file(self, xml_file_path: str) -> Dict:
|
129 |
+
"""Process XML file with optimized batching and reduced database operations"""
|
130 |
try:
|
131 |
tree = ET.parse(xml_file_path)
|
132 |
root = tree.getroot()
|
133 |
|
134 |
+
# Process XML into chunks efficiently
|
135 |
chunks = []
|
136 |
+
paths = []
|
137 |
|
138 |
+
def process_element(element, current_path=""):
|
|
|
|
|
|
|
139 |
# Create element description
|
|
|
140 |
element_info = []
|
141 |
|
142 |
+
# Add basic information
|
143 |
element_info.append(f"Element: {element.tag}")
|
|
|
144 |
|
145 |
+
# Process namespace only if present
|
146 |
if '}' in element.tag:
|
147 |
namespace = element.tag.split('}')[0].strip('{')
|
148 |
element_info.append(f"Namespace: {namespace}")
|
149 |
|
150 |
+
# Process important attributes only
|
151 |
+
important_attrs = ['NodeId', 'BrowseName', 'DisplayName', 'Description', 'DataType']
|
152 |
+
attrs = {k: v for k, v in element.attrib.items() if k in important_attrs}
|
153 |
+
if attrs:
|
154 |
+
for key, value in attrs.items():
|
155 |
+
element_info.append(f"{key}: {value}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
156 |
|
157 |
+
# Process text content if meaningful
|
158 |
if element.text and element.text.strip():
|
159 |
element_info.append(f"Content: {element.text.strip()}")
|
160 |
|
161 |
# Create chunk text
|
162 |
chunk_text = " | ".join(element_info)
|
163 |
+
new_path = f"{current_path}/{element.tag}" if current_path else element.tag
|
164 |
|
165 |
+
chunks.append(chunk_text)
|
166 |
+
paths.append(new_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
|
168 |
# Process children
|
|
|
|
|
|
|
|
|
169 |
for child in element:
|
170 |
+
process_element(child, new_path)
|
|
|
|
|
171 |
|
172 |
# Start processing from root
|
173 |
process_element(root)
|
174 |
print(f"Generated {len(chunks)} XML chunks")
|
175 |
|
176 |
+
# Batch process into database
|
177 |
+
batch_size = 100 # Increased batch size
|
178 |
results = []
|
179 |
+
|
180 |
+
for i in range(0, len(chunks), batch_size):
|
181 |
+
batch_end = min(i + batch_size, len(chunks))
|
182 |
+
batch_chunks = chunks[i:batch_end]
|
183 |
+
batch_paths = paths[i:batch_end]
|
184 |
+
|
185 |
+
# Prepare batch metadata
|
186 |
+
batch_metadata = [{
|
187 |
+
'source_file': os.path.basename(xml_file_path),
|
188 |
+
'content_type': 'xml',
|
189 |
+
'chunk_id': idx,
|
190 |
+
'total_chunks': len(chunks),
|
191 |
+
'xml_path': path,
|
192 |
+
'timestamp': str(datetime.datetime.now())
|
193 |
+
} for idx, path in enumerate(batch_paths, start=i)]
|
194 |
+
|
195 |
+
# Generate batch IDs
|
196 |
+
batch_ids = [
|
197 |
+
f"{os.path.basename(xml_file_path)}_xml_{idx}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
|
198 |
+
for idx in range(i, batch_end)
|
199 |
+
]
|
200 |
+
|
201 |
+
# Store batch in vector database
|
202 |
+
self.collection.add(
|
203 |
+
documents=batch_chunks,
|
204 |
+
metadatas=batch_metadata,
|
205 |
+
ids=batch_ids
|
206 |
+
)
|
207 |
+
|
208 |
+
# Track results
|
209 |
+
results.extend([{
|
210 |
+
'chunk': idx,
|
211 |
+
'success': True,
|
212 |
+
'doc_id': doc_id,
|
213 |
+
'text': text
|
214 |
+
} for idx, (doc_id, text) in enumerate(zip(batch_ids, batch_chunks), start=i)])
|
215 |
+
|
216 |
+
# Print progress
|
217 |
+
print(f"Processed chunks {i} to {batch_end} of {len(chunks)}")
|
218 |
|
219 |
return {
|
220 |
'success': True,
|
221 |
'total_chunks': len(chunks),
|
222 |
'results': results
|
223 |
}
|
224 |
+
|
225 |
except Exception as e:
|
226 |
+
print(f"Error processing XML: {str(e)}")
|
227 |
return {
|
228 |
'success': False,
|
229 |
'error': str(e)
|