Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,11 +1,220 @@
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import tempfile
|
4 |
-
from typing import List
|
5 |
from unified_document_processor import UnifiedDocumentProcessor, CustomEmbeddingFunction
|
6 |
import chromadb
|
7 |
from chromadb.config import Settings
|
8 |
from groq import Groq
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
def initialize_session_state():
|
11 |
"""Initialize all session state variables"""
|
@@ -14,96 +223,63 @@ def initialize_session_state():
|
|
14 |
os.makedirs(st.session_state.CHROMADB_DIR, exist_ok=True)
|
15 |
|
16 |
if 'processed_files' not in st.session_state:
|
17 |
-
st.session_state.processed_files =
|
18 |
|
19 |
if 'processor' not in st.session_state:
|
20 |
-
|
21 |
-
st.session_state.processor = None # Will be initialized in StreamlitDocProcessor
|
22 |
-
except Exception as e:
|
23 |
-
st.error(f"Error initializing processor: {str(e)}")
|
24 |
|
25 |
class StreamlitDocProcessor:
|
26 |
def __init__(self):
|
|
|
|
|
|
|
27 |
if st.session_state.processor is None:
|
28 |
try:
|
29 |
-
groq_api_key =
|
30 |
-
|
|
|
|
|
|
|
31 |
st.session_state.processor = self.initialize_processor(groq_api_key)
|
32 |
-
|
33 |
-
|
|
|
34 |
except Exception as e:
|
35 |
st.error(f"Error initializing processor: {str(e)}")
|
36 |
-
return
|
|
|
37 |
|
38 |
def initialize_processor(self, groq_api_key):
|
39 |
-
"""Initialize the processor with persistent ChromaDB"""
|
40 |
-
class PersistentUnifiedDocumentProcessor(UnifiedDocumentProcessor):
|
41 |
-
def __init__(self, api_key, collection_name="unified_content", persist_dir=None):
|
42 |
-
self.groq_client = Groq(api_key=api_key)
|
43 |
-
self.max_elements_per_chunk = 50
|
44 |
-
self.pdf_chunk_size = 500
|
45 |
-
self.pdf_overlap = 50
|
46 |
-
self._initialize_nltk()
|
47 |
-
|
48 |
-
# Initialize persistent ChromaDB
|
49 |
-
self.chroma_client = chromadb.PersistentClient(
|
50 |
-
path=persist_dir,
|
51 |
-
settings=Settings(
|
52 |
-
allow_reset=True,
|
53 |
-
is_persistent=True
|
54 |
-
)
|
55 |
-
)
|
56 |
-
|
57 |
-
# Get or create collection
|
58 |
-
try:
|
59 |
-
self.collection = self.chroma_client.get_collection(
|
60 |
-
name=collection_name,
|
61 |
-
embedding_function=CustomEmbeddingFunction()
|
62 |
-
)
|
63 |
-
except:
|
64 |
-
self.collection = self.chroma_client.create_collection(
|
65 |
-
name=collection_name,
|
66 |
-
embedding_function=CustomEmbeddingFunction()
|
67 |
-
)
|
68 |
-
|
69 |
return PersistentUnifiedDocumentProcessor(
|
70 |
groq_api_key,
|
71 |
persist_dir=st.session_state.CHROMADB_DIR
|
72 |
)
|
73 |
|
74 |
-
def get_processed_files(self) ->
|
75 |
-
"""Get list of processed files from ChromaDB"""
|
76 |
try:
|
77 |
if st.session_state.processor:
|
78 |
-
|
79 |
-
|
|
|
80 |
except Exception as e:
|
81 |
st.error(f"Error getting processed files: {str(e)}")
|
82 |
-
return
|
83 |
|
84 |
def run(self):
|
85 |
-
st.title("
|
86 |
-
|
87 |
-
# Create sidebar for navigation
|
88 |
-
st.sidebar.title("Navigation")
|
89 |
-
page = st.sidebar.selectbox(
|
90 |
-
"Choose a page",
|
91 |
-
["Upload & Process", "Query"]
|
92 |
-
)
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
- Process them for semantic search
|
100 |
-
- Query the documents with different levels of detail
|
101 |
-
""")
|
102 |
|
103 |
-
|
104 |
-
|
|
|
|
|
105 |
else:
|
106 |
-
|
107 |
|
108 |
def upload_and_process_page(self):
|
109 |
st.header("Upload and Process Documents")
|
@@ -113,187 +289,86 @@ class StreamlitDocProcessor:
|
|
113 |
type=['pdf', 'xml'],
|
114 |
accept_multiple_files=True
|
115 |
)
|
116 |
-
|
117 |
if uploaded_files:
|
118 |
for uploaded_file in uploaded_files:
|
119 |
-
# Create progress containers
|
120 |
progress_bar = st.progress(0)
|
121 |
-
|
122 |
|
123 |
-
|
124 |
-
if uploaded_file.name not in st.session_state.processed_files.get(file_ext, []):
|
125 |
try:
|
126 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
127 |
tmp_file.write(uploaded_file.getbuffer())
|
128 |
temp_path = tmp_file.name
|
129 |
-
|
130 |
-
|
|
|
131 |
|
132 |
-
|
133 |
-
|
134 |
-
status_container.text('Parsing XML...')
|
135 |
-
progress_bar.progress(10)
|
136 |
-
|
137 |
-
result = st.session_state.processor.process_file(temp_path)
|
138 |
-
|
139 |
-
if result['success']:
|
140 |
-
total_chunks = result['total_chunks']
|
141 |
-
for i, chunk_result in enumerate(result['results']):
|
142 |
-
# Update progress for each batch
|
143 |
-
progress = min(95, int(10 + (85 * (i / total_chunks))))
|
144 |
-
progress_bar.progress(progress)
|
145 |
-
status_container.text(f'Processing chunk {i+1}/{total_chunks}...')
|
146 |
-
else:
|
147 |
-
# Regular PDF processing
|
148 |
-
result = st.session_state.processor.process_file(temp_path)
|
149 |
|
150 |
if result['success']:
|
151 |
-
|
152 |
-
st.session_state.processed_files[file_ext] = []
|
153 |
-
st.session_state.processed_files[file_ext].append(uploaded_file.name)
|
154 |
progress_bar.progress(100)
|
155 |
-
|
156 |
else:
|
157 |
progress_bar.progress(100)
|
158 |
-
|
159 |
-
|
160 |
except Exception as e:
|
161 |
-
|
162 |
finally:
|
163 |
try:
|
164 |
os.unlink(temp_path)
|
165 |
except:
|
166 |
pass
|
167 |
else:
|
168 |
-
|
169 |
progress_bar.progress(100)
|
170 |
|
|
|
|
|
|
|
|
|
171 |
|
172 |
def qa_page(self):
|
173 |
-
st.header("Query
|
174 |
-
|
175 |
try:
|
176 |
-
# Refresh available files
|
177 |
st.session_state.processed_files = self.get_processed_files()
|
178 |
|
179 |
-
if not
|
180 |
st.warning("No processed files available. Please upload and process some files first.")
|
181 |
return
|
182 |
-
|
183 |
-
# Create combined list of files with icons
|
184 |
-
all_files = []
|
185 |
-
for file in st.session_state.processed_files.get('xml', []):
|
186 |
-
all_files.append(f"📱 {file}")
|
187 |
-
for file in st.session_state.processed_files.get('pdf', []):
|
188 |
-
all_files.append(f"📄 {file}")
|
189 |
-
|
190 |
-
if not all_files:
|
191 |
-
st.warning("No processed files available. Please upload and process some files first.")
|
192 |
-
return
|
193 |
-
|
194 |
-
# File selection
|
195 |
selected_files = st.multiselect(
|
196 |
"Select files to search through",
|
197 |
-
sorted(
|
198 |
-
default=
|
199 |
)
|
200 |
|
201 |
-
# Remove icons from selected files
|
202 |
-
selected_files = [f.split(' ', 1)[1] for f in selected_files]
|
203 |
-
|
204 |
if not selected_files:
|
205 |
st.warning("Please select at least one file to search through.")
|
206 |
return
|
207 |
-
|
208 |
-
# Question input
|
209 |
question = st.text_input("Enter your question:")
|
210 |
|
211 |
-
if question:
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
st.write("Answer:", answer)
|
223 |
-
except Exception as e:
|
224 |
-
st.error(f"Error getting answer: {str(e)}")
|
225 |
-
|
226 |
-
with col2:
|
227 |
-
if st.button("Detailed Answer"):
|
228 |
-
try:
|
229 |
-
with st.spinner("Getting detailed answer..."):
|
230 |
-
result = st.session_state.processor.get_detailed_context(
|
231 |
-
question,
|
232 |
-
selected_files
|
233 |
-
)
|
234 |
-
if result['success']:
|
235 |
-
st.write("### Relevant Information")
|
236 |
-
for item in result['results']:
|
237 |
-
with st.expander(f"Source: {item['metadata']['source_file']} ({item['metadata']['content_type'].upper()})"):
|
238 |
-
# Use similarity_score instead of relevance_score
|
239 |
-
st.write(f"Similarity Score: {item['similarity_score']}%")
|
240 |
-
if item['metadata']['content_type'] == 'xml':
|
241 |
-
st.write(f"XML Path: {item['source_info']['path']}")
|
242 |
-
st.write("Content:", item['content'])
|
243 |
-
else:
|
244 |
-
st.error(result['error'])
|
245 |
-
except Exception as e:
|
246 |
-
st.error(f"Error getting detailed answer: {str(e)}")
|
247 |
-
|
248 |
-
with col3:
|
249 |
-
if st.button("Complete Analysis"):
|
250 |
-
try:
|
251 |
-
with st.spinner("Performing complete analysis..."):
|
252 |
-
result = st.session_state.processor.get_summary_and_details(
|
253 |
-
question,
|
254 |
-
selected_files
|
255 |
-
)
|
256 |
-
if result['success']:
|
257 |
-
st.write("### Summary")
|
258 |
-
st.write(result['summary'])
|
259 |
-
|
260 |
-
st.write("### Detailed Information")
|
261 |
-
for item in result['details']:
|
262 |
-
with st.expander(f"Source: {item['metadata']['source_file']} ({item['metadata']['content_type'].upper()})"):
|
263 |
-
# Use similarity_score instead of relevance_score
|
264 |
-
st.write(f"Similarity Score: {item.get('similarity_score', 'N/A')}%")
|
265 |
-
if item['metadata']['content_type'] == 'xml':
|
266 |
-
st.write(f"XML Path: {item['source_info']['path']}")
|
267 |
-
if 'parent_info' in item:
|
268 |
-
st.write("Parent Element:", item['parent_info']['content'])
|
269 |
-
if 'children_info' in item:
|
270 |
-
st.write("Related Elements:")
|
271 |
-
for child in item['children_info']:
|
272 |
-
st.write(f"- {child['content']}")
|
273 |
-
st.write("Content:", item['content'])
|
274 |
-
else:
|
275 |
-
st.error(result['error'])
|
276 |
-
except Exception as e:
|
277 |
-
st.error(f"Error getting complete analysis: {str(e)}")
|
278 |
-
|
279 |
except Exception as e:
|
280 |
st.error(f"Error in Q&A interface: {str(e)}")
|
281 |
|
282 |
-
|
283 |
-
|
284 |
def main():
|
285 |
-
# Set page config
|
286 |
-
st.set_page_config(
|
287 |
-
page_title="Document Assistant",
|
288 |
-
page_icon="📚",
|
289 |
-
layout="wide",
|
290 |
-
initial_sidebar_state="expanded"
|
291 |
-
)
|
292 |
-
|
293 |
-
# Initialize session state
|
294 |
initialize_session_state()
|
295 |
-
|
296 |
-
# Create and run app
|
297 |
app = StreamlitDocProcessor()
|
298 |
app.run()
|
299 |
|
|
|
1 |
import streamlit as st
|
2 |
import os
|
3 |
import tempfile
|
4 |
+
from typing import List, Dict, Optional, Tuple
|
5 |
from unified_document_processor import UnifiedDocumentProcessor, CustomEmbeddingFunction
|
6 |
import chromadb
|
7 |
from chromadb.config import Settings
|
8 |
from groq import Groq
|
9 |
+
import json
|
10 |
+
import xml.etree.ElementTree as ET
|
11 |
+
|
12 |
+
class EnhancedXMLProcessor:
|
13 |
+
def __init__(self):
|
14 |
+
self.processed_nodes = set()
|
15 |
+
self.reference_map = {}
|
16 |
+
self.node_info = {}
|
17 |
+
|
18 |
+
def build_reference_map(self, root) -> None:
|
19 |
+
for element in root.findall('.//*'):
|
20 |
+
node_id = element.get('NodeId')
|
21 |
+
if node_id:
|
22 |
+
self.node_info[node_id] = {
|
23 |
+
'tag': element.tag,
|
24 |
+
'browse_name': element.get('BrowseName', ''),
|
25 |
+
'display_name': self._get_display_name(element),
|
26 |
+
'description': self._get_description(element),
|
27 |
+
'data_type': element.get('DataType', ''),
|
28 |
+
'references': []
|
29 |
+
}
|
30 |
+
|
31 |
+
refs = element.find('References')
|
32 |
+
if refs is not None:
|
33 |
+
for ref in refs.findall('Reference'):
|
34 |
+
ref_type = ref.get('ReferenceType')
|
35 |
+
is_forward = ref.get('IsForward', 'true') == 'true'
|
36 |
+
target = ref.text
|
37 |
+
|
38 |
+
if ref_type in ['HasComponent', 'HasProperty', 'HasTypeDefinition']:
|
39 |
+
self.reference_map.setdefault(node_id, []).append({
|
40 |
+
'type': ref_type,
|
41 |
+
'target': target,
|
42 |
+
'is_forward': is_forward
|
43 |
+
})
|
44 |
+
self.node_info[node_id]['references'].append({
|
45 |
+
'type': ref_type,
|
46 |
+
'target': target,
|
47 |
+
'is_forward': is_forward
|
48 |
+
})
|
49 |
+
|
50 |
+
def _get_display_name(self, element) -> str:
|
51 |
+
display_name = element.find('DisplayName')
|
52 |
+
if display_name is not None:
|
53 |
+
return display_name.text
|
54 |
+
return ''
|
55 |
+
|
56 |
+
def _get_description(self, element) -> str:
|
57 |
+
desc = element.find('Description')
|
58 |
+
if desc is not None:
|
59 |
+
return desc.text
|
60 |
+
return ''
|
61 |
+
|
62 |
+
def _get_value(self, element) -> Optional[str]:
|
63 |
+
value_elem = element.find('.//Value')
|
64 |
+
if value_elem is not None:
|
65 |
+
for child in value_elem:
|
66 |
+
if child.text:
|
67 |
+
return child.text
|
68 |
+
return None
|
69 |
+
|
70 |
+
def generate_natural_language(self, node_id: str, depth: int = 0, visited: set = None) -> List[str]:
|
71 |
+
if visited is None:
|
72 |
+
visited = set()
|
73 |
+
|
74 |
+
if node_id in visited:
|
75 |
+
return []
|
76 |
+
|
77 |
+
visited.add(node_id)
|
78 |
+
descriptions = []
|
79 |
+
|
80 |
+
node = self.node_info.get(node_id)
|
81 |
+
if not node:
|
82 |
+
return []
|
83 |
+
|
84 |
+
base_desc = self._build_base_description(node, depth)
|
85 |
+
if base_desc:
|
86 |
+
descriptions.append(base_desc)
|
87 |
+
|
88 |
+
if node_id in self.reference_map:
|
89 |
+
child_descriptions = self._process_forward_references(node_id, depth + 1, visited)
|
90 |
+
descriptions.extend(child_descriptions)
|
91 |
+
|
92 |
+
return descriptions
|
93 |
+
|
94 |
+
def _build_base_description(self, node: Dict, depth: int) -> str:
|
95 |
+
indentation = " " * depth
|
96 |
+
desc_parts = []
|
97 |
+
|
98 |
+
if node['browse_name']:
|
99 |
+
browse_name = node['browse_name'].split(':')[-1]
|
100 |
+
desc_parts.append(f"a {browse_name}")
|
101 |
+
|
102 |
+
if node['display_name']:
|
103 |
+
desc_parts.append(f"(displayed as '{node['display_name']}')")
|
104 |
+
|
105 |
+
if node['data_type']:
|
106 |
+
desc_parts.append(f"of type {node['data_type']}")
|
107 |
+
|
108 |
+
if node['description']:
|
109 |
+
desc_parts.append(f"which {node['description']}")
|
110 |
+
|
111 |
+
if desc_parts:
|
112 |
+
return f"{indentation}Contains {' '.join(desc_parts)}"
|
113 |
+
return ""
|
114 |
+
|
115 |
+
def _process_forward_references(self, node_id: str, depth: int, visited: set) -> List[str]:
|
116 |
+
descriptions = []
|
117 |
+
|
118 |
+
for ref in self.reference_map.get(node_id, []):
|
119 |
+
if ref['is_forward'] and ref['type'] in ['HasComponent', 'HasProperty']:
|
120 |
+
target_descriptions = self.generate_natural_language(ref['target'], depth, visited)
|
121 |
+
descriptions.extend(target_descriptions)
|
122 |
+
|
123 |
+
return descriptions
|
124 |
+
|
125 |
+
def generate_complete_description(self, root) -> str:
|
126 |
+
self.build_reference_map(root)
|
127 |
+
|
128 |
+
root_descriptions = []
|
129 |
+
for node_id in self.node_info:
|
130 |
+
is_root = True
|
131 |
+
for ref_list in self.reference_map.values():
|
132 |
+
for ref in ref_list:
|
133 |
+
if not ref['is_forward'] and ref['type'] == 'HasComponent' and ref['target'] == node_id:
|
134 |
+
is_root = False
|
135 |
+
break
|
136 |
+
if not is_root:
|
137 |
+
break
|
138 |
+
|
139 |
+
if is_root:
|
140 |
+
descriptions = self.generate_natural_language(node_id)
|
141 |
+
root_descriptions.extend(descriptions)
|
142 |
+
|
143 |
+
return "\n".join(root_descriptions)
|
144 |
+
|
145 |
+
def flatten_xml_to_text(self, element, depth=0) -> str:
|
146 |
+
try:
|
147 |
+
return self.generate_complete_description(element)
|
148 |
+
except Exception as e:
|
149 |
+
print(f"Error in enhanced XML processing: {str(e)}")
|
150 |
+
return self._original_flatten_xml_to_text(element, depth)
|
151 |
+
|
152 |
+
def _original_flatten_xml_to_text(self, element, depth=0) -> str:
|
153 |
+
text_parts = []
|
154 |
+
|
155 |
+
element_info = f"Element: {element.tag}"
|
156 |
+
if element.attrib:
|
157 |
+
element_info += f", Attributes: {json.dumps(element.attrib)}"
|
158 |
+
if element.text and element.text.strip():
|
159 |
+
element_info += f", Text: {element.text.strip()}"
|
160 |
+
text_parts.append(element_info)
|
161 |
+
|
162 |
+
for child in element:
|
163 |
+
child_text = self._original_flatten_xml_to_text(child, depth + 1)
|
164 |
+
text_parts.append(child_text)
|
165 |
+
|
166 |
+
return "\n".join(text_parts)
|
167 |
+
|
168 |
+
class PersistentUnifiedDocumentProcessor(UnifiedDocumentProcessor):
|
169 |
+
def __init__(self, api_key, collection_name="unified_content", persist_dir=None):
|
170 |
+
self.groq_client = Groq(api_key=api_key)
|
171 |
+
self.max_elements_per_chunk = 50
|
172 |
+
self.pdf_chunk_size = 500
|
173 |
+
self.pdf_overlap = 50
|
174 |
+
self._initialize_nltk()
|
175 |
+
self.xml_processor = EnhancedXMLProcessor()
|
176 |
+
|
177 |
+
self.chroma_client = chromadb.PersistentClient(
|
178 |
+
path=persist_dir,
|
179 |
+
settings=Settings(
|
180 |
+
allow_reset=True,
|
181 |
+
is_persistent=True
|
182 |
+
)
|
183 |
+
)
|
184 |
+
|
185 |
+
try:
|
186 |
+
self.collection = self.chroma_client.get_collection(
|
187 |
+
name=collection_name,
|
188 |
+
embedding_function=CustomEmbeddingFunction()
|
189 |
+
)
|
190 |
+
except:
|
191 |
+
self.collection = self.chroma_client.create_collection(
|
192 |
+
name=collection_name,
|
193 |
+
embedding_function=CustomEmbeddingFunction()
|
194 |
+
)
|
195 |
+
|
196 |
+
def flatten_xml_to_text(self, element, depth=0) -> str:
|
197 |
+
try:
|
198 |
+
return self.xml_processor.generate_complete_description(element)
|
199 |
+
except Exception as e:
|
200 |
+
st.error(f"Error in enhanced XML processing: {str(e)}")
|
201 |
+
return self._original_flatten_xml_to_text(element, depth)
|
202 |
+
|
203 |
+
def _original_flatten_xml_to_text(self, element, depth=0) -> str:
|
204 |
+
text_parts = []
|
205 |
+
|
206 |
+
element_info = f"Element: {element.tag}"
|
207 |
+
if element.attrib:
|
208 |
+
element_info += f", Attributes: {json.dumps(element.attrib)}"
|
209 |
+
if element.text and element.text.strip():
|
210 |
+
element_info += f", Text: {element.text.strip()}"
|
211 |
+
text_parts.append(element_info)
|
212 |
+
|
213 |
+
for child in element:
|
214 |
+
child_text = self._original_flatten_xml_to_text(child, depth + 1)
|
215 |
+
text_parts.append(child_text)
|
216 |
+
|
217 |
+
return "\n".join(text_parts)
|
218 |
|
219 |
def initialize_session_state():
|
220 |
"""Initialize all session state variables"""
|
|
|
223 |
os.makedirs(st.session_state.CHROMADB_DIR, exist_ok=True)
|
224 |
|
225 |
if 'processed_files' not in st.session_state:
|
226 |
+
st.session_state.processed_files = set()
|
227 |
|
228 |
if 'processor' not in st.session_state:
|
229 |
+
st.session_state.processor = None
|
|
|
|
|
|
|
230 |
|
231 |
class StreamlitDocProcessor:
|
232 |
def __init__(self):
|
233 |
+
self.ensure_processor_initialized()
|
234 |
+
|
235 |
+
def ensure_processor_initialized(self):
|
236 |
if st.session_state.processor is None:
|
237 |
try:
|
238 |
+
groq_api_key = os.getenv('GROQ_API_KEY')
|
239 |
+
if not groq_api_key:
|
240 |
+
st.error("GROQ API key not found. Please set the GROQ_API_KEY environment variable.")
|
241 |
+
return False
|
242 |
+
|
243 |
st.session_state.processor = self.initialize_processor(groq_api_key)
|
244 |
+
if st.session_state.processor:
|
245 |
+
st.session_state.processed_files = self.get_processed_files()
|
246 |
+
return True
|
247 |
except Exception as e:
|
248 |
st.error(f"Error initializing processor: {str(e)}")
|
249 |
+
return False
|
250 |
+
return True
|
251 |
|
252 |
def initialize_processor(self, groq_api_key):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
253 |
return PersistentUnifiedDocumentProcessor(
|
254 |
groq_api_key,
|
255 |
persist_dir=st.session_state.CHROMADB_DIR
|
256 |
)
|
257 |
|
258 |
+
def get_processed_files(self) -> set:
|
|
|
259 |
try:
|
260 |
if st.session_state.processor:
|
261 |
+
available_files = st.session_state.processor.get_available_files()
|
262 |
+
return set(available_files['pdf'] + available_files['xml'])
|
263 |
+
return set()
|
264 |
except Exception as e:
|
265 |
st.error(f"Error getting processed files: {str(e)}")
|
266 |
+
return set()
|
267 |
|
268 |
def run(self):
|
269 |
+
st.title("AAS Assistant")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
270 |
|
271 |
+
if self.ensure_processor_initialized():
|
272 |
+
page = st.sidebar.selectbox(
|
273 |
+
"Choose a page",
|
274 |
+
["Upload & Process", "Query"]
|
275 |
+
)
|
|
|
|
|
|
|
276 |
|
277 |
+
if page == "Upload & Process":
|
278 |
+
self.upload_and_process_page()
|
279 |
+
else:
|
280 |
+
self.qa_page()
|
281 |
else:
|
282 |
+
st.error("Please set the GROQ_API_KEY environment variable and restart the application.")
|
283 |
|
284 |
def upload_and_process_page(self):
|
285 |
st.header("Upload and Process Documents")
|
|
|
289 |
type=['pdf', 'xml'],
|
290 |
accept_multiple_files=True
|
291 |
)
|
292 |
+
|
293 |
if uploaded_files:
|
294 |
for uploaded_file in uploaded_files:
|
|
|
295 |
progress_bar = st.progress(0)
|
296 |
+
status_text = st.empty()
|
297 |
|
298 |
+
if uploaded_file.name not in st.session_state.processed_files:
|
|
|
299 |
try:
|
300 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
301 |
tmp_file.write(uploaded_file.getbuffer())
|
302 |
temp_path = tmp_file.name
|
303 |
+
|
304 |
+
status_text.text(f'Processing {uploaded_file.name}...')
|
305 |
+
progress_bar.progress(25)
|
306 |
|
307 |
+
result = st.session_state.processor.process_file(temp_path)
|
308 |
+
progress_bar.progress(75)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
if result['success']:
|
311 |
+
st.session_state.processed_files.add(uploaded_file.name)
|
|
|
|
|
312 |
progress_bar.progress(100)
|
313 |
+
status_text.success(f"Successfully processed {uploaded_file.name}")
|
314 |
else:
|
315 |
progress_bar.progress(100)
|
316 |
+
status_text.error(f"Failed to process {uploaded_file.name}: {result['error']}")
|
317 |
+
|
318 |
except Exception as e:
|
319 |
+
status_text.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
320 |
finally:
|
321 |
try:
|
322 |
os.unlink(temp_path)
|
323 |
except:
|
324 |
pass
|
325 |
else:
|
326 |
+
status_text.info(f"{uploaded_file.name} has already been processed")
|
327 |
progress_bar.progress(100)
|
328 |
|
329 |
+
if st.session_state.processed_files:
|
330 |
+
st.subheader("Processed Files")
|
331 |
+
for file in sorted(st.session_state.processed_files):
|
332 |
+
st.text(f"✓ {file}")
|
333 |
|
334 |
def qa_page(self):
|
335 |
+
st.header("Query our database")
|
336 |
+
|
337 |
try:
|
|
|
338 |
st.session_state.processed_files = self.get_processed_files()
|
339 |
|
340 |
+
if not st.session_state.processed_files:
|
341 |
st.warning("No processed files available. Please upload and process some files first.")
|
342 |
return
|
343 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
344 |
selected_files = st.multiselect(
|
345 |
"Select files to search through",
|
346 |
+
sorted(list(st.session_state.processed_files)),
|
347 |
+
default=list(st.session_state.processed_files)
|
348 |
)
|
349 |
|
|
|
|
|
|
|
350 |
if not selected_files:
|
351 |
st.warning("Please select at least one file to search through.")
|
352 |
return
|
353 |
+
|
|
|
354 |
question = st.text_input("Enter your question:")
|
355 |
|
356 |
+
if st.button("Ask Question") and question:
|
357 |
+
try:
|
358 |
+
with st.spinner("Searching for answer..."):
|
359 |
+
answer = st.session_state.processor.ask_question_selective(
|
360 |
+
question,
|
361 |
+
selected_files
|
362 |
+
)
|
363 |
+
st.write("Answer:", answer)
|
364 |
+
except Exception as e:
|
365 |
+
st.error(f"Error getting answer: {str(e)}")
|
366 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
367 |
except Exception as e:
|
368 |
st.error(f"Error in Q&A interface: {str(e)}")
|
369 |
|
|
|
|
|
370 |
def main():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
371 |
initialize_session_state()
|
|
|
|
|
372 |
app = StreamlitDocProcessor()
|
373 |
app.run()
|
374 |
|