File size: 14,897 Bytes
d59e7dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8885ab
d59e7dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b8885ab
d59e7dc
 
 
b8885ab
 
d59e7dc
 
b8885ab
d59e7dc
 
b8885ab
d59e7dc
 
412b7cf
d59e7dc
 
 
 
 
412b7cf
 
 
 
 
 
 
 
 
d59e7dc
 
 
 
 
 
 
 
412b7cf
 
 
 
 
 
 
 
 
 
d59e7dc
 
 
 
412b7cf
 
d59e7dc
 
 
 
412b7cf
 
 
 
 
 
d59e7dc
b8885ab
 
 
d59e7dc
 
 
 
 
 
 
412b7cf
d59e7dc
 
 
 
 
 
b8885ab
 
 
d59e7dc
412b7cf
d59e7dc
 
412b7cf
 
d59e7dc
 
412b7cf
 
d59e7dc
 
 
 
 
 
 
412b7cf
d59e7dc
 
 
b8885ab
d59e7dc
412b7cf
 
 
 
 
 
 
 
 
 
 
 
 
 
a942df2
d59e7dc
b8885ab
412b7cf
d59e7dc
 
 
 
b8885ab
d59e7dc
 
412b7cf
b8885ab
 
 
 
 
 
412b7cf
b8885ab
 
 
412b7cf
 
 
 
 
 
 
 
 
 
b8885ab
d59e7dc
 
b8885ab
412b7cf
 
d59e7dc
412b7cf
b8885ab
 
412b7cf
d59e7dc
 
 
 
412b7cf
 
 
 
 
d59e7dc
412b7cf
 
 
 
 
 
 
 
 
 
 
 
 
 
a942df2
412b7cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a942df2
412b7cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d59e7dc
 
 
 
 
412b7cf
 
 
 
 
 
 
 
d59e7dc
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
import streamlit as st
import os
import tempfile
from typing import List
from unified_document_processor import UnifiedDocumentProcessor, CustomEmbeddingFunction
import chromadb
from chromadb.config import Settings
from groq import Groq

def initialize_session_state():
    """Initialize all session state variables"""
    if 'CHROMADB_DIR' not in st.session_state:
        st.session_state.CHROMADB_DIR = os.path.join(os.getcwd(), 'chromadb_data')
        os.makedirs(st.session_state.CHROMADB_DIR, exist_ok=True)
    
    if 'processed_files' not in st.session_state:
        st.session_state.processed_files = dict(pdf=[], xml=[])
    
    if 'processor' not in st.session_state:
        try:
            st.session_state.processor = None  # Will be initialized in StreamlitDocProcessor
        except Exception as e:
            st.error(f"Error initializing processor: {str(e)}")

class StreamlitDocProcessor:
    def __init__(self):
        if st.session_state.processor is None:
            try:
                groq_api_key = st.secrets["GROQ_API_KEY"]
                # Initialize processor with persistent ChromaDB
                st.session_state.processor = self.initialize_processor(groq_api_key)
                # Update processed files after initializing processor
                st.session_state.processed_files = self.get_processed_files()
            except Exception as e:
                st.error(f"Error initializing processor: {str(e)}")
                return

    def initialize_processor(self, groq_api_key):
        """Initialize the processor with persistent ChromaDB"""
        class PersistentUnifiedDocumentProcessor(UnifiedDocumentProcessor):
            def __init__(self, api_key, collection_name="unified_content", persist_dir=None):
                self.groq_client = Groq(api_key=api_key)
                self.max_elements_per_chunk = 50
                self.pdf_chunk_size = 500
                self.pdf_overlap = 50
                self._initialize_nltk()
                
                # Initialize persistent ChromaDB
                self.chroma_client = chromadb.PersistentClient(
                    path=persist_dir,
                    settings=Settings(
                        allow_reset=True,
                        is_persistent=True
                    )
                )
                
                # Get or create collection
                try:
                    self.collection = self.chroma_client.get_collection(
                        name=collection_name,
                        embedding_function=CustomEmbeddingFunction()
                    )
                except:
                    self.collection = self.chroma_client.create_collection(
                        name=collection_name,
                        embedding_function=CustomEmbeddingFunction()
                    )

        return PersistentUnifiedDocumentProcessor(
            groq_api_key, 
            persist_dir=st.session_state.CHROMADB_DIR
        )

    def get_processed_files(self) -> dict:
        """Get list of processed files from ChromaDB"""
        try:
            if st.session_state.processor:
                return st.session_state.processor.get_available_files()
            return dict(pdf=[], xml=[])
        except Exception as e:
            st.error(f"Error getting processed files: {str(e)}")
            return dict(pdf=[], xml=[])

    def run(self):
        st.title("Document Assistant")
        
        # Create sidebar for navigation
        st.sidebar.title("Navigation")
        page = st.sidebar.selectbox(
            "Choose a page",
            ["Upload & Process", "Query"]
        )
        
        # Add sidebar information
        with st.sidebar.expander("About"):
            st.write("""
            This application allows you to:
            - Upload PDF and XML documents
            - Process them for semantic search
            - Query the documents with different levels of detail
            """)
            
        if page == "Upload & Process":
            self.upload_and_process_page()
        else:
            self.qa_page()

    def upload_and_process_page(self):
        st.header("Upload and Process Documents")
        
        # Add instructions
        with st.expander("Instructions", expanded=True):
            st.write("""
            1. Click 'Browse files' to select documents
            2. You can select multiple files at once
            3. Supported formats: PDF and XML
            4. Wait for processing to complete
            5. Processed files will be listed below
            """)
        
        # File uploader
        uploaded_files = st.file_uploader(
            "Upload PDF or XML files",
            type=['pdf', 'xml'],
            accept_multiple_files=True,
            help="Select one or more PDF or XML files to upload"
        )

        if uploaded_files:
            for uploaded_file in uploaded_files:
                # Create progress bar and status container
                col1, col2 = st.columns([3, 1])
                with col1:
                    progress_bar = st.progress(0)
                with col2:
                    status_text = st.empty()
                
                # Check if file is already processed
                file_ext = os.path.splitext(uploaded_file.name)[1][1:]  # Get extension without dot
                if uploaded_file.name not in st.session_state.processed_files.get(file_ext, []):
                    try:
                        # Create a temporary file
                        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
                            tmp_file.write(uploaded_file.getbuffer())
                            temp_path = tmp_file.name

                        # Process the file
                        status_text.info('Processing...')
                        progress_bar.progress(25)
                        
                        result = st.session_state.processor.process_file(temp_path)
                        progress_bar.progress(75)
                        
                        if result['success']:
                            if file_ext not in st.session_state.processed_files:
                                st.session_state.processed_files[file_ext] = []
                            st.session_state.processed_files[file_ext].append(uploaded_file.name)
                            progress_bar.progress(100)
                            status_text.success("βœ“ Success")
                        else:
                            progress_bar.progress(100)
                            status_text.error("βœ— Failed")
                            st.error(f"Failed to process {uploaded_file.name}: {result['error']}")

                    except Exception as e:
                        status_text.error("βœ— Error")
                        st.error(f"Error processing {uploaded_file.name}: {str(e)}")
                    finally:
                        # Clean up temporary file
                        try:
                            os.unlink(temp_path)
                        except:
                            pass
                else:
                    status_text.info("Already processed")
                    progress_bar.progress(100)

        # Display processed files
        if any(st.session_state.processed_files.values()):
            st.subheader("Processed Files")
            
            col1, col2 = st.columns(2)
            
            with col1:
                if st.session_state.processed_files.get('xml'):
                    st.write("πŸ“± XML Files:")
                    for file in sorted(st.session_state.processed_files['xml']):
                        st.text(f"  β€’ {file}")
                        
            with col2:
                if st.session_state.processed_files.get('pdf'):
                    st.write("πŸ“„ PDF Files:")
                    for file in sorted(st.session_state.processed_files['pdf']):
                        st.text(f"  β€’ {file}")

    def qa_page(self):
        st.header("Query Documents")

        try:
            # Refresh available files
            st.session_state.processed_files = self.get_processed_files()
            
            if not any(st.session_state.processed_files.values()):
                st.warning("No processed files available. Please upload and process some files first.")
                return

            # Create combined list of files with icons
            all_files = []
            for file in st.session_state.processed_files.get('xml', []):
                all_files.append(f"πŸ“± {file}")
            for file in st.session_state.processed_files.get('pdf', []):
                all_files.append(f"πŸ“„ {file}")

            if not all_files:
                st.warning("No processed files available. Please upload and process some files first.")
                return

            # Add query instructions
            with st.expander("Query Instructions", expanded=True):
                st.write("""
                Choose your query type:
                - **Quick Answer**: Basic response with essential information
                - **Detailed Answer**: Shows sources and relevance with expandable details
                - **Complete Analysis**: Provides summary and full breakdown with XML hierarchies
                """)

            # File selection
            selected_files = st.multiselect(
                "Select files to search through",
                sorted(all_files),
                default=all_files,
                help="Choose which files to include in your search"
            )

            # Remove icons from selected files
            selected_files = [f.split(' ', 1)[1] for f in selected_files]

            if not selected_files:
                st.warning("Please select at least one file to search through.")
                return

            # Question input
            question = st.text_input(
                "Enter your question:",
                help="Type your question here and choose a query type below"
            )
            
            if question:
                col1, col2, col3 = st.columns(3)
                
                with col1:
                    if st.button("Quick Answer", help="Get a concise answer quickly"):
                        try:
                            with st.spinner("Getting quick answer..."):
                                answer = st.session_state.processor.ask_question_selective(
                                    question,
                                    selected_files
                                )
                                st.write("Answer:", answer)
                        except Exception as e:
                            st.error(f"Error getting answer: {str(e)}")

                with col2:
                    if st.button("Detailed Answer", help="Get answer with sources and relevance scores"):
                        try:
                            with st.spinner("Getting detailed answer..."):
                                result = st.session_state.processor.get_detailed_context(
                                    question,
                                    selected_files
                                )
                                if result['success']:
                                    st.write("### Relevant Information")
                                    for item in result['results']:
                                        with st.expander(f"Source: {item['metadata']['source_file']} ({item['metadata']['content_type'].upper()})"):
                                            st.write(f"Relevance Score: {item['relevance_score']:.2f}")
                                            if item['metadata']['content_type'] == 'xml':
                                                st.write(f"XML Path: {item['source_info']['path']}")
                                            st.write("Content:", item['content'])
                                else:
                                    st.error(result['error'])
                        except Exception as e:
                            st.error(f"Error getting detailed answer: {str(e)}")

                with col3:
                    if st.button("Complete Analysis", help="Get comprehensive analysis with XML hierarchy"):
                        try:
                            with st.spinner("Performing complete analysis..."):
                                result = st.session_state.processor.get_summary_and_details(
                                    question,
                                    selected_files
                                )
                                if result['success']:
                                    st.write("### Summary")
                                    st.write(result['summary'])
                                    
                                    st.write("### Detailed Information")
                                    for item in result['details']:
                                        with st.expander(f"Source: {item['metadata']['source_file']} ({item['metadata']['content_type'].upper()})"):
                                            st.write(f"Relevance Score: {item['relevance_score']:.2f}")
                                            if item['metadata']['content_type'] == 'xml':
                                                st.write(f"XML Path: {item['source_info']['path']}")
                                                if 'parent_info' in item:
                                                    st.write("Parent Element:", item['parent_info']['content'])
                                                if 'children_info' in item:
                                                    st.write("Related Elements:")
                                                    for child in item['children_info']:
                                                        st.write(f"- {child['content']}")
                                            st.write("Content:", item['content'])
                                else:
                                    st.error(result['error'])
                        except Exception as e:
                            st.error(f"Error getting complete analysis: {str(e)}")
                    
        except Exception as e:
            st.error(f"Error in Q&A interface: {str(e)}")

def main():
    # Set page config
    st.set_page_config(
        page_title="Document Assistant",
        page_icon="πŸ“š",
        layout="wide",
        initial_sidebar_state="expanded"
    )
    
    # Initialize session state
    initialize_session_state()
    
    # Create and run app
    app = StreamlitDocProcessor()
    app.run()

if __name__ == "__main__":
    main()