File size: 13,274 Bytes
5610ef5
 
14a3c08
 
7e7c741
14a3c08
5610ef5
5b02b7b
5610ef5
5b02b7b
 
 
 
5610ef5
 
 
 
 
 
7e7c741
4e1be1b
 
 
 
 
 
 
7f49e7a
 
 
 
 
14a3c08
 
7f49e7a
14a3c08
 
7f49e7a
 
5610ef5
7e7c741
14a3c08
7f49e7a
 
14a3c08
7f49e7a
 
14a3c08
5610ef5
 
 
 
 
 
 
 
 
 
 
 
7f49e7a
14a3c08
5610ef5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7f49e7a
14a3c08
5610ef5
 
7f49e7a
 
 
 
 
14a3c08
7f49e7a
 
5610ef5
7f49e7a
 
5b02b7b
7f49e7a
7e7c741
7f49e7a
 
 
7e7c741
7f49e7a
 
 
 
 
 
 
 
7e7c741
7f49e7a
7e7c741
7f49e7a
 
 
 
 
 
 
7e7c741
7f49e7a
7e7c741
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5610ef5
 
 
 
 
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e7c741
7f49e7a
 
 
7e7c741
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
 
7e7c741
7f49e7a
5610ef5
 
 
 
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
 
7e7c741
7f49e7a
 
 
 
14a3c08
 
7f49e7a
14a3c08
 
7f49e7a
 
 
 
 
14a3c08
 
7f49e7a
 
 
 
 
5610ef5
7f49e7a
5610ef5
 
 
 
 
 
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5610ef5
7f49e7a
5610ef5
 
 
 
 
 
7f49e7a
 
 
 
 
 
7e7c741
7f49e7a
 
 
 
 
 
 
 
 
 
 
 
5610ef5
 
 
 
 
 
 
 
14a3c08
7f49e7a
 
 
 
 
 
5610ef5
7f49e7a
 
 
 
 
7e7c741
7f49e7a
 
 
 
7e7c741
7f49e7a
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
# crawlgpt/src/crawlgpt/ui/chat_app.py
# Description: Streamlit app for the chat interface of the CrawlGPT system with user authentication
import streamlit as st
import asyncio
import time
from datetime import datetime
import json
from src.crawlgpt.core.LLMBasedCrawler import Model
from src.crawlgpt.core.database import save_chat_message, get_chat_history, delete_user_chat_history, restore_chat_history
from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
from src.crawlgpt.utils.progress import ProgressTracker
from src.crawlgpt.utils.data_manager import DataManager
from src.crawlgpt.utils.content_validator import ContentValidator
from src.crawlgpt.ui.login import show_login

# Check authentication before any other processing
if 'user' not in st.session_state:
    show_login()
    st.stop()  # Stop execution if not logged in

# Home Page Setup 
st.set_page_config(
    page_title="CrawlGPT πŸš€πŸ€–", 
    page_icon="πŸ€–", 
    layout="centered",
)

# Streamlit app title and description
st.title("CrawlGPT πŸš€πŸ€–")
st.write(
    "This app extracts content from a URL, stores it in a vector database, and generates responses to user queries. "
    "It also summarizes extracted content for efficient retrieval."
)

# Initialize components in session state
if "model" not in st.session_state:
    st.session_state.model = Model()
    st.session_state.data_manager = DataManager()
    st.session_state.content_validator = ContentValidator()
    st.session_state.messages = []
    st.session_state.url_processed = False

if "use_summary" not in st.session_state:
    st.session_state.use_summary = True

if "metrics" not in st.session_state:
    st.session_state.metrics = MetricsCollector()

# Load chat history from database
if "messages" not in st.session_state:
    st.session_state.messages = []
    # Load user's chat history from database
    history = get_chat_history(st.session_state.user.id)
    st.session_state.messages = [{
        "role": msg.role,
        "content": msg.message,
        "context": msg.context,
        "timestamp": msg.timestamp
    } for msg in history]

model = st.session_state.model

def load_chat_history():
    """Loads chat history and model state from database"""
    try:
        # Clear existing model state
        model.clear()
        
        # Load messages
        st.session_state.messages = restore_chat_history(st.session_state.user.id)
        
        # Rebuild model context from chat history
        context_parts = [
            msg['context'] for msg in st.session_state.messages 
            if msg.get('context')
        ]
        model.context = "\n".join(context_parts)
        
        # Rebuild vector database from context
        if model.context:
            chunks = model.chunk_text(model.context)
            summaries = [model.summarizer.generate_summary(chunk) for chunk in chunks]
            model.database.add_data(chunks, summaries)
            st.session_state.url_processed = True
            
        st.rerun()
        
    except Exception as e:
        st.error(f"Restoration failed: {str(e)}")

# Sidebar implementation
with st.sidebar:
    st.subheader(f"πŸ‘€ User: {st.session_state.user.username}")
    
    st.subheader("πŸ“Š System Metrics")
    metrics = st.session_state.metrics.metrics.to_dict()
    st.metric("Total Requests", metrics["total_requests"])
    st.metric("Success Rate", f"{(metrics['successful_requests']/max(metrics['total_requests'], 1))*100:.1f}%")
    st.metric("Avg Response Time", f"{metrics['average_response_time']:.2f}s")
    
    # RAG Settings
    st.subheader("πŸ”§ RAG Settings")
    st.session_state.use_summary = st.checkbox("Use Summarized RAG", value=False, help="Don't use summarization when dealing with Coding Documentation.")
    st.subheader("πŸ€– Normal LLM Settings")
    temperature = st.slider("Temperature", 0.0, 1.0, 0.7, help="Controls the randomness of the generated text. Lower values are more deterministic.")
    max_tokens = st.slider("Max Tokens", 500, 10000, 5000, help="Maximum number of tokens to generate in the response.")
    model_id = st.radio("Model ID", ['llama-3.1-8b-instant', 'llama-3.3-70b-versatile', 'mixtral-8x7b-32768'], help="Choose the model to use for generating responses.")
    
    # Export/Import Data
    st.subheader("πŸ’Ύ Data Management")
    if st.button("Export Current State"):
        try:
            export_data = {
                "metrics": metrics,
                "vector_database": model.database.to_dict(),
                "messages": st.session_state.messages
            }
            export_json = json.dumps(export_data)
            st.session_state.export_json = export_json
            st.success("Data exported successfully!")
        except Exception as e:
            st.error(f"Export failed: {e}")

    if "export_json" in st.session_state:
        st.download_button(
            label="Download Backup",
            data=st.session_state.export_json,
            file_name=f"crawlgpt_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json"
        )

    uploaded_file = st.file_uploader("Import Previous State", type=['json'])
    if uploaded_file is not None:
        try:
            imported_data = json.loads(uploaded_file.read())
            
            # Validate imported data structure
            required_keys = ["metrics", "vector_database", "messages"]
            if not all(key in imported_data for key in required_keys):
                raise ValueError("Invalid backup file structure")
                
            # Import data with proper state management
            model.import_state(imported_data)
            
            # Restore chat history and context
            if "messages" in imported_data:
                st.session_state.messages = imported_data["messages"]
                
            # Set URL processed state if there's context
            if model.context:
                st.session_state.url_processed = True
            else:
                st.session_state.url_processed = False
                
            # Update metrics
            if "metrics" in imported_data:
                st.session_state.metrics = MetricsCollector()
                st.session_state.metrics.metrics = Metrics.from_dict(imported_data["metrics"])
                model.import_state(imported_data)
                
            st.success("Data imported successfully! You can continue chatting.")
            st.session_state.url_processed = True
            
        except Exception as e:
            st.error(f"Import failed: {e}")
            st.session_state.url_processed = False
            
    if st.button("♻️ Restore Full Chat State"):
        with st.spinner("Rebuilding AI context..."):
            load_chat_history()
        st.success("Full conversation state restored!")

# URL Processing Section
url_col1, url_col2 = st.columns([3, 1])
with url_col1:
    url = st.text_input("Enter URL:", help="Provide the URL to extract content from.")
with url_col2:
    process_url = st.button("Process URL")

if process_url and url:
    if not url.strip():
        st.warning("Please enter a valid URL.")
    else:
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        try:
            if not st.session_state.content_validator.is_valid_url(url):
                st.error("Invalid URL format")
            else:
                async def extract_content():
                    start_time = time.time()
                    progress = ProgressTracker(total_steps=4, operation_name="content_extraction")
                    
                    try:
                        status_text.text("Validating URL...")
                        progress_bar.progress(25)
                        
                        status_text.text("Crawling content...")
                        progress_bar.progress(50)
                        success, msg = await model.extract_content_from_url(url)
                        
                        if success:
                            status_text.text("Processing content...")
                            progress_bar.progress(75)
                            
                            status_text.text("Storing in database...")
                            progress_bar.progress(100)
                            
                            st.session_state.metrics.record_request(
                                success=True,
                                response_time=time.time() - start_time,
                                tokens_used=len(model.context.split())
                            )
                            
                            st.session_state.url_processed = True
                            st.session_state.messages.append({
                                                                "role": "system",
                                                                "content": f"Content from {url} processed",
                                                                "context": model.context  # Store full context
                                                            })
                        else:
                            raise Exception(msg)
                            
                    except Exception as e:
                        st.session_state.metrics.record_request(
                            success=False,
                            response_time=time.time() - start_time,
                            tokens_used=0
                        )
                        raise e
                    finally:
                        status_text.empty()
                        progress_bar.empty()

                asyncio.run(extract_content())
                
        except Exception as e:
            st.error(f"Error processing URL: {e}")

# Chat Interface
st.subheader("πŸ’­ Chat Interface")

# Display chat messages
chat_container = st.container()
with chat_container:
    for message in st.session_state.messages:
        with st.chat_message(message["role"]):
            st.write(message["content"])

# Chat input
if chat_input := st.chat_input("Ask about the content...", disabled=not st.session_state.url_processed):
    # Display user message
    with st.chat_message("user"):
        st.write(chat_input)
    
    # Add user message to history and database
    st.session_state.messages.append({"role": "user", "content": chat_input})
    save_chat_message(
        st.session_state.user.id,
        chat_input,
        "user",
        model.context  # Store full context
    )
    try:
        start_time = time.time()
        
        # Show typing indicator
        with st.chat_message("assistant"):
            with st.spinner("Thinking..."):
                response = model.generate_response(
                    chat_input,
                    temperature,
                    max_tokens,
                    model_id,
                    use_summary=st.session_state.use_summary
                )
                st.write(response)
        
        # Add assistant response to history and database
        st.session_state.messages.append({"role": "assistant", "content": response})
        save_chat_message(
            st.session_state.user.id,
            response,  # Fixed: Save the assistant's response
            "assistant",  # Fixed: Correct role
            model.context
        )
        # Record metrics
        st.session_state.metrics.record_request(
            success=True,
            response_time=time.time() - start_time,
            tokens_used=len(response.split())
        )

    except Exception as e:
        st.session_state.metrics.record_request(
            success=False,
            response_time=time.time() - start_time,
            tokens_used=0
        )
        st.error(f"Error generating response: {e}")

# Debug and Clear Options
col1, col2 = st.columns(2)
with col1:
    if st.button("Clear Chat History"):
        try:
            delete_user_chat_history(st.session_state.user.id)
            st.session_state.messages = []
            st.session_state.url_processed = False
            st.success("Chat history cleared!")
            st.rerun()
        except Exception as e:
            st.error(f"Error clearing history: {e}")

with col2:
    if st.button("Clear All Data"):
        if st.checkbox("Confirm Clear"):
            try:
                model.clear()
                st.session_state.messages = []
                delete_user_chat_history(st.session_state.user.id)
                st.session_state.url_processed = False
                st.session_state.metrics = MetricsCollector()
                st.success("All data cleared successfully.")
            except Exception as e:
                st.error(f"Error clearing data: {e}")

# Debug Information
if st.checkbox("Show Debug Info"):
    st.subheader("πŸ” Debug Information")
    col1, col2 = st.columns(2)
    
    with col1:
        st.write("Cache Information:")
        st.write(model.cache)
    
    with col2:
        st.write("Current Metrics:")
        st.write(metrics)
    
    st.write("Current Context Preview:")
    st.write(model.context[:500] if model.context else "No context available")