File size: 8,249 Bytes
bb32843
 
 
5a34799
 
c385238
 
5b02b7b
 
 
 
 
809aa75
5a34799
 
 
 
 
 
 
 
c385238
5a34799
 
c385238
 
5a34799
 
c385238
5a34799
c385238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809aa75
c385238
 
809aa75
 
 
 
c385238
 
 
809aa75
 
 
 
 
 
 
 
 
c385238
 
809aa75
 
 
 
 
207bed4
c385238
 
 
5a34799
 
 
 
 
 
 
 
c385238
 
 
 
5a34799
c385238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a34799
c385238
 
 
5a34799
 
 
207bed4
 
 
 
 
 
5a34799
c385238
 
 
5a34799
 
 
 
 
 
c385238
 
207bed4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c385238
 
 
 
 
 
 
 
5a34799
 
c385238
5a34799
c385238
 
 
 
 
5a34799
 
c385238
5a34799
c385238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# This file is only used for Streamlit UI development and testing.
# It is not part of the main application.

import streamlit as st
import asyncio
import time
from datetime import datetime
from src.crawlgpt.core.LLMBasedCrawler import Model
from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
from src.crawlgpt.utils.progress import ProgressTracker
from src.crawlgpt.utils.data_manager import DataManager
from src.crawlgpt.utils.content_validator import ContentValidator
import json

# Streamlit app title and description
st.title("CrawlGPT πŸš€πŸ€–")
st.write(
    "This app extracts content from a URL, stores it in a vector database, and generates responses to user queries. "
    "It also summarizes extracted content for efficient retrieval."
)

# Initialize components in session state
if "model" not in st.session_state:
    st.session_state.model = Model()
    st.session_state.data_manager = DataManager()
    st.session_state.content_validator = ContentValidator()

if "use_summary" not in st.session_state:
    st.session_state.use_summary = True

if "metrics" not in st.session_state:
    st.session_state.metrics = MetricsCollector()

model = st.session_state.model

# Sidebar for metrics and monitoring
with st.sidebar:
    st.subheader("πŸ“Š System Metrics")
    metrics = st.session_state.metrics.metrics.to_dict()
    st.metric("Total Requests", metrics["total_requests"])
    st.metric("Success Rate", f"{(metrics['successful_requests']/max(metrics['total_requests'], 1))*100:.1f}%")
    st.metric("Avg Response Time", f"{metrics['average_response_time']:.2f}s")
    
    # Export/Import Data
    st.subheader("πŸ’Ύ Data Management")
    if st.button("Export Current State"):
        try:
            export_data = {
                "metrics": metrics,
                "vector_database": model.database.to_dict()
            }
            export_json = json.dumps(export_data)
            st.session_state.export_json = export_json
            st.success("Data exported successfully!")
        except Exception as e:
            st.error(f"Export failed: {e}")

    if "export_json" in st.session_state:
        st.download_button(
            label="Download Backup",
            data=st.session_state.export_json,
            file_name=f"crawlgpt_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
            mime="application/json"
        )

    uploaded_file = st.file_uploader("Import Previous State", type=['json'])
    if uploaded_file is not None:
        try:
            # Read the uploaded file content
            uploaded_file_content = uploaded_file.read()
            # Parse the JSON content
            imported_data = json.loads(uploaded_file_content)
            # Import the state
            model.import_state(imported_data)
            st.success("Data imported successfully!")
        except Exception as e:
            st.error(f"Import failed: {e}")

# URL input and content extraction
url = st.text_input("Enter URL:", help="Provide the URL to extract content from.")

if st.button("Extract and Store Content"):
    if not url.strip():
        st.warning("Please enter a valid URL.")
    else:
        # Create a progress bar
        progress_bar = st.progress(0)
        status_text = st.empty()
        
        try:
            # Validate URL
            if not st.session_state.content_validator.is_valid_url(url):
                st.error("Invalid URL format")
            else:
                async def extract_content():
                    start_time = time.time()
                    
                    # Initialize progress tracker
                    progress = ProgressTracker(
                        total_steps=4,
                        operation_name="content_extraction"
                    )
                    
                    try:
                        # Update progress for each step
                        status_text.text("Validating URL...")
                        progress_bar.progress(25)
                        
                        status_text.text("Crawling content...")
                        progress_bar.progress(50)
                        await model.extract_content_from_url(url)
                        
                        status_text.text("Processing content...")
                        progress_bar.progress(75)
                        
                        status_text.text("Storing in database...")
                        progress_bar.progress(100)
                        
                        # Record metrics
                        st.session_state.metrics.record_request(
                            success=True,
                            response_time=time.time() - start_time,
                            tokens_used=len(model.context.split())
                        )
                        
                        st.success("Content extracted and stored successfully.")
                        st.write("Extracted Content Preview:")
                        st.write(model.context[:500])
                        
                    except Exception as e:
                        st.session_state.metrics.record_request(
                            success=False,
                            response_time=time.time() - start_time,
                            tokens_used=0
                        )
                        raise e
                    finally:
                        status_text.empty()
                        progress_bar.empty()

                # Run the asynchronous function
                asyncio.run(extract_content())
                
        except Exception as e:
            st.error(f"Error extracting content: {e}")

# Query section with RAG type selection
rag_type = st.radio(
    "Select RAG Type:",
    ("Normal RAG", "Summarized RAG")
)

query = st.text_input("Ask a question:", help="Enter a query to retrieve context and generate a response.")
temperature = st.slider("Temperature", 0.0, 1.0, 0.7)
max_tokens = st.slider("Max Tokens", 50, 1000, 200)
model_id = st.text_input("Model ID", "llama-3.1-8b-instant")

if st.button("Get Response"):
    if not query.strip():
        st.warning("Please enter a query.")
    else:
        try:
            start_time = time.time()
            
            if rag_type == "Normal RAG":
                response = model.generate_response(
                    query, 
                    temperature, 
                    max_tokens, 
                    model_id, 
                    use_summary=False
                )
            else:
                response = model.generate_response(
                    query, 
                    temperature, 
                    max_tokens, 
                    model_id, 
                    use_summary=True
                )
            
            # Record metrics
            st.session_state.metrics.record_request(
                success=True,
                response_time=time.time() - start_time,
                tokens_used=len(response.split())
            )
            
            st.write("Generated Response:")
            st.write(response)
            
        except Exception as e:
            st.session_state.metrics.record_request(
                success=False,
                response_time=time.time() - start_time,
                tokens_used=0
            )
            st.error(f"Error generating response: {e}")

# Enhanced debug section
if st.checkbox("Show Debug Info"):
    st.subheader("πŸ” Debug Information")
    
    # System Status
    st.write("System Status:")
    col1, col2 = st.columns(2)
    
    with col1:
        st.write("Cache Information:")
        st.write(model.cache)
    
    with col2:
        st.write("Current Metrics:")
        st.write(metrics)
    
    # Content Preview
    st.write("Current Context Preview:")
    st.write(model.context[:500])

# Clear functionality with confirmation
if st.button("Clear All Data"):
    if st.checkbox("Confirm Clear"):
        try:
            model.clear()
            st.session_state.metrics = MetricsCollector()  # Reset metrics
            st.success("All data cleared successfully.")
        except Exception as e:
            st.error(f"Error clearing data: {e}")