Spaces:
Sleeping
Sleeping
File size: 8,249 Bytes
bb32843 5a34799 c385238 5b02b7b 809aa75 5a34799 c385238 5a34799 c385238 5a34799 c385238 5a34799 c385238 809aa75 c385238 809aa75 c385238 809aa75 c385238 809aa75 207bed4 c385238 5a34799 c385238 5a34799 c385238 5a34799 c385238 5a34799 207bed4 5a34799 c385238 5a34799 c385238 207bed4 c385238 5a34799 c385238 5a34799 c385238 5a34799 c385238 5a34799 c385238 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
# This file is only used for Streamlit UI development and testing.
# It is not part of the main application.
import streamlit as st
import asyncio
import time
from datetime import datetime
from src.crawlgpt.core.LLMBasedCrawler import Model
from src.crawlgpt.utils.monitoring import MetricsCollector, Metrics
from src.crawlgpt.utils.progress import ProgressTracker
from src.crawlgpt.utils.data_manager import DataManager
from src.crawlgpt.utils.content_validator import ContentValidator
import json
# Streamlit app title and description
st.title("CrawlGPT ππ€")
st.write(
"This app extracts content from a URL, stores it in a vector database, and generates responses to user queries. "
"It also summarizes extracted content for efficient retrieval."
)
# Initialize components in session state
if "model" not in st.session_state:
st.session_state.model = Model()
st.session_state.data_manager = DataManager()
st.session_state.content_validator = ContentValidator()
if "use_summary" not in st.session_state:
st.session_state.use_summary = True
if "metrics" not in st.session_state:
st.session_state.metrics = MetricsCollector()
model = st.session_state.model
# Sidebar for metrics and monitoring
with st.sidebar:
st.subheader("π System Metrics")
metrics = st.session_state.metrics.metrics.to_dict()
st.metric("Total Requests", metrics["total_requests"])
st.metric("Success Rate", f"{(metrics['successful_requests']/max(metrics['total_requests'], 1))*100:.1f}%")
st.metric("Avg Response Time", f"{metrics['average_response_time']:.2f}s")
# Export/Import Data
st.subheader("πΎ Data Management")
if st.button("Export Current State"):
try:
export_data = {
"metrics": metrics,
"vector_database": model.database.to_dict()
}
export_json = json.dumps(export_data)
st.session_state.export_json = export_json
st.success("Data exported successfully!")
except Exception as e:
st.error(f"Export failed: {e}")
if "export_json" in st.session_state:
st.download_button(
label="Download Backup",
data=st.session_state.export_json,
file_name=f"crawlgpt_backup_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json",
mime="application/json"
)
uploaded_file = st.file_uploader("Import Previous State", type=['json'])
if uploaded_file is not None:
try:
# Read the uploaded file content
uploaded_file_content = uploaded_file.read()
# Parse the JSON content
imported_data = json.loads(uploaded_file_content)
# Import the state
model.import_state(imported_data)
st.success("Data imported successfully!")
except Exception as e:
st.error(f"Import failed: {e}")
# URL input and content extraction
url = st.text_input("Enter URL:", help="Provide the URL to extract content from.")
if st.button("Extract and Store Content"):
if not url.strip():
st.warning("Please enter a valid URL.")
else:
# Create a progress bar
progress_bar = st.progress(0)
status_text = st.empty()
try:
# Validate URL
if not st.session_state.content_validator.is_valid_url(url):
st.error("Invalid URL format")
else:
async def extract_content():
start_time = time.time()
# Initialize progress tracker
progress = ProgressTracker(
total_steps=4,
operation_name="content_extraction"
)
try:
# Update progress for each step
status_text.text("Validating URL...")
progress_bar.progress(25)
status_text.text("Crawling content...")
progress_bar.progress(50)
await model.extract_content_from_url(url)
status_text.text("Processing content...")
progress_bar.progress(75)
status_text.text("Storing in database...")
progress_bar.progress(100)
# Record metrics
st.session_state.metrics.record_request(
success=True,
response_time=time.time() - start_time,
tokens_used=len(model.context.split())
)
st.success("Content extracted and stored successfully.")
st.write("Extracted Content Preview:")
st.write(model.context[:500])
except Exception as e:
st.session_state.metrics.record_request(
success=False,
response_time=time.time() - start_time,
tokens_used=0
)
raise e
finally:
status_text.empty()
progress_bar.empty()
# Run the asynchronous function
asyncio.run(extract_content())
except Exception as e:
st.error(f"Error extracting content: {e}")
# Query section with RAG type selection
rag_type = st.radio(
"Select RAG Type:",
("Normal RAG", "Summarized RAG")
)
query = st.text_input("Ask a question:", help="Enter a query to retrieve context and generate a response.")
temperature = st.slider("Temperature", 0.0, 1.0, 0.7)
max_tokens = st.slider("Max Tokens", 50, 1000, 200)
model_id = st.text_input("Model ID", "llama-3.1-8b-instant")
if st.button("Get Response"):
if not query.strip():
st.warning("Please enter a query.")
else:
try:
start_time = time.time()
if rag_type == "Normal RAG":
response = model.generate_response(
query,
temperature,
max_tokens,
model_id,
use_summary=False
)
else:
response = model.generate_response(
query,
temperature,
max_tokens,
model_id,
use_summary=True
)
# Record metrics
st.session_state.metrics.record_request(
success=True,
response_time=time.time() - start_time,
tokens_used=len(response.split())
)
st.write("Generated Response:")
st.write(response)
except Exception as e:
st.session_state.metrics.record_request(
success=False,
response_time=time.time() - start_time,
tokens_used=0
)
st.error(f"Error generating response: {e}")
# Enhanced debug section
if st.checkbox("Show Debug Info"):
st.subheader("π Debug Information")
# System Status
st.write("System Status:")
col1, col2 = st.columns(2)
with col1:
st.write("Cache Information:")
st.write(model.cache)
with col2:
st.write("Current Metrics:")
st.write(metrics)
# Content Preview
st.write("Current Context Preview:")
st.write(model.context[:500])
# Clear functionality with confirmation
if st.button("Clear All Data"):
if st.checkbox("Confirm Clear"):
try:
model.clear()
st.session_state.metrics = MetricsCollector() # Reset metrics
st.success("All data cleared successfully.")
except Exception as e:
st.error(f"Error clearing data: {e}") |