poemsforaphrodite commited on
Commit
1c9b44f
1 Parent(s): 3eb6d62

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +868 -0
app.py ADDED
@@ -0,0 +1,868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import plotly.express as px
4
+ import numpy as np
5
+ from datetime import datetime, timedelta
6
+ import json
7
+ from pymongo import MongoClient
8
+ from dotenv import load_dotenv
9
+ import os
10
+ import bcrypt
11
+ from openai import OpenAI
12
+ from streamlit_plotly_events import plotly_events
13
+ from pinecone import Pinecone, ServerlessSpec
14
+ import threading # {{ edit_25: Import threading for background processing }}
15
+ import tiktoken
16
+ from tiktoken.core import Encoding
17
+
18
+ # Set page configuration to wide mode
19
+ st.set_page_config(layout="wide")
20
+
21
+ # Load environment variables
22
+ load_dotenv()
23
+
24
+ # MongoDB connection
25
+ mongodb_uri = os.getenv('MONGODB_URI')
26
+ mongo_client = MongoClient(mongodb_uri) # {{ edit_11: Rename MongoDB client to 'mongo_client' }}
27
+ db = mongo_client['llm_evaluation_system']
28
+ users_collection = db['users']
29
+ results_collection = db['evaluation_results']
30
+
31
+ # Initialize OpenAI client
32
+ openai_client = OpenAI() # {{ edit_12: Rename OpenAI client to 'openai_client' }}
33
+
34
+ # Initialize Pinecone
35
+ pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY')) # {{ edit_13: Initialize Pinecone client using Pinecone class }}
36
+
37
+ # Initialize the tokenizer
38
+ tokenizer: Encoding = tiktoken.get_encoding("cl100k_base") # This is suitable for GPT-4 and recent models
39
+
40
+ # Authentication functions
41
+ def hash_password(password):
42
+ return bcrypt.hashpw(password.encode('utf-8'), bcrypt.gensalt())
43
+
44
+ def verify_password(password, hashed_password):
45
+ return bcrypt.checkpw(password.encode('utf-8'), hashed_password)
46
+
47
+ def authenticate(username, password):
48
+ user = users_collection.find_one({"username": username})
49
+ if user and verify_password(password, user['password']):
50
+ return True
51
+ return False
52
+
53
+ def signup(username, password):
54
+ if users_collection.find_one({"username": username}):
55
+ return False
56
+ hashed_password = hash_password(password)
57
+ # {{ edit_1: Initialize models list for the new user }}
58
+ users_collection.insert_one({
59
+ "username": username,
60
+ "password": hashed_password,
61
+ "models": [] # List to store user's models
62
+ })
63
+ return True
64
+ def upload_model(file):
65
+ return "Model uploaded successfully!"
66
+
67
+ # Function to perform evaluation (placeholder)
68
+ def evaluate_model(model_identifier, metrics, username):
69
+ # {{ edit_4: Differentiate between Custom and Named models }}
70
+ user = users_collection.find_one({"username": username})
71
+ models = user.get("models", [])
72
+ selected_model = next((m for m in models if (m['model_name'] == model_identifier) or (m['model_id'] == model_identifier)), None)
73
+
74
+ if selected_model:
75
+ if selected_model.get("model_type") == "named":
76
+ # For Named Models, use RAG-based evaluation
77
+ return evaluate_named_model(model_identifier, prompt, context_dataset)
78
+ else:
79
+ # For Custom Models, proceed with existing evaluation logic
80
+ results = {metric: round(np.random.rand() * 100, 2) for metric in metrics}
81
+ return results
82
+ else:
83
+ st.error("Selected model not found.")
84
+ return None
85
+
86
+ # Function to generate response using GPT-4-mini
87
+ def generate_response(prompt, context):
88
+ try:
89
+ response = openai_client.chat.completions.create(
90
+ model="gpt-4o-mini",
91
+ messages=[
92
+ {"role": "system", "content": "You are a helpful assistant."},
93
+ {"role": "user", "content": f"Context: {context}\n\nPrompt: {prompt}"}
94
+ ]
95
+ )
96
+ return response.choices[0].message.content
97
+ except Exception as e:
98
+ st.error(f"Error generating response: {str(e)}")
99
+ return None
100
+
101
+ # Function to clear the results database
102
+ def clear_results_database():
103
+ try:
104
+ results_collection.delete_many({})
105
+ return True
106
+ except Exception as e:
107
+ st.error(f"Error clearing results database: {str(e)}")
108
+ return False
109
+
110
+ # Function to generate embeddings using the specified model
111
+ def generate_embedding(text):
112
+ try:
113
+ embedding_response = openai_client.embeddings.create(
114
+ model="text-embedding-3-large", # {{ edit_3: Use the specified embedding model }}
115
+ input=text,
116
+ encoding_format="float"
117
+ )
118
+ embedding = embedding_response["data"][0]["embedding"]
119
+ return embedding
120
+ except Exception as e:
121
+ st.error(f"Error generating embedding: {str(e)}")
122
+ return None
123
+
124
+ # Function to handle Named Model Evaluation using RAG
125
+ def evaluate_named_model(model_name, prompt, context_dataset):
126
+ # {{ edit_4: Implement evaluation using RAG and Pinecone with the specified embedding model }}
127
+ try:
128
+ # Initialize Pinecone index
129
+ index = pinecone_client.Index(os.getenv('PINECONE_INDEX_NAME'))
130
+
131
+ # Generate embedding for the prompt
132
+ prompt_embedding = generate_embedding(prompt)
133
+ if not prompt_embedding:
134
+ st.error("Failed to generate embedding for the prompt.")
135
+ return None
136
+
137
+ # Retrieve relevant context using RAG by querying Pinecone with the embedding
138
+ query_response = index.query(
139
+ top_k=5,
140
+ namespace=model_name,
141
+ include_metadata=True,
142
+ vector=prompt_embedding # {{ edit_5: Use embedding vector for querying }}
143
+ )
144
+
145
+ # Aggregate retrieved context
146
+ retrieved_context = " ".join([item['metadata']['text'] for item in query_response['matches']])
147
+
148
+ # Generate response using the retrieved context
149
+ response = generate_response(prompt, retrieved_context)
150
+
151
+ # Evaluate the response
152
+ evaluation = teacher_evaluate(prompt, retrieved_context, response)
153
+
154
+ # Save the results
155
+ save_results(model_name, prompt, retrieved_context, response, evaluation)
156
+
157
+ return evaluation
158
+
159
+ except Exception as e:
160
+ st.error(f"Error in evaluating named model: {str(e)}")
161
+ return None
162
+
163
+ # Example: When indexing data to Pinecone, generate embeddings using the specified model
164
+ def index_context_data(model_name, texts):
165
+ try:
166
+ index = pinecone_client.Index(os.getenv('PINECONE_INDEX_NAME'))
167
+ for text in texts:
168
+ embedding = generate_embedding(text)
169
+ if embedding:
170
+ index.upsert([
171
+ {
172
+ "id": f"{model_name}_{hash(text)}",
173
+ "values": embedding,
174
+ "metadata": {"text": text}
175
+ }
176
+ ])
177
+ except Exception as e:
178
+ st.error(f"Error indexing data to Pinecone: {str(e)}")
179
+ def upload_model(file, username, model_type):
180
+ # {{ edit_5: Modify upload_model to handle model_type }}
181
+ model_id = f"{username}_model_{int(datetime.now().timestamp())}"
182
+ if model_type == "custom":
183
+ # Save the model file as needed
184
+ model_path = os.path.join("models", f"{model_id}.bin")
185
+ with open(model_path, "wb") as f:
186
+ f.write(file.getbuffer())
187
+
188
+ # Update user's models list
189
+ users_collection.update_one(
190
+ {"username": username},
191
+ {"$push": {"models": {
192
+ "model_id": model_id,
193
+ "file_path": model_path,
194
+ "uploaded_at": datetime.now(),
195
+ "model_type": "custom"
196
+ }}}
197
+ )
198
+ return f"Custom Model {model_id} uploaded successfully!"
199
+ elif model_type == "named":
200
+ # For Named Models, assume the model is managed externally (e.g., via Pinecone)
201
+ users_collection.update_one(
202
+ {"username": username},
203
+ {"$push": {"models": {
204
+ "model_id": model_id,
205
+ "model_name": None,
206
+ "file_path": None,
207
+ "model_link": None,
208
+ "uploaded_at": datetime.now(),
209
+ "model_type": "named"
210
+ }}}
211
+ )
212
+ return f"Named Model {model_id} registered successfully!"
213
+ else:
214
+ return "Invalid model type specified."
215
+
216
+ # Function to save results to MongoDB
217
+ def save_results(username, model, prompt, context, response, evaluation): # {{ edit_29: Add 'username' parameter }}
218
+ result = {
219
+ "username": username, # Use the passed 'username' parameter
220
+ "model_id": model['model_id'], # {{ edit_19: Associate results with 'model_id' }}
221
+ "model_name": model.get('model_name'),
222
+ "model_type": model.get('model_type', 'custom'), # {{ edit_20: Include 'model_type' in results }}
223
+ "prompt": prompt,
224
+ "context": context,
225
+ "response": response,
226
+ "evaluation": evaluation,
227
+ "timestamp": datetime.now()
228
+ }
229
+ results_collection.insert_one(result)
230
+
231
+ # Function for teacher model evaluation
232
+ def teacher_evaluate(prompt, context, response):
233
+ try:
234
+ evaluation_prompt = f"""
235
+ Evaluate the following response based on the given prompt and context.
236
+ Rate each factor on a scale of 0 to 1, where 1 is the best (or least problematic for negative factors like Hallucination and Bias).
237
+ Please provide scores with two decimal places, and avoid extreme scores of exactly 0 or 1 unless absolutely necessary.
238
+
239
+ Prompt: {prompt}
240
+ Context: {context}
241
+ Response: {response}
242
+
243
+ Factors to evaluate:
244
+ 1. Accuracy: How factually correct is the response?
245
+ 2. Hallucination: To what extent does the response contain made-up information? (Higher score means less hallucination)
246
+ 3. Groundedness: How well is the response grounded in the given context and prompt?
247
+ 4. Relevance: How relevant is the response to the prompt?
248
+ 5. Recall: How much of the relevant information from the context is included in the response?
249
+ 6. Precision: How precise and focused is the response in addressing the prompt?
250
+ 7. Consistency: How consistent is the response with the given information and within itself?
251
+ 8. Bias Detection: To what extent is the response free from bias? (Higher score means less bias)
252
+
253
+ Provide the evaluation as a JSON object. Each factor should be a key mapping to an object containing 'score' and 'explanation'.
254
+ Do not include any additional text, explanations, or markdown formatting.
255
+ """
256
+
257
+ evaluation_response = openai_client.chat.completions.create(
258
+ model="gpt-4o-mini", # Corrected model name
259
+ messages=[
260
+ {"role": "system", "content": "You are an expert evaluator of language model responses."},
261
+ {"role": "user", "content": evaluation_prompt}
262
+ ]
263
+ )
264
+
265
+ content = evaluation_response.choices[0].message.content.strip()
266
+
267
+ # Ensure the response starts and ends with curly braces
268
+ if not (content.startswith("{") and content.endswith("}")):
269
+ st.error("Teacher evaluation did not return a valid JSON object.")
270
+ st.error(f"Response content: {content}")
271
+ return None
272
+
273
+ try:
274
+ evaluation = json.loads(content)
275
+ return evaluation
276
+ except json.JSONDecodeError as e:
277
+ st.error(f"Error decoding evaluation response: {str(e)}")
278
+ st.error(f"Response content: {content}")
279
+ return None
280
+
281
+ except Exception as e:
282
+ st.error(f"Error in teacher evaluation: {str(e)}")
283
+ return None
284
+
285
+ # Function to generate dummy data for demonstration
286
+ def generate_dummy_data():
287
+ dates = pd.date_range(end=datetime.now(), periods=30).tolist()
288
+ metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'Consistency', 'Bias']
289
+ data = {
290
+ 'Date': dates * len(metrics),
291
+ 'Metric': [metric for metric in metrics for _ in range(len(dates))],
292
+ 'Value': np.random.rand(len(dates) * len(metrics)) * 100
293
+ }
294
+ return pd.DataFrame(data)
295
+
296
+ # Function to count tokens
297
+ def count_tokens(text: str) -> int:
298
+ return len(tokenizer.encode(text))
299
+
300
+ # Sidebar Navigation
301
+ st.sidebar.title("LLM Evaluation System")
302
+
303
+ # Session state
304
+ if 'user' not in st.session_state:
305
+ st.session_state.user = None
306
+
307
+ # Authentication
308
+ if not st.session_state.user:
309
+ auth_option = st.sidebar.radio("Choose an option", ["Login", "Signup"])
310
+
311
+ username = st.sidebar.text_input("Username")
312
+ password = st.sidebar.text_input("Password", type="password")
313
+
314
+ if auth_option == "Login":
315
+ if st.sidebar.button("Login"):
316
+ if authenticate(username, password):
317
+ st.session_state.user = username
318
+ st.rerun()
319
+ else:
320
+ st.sidebar.error("Invalid username or password")
321
+ else:
322
+ if st.sidebar.button("Signup"):
323
+ if signup(username, password):
324
+ st.sidebar.success("Signup successful. Please login.")
325
+ else:
326
+ st.sidebar.error("Username already exists")
327
+ else:
328
+ st.sidebar.success(f"Welcome, {st.session_state.user}!")
329
+ if st.sidebar.button("Logout"):
330
+ st.session_state.user = None
331
+ st.experimental_rerun()
332
+
333
+ # Add Clear Results Database button
334
+ if st.sidebar.button("Clear Results Database"):
335
+ if clear_results_database(): # {{ edit_fix: Calling the newly defined clear_results_database function }}
336
+ st.sidebar.success("Results database cleared successfully!")
337
+ else:
338
+ st.sidebar.error("Failed to clear results database.")
339
+
340
+ # App content
341
+ if st.session_state.user:
342
+ app_mode = st.sidebar.selectbox("Choose the section", ["Dashboard", "Model Upload", "Evaluation", "Prompt Testing", "Manage Models", "History"]) # {{ edit_add: Added "History" to the sidebar navigation }}
343
+
344
+ if app_mode == "Dashboard":
345
+ st.title("Dashboard")
346
+ st.write("### Real-time Metrics and Performance Insights")
347
+
348
+ # Fetch the user from the database
349
+ user = users_collection.find_one({"username": st.session_state.user})
350
+ if user is None:
351
+ st.error("User not found in the database.")
352
+ st.stop()
353
+ user_models = user.get("models", [])
354
+
355
+ if user_models:
356
+ model_options = [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models]
357
+ selected_model = st.selectbox("Select Model to View Metrics", ["All Models"] + model_options)
358
+ else:
359
+ st.error("You have no uploaded models.")
360
+ selected_model = "All Models"
361
+
362
+ try:
363
+ query = {"username": st.session_state.user}
364
+ if selected_model != "All Models":
365
+ query["model_name"] = selected_model
366
+ if not selected_model:
367
+ query = {"username": st.session_state.user, "model_id": selected_model}
368
+ results = list(results_collection.find(query))
369
+ if results:
370
+ df = pd.DataFrame(results)
371
+
372
+ # Count tokens for prompt, context, and response
373
+ df['prompt_tokens'] = df['prompt'].apply(count_tokens)
374
+ df['context_tokens'] = df['context'].apply(count_tokens)
375
+ df['response_tokens'] = df['response'].apply(count_tokens)
376
+
377
+ # Calculate total tokens for each row
378
+ df['total_tokens'] = df['prompt_tokens'] + df['context_tokens'] + df['response_tokens']
379
+
380
+ metrics = ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]
381
+ for metric in metrics:
382
+ df[metric] = df['evaluation'].apply(lambda x: x.get(metric, {}).get('score', 0) if x else 0) * 100
383
+
384
+ df['timestamp'] = pd.to_datetime(df['timestamp'])
385
+ df['query_number'] = range(1, len(df) + 1) # Add query numbers
386
+
387
+ @st.cache_data
388
+ def create_metrics_graph(df, metrics):
389
+ fig = px.line(
390
+ df,
391
+ x='query_number', # Use query numbers on x-axis
392
+ y=metrics,
393
+ title='Metrics Over Queries',
394
+ labels={metric: f"{metric} (%)" for metric in metrics},
395
+ markers=True,
396
+ template='plotly_dark',
397
+ )
398
+ color_discrete_sequence = px.colors.qualitative.Dark24
399
+ for i, metric in enumerate(metrics):
400
+ fig.data[i].line.color = color_discrete_sequence[i % len(color_discrete_sequence)]
401
+ fig.data[i].marker.color = color_discrete_sequence[i % len(color_discrete_sequence)]
402
+ fig.update_layout(
403
+ xaxis_title="Query Number",
404
+ yaxis_title="Metric Score (%)",
405
+ legend_title="Metrics",
406
+ hovermode="x unified",
407
+ margin=dict(l=50, r=50, t=100, b=50),
408
+ height=700 # Increase the height of the graph
409
+ )
410
+ return fig
411
+
412
+ fig = create_metrics_graph(df, metrics)
413
+
414
+ st.plotly_chart(fig, use_container_width=True)
415
+
416
+ # Latest Metrics
417
+ st.subheader("Latest Metrics")
418
+ latest_result = df.iloc[-1] # Get the last row (most recent query)
419
+ latest_metrics = {metric: latest_result[metric] for metric in metrics}
420
+
421
+ cols = st.columns(4)
422
+ for i, (metric, value) in enumerate(latest_metrics.items()):
423
+ with cols[i % 4]:
424
+ color = 'green' if value >= 75 else 'orange' if value >= 50 else 'red'
425
+ st.metric(label=metric, value=f"{value:.2f}%", delta=None)
426
+ st.progress(value / 100)
427
+
428
+ # Detailed Data View
429
+ st.subheader("Detailed Data View")
430
+
431
+ # Calculate aggregate metrics
432
+ total_spans = len(df)
433
+ total_tokens = df['total_tokens'].sum()
434
+
435
+ # Display aggregate metrics
436
+ col1, col2 = st.columns(2)
437
+ with col1:
438
+ st.metric("Total Spans", f"{total_spans:,}")
439
+ with col2:
440
+ st.metric("Total Tokens", f"{total_tokens:,.2f}M" if total_tokens >= 1e6 else f"{total_tokens:,}")
441
+
442
+ # Prepare the data for display
443
+ display_data = []
444
+ for _, row in df.iterrows():
445
+ display_row = {
446
+ "Prompt": row['prompt'][:50] + "...", # Truncate long prompts
447
+ "Context": row['context'][:50] + "...", # Truncate long contexts
448
+ "Response": row['response'][:50] + "...", # Truncate long responses
449
+ }
450
+ # Add metrics to the display row
451
+ for metric in metrics:
452
+ display_row[metric] = row[metric] # Store as float, not string
453
+
454
+ display_data.append(display_row)
455
+
456
+ # Convert to DataFrame for easy display
457
+ display_df = pd.DataFrame(display_data)
458
+
459
+ # Function to color cells based on score
460
+ def color_cells(val):
461
+ if isinstance(val, float):
462
+ if val >= 80:
463
+ color = 'green'
464
+ elif val >= 60:
465
+ color = '#90EE90' # Light green
466
+ else:
467
+ color = 'red'
468
+ return f'background-color: {color}; color: black'
469
+ return ''
470
+
471
+ # Apply the styling only to metric columns
472
+ styled_df = display_df.style.applymap(color_cells, subset=metrics)
473
+
474
+ # Format metric columns as percentages
475
+ for metric in metrics:
476
+ styled_df = styled_df.format({metric: "{:.2f}%"})
477
+
478
+ # Display the table with custom styling
479
+ st.dataframe(
480
+ styled_df.set_properties(**{
481
+ 'color': 'white',
482
+ 'border': '1px solid #ddd'
483
+ }).set_table_styles([
484
+ {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white')]},
485
+ {'selector': 'td', 'props': [('text-align', 'left')]},
486
+ # Keep background white for non-metric columns
487
+ {'selector': 'td:nth-child(-n+3)', 'props': [('background-color', 'white !important')]}
488
+ ]),
489
+ use_container_width=True,
490
+ height=400 # Set a fixed height with scrolling
491
+ )
492
+
493
+ # Placeholders for future sections
494
+ st.subheader("Worst Performing Slice Analysis")
495
+ st.info("This section will show analysis of the worst-performing data slices.")
496
+
497
+ st.subheader("UMAP Visualization")
498
+ st.info("This section will contain UMAP visualizations for dimensionality reduction insights.")
499
+ else:
500
+ st.info("No evaluation results available for the selected model.")
501
+ except Exception as e:
502
+ st.error(f"Error fetching data from database: {e}")
503
+ st.error("Detailed error information:")
504
+ st.error(str(e))
505
+ import traceback
506
+ st.error(traceback.format_exc())
507
+
508
+ elif app_mode == "Model Upload":
509
+ st.title("Upload Your Model")
510
+ model_type = st.radio("Select Model Type", ["Custom", "Named"]) # {{ edit_6: Select model type }}
511
+ uploaded_file = st.file_uploader("Choose a model file", type=[".pt", ".h5", ".bin"]) if model_type == "custom" else None
512
+
513
+ if st.button("Upload Model"):
514
+ if model_type == "custom" and uploaded_file is not None:
515
+ result = upload_model(uploaded_file, st.session_state.user, model_type="custom")
516
+ st.success(result)
517
+ elif model_type == "named":
518
+ result = upload_model(None, st.session_state.user, model_type="named")
519
+ st.success(result)
520
+ else:
521
+ st.error("Please upload a valid model file for Custom models.")
522
+
523
+ elif app_mode == "Evaluation":
524
+ st.title("Evaluate Your Model")
525
+ st.write("### Select Model and Evaluation Metrics")
526
+
527
+ # Fetch the user from the database
528
+ user = users_collection.find_one({"username": st.session_state.user})
529
+ if user is None:
530
+ st.error("User not found in the database.")
531
+ st.stop()
532
+ user_models = user.get("models", [])
533
+
534
+ if not user_models:
535
+ st.error("You have no uploaded models. Please upload a model first.")
536
+ else:
537
+ # {{ edit_1: Display model_name instead of model_id }}
538
+ model_identifier = st.selectbox(
539
+ "Choose a Model to Evaluate",
540
+ [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models]
541
+ )
542
+
543
+ # {{ edit_2: Remove metrics selection and set fixed metrics }}
544
+ fixed_metrics = ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]
545
+ st.write("### Evaluation Metrics")
546
+ st.write(", ".join(fixed_metrics))
547
+
548
+ # Modify the evaluation function call to use fixed_metrics
549
+ if st.button("Start Evaluation"):
550
+ with st.spinner("Evaluation in progress..."):
551
+ # {{ edit_3: Use fixed_metrics instead of user-selected metrics }}
552
+ results = evaluate_model(model_identifier, fixed_metrics, st.session_state.user)
553
+ # Fetch the current model document
554
+ current_model = next((m for m in user_models if (m['model_name'] == model_identifier) or (m['model_id'] == model_identifier)), None)
555
+ if current_model:
556
+ save_results(st.session_state.user, current_model, prompt, context, response, results) # {{ edit_21: Pass current_model to save_results }}
557
+ st.success("Evaluation Completed!")
558
+ st.json(results)
559
+ else:
560
+ st.error("Selected model not found.")
561
+
562
+ elif app_mode == "Prompt Testing":
563
+ st.title("Prompt Testing")
564
+
565
+ # {{ edit_6: Use model_name instead of model_id }}
566
+ model_selection_option = st.radio("Select Model Option:", ["Choose Existing Model", "Add New Model"])
567
+
568
+ if model_selection_option == "Choose Existing Model":
569
+ user = users_collection.find_one({"username": st.session_state.user})
570
+ user_models = user.get("models", [])
571
+
572
+ if not user_models:
573
+ st.error("You have no uploaded models. Please upload a model first.")
574
+ else:
575
+ # Display model_name instead of model_id
576
+ model_name = st.selectbox("Select a Model for Testing", [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models])
577
+ else:
578
+ # Option to enter model name or upload a link
579
+ new_model_option = st.radio("Add Model By:", ["Enter Model Name", "Upload Model Link"])
580
+
581
+ if new_model_option == "Enter Model Name":
582
+ model_name_input = st.text_input("Enter New Model Name:")
583
+ if st.button("Save Model Name"):
584
+ if model_name_input:
585
+ # {{ edit_3: Save the new model name to user's models }}
586
+ model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
587
+ users_collection.update_one(
588
+ {"username": st.session_state.user},
589
+ {"$push": {"models": {
590
+ "model_id": model_id,
591
+ "model_name": model_name_input,
592
+ "file_path": None,
593
+ "model_link": None,
594
+ "uploaded_at": datetime.now()
595
+ }}}
596
+ )
597
+ st.success(f"Model '{model_name_input}' saved successfully as {model_id}!")
598
+ model_name = model_name_input # Use model_name instead of model_id
599
+ else:
600
+ st.error("Please enter a valid model name.")
601
+ else:
602
+ model_link = st.text_input("Enter Model Link:")
603
+ if st.button("Save Model Link"):
604
+ if model_link:
605
+ # {{ edit_4: Save the model link to user's models }}
606
+ model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
607
+ users_collection.update_one(
608
+ {"username": st.session_state.user},
609
+ {"$push": {"models": {
610
+ "model_id": model_id,
611
+ "model_name": None,
612
+ "file_path": None,
613
+ "model_link": model_link,
614
+ "uploaded_at": datetime.now()
615
+ }}}
616
+ )
617
+ st.success(f"Model link saved successfully as {model_id}!")
618
+ model_name = model_id # Use model_id if model_name is not available
619
+ else:
620
+ st.error("Please enter a valid model link.")
621
+
622
+ # Two ways to provide prompts
623
+ prompt_input_method = st.radio("Choose prompt input method:", ["Single JSON", "Batch Upload"])
624
+
625
+ if prompt_input_method == "Single JSON":
626
+ json_input = st.text_area("Enter your JSON input:")
627
+ if json_input:
628
+ try:
629
+ data = json.loads(json_input)
630
+ st.success("JSON parsed successfully!")
631
+
632
+ # Display JSON in a table format
633
+ st.subheader("Input Data")
634
+ df = pd.json_normalize(data)
635
+ st.table(df.style.set_properties(**{
636
+ 'background-color': '#f0f8ff',
637
+ 'color': '#333',
638
+ 'border': '1px solid #ddd'
639
+ }).set_table_styles([
640
+ {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white')]},
641
+ {'selector': 'td', 'props': [('text-align', 'left')]}
642
+ ]))
643
+ except json.JSONDecodeError:
644
+ st.error("Invalid JSON. Please check your input.")
645
+ else:
646
+ uploaded_file = st.file_uploader("Upload a JSON file with prompts, contexts, and responses", type="json")
647
+ if uploaded_file is not None:
648
+ try:
649
+ data = json.load(uploaded_file)
650
+ st.success("JSON file loaded successfully!")
651
+
652
+ # Display JSON in a table format
653
+ st.subheader("Input Data")
654
+ df = pd.json_normalize(data)
655
+ st.table(df.style.set_properties(**{
656
+ 'background-color': '#f0f8ff',
657
+ 'color': '#333',
658
+ 'border': '1px solid #ddd'
659
+ }).set_table_styles([
660
+ {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white')]},
661
+ {'selector': 'td', 'props': [('text-align', 'left')]}
662
+ ]))
663
+ except json.JSONDecodeError:
664
+ st.error("Invalid JSON file. Please check your file contents.")
665
+
666
+ # Function to handle background evaluation
667
+ def run_evaluations(data, selected_model, username): # {{ edit_30: Add 'username' parameter }}
668
+ if isinstance(data, list):
669
+ for item in data:
670
+ if 'response' not in item:
671
+ item['response'] = generate_response(item['prompt'], item['context'])
672
+ evaluation = teacher_evaluate(item['prompt'], item['context'], item['response'])
673
+ save_results(username, selected_model, item['prompt'], item['context'], item['response'], evaluation) # {{ edit_31: Pass 'username' to save_results }}
674
+ # Optionally, update completed prompts in session_state or another mechanism
675
+ else:
676
+ if 'response' not in data:
677
+ data['response'] = generate_response(data['prompt'], data['context'])
678
+ evaluation = teacher_evaluate(data['prompt'], data['context'], data['response'])
679
+ save_results(username, selected_model, data['prompt'], data['context'], data['response'], evaluation) # {{ edit_32: Pass 'username' to save_results }}
680
+ # Optionally, update completed prompts in session_state or another mechanism
681
+
682
+ # In the Prompt Testing section
683
+ if st.button("Run Test"):
684
+ if not model_name:
685
+ st.error("Please select or add a valid Model.")
686
+ elif not data:
687
+ st.error("Please provide valid JSON input.")
688
+ else:
689
+ # {{ edit_28: Define 'selected_model' based on 'model_name' }}
690
+ selected_model = next(
691
+ (m for m in user_models if (m['model_name'] == model_name) or (m['model_id'] == model_name)),
692
+ None
693
+ )
694
+ if selected_model:
695
+ with st.spinner("Starting evaluations in the background..."):
696
+ evaluation_thread = threading.Thread(
697
+ target=run_evaluations,
698
+ args=(data, selected_model, st.session_state.user) # {{ edit_33: Pass 'username' to the thread }}
699
+ )
700
+ evaluation_thread.start()
701
+ st.success("Evaluations are running in the background. You can navigate away or close the site.")
702
+ # {{ edit_34: Optionally, track running evaluations in session_state }}
703
+ else:
704
+ st.error("Selected model not found.")
705
+
706
+ elif app_mode == "Manage Models":
707
+ st.title("Manage Your Models")
708
+ # Fetch the user from the database
709
+ user = users_collection.find_one({"username": st.session_state.user})
710
+ if user is None:
711
+ st.error("User not found in the database.")
712
+ st.stop()
713
+ user_models = user.get("models", [])
714
+
715
+ # {{ edit_1: Add option to add a new model }}
716
+ st.subheader("Add a New Model")
717
+ add_model_option = st.radio("Add Model By:", ["Enter Model Name", "Upload Model Link"])
718
+
719
+ if add_model_option == "Enter Model Name":
720
+ new_model_name = st.text_input("Enter New Model Name:")
721
+ if st.button("Add Model Name"):
722
+ if new_model_name:
723
+ model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
724
+ users_collection.update_one(
725
+ {"username": st.session_state.user},
726
+ {"$push": {"models": {
727
+ "model_id": model_id,
728
+ "model_name": new_model_name,
729
+ "file_path": None,
730
+ "model_link": None,
731
+ "uploaded_at": datetime.now()
732
+ }}}
733
+ )
734
+ st.success(f"Model '{new_model_name}' added successfully as {model_id}!")
735
+ else:
736
+ st.error("Please enter a valid model name.")
737
+ else:
738
+ new_model_link = st.text_input("Enter Model Link:")
739
+ if st.button("Add Model Link"):
740
+ if new_model_link:
741
+ model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
742
+ users_collection.update_one(
743
+ {"username": st.session_state.user},
744
+ {"$push": {"models": {
745
+ "model_id": model_id,
746
+ "model_name": None,
747
+ "file_path": None,
748
+ "model_link": new_model_link,
749
+ "uploaded_at": datetime.now()
750
+ }}}
751
+ )
752
+ st.success(f"Model link added successfully as {model_id}!")
753
+ else:
754
+ st.error("Please enter a valid model link.")
755
+
756
+ st.markdown("---")
757
+
758
+ if user_models:
759
+ st.subheader("Your Models")
760
+ for model in user_models:
761
+ st.markdown(f"**Model ID:** {model['model_id']}")
762
+ st.write(f"**Model Type:** {model.get('model_type', 'custom').capitalize()}") # {{ edit_14: Handle missing 'model_type' with default 'custom' }}
763
+ if model.get("model_name"):
764
+ st.write(f"**Model Name:** {model['model_name']}")
765
+ if model.get("model_link"):
766
+ st.write(f"**Model Link:** [Link]({model['model_link']})")
767
+ if model.get("file_path"):
768
+ st.write(f"**File Path:** {model['file_path']}")
769
+ st.write(f"**Uploaded at:** {model['uploaded_at']}")
770
+
771
+ # Add delete option
772
+ if st.button(f"Delete {model['model_id']}"):
773
+ # Delete the model file if exists and it's a Custom model
774
+ if model['file_path'] and os.path.exists(model['file_path']):
775
+ os.remove(model['file_path'])
776
+ # Remove model from user's models list
777
+ users_collection.update_one(
778
+ {"username": st.session_state.user},
779
+ {"$pull": {"models": {"model_id": model['model_id']}}}
780
+ )
781
+ st.success(f"Model {model['model_id']} deleted successfully!")
782
+ else:
783
+ st.info("You have no uploaded models.")
784
+
785
+ elif app_mode == "History": # {{ edit_add: Enhanced History UI }}
786
+ st.title("History")
787
+ st.write("### Your Evaluation History")
788
+
789
+ try:
790
+ # Fetch all evaluation results for the current user from MongoDB
791
+ user_results = list(results_collection.find({"username": st.session_state.user}).sort("timestamp", -1))
792
+
793
+ if user_results:
794
+ # Convert results to a pandas DataFrame
795
+ df = pd.DataFrame(user_results)
796
+
797
+ # Normalize the evaluation JSON into separate columns
798
+ eval_df = df['evaluation'].apply(pd.Series)
799
+ for metric in ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]:
800
+ if metric in eval_df.columns:
801
+ df[metric + " Score"] = eval_df[metric].apply(lambda x: x.get('score', 0) * 100 if isinstance(x, dict) else 0)
802
+ df[metric + " Explanation"] = eval_df[metric].apply(lambda x: x.get('explanation', '') if isinstance(x, dict) else '')
803
+ else:
804
+ df[metric + " Score"] = 0
805
+ df[metric + " Explanation"] = ""
806
+
807
+ # Select relevant columns to display
808
+ display_df = df[[
809
+ "timestamp", "model_name", "prompt", "context", "response",
810
+ "Accuracy Score", "Hallucination Score", "Groundedness Score",
811
+ "Relevance Score", "Recall Score", "Precision Score",
812
+ "Consistency Score", "Bias Detection Score"
813
+ ]]
814
+
815
+ # Rename columns for better readability
816
+ display_df = display_df.rename(columns={
817
+ "timestamp": "Timestamp",
818
+ "model_name": "Model Name",
819
+ "prompt": "Prompt",
820
+ "context": "Context",
821
+ "response": "Response",
822
+ "Accuracy Score": "Accuracy (%)",
823
+ "Hallucination Score": "Hallucination (%)",
824
+ "Groundedness Score": "Groundedness (%)",
825
+ "Relevance Score": "Relevance (%)",
826
+ "Recall Score": "Recall (%)",
827
+ "Precision Score": "Precision (%)",
828
+ "Consistency Score": "Consistency (%)",
829
+ "Bias Detection Score": "Bias Detection (%)"
830
+ })
831
+
832
+ # Convert timestamp to a readable format
833
+ display_df['Timestamp'] = pd.to_datetime(display_df['Timestamp']).dt.strftime('%Y-%m-%d %H:%M:%S')
834
+
835
+ st.subheader("Evaluation Results")
836
+
837
+ # Display the DataFrame with enhanced styling
838
+ st.dataframe(
839
+ display_df.style.set_properties(**{
840
+ 'background-color': '#f0f8ff',
841
+ 'color': '#333',
842
+ 'border': '1px solid #ddd'
843
+ }).set_table_styles([
844
+ {'selector': 'th', 'props': [('background-color', '#f5f5f5'), ('text-align', 'center')]},
845
+ {'selector': 'td', 'props': [('text-align', 'center'), ('vertical-align', 'top')]}
846
+ ]).format({
847
+ "Accuracy (%)": "{:.2f}",
848
+ "Hallucination (%)": "{:.2f}",
849
+ "Groundedness (%)": "{:.2f}",
850
+ "Relevance (%)": "{:.2f}",
851
+ "Recall (%)": "{:.2f}",
852
+ "Precision (%)": "{:.2f}",
853
+ "Consistency (%)": "{:.2f}",
854
+ "Bias Detection (%)": "{:.2f}"
855
+ }), use_container_width=True
856
+ )
857
+
858
+ else:
859
+ st.info("You have no evaluation history yet.")
860
+
861
+ except Exception as e:
862
+ st.error(f"Error fetching history data: {e}")
863
+
864
+ # Add a footer
865
+ st.sidebar.markdown("---")
866
+ st.sidebar.info("LLM Evaluation System - v0.2")
867
+
868
+ # Function to handle model upload (placeholder)