Spaces:

poemsforaphrodite
/

llm-eval

Running

App Files Files Community

poemsforaphrodite commited on 26 days ago

Commit

b26ef8e

•

1 Parent(s): 343cdcf

Update app.py

Browse files

Files changed (1) hide show

app.py +614 -190

app.py CHANGED Viewed

@@ -14,6 +14,23 @@ from pinecone import Pinecone, ServerlessSpec
 import threading  # {{ edit_25: Import threading for background processing }}
 import tiktoken
 from tiktoken.core import Encoding
 # Set page configuration to wide mode
 st.set_page_config(layout="wide")
@@ -28,8 +45,11 @@ db = mongo_client['llm_evaluation_system']
 users_collection = db['users']
 results_collection = db['evaluation_results']
-# Initialize OpenAI client
-openai_client = OpenAI()  # {{ edit_12: Rename OpenAI client to 'openai_client' }}
 # Initialize Pinecone
 pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))  # {{ edit_13: Initialize Pinecone client using Pinecone class }}
@@ -97,11 +117,30 @@ def generate_response(prompt, context):
     except Exception as e:
         st.error(f"Error generating response: {str(e)}")
         return None
 # Function to clear the results database
-def clear_results_database():
     try:
-        results_collection.delete_many({})
         return True
     except Exception as e:
         st.error(f"Error clearing results database: {str(e)}")
@@ -228,6 +267,62 @@ def save_results(username, model, prompt, context, response, evaluation):  # {{
     }
     results_collection.insert_one(result)
 # Function for teacher model evaluation
 def teacher_evaluate(prompt, context, response):
     try:
@@ -236,8 +331,8 @@ def teacher_evaluate(prompt, context, response):
         Rate each factor on a scale of 0 to 1, where 1 is the best (or least problematic for negative factors like Hallucination and Bias).
         Please provide scores with two decimal places, and avoid extreme scores of exactly 0 or 1 unless absolutely necessary.
-        Prompt: {prompt}
         Context: {context}
         Response: {response}
         Factors to evaluate:
@@ -255,7 +350,7 @@ def teacher_evaluate(prompt, context, response):
         """
         evaluation_response = openai_client.chat.completions.create(
-            model="gpt-4o-mini",  # Corrected model name
             messages=[
                 {"role": "system", "content": "You are an expert evaluator of language model responses."},
                 {"role": "user", "content": evaluation_prompt}
@@ -328,14 +423,9 @@ else:
     st.sidebar.success(f"Welcome, {st.session_state.user}!")
     if st.sidebar.button("Logout"):
         st.session_state.user = None
-        st.experimental_rerun()
-    # Add Clear Results Database button
-    if st.sidebar.button("Clear Results Database"):
-        if clear_results_database():  # {{ edit_fix: Calling the newly defined clear_results_database function }}
-            st.sidebar.success("Results database cleared successfully!")
-        else:
-            st.sidebar.error("Failed to clear results database.")
 # App content
 if st.session_state.user:
@@ -355,9 +445,23 @@ if st.session_state.user:
         if user_models:
             model_options = [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models]
             selected_model = st.selectbox("Select Model to View Metrics", ["All Models"] + model_options)
         else:
             st.error("You have no uploaded models.")
             selected_model = "All Models"
         try:
             query = {"username": st.session_state.user}
@@ -369,21 +473,81 @@ if st.session_state.user:
             if results:
                 df = pd.DataFrame(results)
-                # Count tokens for prompt, context, and response
-                df['prompt_tokens'] = df['prompt'].apply(count_tokens)
-                df['context_tokens'] = df['context'].apply(count_tokens)
-                df['response_tokens'] = df['response'].apply(count_tokens)
                 # Calculate total tokens for each row
                 df['total_tokens'] = df['prompt_tokens'] + df['context_tokens'] + df['response_tokens']
                 metrics = ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]
                 for metric in metrics:
-                    df[metric] = df['evaluation'].apply(lambda x: x.get(metric, {}).get('score', 0) if x else 0) * 100
                 df['timestamp'] = pd.to_datetime(df['timestamp'])
                 df['query_number'] = range(1, len(df) + 1)  # Add query numbers
                 @st.cache_data
                 def create_metrics_graph(df, metrics):
                     fig = px.line(
@@ -415,8 +579,7 @@ if st.session_state.user:
                 # Latest Metrics
                 st.subheader("Latest Metrics")
-                latest_result = df.iloc[-1]  # Get the last row (most recent query)
-                latest_metrics = {metric: latest_result[metric] for metric in metrics}
                 cols = st.columns(4)
                 for i, (metric, value) in enumerate(latest_metrics.items()):
@@ -425,6 +588,9 @@ if st.session_state.user:
                         st.metric(label=metric, value=f"{value:.2f}%", delta=None)
                         st.progress(value / 100)
                 # Detailed Data View
                 st.subheader("Detailed Data View")
@@ -442,14 +608,15 @@ if st.session_state.user:
                 # Prepare the data for display
                 display_data = []
                 for _, row in df.iterrows():
                     display_row = {
-                        "Prompt": row['prompt'][:50] + "...",  # Truncate long prompts
-                        "Context": row['context'][:50] + "...",  # Truncate long contexts
-                        "Response": row['response'][:50] + "...",  # Truncate long responses
                     }
                     # Add metrics to the display row
                     for metric in metrics:
-                        display_row[metric] = row[metric]  # Store as float, not string
                     display_data.append(display_row)
@@ -490,20 +657,309 @@ if st.session_state.user:
                     height=400  # Set a fixed height with scrolling
                 )
-                # Placeholders for future sections
                 st.subheader("Worst Performing Slice Analysis")
-                st.info("This section will show analysis of the worst-performing data slices.")
-                st.subheader("UMAP Visualization")
-                st.info("This section will contain UMAP visualizations for dimensionality reduction insights.")
             else:
                 st.info("No evaluation results available for the selected model.")
         except Exception as e:
-            st.error(f"Error fetching data from database: {e}")
             st.error("Detailed error information:")
-            st.error(str(e))
-            import traceback
             st.error(traceback.format_exc())
     elif app_mode == "Model Upload":
         st.title("Upload Your Model")
@@ -562,7 +1018,6 @@ if st.session_state.user:
     elif app_mode == "Prompt Testing":
         st.title("Prompt Testing")
-        # {{ edit_6: Use model_name instead of model_id }}
         model_selection_option = st.radio("Select Model Option:", ["Choose Existing Model", "Add New Model"])
         if model_selection_option == "Choose Existing Model":
@@ -572,136 +1027,94 @@ if st.session_state.user:
             if not user_models:
                 st.error("You have no uploaded models. Please upload a model first.")
             else:
-                # Display model_name instead of model_id
-                model_name = st.selectbox("Select a Model for Testing", [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models])
         else:
-            # Option to enter model name or upload a link
-            new_model_option = st.radio("Add Model By:", ["Enter Model Name", "Upload Model Link"])
-            if new_model_option == "Enter Model Name":
-                model_name_input = st.text_input("Enter New Model Name:")
-                if st.button("Save Model Name"):
-                    if model_name_input:
-                        # {{ edit_3: Save the new model name to user's models }}
-                        model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
-                        users_collection.update_one(
-                            {"username": st.session_state.user},
-                            {"$push": {"models": {
-                                "model_id": model_id,
-                                "model_name": model_name_input,
-                                "file_path": None,
-                                "model_link": None,
-                                "uploaded_at": datetime.now()
-                            }}}
-                        )
-                        st.success(f"Model '{model_name_input}' saved successfully as {model_id}!")
-                        model_name = model_name_input  # Use model_name instead of model_id
-                    else:
-                        st.error("Please enter a valid model name.")
-            else:
-                model_link = st.text_input("Enter Model Link:")
-                if st.button("Save Model Link"):
-                    if model_link:
-                        # {{ edit_4: Save the model link to user's models }}
-                        model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
-                        users_collection.update_one(
-                            {"username": st.session_state.user},
-                            {"$push": {"models": {
-                                "model_id": model_id,
-                                "model_name": None,
-                                "file_path": None,
-                                "model_link": model_link,
-                                "uploaded_at": datetime.now()
-                            }}}
-                        )
-                        st.success(f"Model link saved successfully as {model_id}!")
-                        model_name = model_id  # Use model_id if model_name is not available
-                    else:
-                        st.error("Please enter a valid model link.")
-        # Two ways to provide prompts
-        prompt_input_method = st.radio("Choose prompt input method:", ["Single JSON", "Batch Upload"])
-        if prompt_input_method == "Single JSON":
-            json_input = st.text_area("Enter your JSON input:")
-            if json_input:
                 try:
-                    data = json.loads(json_input)
-                    st.success("JSON parsed successfully!")
-                    # Display JSON in a table format
-                    st.subheader("Input Data")
-                    df = pd.json_normalize(data)
-                    st.table(df.style.set_properties(**{
-                        'background-color': '#f0f8ff',
-                        'color': '#333',
-                        'border': '1px solid #ddd'
-                    }).set_table_styles([
-                        {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white')]},
-                        {'selector': 'td', 'props': [('text-align', 'left')]}
-                    ]))
-                except json.JSONDecodeError:
-                    st.error("Invalid JSON. Please check your input.")
-        else:
-            uploaded_file = st.file_uploader("Upload a JSON file with prompts, contexts, and responses", type="json")
-            if uploaded_file is not None:
-                try:
-                    data = json.load(uploaded_file)
-                    st.success("JSON file loaded successfully!")
-                    # Display JSON in a table format
-                    st.subheader("Input Data")
-                    df = pd.json_normalize(data)
-                    st.table(df.style.set_properties(**{
-                        'background-color': '#f0f8ff',
-                        'color': '#333',
-                        'border': '1px solid #ddd'
-                    }).set_table_styles([
-                        {'selector': 'th', 'props': [('background-color', '#4CAF50'), ('color', 'white')]},
-                        {'selector': 'td', 'props': [('text-align', 'left')]}
-                    ]))
                 except json.JSONDecodeError:
-                    st.error("Invalid JSON file. Please check your file contents.")
-        # Function to handle background evaluation
-        def run_evaluations(data, selected_model, username):  # {{ edit_30: Add 'username' parameter }}
-            if isinstance(data, list):
-                for item in data:
-                    if 'response' not in item:
-                        item['response'] = generate_response(item['prompt'], item['context'])
-                    evaluation = teacher_evaluate(item['prompt'], item['context'], item['response'])
-                    save_results(username, selected_model, item['prompt'], item['context'], item['response'], evaluation)  # {{ edit_31: Pass 'username' to save_results }}
-                    # Optionally, update completed prompts in session_state or another mechanism
             else:
-                if 'response' not in data:
-                    data['response'] = generate_response(data['prompt'], data['context'])
-                evaluation = teacher_evaluate(data['prompt'], data['context'], data['response'])
-                save_results(username, selected_model, data['prompt'], data['context'], data['response'], evaluation)  # {{ edit_32: Pass 'username' to save_results }}
-                # Optionally, update completed prompts in session_state or another mechanism
-        # In the Prompt Testing section
         if st.button("Run Test"):
             if not model_name:
                 st.error("Please select or add a valid Model.")
-            elif not data:
-                st.error("Please provide valid JSON input.")
             else:
-                # {{ edit_28: Define 'selected_model' based on 'model_name' }}
-                selected_model = next(
-                    (m for m in user_models if (m['model_name'] == model_name) or (m['model_id'] == model_name)),
-                    None
-                )
-                if selected_model:
-                    with st.spinner("Starting evaluations in the background..."):
-                        evaluation_thread = threading.Thread(
-                            target=run_evaluations,
-                            args=(data, selected_model, st.session_state.user)  # {{ edit_33: Pass 'username' to the thread }}
-                        )
-                        evaluation_thread.start()
-                        st.success("Evaluations are running in the background. You can navigate away or close the site.")
-                        # {{ edit_34: Optionally, track running evaluations in session_state }}
-                else:
-                    st.error("Selected model not found.")
     elif app_mode == "Manage Models":
         st.title("Manage Your Models")
@@ -712,46 +1125,58 @@ if st.session_state.user:
             st.stop()
         user_models = user.get("models", [])
-        # {{ edit_1: Add option to add a new model }}
         st.subheader("Add a New Model")
-        add_model_option = st.radio("Add Model By:", ["Enter Model Name", "Upload Model Link"])
-        if add_model_option == "Enter Model Name":
             new_model_name = st.text_input("Enter New Model Name:")
-            if st.button("Add Model Name"):
-                if new_model_name:
-                    model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
-                    users_collection.update_one(
-                        {"username": st.session_state.user},
-                        {"$push": {"models": {
-                            "model_id": model_id,
-                            "model_name": new_model_name,
-                            "file_path": None,
-                            "model_link": None,
-                            "uploaded_at": datetime.now()
-                        }}}
-                    )
-                    st.success(f"Model '{new_model_name}' added successfully as {model_id}!")
-                else:
-                    st.error("Please enter a valid model name.")
-        else:
-            new_model_link = st.text_input("Enter Model Link:")
-            if st.button("Add Model Link"):
-                if new_model_link:
                     model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
                     users_collection.update_one(
                         {"username": st.session_state.user},
-                        {"$push": {"models": {
-                            "model_id": model_id,
-                            "model_name": None,
-                            "file_path": None,
-                            "model_link": new_model_link,
-                            "uploaded_at": datetime.now()
-                        }}}
                     )
-                    st.success(f"Model link added successfully as {model_id}!")
                 else:
-                    st.error("Please enter a valid model link.")
         st.markdown("---")
@@ -759,11 +1184,9 @@ if st.session_state.user:
             st.subheader("Your Models")
             for model in user_models:
                 st.markdown(f"**Model ID:** {model['model_id']}")
-                st.write(f"**Model Type:** {model.get('model_type', 'custom').capitalize()}")  # {{ edit_14: Handle missing 'model_type' with default 'custom' }}
                 if model.get("model_name"):
                     st.write(f"**Model Name:** {model['model_name']}")
-                if model.get("model_link"):
-                    st.write(f"**Model Link:** [Link]({model['model_link']})")
                 if model.get("file_path"):
                     st.write(f"**File Path:** {model['file_path']}")
                 st.write(f"**Uploaded at:** {model['uploaded_at']}")
@@ -794,6 +1217,9 @@ if st.session_state.user:
                 # Convert results to a pandas DataFrame
                 df = pd.DataFrame(user_results)
                 # Normalize the evaluation JSON into separate columns
                 eval_df = df['evaluation'].apply(pd.Series)
                 for metric in ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]:
@@ -842,7 +1268,7 @@ if st.session_state.user:
                         'border': '1px solid #ddd'
                     }).set_table_styles([
                         {'selector': 'th', 'props': [('background-color', '#f5f5f5'), ('text-align', 'center')]},
-                        {'selector': 'td', 'props': [('text-align', 'center'), ('vertical-align', 'top')]}
                     ]).format({
                         "Accuracy (%)": "{:.2f}",
                         "Hallucination (%)": "{:.2f}",
@@ -863,6 +1289,4 @@ if st.session_state.user:
 # Add a footer
 st.sidebar.markdown("---")
-st.sidebar.info("LLM Evaluation System - v0.2")
-# Function to handle model upload (placeholder)

 import threading  # {{ edit_25: Import threading for background processing }}
 import tiktoken
 from tiktoken.core import Encoding
+from runner import run_model
+from bson.objectid import ObjectId
+import traceback  # Add this import at the top of your file
+import umap
+import plotly.graph_objs as go
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import KMeans
+import plotly.colors as plc
+# Add this helper function at the beginning of your file
+def extract_prompt_text(prompt):
+    if isinstance(prompt, dict):
+        return prompt.get('prompt', '')
+    elif isinstance(prompt, str):
+        return prompt
+    else:
+        return str(prompt)
 # Set page configuration to wide mode
 st.set_page_config(layout="wide")
 users_collection = db['users']
 results_collection = db['evaluation_results']
+# Remove or comment out this line if it exists
+# openai_client = OpenAI()
+# Instead, use the openai_client from runner.py
+from runner import openai_client
 # Initialize Pinecone
 pinecone_client = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))  # {{ edit_13: Initialize Pinecone client using Pinecone class }}
     except Exception as e:
         st.error(f"Error generating response: {str(e)}")
         return None
+# Add this function to update the context for a model
+def update_model_context(username, model_id, context):
+    users_collection.update_one(
+        {"username": username, "models.model_id": model_id},
+        {"$set": {"models.$.context": context}}
+    )
 # Function to clear the results database
+def clear_results_database(username, model_identifier=None):
     try:
+        if model_identifier:
+            # Clear results for the specific model
+            results_collection.delete_many({
+                "username": username,
+                "$or": [
+                    {"model_name": model_identifier},
+                    {"model_id": model_identifier}
+                ]
+            })
+        else:
+            # Clear all results for the user
+            results_collection.delete_many({"username": username})
         return True
     except Exception as e:
         st.error(f"Error clearing results database: {str(e)}")
     }
     results_collection.insert_one(result)
+# Modify the run_custom_evaluations function
+def run_custom_evaluations(data, selected_model, username):
+    try:
+        model_name = selected_model['model_name']
+        model_id = selected_model['model_id']
+        model_type = selected_model.get('model_type', 'Unknown').lower()
+        if model_type == 'simple':
+            # For simple models, data is already in the correct format
+            test_cases = data
+        else:
+            # For other models, data is split into context_dataset and questions
+            context_dataset, questions = data
+            test_cases = [
+                {
+                    "prompt": extract_prompt_text(question),
+                    "context": context_dataset,
+                    "response": ""  # This will be filled by the model
+                }
+                for question in questions
+            ]
+        for test_case in test_cases:
+            prompt_text = test_case["prompt"]
+            context = test_case["context"]
+            # Get the student model's response using runner.py
+            try:
+                answer = run_model(model_name, prompt_text)
+                if answer is None or answer == "":
+                    st.warning(f"No response received from the model for prompt: {prompt_text}")
+                    answer = "No response received from the model."
+            except Exception as model_error:
+                st.error(f"Error running model for prompt: {prompt_text}")
+                st.error(f"Error details: {str(model_error)}")
+                answer = f"Error: {str(model_error)}"
+            # Get the teacher's evaluation
+            try:
+                evaluation = teacher_evaluate(prompt_text, context, answer)
+                if evaluation is None:
+                    st.warning(f"No evaluation received for prompt: {prompt_text}")
+                    evaluation = {"Error": "No evaluation received"}
+            except Exception as eval_error:
+                st.error(f"Error in teacher evaluation for prompt: {prompt_text}")
+                st.error(f"Error details: {str(eval_error)}")
+                evaluation = {"Error": str(eval_error)}
+            # Save the results
+            save_results(username, selected_model, prompt_text, context, answer, evaluation)
+        st.success("Evaluation completed successfully!")
+    except Exception as e:
+        st.error(f"Error in custom evaluation: {str(e)}")
+        st.error(f"Detailed error: {traceback.format_exc()}")
 # Function for teacher model evaluation
 def teacher_evaluate(prompt, context, response):
     try:
         Rate each factor on a scale of 0 to 1, where 1 is the best (or least problematic for negative factors like Hallucination and Bias).
         Please provide scores with two decimal places, and avoid extreme scores of exactly 0 or 1 unless absolutely necessary.
         Context: {context}
+        Prompt: {prompt}
         Response: {response}
         Factors to evaluate:
         """
         evaluation_response = openai_client.chat.completions.create(
+            model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": "You are an expert evaluator of language model responses."},
                 {"role": "user", "content": evaluation_prompt}
     st.sidebar.success(f"Welcome, {st.session_state.user}!")
     if st.sidebar.button("Logout"):
         st.session_state.user = None
+        st.rerun()
 # App content
 if st.session_state.user:
         if user_models:
             model_options = [model['model_name'] if model['model_name'] else model['model_id'] for model in user_models]
             selected_model = st.selectbox("Select Model to View Metrics", ["All Models"] + model_options)
+            st.session_state['selected_model'] = selected_model  # Store the selected model in session state
+            # Add delete dataset button
+            if selected_model != "All Models":
+                if st.button("Delete Dataset"):
+                    if st.session_state['selected_model']:
+                        if clear_results_database(st.session_state.user, st.session_state['selected_model']):
+                            st.success(f"All evaluation results for {st.session_state['selected_model']} have been deleted.")
+                            st.rerun()  # Rerun the app to refresh the dashboard
+                        else:
+                            st.error("Failed to delete the dataset. Please try again.")
+                    else:
+                        st.error("No model selected. Please select a model to delete its dataset.")
         else:
             st.error("You have no uploaded models.")
             selected_model = "All Models"
+            st.session_state['selected_model'] = selected_model
         try:
             query = {"username": st.session_state.user}
             if results:
                 df = pd.DataFrame(results)
+                # Check if required columns exist
+                required_columns = ['prompt', 'context', 'response', 'evaluation']
+                missing_columns = [col for col in required_columns if col not in df.columns]
+                if missing_columns:
+                    st.error(f"Error: Missing columns in the data: {', '.join(missing_columns)}")
+                    st.error("Please check the database schema and ensure all required fields are present.")
+                    st.stop()
+                # Extract prompt text if needed
+                df['prompt'] = df['prompt'].apply(extract_prompt_text)
+                # Safely count tokens for prompt, context, and response
+                def safe_count_tokens(text):
+                    if isinstance(text, str):
+                        return count_tokens(text)
+                    else:
+                        return 0  # or some default value
+                df['prompt_tokens'] = df['prompt'].apply(safe_count_tokens)
+                df['context_tokens'] = df['context'].apply(safe_count_tokens)
+                df['response_tokens'] = df['response'].apply(safe_count_tokens)
                 # Calculate total tokens for each row
                 df['total_tokens'] = df['prompt_tokens'] + df['context_tokens'] + df['response_tokens']
+                # Safely extract evaluation metrics
                 metrics = ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]
                 for metric in metrics:
+                    df[metric] = df['evaluation'].apply(lambda x: x.get(metric, {}).get('score', 0) if isinstance(x, dict) else 0) * 100
                 df['timestamp'] = pd.to_datetime(df['timestamp'])
                 df['query_number'] = range(1, len(df) + 1)  # Add query numbers
+                # Set the threshold for notifications
+                notification_threshold = st.slider("Set Performance Threshold for Notifications (%)", min_value=0, max_value=100, value=50)
+                # Define the metrics to check
+                metrics_to_check = metrics  # Or allow the user to select specific metrics
+                # Check for evaluations where any of the metrics are below the threshold
+                low_performance_mask = df[metrics_to_check].lt(notification_threshold).any(axis=1)
+                low_performing_evaluations = df[low_performance_mask]
+                # Display Notifications
+                if not low_performing_evaluations.empty:
+                    st.warning(f"⚠️ You have {len(low_performing_evaluations)} evaluations with metrics below {notification_threshold}%.")
+                    with st.expander("View Low-Performing Evaluations"):
+                        # Display the low-performing evaluations in a table
+                        display_columns = ['timestamp', 'model_name', 'prompt', 'response'] + metrics_to_check
+                        low_perf_display_df = low_performing_evaluations[display_columns].copy()
+                        low_perf_display_df['timestamp'] = low_perf_display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
+                        # Apply styling to highlight low scores
+                        def highlight_low_scores(val):
+                            if isinstance(val, float):
+                                if val < notification_threshold:
+                                    return 'background-color: red; color: white'
+                            return ''
+                        styled_low_perf_df = low_perf_display_df.style.applymap(highlight_low_scores, subset=metrics_to_check)
+                        styled_low_perf_df = styled_low_perf_df.format({metric: "{:.2f}%" for metric in metrics_to_check})
+                        st.dataframe(
+                            styled_low_perf_df.set_properties(**{
+                                'text-align': 'left',
+                                'border': '1px solid #ddd'
+                            }).set_table_styles([
+                                {'selector': 'th', 'props': [('background-color', '#333'), ('color', 'white')]},
+                                {'selector': 'td', 'props': [('vertical-align', 'top')]}
+                            ]),
+                            use_container_width=True
+                        )
+                else:
+                    st.success("🎉 All your evaluations have metrics above the threshold!")
                 @st.cache_data
                 def create_metrics_graph(df, metrics):
                     fig = px.line(
                 # Latest Metrics
                 st.subheader("Latest Metrics")
+                latest_metrics = df[metrics].mean()  # Calculate the average of all metrics
                 cols = st.columns(4)
                 for i, (metric, value) in enumerate(latest_metrics.items()):
                         st.metric(label=metric, value=f"{value:.2f}%", delta=None)
                         st.progress(value / 100)
+                # Add an explanation for the metrics
+                st.info("These metrics represent the average scores across all evaluations.")
                 # Detailed Data View
                 st.subheader("Detailed Data View")
                 # Prepare the data for display
                 display_data = []
                 for _, row in df.iterrows():
+                    prompt_text = extract_prompt_text(row.get('prompt', ''))
                     display_row = {
+                        "Prompt": prompt_text[:50] + "..." if prompt_text else "N/A",
+                        "Context": str(row.get('context', ''))[:50] + "..." if row.get('context') else "N/A",
+                        "Response": str(row.get('response', ''))[:50] + "..." if row.get('response') else "N/A",
                     }
                     # Add metrics to the display row
                     for metric in metrics:
+                        display_row[metric] = row.get(metric, 0)  # Use get() with a default value
                     display_data.append(display_row)
                     height=400  # Set a fixed height with scrolling
                 )
+                # UMAP Visualization with Clustering
+                st.subheader("UMAP Visualization with Clustering")
+                if len(df) > 2:
+                    # Allow user to select metrics to include
+                    metrics = ['Accuracy', 'Hallucination', 'Groundedness', 'Relevance', 'Recall', 'Precision', 'Consistency', 'Bias Detection']
+                    selected_metrics = st.multiselect("Select Metrics to Include in UMAP", metrics, default=metrics)
+                    if len(selected_metrics) < 2:
+                        st.warning("Please select at least two metrics for UMAP.")
+                    else:
+                        # Allow user to select number of dimensions
+                        n_components = st.radio("Select UMAP Dimensions", [2, 3], index=1)
+                        # Allow user to adjust UMAP parameters
+                        n_neighbors = st.slider("n_neighbors", min_value=2, max_value=50, value=15)
+                        min_dist = st.slider("min_dist", min_value=0.0, max_value=1.0, value=0.1, step=0.01)
+                        # Prepare data for UMAP
+                        X = df[selected_metrics].values
+                        # Normalize the data
+                        scaler = StandardScaler()
+                        X_scaled = scaler.fit_transform(X)
+                        # Perform UMAP dimensionality reduction
+                        reducer = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, n_components=n_components, random_state=42)
+                        embedding = reducer.fit_transform(X_scaled)
+                        # Allow user to select the number of clusters
+                        num_clusters = st.slider("Select Number of Clusters", min_value=2, max_value=10, value=3)
+                        # Perform KMeans clustering on the UMAP embeddings
+                        kmeans = KMeans(n_clusters=num_clusters, random_state=42)
+                        cluster_labels = kmeans.fit_predict(embedding)
+                        # Create a DataFrame with the UMAP results and cluster labels
+                        umap_columns = [f'UMAP{i+1}' for i in range(n_components)]
+                        umap_data = {col: embedding[:, idx] for idx, col in enumerate(umap_columns)}
+                        umap_data['Cluster'] = cluster_labels
+                        umap_data['Model'] = df['model_name']
+                        umap_data['Prompt'] = df['prompt']
+                        umap_data['Response'] = df['response']
+                        umap_data['Timestamp'] = df['timestamp']
+                        umap_df = pd.DataFrame(umap_data)
+                        # Include selected metrics in umap_df for hover info
+                        for metric in selected_metrics:
+                            umap_df[metric] = df[metric]
+                        # Prepare customdata for hovertemplate
+                        customdata_columns = ['Model', 'Prompt', 'Cluster'] + selected_metrics
+                        umap_df['customdata'] = umap_df[customdata_columns].values.tolist()
+                        # Build hovertemplate
+                        hovertemplate = '<b>Model:</b> %{customdata[0]}<br>' + \
+                                        '<b>Prompt:</b> %{customdata[1]}<br>' + \
+                                        '<b>Cluster:</b> %{customdata[2]}<br>'
+                        for idx, metric in enumerate(selected_metrics):
+                            hovertemplate += f'<b>{metric}:</b> %{{customdata[{idx+3}]:.2f}}<br>'
+                        hovertemplate += '<extra></extra>'  # Hide trace info
+                        # Define color palette for clusters
+                        cluster_colors = plc.qualitative.Plotly
+                        num_colors = len(cluster_colors)
+                        if num_clusters > num_colors:
+                            cluster_colors = plc.sample_colorscale('Rainbow', [n/(num_clusters-1) for n in range(num_clusters)])
+                        else:
+                            cluster_colors = cluster_colors[:num_clusters]
+                        # Map cluster labels to colors
+                        cluster_color_map = {label: color for label, color in zip(range(num_clusters), cluster_colors)}
+                        umap_df['Color'] = umap_df['Cluster'].map(cluster_color_map)
+                        # Create the UMAP plot
+                        if n_components == 3:
+                            # 3D plot
+                            fig = go.Figure()
+                            for cluster_label in sorted(umap_df['Cluster'].unique()):
+                                cluster_data = umap_df[umap_df['Cluster'] == cluster_label]
+                                fig.add_trace(go.Scatter3d(
+                                    x=cluster_data['UMAP1'],
+                                    y=cluster_data['UMAP2'],
+                                    z=cluster_data['UMAP3'],
+                                    mode='markers',
+                                    name=f'Cluster {cluster_label}',
+                                    marker=dict(
+                                        size=5,
+                                        color=cluster_data['Color'],  # Color according to cluster
+                                        opacity=0.8,
+                                        line=dict(width=0.5, color='white')
+                                    ),
+                                    customdata=cluster_data['customdata'],
+                                    hovertemplate=hovertemplate
+                                ))
+                            fig.update_layout(
+                                title='3D UMAP Visualization with Clustering',
+                                scene=dict(
+                                    xaxis_title='UMAP Dimension 1',
+                                    yaxis_title='UMAP Dimension 2',
+                                    zaxis_title='UMAP Dimension 3'
+                                ),
+                                hovermode='closest',
+                                template='plotly_dark',
+                                height=800,
+                                legend_title='Clusters'
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                        else:
+                            # 2D plot
+                            fig = go.Figure()
+                            for cluster_label in sorted(umap_df['Cluster'].unique()):
+                                cluster_data = umap_df[umap_df['Cluster'] == cluster_label]
+                                fig.add_trace(go.Scatter(
+                                    x=cluster_data['UMAP1'],
+                                    y=cluster_data['UMAP2'],
+                                    mode='markers',
+                                    name=f'Cluster {cluster_label}',
+                                    marker=dict(
+                                        size=8,
+                                        color=cluster_data['Color'],  # Color according to cluster
+                                        opacity=0.8,
+                                        line=dict(width=0.5, color='white')
+                                    ),
+                                    customdata=cluster_data['customdata'],
+                                    hovertemplate=hovertemplate
+                                ))
+                            fig.update_layout(
+                                title='2D UMAP Visualization with Clustering',
+                                xaxis_title='UMAP Dimension 1',
+                                yaxis_title='UMAP Dimension 2',
+                                hovermode='closest',
+                                template='plotly_dark',
+                                height=800,
+                                legend_title='Clusters'
+                            )
+                            st.plotly_chart(fig, use_container_width=True)
+                        # Selectable Data Points
+                        st.subheader("Cluster Analysis")
+                        # Show cluster counts
+                        cluster_counts = umap_df['Cluster'].value_counts().sort_index().reset_index()
+                        cluster_counts.columns = ['Cluster', 'Number of Points']
+                        st.write("### Cluster Summary")
+                        st.dataframe(cluster_counts)
+                        # Allow user to select clusters to view details
+                        selected_clusters = st.multiselect("Select Clusters to View Details", options=sorted(umap_df['Cluster'].unique()), default=sorted(umap_df['Cluster'].unique()))
+                        if selected_clusters:
+                            selected_data = umap_df[umap_df['Cluster'].isin(selected_clusters)]
+                            st.write("### Details of Selected Clusters")
+                            st.dataframe(selected_data[['Model', 'Prompt', 'Response', 'Cluster'] + selected_metrics])
+                        else:
+                            st.info("Select clusters to view their details.")
+                        st.info("""
+                        **UMAP Visualization with Clustering**
+                        This visualization includes clustering of the evaluation data points in the UMAP space.
+                        **Features:**
+                        - **Clustering Algorithm**: KMeans clustering is applied on the UMAP embeddings.
+                        - **Cluster Selection**: Choose the number of clusters to identify patterns in the data.
+                        - **Color Coding**: Each cluster is represented by a distinct color in the plot.
+                        - **Interactive Exploration**: Hover over points to see detailed information, including the cluster label.
+                        - **Cluster Analysis**: View summary statistics and details of selected clusters.
+                        **Instructions:**
+                        - **Select Metrics**: Choose which evaluation metrics to include in the UMAP calculation.
+                        - **Adjust UMAP Parameters**: Fine-tune `n_neighbors` and `min_dist` for clustering granularity.
+                        - **Choose Number of Clusters**: Use the slider to set how many clusters to identify.
+                        - **Interact with the Plot**: Hover and click on clusters to explore data points.
+                        **Interpreting Clusters:**
+                        - **Cluster Composition**: Clusters group evaluations with similar metric profiles.
+                        - **Model Performance**: Analyze clusters to identify strengths and weaknesses of models.
+                        - **Data Patterns**: Use clustering to uncover hidden patterns in your evaluation data.
+                        **Tips:**
+                        - Experiment with different numbers of clusters to find meaningful groupings.
+                        - Adjust UMAP parameters to see how the clustering changes with different embeddings.
+                        - Use the cluster details to investigate specific evaluations and prompts.
+                        Enjoy exploring your evaluation data with clustering!
+                        """)
+                else:
+                    st.info("Not enough data for UMAP visualization. Please run more evaluations.")
+                # Worst Performing Slice Analysis
                 st.subheader("Worst Performing Slice Analysis")
+                # Allow the user to select metrics to analyze
+                metrics = ['Accuracy', 'Hallucination', 'Groundedness', 'Relevance', 'Recall', 'Precision', 'Consistency', 'Bias Detection']
+                selected_metrics = st.multiselect("Select Metrics to Analyze", metrics, default=metrics)
+                if selected_metrics:
+                    # Set a threshold for "poor performance"
+                    threshold = st.slider("Performance Threshold (%)", min_value=0, max_value=100, value=50)
+                    # Filter data where any of the selected metrics are below the threshold
+                    mask = df[selected_metrics].lt(threshold).any(axis=1)
+                    worst_performing_df = df[mask]
+                    if not worst_performing_df.empty:
+                        st.write(f"Found {len(worst_performing_df)} evaluations below the threshold of {threshold}% in the selected metrics.")
+                        # Display the worst-performing prompts and their metrics
+                        st.write("### Worst Performing Evaluations")
+                        display_columns = ['prompt', 'response'] + selected_metrics + ['timestamp']
+                        worst_performing_display_df = worst_performing_df[display_columns].copy()
+                        worst_performing_display_df['timestamp'] = worst_performing_display_df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')
+                        # Apply styling to highlight low scores
+                        def highlight_low_scores(val):
+                            if isinstance(val, float):
+                                if val < threshold:
+                                    return 'background-color: red; color: white'
+                            return ''
+                        styled_worst_df = worst_performing_display_df.style.applymap(highlight_low_scores, subset=selected_metrics)
+                        styled_worst_df = styled_worst_df.format({metric: "{:.2f}%" for metric in selected_metrics})
+                        st.dataframe(
+                            styled_worst_df.set_properties(**{
+                                'text-align': 'left',
+                                'border': '1px solid #ddd'
+                            }).set_table_styles([
+                                {'selector': 'th', 'props': [('background-color', '#333'), ('color', 'white')]},
+                                {'selector': 'td', 'props': [('vertical-align', 'top')]}
+                            ]),
+                            use_container_width=True
+                        )
+                        # Analyze the worst-performing slices based on prompt characteristics
+                        st.write("### Analysis by Prompt Length")
+                        # Add a column for prompt length
+                        worst_performing_df['Prompt Length'] = worst_performing_df['prompt'].apply(lambda x: len(x.split()))
+                        # Define bins for prompt length ranges
+                        bins = [0, 5, 10, 20, 50, 100, 1000]
+                        labels = ['0-5', '6-10', '11-20', '21-50', '51-100', '100+']
+                        worst_performing_df['Prompt Length Range'] = pd.cut(worst_performing_df['Prompt Length'], bins=bins, labels=labels, right=False)
+                        # Group by 'Prompt Length Range' and calculate average metrics
+                        group_metrics = worst_performing_df.groupby('Prompt Length Range')[selected_metrics].mean().reset_index()
+                        # Display the average metrics per prompt length range
+                        st.write("#### Average Metrics per Prompt Length Range")
+                        group_metrics = group_metrics.sort_values('Prompt Length Range')
+                        st.dataframe(group_metrics.style.format({metric: "{:.2f}%" for metric in selected_metrics}))
+                        # Visualization of average metrics per prompt length range
+                        st.write("#### Visualization of Metrics by Prompt Length Range")
+                        melted_group_metrics = group_metrics.melt(id_vars='Prompt Length Range', value_vars=selected_metrics, var_name='Metric', value_name='Average Score')
+                        fig = px.bar(
+                            melted_group_metrics,
+                            x='Prompt Length Range',
+                            y='Average Score',
+                            color='Metric',
+                            barmode='group',
+                            title='Average Metric Scores by Prompt Length Range',
+                            labels={'Average Score': 'Average Score (%)'},
+                            height=600
+                        )
+                        st.plotly_chart(fig, use_container_width=True)
+                        # Further analysis: show counts of worst-performing evaluations per model
+                        st.write("### Worst Performing Evaluations per Model")
+                        model_counts = worst_performing_df['model_name'].value_counts().reset_index()
+                        model_counts.columns = ['Model Name', 'Count of Worst Evaluations']
+                        st.dataframe(model_counts)
+                        # Allow user to download the worst-performing data
+                        csv = worst_performing_df.to_csv(index=False)
+                        st.download_button(
+                            label="Download Worst Performing Data as CSV",
+                            data=csv,
+                            file_name='worst_performing_data.csv',
+                            mime='text/csv',
+                        )
+                    else:
+                        st.info("No evaluations found below the specified threshold.")
+                else:
+                    st.warning("Please select at least one metric to analyze.")
             else:
                 st.info("No evaluation results available for the selected model.")
         except Exception as e:
+            st.error(f"Error processing data from database: {str(e)}")
             st.error("Detailed error information:")
             st.error(traceback.format_exc())
+            st.stop()
     elif app_mode == "Model Upload":
         st.title("Upload Your Model")
     elif app_mode == "Prompt Testing":
         st.title("Prompt Testing")
         model_selection_option = st.radio("Select Model Option:", ["Choose Existing Model", "Add New Model"])
         if model_selection_option == "Choose Existing Model":
             if not user_models:
                 st.error("You have no uploaded models. Please upload a model first.")
             else:
+                model_options = [
+                    f"{model['model_name']} ({model.get('model_type', 'Unknown').capitalize()})"
+                    for model in user_models
+                ]
+                selected_model = st.selectbox("Select a Model for Testing", model_options)
+                model_name = selected_model.split(" (")[0]
+                model_type = selected_model.split(" (")[1].rstrip(")")
         else:
+            # Code for adding a new model (unchanged)
+            ...
+        st.subheader("Input for Model Testing")
+        # For simple models, we'll use a single JSON file
+        if model_type.lower() == "simple":
+            st.write("For simple models, please upload a single JSON file containing prompts, contexts, and responses.")
+            json_file = st.file_uploader("Upload Test Data JSON", type=["json"])
+            if json_file is not None:
                 try:
+                    test_data = json.load(json_file)
+                    st.success("Test data JSON file uploaded successfully!")
+                    # Display a preview of the test data
+                    st.write("Preview of test data:")
+                    st.json(test_data[:3] if len(test_data) > 3 else test_data)
                 except json.JSONDecodeError:
+                    st.error("Invalid JSON format. Please check your file.")
+            else:
+                test_data = None
+        else:
+            # For other model types, keep the existing separate inputs for context and questions
+            context_input_method = st.radio("Choose context input method:", ["Text Input", "File Upload"])
+            if context_input_method == "Text Input":
+                context_dataset = st.text_area("Enter Context Dataset (txt):", height=200)
             else:
+                context_file = st.file_uploader("Upload Context Dataset", type=["txt"])
+                if context_file is not None:
+                    context_dataset = context_file.getvalue().decode("utf-8")
+                    st.success("Context file uploaded successfully!")
+                else:
+                    context_dataset = None
+            questions_input_method = st.radio("Choose questions input method:", ["Text Input", "File Upload"])
+            if questions_input_method == "Text Input":
+                questions_json = st.text_area("Enter Questions (JSON format):", height=200)
+            else:
+                questions_file = st.file_uploader("Upload Questions JSON", type=["json"])
+                if questions_file is not None:
+                    questions_json = questions_file.getvalue().decode("utf-8")
+                    st.success("Questions file uploaded successfully!")
+                else:
+                    questions_json = None
         if st.button("Run Test"):
             if not model_name:
                 st.error("Please select or add a valid Model.")
+            elif model_type.lower() == "simple" and test_data is None:
+                st.error("Please upload a valid test data JSON file.")
+            elif model_type.lower() != "simple" and (not context_dataset or not questions_json):
+                st.error("Please provide both context dataset and questions JSON.")
             else:
+                try:
+                    selected_model = next(
+                        (m for m in user_models if m['model_name'] == model_name),
+                        None
+                    )
+                    if selected_model:
+                        with st.spinner("Starting evaluations..."):
+                            if model_type.lower() == "simple":
+                                evaluation_thread = threading.Thread(
+                                    target=run_custom_evaluations,
+                                    args=(test_data, selected_model, st.session_state.user)
+                                )
+                            else:
+                                questions = json.loads(questions_json)
+                                evaluation_thread = threading.Thread(
+                                    target=run_custom_evaluations,
+                                    args=((context_dataset, questions), selected_model, st.session_state.user)
+                                )
+                            evaluation_thread.start()
+                            st.success("Evaluations are running in the background. You can navigate away or close the site.")
+                    else:
+                        st.error("Selected model not found.")
+                except json.JSONDecodeError:
+                    st.error("Invalid JSON format. Please check your input.")
     elif app_mode == "Manage Models":
         st.title("Manage Your Models")
             st.stop()
         user_models = user.get("models", [])
+        # Update existing models to ensure they have a model_type
+        for model in user_models:
+            if 'model_type' not in model:
+                model['model_type'] = 'simple'  # Default to 'simple' for existing models
+        users_collection.update_one(
+            {"username": st.session_state.user},
+            {"$set": {"models": user_models}}
+        )
         st.subheader("Add a New Model")
+        model_type = st.radio("Select Model Type:", ["Simple Model", "Custom Model"])
+        if model_type == "Simple Model":
             new_model_name = st.text_input("Enter New Model Name:")
+            if st.button("Add Simple Model") or st.button("Add Custom Model"):
+                if new_model_name or selected_custom_model:
                     model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
+                    model_data = {
+                        "model_id": model_id,
+                        "model_name": new_model_name if model_type == "Simple Model" else selected_custom_model,
+                        "model_type": "simple" if model_type == "Simple Model" else "custom",
+                        "file_path": None,
+                        "model_link": None,
+                        "uploaded_at": datetime.now(),
+                        "context": None  # We'll update this when running evaluations
+                    }
                     users_collection.update_one(
                         {"username": st.session_state.user},
+                        {"$push": {"models": model_data}}
                     )
+                    st.success(f"Model '{model_data['model_name']}' added successfully as {model_id}!")
                 else:
+                    st.error("Please enter a valid model name or select a custom model.")
+        else:  # Custom Model
+            custom_model_options = ["gpt-4o", "gpt-4o-mini"]
+            selected_custom_model = st.selectbox("Select Custom Model:", custom_model_options)
+            if st.button("Add Custom Model"):
+                model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
+                users_collection.update_one(
+                    {"username": st.session_state.user},
+                    {"$push": {"models": {
+                        "model_id": model_id,
+                        "model_name": selected_custom_model,
+                        "model_type": "custom",
+                        "file_path": None,
+                        "model_link": None,
+                        "uploaded_at": datetime.now()
+                    }}}
+                )
+                st.success(f"Custom Model '{selected_custom_model}' added successfully as {model_id}!")
         st.markdown("---")
             st.subheader("Your Models")
             for model in user_models:
                 st.markdown(f"**Model ID:** {model['model_id']}")
+                st.write(f"**Model Type:** {model.get('model_type', 'simple').capitalize()}")
                 if model.get("model_name"):
                     st.write(f"**Model Name:** {model['model_name']}")
                 if model.get("file_path"):
                     st.write(f"**File Path:** {model['file_path']}")
                 st.write(f"**Uploaded at:** {model['uploaded_at']}")
                 # Convert results to a pandas DataFrame
                 df = pd.DataFrame(user_results)
+                # Extract prompt text using the helper function
+                df['prompt'] = df['prompt'].apply(extract_prompt_text)
                 # Normalize the evaluation JSON into separate columns
                 eval_df = df['evaluation'].apply(pd.Series)
                 for metric in ["Accuracy", "Hallucination", "Groundedness", "Relevance", "Recall", "Precision", "Consistency", "Bias Detection"]:
                         'border': '1px solid #ddd'
                     }).set_table_styles([
                         {'selector': 'th', 'props': [('background-color', '#f5f5f5'), ('text-align', 'center')]},
+                        {'selector': 'td', 'props': [('text-align', 'left'), ('vertical-align', 'top')]}
                     ]).format({
                         "Accuracy (%)": "{:.2f}",
                         "Hallucination (%)": "{:.2f}",
 # Add a footer
 st.sidebar.markdown("---")
+st.sidebar.info("LLM Evaluation System - v0.2")