Spaces:

poemsforaphrodite
/

llm-eval

Running

App Files Files Community

poemsforaphrodite commited on 20 days ago

Commit

05e591f

•

1 Parent(s): 80476ab

Update app.py

Browse files

Files changed (1) hide show

app.py +521 -122

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from pinecone import Pinecone, ServerlessSpec
 import threading  # {{ edit_25: Import threading for background processing }}
 import tiktoken
 from tiktoken.core import Encoding
-from runner import run_model
 from bson.objectid import ObjectId
 import traceback  # Add this import at the top of your file
 import umap
@@ -22,6 +22,39 @@ import plotly.graph_objs as go
 from sklearn.preprocessing import StandardScaler
 from sklearn.cluster import KMeans
 import plotly.colors as plc
 # Add this helper function at the beginning of your file
 def extract_prompt_text(prompt):
@@ -81,8 +114,6 @@ def signup(username, password):
         "models": []  # List to store user's models
     })
     return True
-def upload_model(file):
-    return "Model uploaded successfully!"
 # Function to perform evaluation (placeholder)
 def evaluate_model(model_identifier, metrics, username):
@@ -151,10 +182,9 @@ def generate_embedding(text):
     try:
         embedding_response = openai_client.embeddings.create(
             model="text-embedding-3-large",  # {{ edit_3: Use the specified embedding model }}
-            input=text,
-            encoding_format="float"
         )
-        embedding = embedding_response["data"][0]["embedding"]
         return embedding
     except Exception as e:
         st.error(f"Error generating embedding: {str(e)}")
@@ -215,6 +245,7 @@ def index_context_data(model_name, texts):
                 ])
     except Exception as e:
         st.error(f"Error indexing data to Pinecone: {str(e)}")
 def upload_model(file, username, model_type):
     # {{ edit_5: Modify upload_model to handle model_type }}
     model_id = f"{username}_model_{int(datetime.now().timestamp())}"
@@ -251,7 +282,56 @@ def upload_model(file, username, model_type):
         return f"Named Model {model_id} registered successfully!"
     else:
         return "Invalid model type specified."
 # Function to save results to MongoDB
 def save_results(username, model, prompt, context, response, evaluation):  # {{ edit_29: Add 'username' parameter }}
     result = {
@@ -267,6 +347,87 @@ def save_results(username, model, prompt, context, response, evaluation):  # {{
     }
     results_collection.insert_one(result)
 # Modify the run_custom_evaluations function
 def run_custom_evaluations(data, selected_model, username):
     try:
@@ -278,12 +439,16 @@ def run_custom_evaluations(data, selected_model, username):
             # For simple models, data is already in the correct format
             test_cases = data
         else:
-            # For other models, data is split into context_dataset and questions
             context_dataset, questions = data
             test_cases = [
                 {
                     "prompt": extract_prompt_text(question),
-                    "context": context_dataset,
                     "response": ""  # This will be filled by the model
                 }
                 for question in questions
@@ -291,11 +456,18 @@ def run_custom_evaluations(data, selected_model, username):
         for test_case in test_cases:
             prompt_text = test_case["prompt"]
             context = test_case["context"]
             # Get the student model's response using runner.py
             try:
-                answer = run_model(model_name, prompt_text)
                 if answer is None or answer == "":
                     st.warning(f"No response received from the model for prompt: {prompt_text}")
                     answer = "No response received from the model."
@@ -421,17 +593,43 @@ if not st.session_state.user:
                 st.sidebar.error("Username already exists")
 else:
     st.sidebar.success(f"Welcome, {st.session_state.user}!")
     if st.sidebar.button("Logout"):
         st.session_state.user = None
         st.rerun()
 # App content
 if st.session_state.user:
-    app_mode = st.sidebar.selectbox("Choose the section", ["Dashboard", "Model Upload", "Evaluation", "Prompt Testing", "Manage Models", "History"])  # {{ edit_add: Added "History" to the sidebar navigation }}
-    if app_mode == "Dashboard":
         st.title("Dashboard")
         st.write("### Real-time Metrics and Performance Insights")
@@ -844,7 +1042,7 @@ if st.session_state.user:
                         - **Model Performance**: Analyze clusters to identify strengths and weaknesses of models.
                         - **Data Patterns**: Use clustering to uncover hidden patterns in your evaluation data.
-                        **Tips:**
                         - Experiment with different numbers of clusters to find meaningful groupings.
                         - Adjust UMAP parameters to see how the clustering changes with different embeddings.
@@ -961,7 +1159,7 @@ if st.session_state.user:
             st.error(traceback.format_exc())
             st.stop()
-    elif app_mode == "Model Upload":
         st.title("Upload Your Model")
         model_type = st.radio("Select Model Type", ["Custom", "Named"])  # {{ edit_6: Select model type }}
         uploaded_file = st.file_uploader("Choose a model file", type=[".pt", ".h5", ".bin"]) if model_type == "custom" else None
@@ -976,7 +1174,7 @@ if st.session_state.user:
             else:
                 st.error("Please upload a valid model file for Custom models.")
-    elif app_mode == "Evaluation":
         st.title("Evaluate Your Model")
         st.write("### Select Model and Evaluation Metrics")
@@ -1015,108 +1213,290 @@ if st.session_state.user:
                     else:
                         st.error("Selected model not found.")
-    elif app_mode == "Prompt Testing":
         st.title("Prompt Testing")
-        model_selection_option = st.radio("Select Model Option:", ["Choose Existing Model", "Add New Model"])
-        if model_selection_option == "Choose Existing Model":
-            user = users_collection.find_one({"username": st.session_state.user})
-            user_models = user.get("models", [])
-            if not user_models:
-                st.error("You have no uploaded models. Please upload a model first.")
-            else:
-                model_options = [
-                    f"{model['model_name']} ({model.get('model_type', 'Unknown').capitalize()})"
-                    for model in user_models
-                ]
-                selected_model = st.selectbox("Select a Model for Testing", model_options)
-                model_name = selected_model.split(" (")[0]
-                model_type = selected_model.split(" (")[1].rstrip(")")
         else:
-            # Code for adding a new model (unchanged)
-            ...
-        st.subheader("Input for Model Testing")
-        # For simple models, we'll use a single JSON file
-        if model_type.lower() == "simple":
-            st.write("For simple models, please upload a single JSON file containing prompts, contexts, and responses.")
-            json_file = st.file_uploader("Upload Test Data JSON", type=["json"])
-            if json_file is not None:
-                try:
-                    test_data = json.load(json_file)
-                    st.success("Test data JSON file uploaded successfully!")
-                    # Display a preview of the test data
-                    st.write("Preview of test data:")
-                    st.json(test_data[:3] if len(test_data) > 3 else test_data)
-                except json.JSONDecodeError:
-                    st.error("Invalid JSON format. Please check your file.")
-            else:
-                test_data = None
-        else:
-            # For other model types, keep the existing separate inputs for context and questions
-            context_input_method = st.radio("Choose context input method:", ["Text Input", "File Upload"])
-            if context_input_method == "Text Input":
-                context_dataset = st.text_area("Enter Context Dataset (txt):", height=200)
-            else:
-                context_file = st.file_uploader("Upload Context Dataset", type=["txt"])
-                if context_file is not None:
-                    context_dataset = context_file.getvalue().decode("utf-8")
-                    st.success("Context file uploaded successfully!")
-                else:
-                    context_dataset = None
-            questions_input_method = st.radio("Choose questions input method:", ["Text Input", "File Upload"])
-            if questions_input_method == "Text Input":
-                questions_json = st.text_area("Enter Questions (JSON format):", height=200)
-            else:
-                questions_file = st.file_uploader("Upload Questions JSON", type=["json"])
-                if questions_file is not None:
-                    questions_json = questions_file.getvalue().decode("utf-8")
-                    st.success("Questions file uploaded successfully!")
-                else:
-                    questions_json = None
-        if st.button("Run Test"):
-            if not model_name:
-                st.error("Please select or add a valid Model.")
-            elif model_type.lower() == "simple" and test_data is None:
-                st.error("Please upload a valid test data JSON file.")
-            elif model_type.lower() != "simple" and (not context_dataset or not questions_json):
-                st.error("Please provide both context dataset and questions JSON.")
-            else:
-                try:
-                    selected_model = next(
-                        (m for m in user_models if m['model_name'] == model_name),
-                        None
-                    )
-                    if selected_model:
-                        with st.spinner("Starting evaluations..."):
-                            if model_type.lower() == "simple":
-                                evaluation_thread = threading.Thread(
-                                    target=run_custom_evaluations,
-                                    args=(test_data, selected_model, st.session_state.user)
-                                )
                             else:
-                                questions = json.loads(questions_json)
-                                evaluation_thread = threading.Thread(
-                                    target=run_custom_evaluations,
-                                    args=((context_dataset, questions), selected_model, st.session_state.user)
-                                )
-                            evaluation_thread.start()
-                            st.success("Evaluations are running in the background. You can navigate away or close the site.")
                     else:
-                        st.error("Selected model not found.")
-                except json.JSONDecodeError:
-                    st.error("Invalid JSON format. Please check your input.")
-    elif app_mode == "Manage Models":
         st.title("Manage Your Models")
         # Fetch the user from the database
         user = users_collection.find_one({"username": st.session_state.user})
@@ -1135,17 +1515,17 @@ if st.session_state.user:
         )
         st.subheader("Add a New Model")
-        model_type = st.radio("Select Model Type:", ["Simple Model", "Custom Model"])
         if model_type == "Simple Model":
             new_model_name = st.text_input("Enter New Model Name:")
-            if st.button("Add Simple Model") or st.button("Add Custom Model"):
-                if new_model_name or selected_custom_model:
                     model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
                     model_data = {
                         "model_id": model_id,
-                        "model_name": new_model_name if model_type == "Simple Model" else selected_custom_model,
-                        "model_type": "simple" if model_type == "Simple Model" else "custom",
                         "file_path": None,
                         "model_link": None,
                         "uploaded_at": datetime.now(),
@@ -1155,11 +1535,11 @@ if st.session_state.user:
                         {"username": st.session_state.user},
                         {"$push": {"models": model_data}}
                     )
-                    st.success(f"Model '{model_data['model_name']}' added successfully as {model_id}!")
                 else:
-                    st.error("Please enter a valid model name or select a custom model.")
-        else:  # Custom Model
             custom_model_options = ["gpt-4o", "gpt-4o-mini"]
             selected_custom_model = st.selectbox("Select Custom Model:", custom_model_options)
@@ -1177,6 +1557,28 @@ if st.session_state.user:
                     }}}
                 )
                 st.success(f"Custom Model '{selected_custom_model}' added successfully as {model_id}!")
         st.markdown("---")
@@ -1202,10 +1604,11 @@ if st.session_state.user:
                         {"$pull": {"models": {"model_id": model['model_id']}}}
                     )
                     st.success(f"Model {model['model_id']} deleted successfully!")
         else:
             st.info("You have no uploaded models.")
-    elif app_mode == "History":  # {{ edit_add: Enhanced History UI }}
         st.title("History")
         st.write("### Your Evaluation History")
@@ -1285,8 +1688,4 @@ if st.session_state.user:
                 st.info("You have no evaluation history yet.")
         except Exception as e:
-            st.error(f"Error fetching history data: {e}")
-# Add a footer
-st.sidebar.markdown("---")
-st.sidebar.info("LLM Evaluation System - v0.2")

 import threading  # {{ edit_25: Import threading for background processing }}
 import tiktoken
 from tiktoken.core import Encoding
+from runner import run_model, summarize_image  # {{ edit_add: Import necessary functions }}
 from bson.objectid import ObjectId
 import traceback  # Add this import at the top of your file
 import umap
 from sklearn.preprocessing import StandardScaler
 from sklearn.cluster import KMeans
 import plotly.colors as plc
+import uuid
+import time  # Add this import at the top of your file
+from streamlit_webrtc import webrtc_streamer, WebRtcMode, RTCConfiguration, AudioProcessorBase
+import av
+import io
+from typing import List
+import requests
+import traceback
+# Add these imports at the beginning of your file
+from pydub import AudioSegment
+# Add this import at the top of your file
+import tempfile
+# Add this helper function for audio recording
+def process_audio(frame):
+    sound = frame.to_ndarray()
+    sound = sound.astype(np.int16)
+    return av.AudioFrame.from_ndarray(sound, layout="mono")
+# Add this helper function to convert WebRTC audio to a file
+def webrtc_audio_to_file(audio_frames):
+    audio = AudioSegment.empty()
+    for frame in audio_frames:
+        audio += AudioSegment(
+            data=frame.to_ndarray().tobytes(),
+            sample_width=frame.format.bytes,
+            frame_rate=frame.sample_rate,
+            channels=1
+        )
+    buffer = io.BytesIO()
+    audio.export(buffer, format="wav")
+    return buffer.getvalue()
 # Add this helper function at the beginning of your file
 def extract_prompt_text(prompt):
         "models": []  # List to store user's models
     })
     return True
 # Function to perform evaluation (placeholder)
 def evaluate_model(model_identifier, metrics, username):
     try:
         embedding_response = openai_client.embeddings.create(
             model="text-embedding-3-large",  # {{ edit_3: Use the specified embedding model }}
+            input=text
         )
+        embedding = embedding_response.data[0].embedding
         return embedding
     except Exception as e:
         st.error(f"Error generating embedding: {str(e)}")
                 ])
     except Exception as e:
         st.error(f"Error indexing data to Pinecone: {str(e)}")
 def upload_model(file, username, model_type):
     # {{ edit_5: Modify upload_model to handle model_type }}
     model_id = f"{username}_model_{int(datetime.now().timestamp())}"
         return f"Named Model {model_id} registered successfully!"
     else:
         return "Invalid model type specified."
+    # {{ edit_30: Display uploaded models in the UI after uploading }}
+    st.write("### Uploaded Models")
+    user = users_collection.find_one({"username": username})
+    user_models = user.get("models", [])
+    for model in user_models:
+        st.write(f"- **{model['model_name']}** (ID: {model['model_id']})")
+def run_huggingface_evaluations(data, selected_model, username):
+    try:
+        model_name = selected_model['model_name']
+        model_id = selected_model['model_id']
+        api_endpoint = selected_model.get('model_link')
+        api_token = selected_model.get('model_api_token')
+        if not api_endpoint or not api_token:
+            st.error("API endpoint or token is missing for the selected Hugging Face model.")
+            return
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json"
+        }
+        for test_case in data:
+            prompt = test_case.get("prompt", "")
+            context = test_case.get("context", "")
+            # Prepare the payload for the Hugging Face API
+            payload = {
+                "inputs": f"Context: {context}\n\nPrompt: {prompt}"
+            }
+            # Make the API call to the Hugging Face model
+            response = requests.post(api_endpoint, headers=headers, json=payload)
+            if response.status_code == 200:
+                model_output = response.json()[0]['generated_text']
+                # Get the teacher's evaluation
+                evaluation = teacher_evaluate(prompt, context, model_output)
+                # Save the results
+                save_results(username, selected_model, prompt, context, model_output, evaluation)
+            else:
+                st.error(f"Error calling Hugging Face API: {response.status_code} - {response.text}")
+        st.success("Hugging Face model evaluation completed successfully!")
+    except Exception as e:
+        st.error(f"Error in Hugging Face evaluation: {str(e)}")
+        st.error(f"Detailed error: {traceback.format_exc()}")
 # Function to save results to MongoDB
 def save_results(username, model, prompt, context, response, evaluation):  # {{ edit_29: Add 'username' parameter }}
     result = {
     }
     results_collection.insert_one(result)
+# Function to chunk text
+def chunk_text(text, max_tokens=500):
+    tokens = tokenizer.encode(text)
+    chunks = []
+    current_chunk = []
+    current_length = 0
+    for token in tokens:
+        if current_length + 1 > max_tokens:
+            chunks.append(tokenizer.decode(current_chunk))
+            current_chunk = []
+            current_length = 0
+        current_chunk.append(token)
+        current_length += 1
+    if current_chunk:
+        chunks.append(tokenizer.decode(current_chunk))
+    return chunks
+# Function to upload context to Pinecone
+def upload_context_to_pinecone(context, username, model_name):
+    chunks = chunk_text(context)
+    index = pinecone_client.Index(os.getenv('PINECONE_INDEX_NAME'))
+    namespace = f"{username}_{model_name}"  # Create a unique namespace for each user-model combination
+    for chunk in chunks:
+        embedding = generate_embedding(chunk)
+        if embedding:
+            index.upsert([
+                {
+                    "id": str(uuid.uuid4()),
+                    "values": embedding,
+                    "metadata": {"text": chunk}
+                }
+            ], namespace=namespace)  # Use the namespace when upserting
+# Function to retrieve relevant context from Pinecone
+def retrieve_context_from_pinecone(prompt, username, model_name):
+    index = pinecone_client.Index(os.getenv('PINECONE_INDEX_NAME'))
+    prompt_embedding = generate_embedding(prompt)
+    namespace = f"{username}_{model_name}"  # Use the same namespace format for retrieval
+    if prompt_embedding:
+        results = index.query(
+            vector=prompt_embedding,
+            top_k=5,
+            namespace=namespace,  # Use the namespace when querying
+            include_metadata=True
+        )
+        retrieved_context = " ".join([result.metadata['text'] for result in results.matches])
+        return retrieved_context
+    return ""
+def transcribe_audio(audio_file):
+    try:
+        # Save the uploaded file to a temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio:
+            temp_audio.write(audio_file.read())
+            temp_audio_path = temp_audio.name
+        # Transcribe the audio using OpenAI's Whisper model
+        with open(temp_audio_path, "rb") as audio_file:
+            transcript = openai_client.audio.transcriptions.create(
+                model="whisper-1",
+                file=audio_file,
+                response_format="text"
+            )
+        # Remove the temporary file
+        os.unlink(temp_audio_path)
+        return transcript
+    except Exception as e:
+        st.error(f"Error transcribing audio: {str(e)}")
+        return None
 # Modify the run_custom_evaluations function
 def run_custom_evaluations(data, selected_model, username):
     try:
             # For simple models, data is already in the correct format
             test_cases = data
         else:
+            # For custom models, data is split into context_dataset and questions
             context_dataset, questions = data
+            # Upload context to Pinecone with user and model-specific namespace
+            upload_context_to_pinecone(context_dataset, username, model_name)
             test_cases = [
                 {
                     "prompt": extract_prompt_text(question),
+                    "context": "",  # This will be filled with retrieved context
                     "response": ""  # This will be filled by the model
                 }
                 for question in questions
         for test_case in test_cases:
             prompt_text = test_case["prompt"]
+            # For custom models, retrieve context from Pinecone using the user and model-specific namespace
+            if model_type != 'simple':
+                retrieved_context = retrieve_context_from_pinecone(prompt_text, username, model_name)
+                test_case["context"] = retrieved_context
             context = test_case["context"]
             # Get the student model's response using runner.py
             try:
+                # Pass both prompt and context to run_model
+                answer = run_model(model_name, prompt_text, context)
                 if answer is None or answer == "":
                     st.warning(f"No response received from the model for prompt: {prompt_text}")
                     answer = "No response received from the model."
                 st.sidebar.error("Username already exists")
 else:
     st.sidebar.success(f"Welcome, {st.session_state.user}!")
+    # Separate links for each section
+    if st.sidebar.button("Dashboard"):
+        st.session_state.app_mode = "Dashboard"
+        st.rerun()
+    if st.sidebar.button("Model Upload"):
+        st.session_state.app_mode = "Model Upload"
+        st.rerun()
+    if st.sidebar.button("Evaluation"):
+        st.session_state.app_mode = "Evaluation"
+        st.rerun()
+    if st.sidebar.button("Prompt Testing"):
+        st.session_state.app_mode = "Prompt Testing"
+        st.rerun()
+    if st.sidebar.button("Manage Models"):
+        st.session_state.app_mode = "Manage Models"
+        st.rerun()
+    if st.sidebar.button("History"):
+        st.session_state.app_mode = "History"
+        st.rerun()
     if st.sidebar.button("Logout"):
         st.session_state.user = None
+        st.session_state.app_mode = None
         st.rerun()
 # App content
 if st.session_state.user:
+    if 'app_mode' not in st.session_state:
+        st.session_state.app_mode = "Dashboard"
+    if st.session_state.app_mode == "Dashboard":
         st.title("Dashboard")
         st.write("### Real-time Metrics and Performance Insights")
                         - **Model Performance**: Analyze clusters to identify strengths and weaknesses of models.
                         - **Data Patterns**: Use clustering to uncover hidden patterns in your evaluation data.
+                        **Tips:**
                         - Experiment with different numbers of clusters to find meaningful groupings.
                         - Adjust UMAP parameters to see how the clustering changes with different embeddings.
             st.error(traceback.format_exc())
             st.stop()
+    elif st.session_state.app_mode == "Model Upload":
         st.title("Upload Your Model")
         model_type = st.radio("Select Model Type", ["Custom", "Named"])  # {{ edit_6: Select model type }}
         uploaded_file = st.file_uploader("Choose a model file", type=[".pt", ".h5", ".bin"]) if model_type == "custom" else None
             else:
                 st.error("Please upload a valid model file for Custom models.")
+    elif st.session_state.app_mode == "Evaluation":
         st.title("Evaluate Your Model")
         st.write("### Select Model and Evaluation Metrics")
                     else:
                         st.error("Selected model not found.")
+    elif st.session_state.app_mode == "Prompt Testing":
         st.title("Prompt Testing")
+        user = users_collection.find_one({"username": st.session_state.user})
+        user_models = user.get("models", [])
+        if not user_models:
+            st.error("You have no uploaded models. Please upload a model first.")
         else:
+            model_options = [
+                f"{model['model_name']} ({model.get('model_type', 'Unknown').capitalize()})"
+                for model in user_models
+            ]
+            selected_model = st.selectbox("Select a Model for Testing", model_options)
+            model_name = selected_model.split(" (")[0]
+            model_type = selected_model.split(" (")[1].rstrip(")")
+            st.subheader("Input for Model Testing")
+            if model_type.lower() == "simple":
+                input_type = st.radio("Select Input Type:", ["Text", "Audio", "Image"])
+            elif model_type.lower() == "custom":
+                input_type = "Text"
+            elif model_type.lower() == "huggingface":
+                input_type = "Text"
+            if input_type == "Text":
+                if model_type.lower() == "simple":
+                    st.write("For simple models, please upload a single JSON file containing prompts, contexts, and responses.")
+                    json_file = st.file_uploader("Upload Test Data JSON", type=["json"])
+                    if json_file is not None:
+                        try:
+                            test_data = json.load(json_file)
+                            st.success("Test data JSON file uploaded successfully!")
+                            # Display a preview of the test data
+                            st.write("Preview of test data:")
+                            st.json(test_data[:3] if len(test_data) > 3 else test_data)
+                        except json.JSONDecodeError:
+                            st.error("Invalid JSON format. Please check your file.")
+                    else:
+                        test_data = None
+                elif model_type.lower() == "custom":
+                    # For other model types, keep the existing separate inputs for context and questions
+                    context_file = st.file_uploader("Upload Context Dataset", type=["txt"])
+                    if context_file is not None:
+                        context_dataset = context_file.getvalue().decode("utf-8")
+                        st.success("Context file uploaded successfully!")
+                        # Upload context to Pinecone with user and model-specific namespace
+                        upload_context_to_pinecone(context_dataset, st.session_state.user, model_name)
+                    else:
+                        context_dataset = None
+                    questions_file = st.file_uploader("Upload Questions JSON", type=["json"])
+                    if questions_file is not None:
+                        questions_json = questions_file.getvalue().decode("utf-8")
+                        st.success("Questions file uploaded successfully!")
+                    else:
+                        questions_json = None
+                elif model_type.lower() == "huggingface":
+                    st.write("For Hugging Face models, please enter your prompt:")
+                    context_file = st.file_uploader("Upload Context Dataset", type=["txt"])
+                    if context_file is not None:
+                        context_dataset = context_file.getvalue().decode("utf-8")
+                        st.success("Context file uploaded successfully!")
+                    else:
+                        context_dataset = None
+                    questions_file = st.file_uploader("Upload Questions JSON", type=["json"])
+                    if questions_file is not None:
+                        questions_json = questions_file.getvalue().decode("utf-8")
+                        st.success("Questions file uploaded successfully!")
+                    else:
+                        questions_json = None
+            elif input_type == "Audio":
+                st.write("Please upload audio files for Prompts, Contexts, and Responses.")
+                prompt_audio = st.file_uploader("Upload Prompt Audio", type=["mp3", "wav"])
+                context_audio = st.file_uploader("Upload Context Audio", type=["mp3", "wav"])
+                response_audio = st.file_uploader("Upload Response Audio", type=["mp3", "wav"])
+                if prompt_audio:
+                    st.audio(prompt_audio, format='audio/wav')
+                    st.write(f"**Uploaded Prompt Audio:** {prompt_audio.name}")
+                if context_audio:
+                    st.audio(context_audio, format='audio/wav')
+                    st.write(f"**Uploaded Context Audio:** {context_audio.name}")
+                if response_audio:
+                    st.audio(response_audio, format='audio/wav')
+                    st.write(f"**Uploaded Response Audio:** {response_audio.name}")
+            elif input_type == "Image":
+                st.write("Please upload image files for Prompt, Context, and Response.")
+                prompt_image = st.file_uploader("Upload Prompt Image", type=["png", "jpg", "jpeg"])
+                context_image = st.file_uploader("Upload Context Image", type=["png", "jpg", "jpeg"])
+                response_image = st.file_uploader("Upload Response Image", type=["png", "jpg", "jpeg"])
+                if prompt_image:
+                    st.image(prompt_image, caption='Uploaded Prompt Image.', use_column_width=True)
+                    st.write(f"**Uploaded Prompt Image:** {prompt_image.name}")
+                if context_image:
+                    st.image(context_image, caption='Uploaded Context Image.', use_column_width=True)
+                    st.write(f"**Uploaded Context Image:** {context_image.name}")
+                if response_image:
+                    st.image(response_image, caption='Uploaded Response Image.', use_column_width=True)
+                    st.write(f"**Uploaded Response Image:** {response_image.name}")
+            # {{ edit_final: Handle Run Test for Image input with three images }}
+            if st.button("Run Test"):
+                if not model_name:
+                    st.error("Please select a valid Model.")
+                elif input_type == "Text":
+                    if model_type.lower() == "simple" and test_data is None:
+                        st.error("Please upload a valid test data JSON file.")
+                    elif model_type.lower() != "simple" and (not context_dataset or not questions_json):
+                        st.error("Please provide both context dataset and questions JSON.")
+                    else:
+                        try:
+                            selected_model_data = next(
+                                (m for m in user_models if m['model_name'] == model_name),
+                                None
+                            )
+                            if selected_model_data:
+                                with st.spinner("Starting evaluations..."):
+                                    if model_type.lower() == "simple":
+                                        run_custom_evaluations(test_data, selected_model_data, st.session_state.user)
+                                        st.success("Simple model evaluations are running in the background. You can navigate away or close the site.")
+                                    elif model_type.lower() == "custom":
+                                        questions = json.loads(questions_json)
+                                        run_custom_evaluations((context_dataset, questions), selected_model_data, st.session_state.user)
+                                        st.success("Custom model evaluations are running in the background. You can navigate away or close the site.")
+                                    elif model_type.lower() == "huggingface":
+                                        if not context_dataset or not questions_json:
+                                            st.error("Please provide both context dataset and questions JSON.")
+                                        else:
+                                            try:
+                                                questions = json.loads(questions_json)
+                                                test_data = [
+                                                    {
+                                                        "prompt": extract_prompt_text(question),
+                                                        "context": context_dataset
+                                                    }
+                                                    for question in questions
+                                                ]
+                                                run_huggingface_evaluations(test_data, selected_model_data, st.session_state.user)
+                                                st.success("Hugging Face model evaluations are running in the background. You can navigate away or close the site.")
+                                            except Exception as e:
+                                                st.error(f"An error occurred: {str(e)}")
+                                                st.error(f"Detailed error: {traceback.format_exc()}")
                             else:
+                                st.error("Selected model not found.")
+                        except Exception as e:
+                            st.error(f"An error occurred: {str(e)}")
+                            st.error(f"Detailed error: {traceback.format_exc()}")
+                        st.success("Evaluations are running in the background. You can navigate away or close the site.")
+                elif input_type == "Audio":
+                    if model_type.lower() == "simple" and test_data is None:
+                        st.error("Please upload a valid test data JSON file.")
+                    elif model_type.lower() != "simple" and (not context_dataset or not questions_json):
+                        st.error("Please provide both context dataset and questions JSON.")
                     else:
+                        try:
+                            selected_model = next(
+                                (m for m in user_models if m['model_name'] == model_name),
+                                None
+                            )
+                            if selected_model:
+                                with st.spinner("Processing audio files..."):
+                                    prompt_text = transcribe_audio(prompt_audio)
+                                    context_text = transcribe_audio(context_audio)
+                                    response_text = transcribe_audio(response_audio)
+                                test_data = [
+                                    {
+                                        "prompt": prompt_text,
+                                        "context": context_text,
+                                        "response": response_text
+                                    }
+                                ]
+                                with st.spinner("Starting evaluations..."):
+                                    evaluation_thread = threading.Thread(
+                                        target=run_custom_evaluations,
+                                        args=(test_data, selected_model, st.session_state.user)
+                                    )
+                                    evaluation_thread.start()
+                                    st.success("Evaluations are running in the background. You can navigate away or close the site.")
+                            else:
+                                st.error("Selected model not found.")
+                        except Exception as e:
+                            st.error(f"An error occurred: {e}")
+                elif input_type == "Image":
+                    if not (prompt_image and context_image and response_image):
+                        st.error("Please upload all three image files: Prompt, Context, and Response.")
+                    else:
+                        try:
+                            selected_model = next(
+                                (m for m in user_models if m['model_name'] == model_name),
+                                None
+                            )
+                            if selected_model:
+                                with st.spinner("Processing images and starting evaluations..."):
+                                    # Convert images to binary
+                                    prompt_bytes = prompt_image.read()
+                                    context_bytes = context_image.read()
+                                    response_bytes = response_image.read()
+                                    # Use runner.py to summarize the images
+                                    prompt_summary = summarize_image(prompt_bytes)
+                                    context_summary = summarize_image(context_bytes)
+                                    response_summary = summarize_image(response_bytes)
+                                    if prompt_summary and context_summary and response_summary:
+                                        # Prepare test data with summaries
+                                        test_data = [
+                                            {
+                                                "prompt": prompt_summary,
+                                                "context": context_summary,
+                                                "response": response_summary
+                                            }
+                                        ]
+                                        # Start the evaluation in a separate thread
+                                        evaluation_thread = threading.Thread(
+                                            target=run_custom_evaluations,
+                                            args=(test_data, selected_model, st.session_state.user)
+                                        )
+                                        evaluation_thread.start()
+                                        st.success("Images processed and evaluations are running in the background. You can navigate away or close the site.")
+                                    else:
+                                        st.error("Failed to generate summaries for the uploaded images.")
+                            else:
+                                st.error("Selected model not found.")
+                        except Exception as e:
+                            st.error(f"An error occurred: {e}")
+                elif input_type == "Image":
+                    if not (prompt_image and context_image and response_image):
+                        st.error("Please upload all three image files: Prompt, Context, and Response.")
+                    else:
+                        try:
+                            selected_model = next(
+                                (m for m in user_models if m['model_name'] == model_name),
+                                None
+                            )
+                            if selected_model:
+                                with st.spinner("Processing images and starting evaluations..."):
+                                    # Convert images to binary
+                                    prompt_bytes = prompt_image.read()
+                                    context_bytes = context_image.read()
+                                    response_bytes = response_image.read()
+                                    # Use runner.py to summarize the images
+                                    prompt_summary = summarize_image(prompt_bytes)
+                                    context_summary = summarize_image(context_bytes)
+                                    response_summary = summarize_image(response_bytes)
+                                    if prompt_summary and context_summary and response_summary:
+                                        # Prepare test data with summaries
+                                        test_data = [
+                                            {
+                                                "prompt": prompt_summary,
+                                                "context": context_summary,
+                                                "response": response_summary
+                                            }
+                                        ]
+                                        # Start the evaluation in a separate thread
+                                        evaluation_thread = threading.Thread(
+                                            target=run_custom_evaluations,
+                                            args=(test_data, selected_model, st.session_state.user)
+                                        )
+                                        evaluation_thread.start()
+                                        st.success("Images processed and evaluations are running in the background. You can navigate away or close the site.")
+                                    else:
+                                        st.error("Failed to generate summaries for the uploaded images.")
+                            else:
+                                st.error("Selected model not found.")
+                        except Exception as e:
+                            st.error(f"An error occurred: {e}")
+    elif st.session_state.app_mode == "Manage Models":
         st.title("Manage Your Models")
         # Fetch the user from the database
         user = users_collection.find_one({"username": st.session_state.user})
         )
         st.subheader("Add a New Model")
+        model_type = st.radio("Select Model Type:", ["Simple Model", "Custom Model","huggingface"])
         if model_type == "Simple Model":
             new_model_name = st.text_input("Enter New Model Name:")
+            if st.button("Add Simple Model"):
+                if new_model_name:
                     model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
                     model_data = {
                         "model_id": model_id,
+                        "model_name": new_model_name,
+                        "model_type": "simple",
                         "file_path": None,
                         "model_link": None,
                         "uploaded_at": datetime.now(),
                         {"username": st.session_state.user},
                         {"$push": {"models": model_data}}
                     )
+                    st.success(f"Model '{new_model_name}' added successfully as {model_id}!")
                 else:
+                    st.error("Please enter a valid model name.")
+        elif model_type == "Custom Model":  # Custom Model
             custom_model_options = ["gpt-4o", "gpt-4o-mini"]
             selected_custom_model = st.selectbox("Select Custom Model:", custom_model_options)
                     }}}
                 )
                 st.success(f"Custom Model '{selected_custom_model}' added successfully as {model_id}!")
+        else:
+            model_name = st.text_input("Enter Hugging Face Model Name:")
+            api_endpoint = st.text_input("Enter Hugging Face API Endpoint:")
+            api_token = st.text_input("Enter Hugging Face API Token:", type="password")
+            if st.button("Add Hugging Face Model"):
+                if api_endpoint and api_token:
+                    model_id = f"{st.session_state.user}_model_{int(datetime.now().timestamp())}"
+                    model_data = {
+                        "model_id": model_id,
+                        "model_name": model_name,
+                        "model_type": "huggingface",
+                        "file_path": None,
+                        "model_link": api_endpoint,
+                        "model_api_token": api_token,
+                        "uploaded_at": datetime.now()
+                    }
+                    users_collection.update_one(
+                        {"username": st.session_state.user},
+                        {"$push": {"models": model_data}}
+                    )
+                st.success(f"Hugging Face Model '{model_name}' added successfully as {model_id}!")
         st.markdown("---")
                         {"$pull": {"models": {"model_id": model['model_id']}}}
                     )
                     st.success(f"Model {model['model_id']} deleted successfully!")
+                    st.rerun()
         else:
             st.info("You have no uploaded models.")
+    elif st.session_state.app_mode == "History":
         st.title("History")
         st.write("### Your Evaluation History")
                 st.info("You have no evaluation history yet.")
         except Exception as e:
+            st.error(f"Error fetching history data: {e}")