import streamlit as st import pandas as pd import json import base64 import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import gdown # ----------------- APP CONFIG ----------------- st.set_page_config(page_title="Synthetic Data Generation for Multi-modal LLMs", layout="wide") # # Force Streamlit to apply a higher max message size # st.set_option("server.maxMessageSize", 5000) # ----------------- SIDEBAR NAVIGATION ----------------- st.sidebar.title("🔗 Navigation") page = st.sidebar.radio("Go to:", ["🏠 Home", "📊 Dataset Explorer"]) # ----------------- HOME PAGE ----------------- if page == "🏠 Home": st.title("🧠 Synthetic Data Generation for Multi-modal LLMs") st.markdown(""" ## Welcome to the Synthetic Data Generation Dashboard! This project focuses on generating high-quality **multi-modal datasets** using **Gemini AI** and evaluating chatbot responses with **Gemini AI** based on 3H (Honesty, Helpfulness and Harmlessness) parameters. ### 📌 **Project Objectives** - Generate synthetic **human-bot conversations** based on **text and images**. - Ensure **ethical AI** by preventing biased, toxic, or identifiable personal information. - **Evaluate** chatbot responses using **multiple LLM models** to assess quality. ### 🔍 **Methodology** 1. **Synthetic Data Generation**: - Uses **Gemini AI** to generate human-bot conversations. - Includes **multi-turn dialogues** with references to images. 2. **Dataset Evaluation**: - Uses **Gemini** to provide **7 evaluation scores** per conversation: - **Relevance, Coherence, Factual Accuracy, Bias, Fluency, Image Alignment, Creativity**. 3. **Dataset Explorer & Visualization**: - Interactive filtering and visualization of scores. - Image thumbnail previews for conversations. ### 🚀 **Key Features** - 📊 **Dataset Filtering & Score Visualization** - 🖼️ **Image Previews & Mapping** - 📥 **Download Filtered Dataset** """) st.info("🔄 Use the sidebar to navigate to the **Dataset Explorer**!") # ----------------- DATASET EXPLORER PAGE ----------------- elif page == "📊 Dataset Explorer": st.title("📊 Dataset Explorer") # Sidebar: Dataset Selection st.sidebar.header("📂 Select Dataset Category") dataset_category = st.sidebar.selectbox("Choose Dataset Type", ["Anime", "Celeb", "Meme", "Clustered", "Combined"]) # Define dataset file paths based on selection dataset_paths = { # "Anime": "../Final_Datasets/anime.json", # "Celeb": "../Final_Datasets/celeb.json", # "Meme": "../Final_Datasets/meme.json", # "Clustered": "../Final_Datasets/clustering.json", # "Combined": "../Final_Datasets/combined_folder.json" "Anime": "18EA2dgaMPxuJ1VGeYYgfp9TXXyjmLuIK", "Celeb": "1zhmP7QrD_ZZN8Mm5ekHZMPyVmwN877D_", "Meme": "1SzE0BKiOo7xV7R7D1Vr30pnoKTcyoXqu", "Clustered": "1Dz25PN-54OYPD0ZZ9fb9apGC40Z0bK6-", "Combined": "196X5cOhQu-KRyyUHxAGyNynTu38oR-Jh" } evaluation_paths = { # "Anime": "../Evaluation_result/clustering_part1_200_baisakhi_evaluation_results0224.json", # "Celeb": "../Evaluation_result/clustering_part1_200_baisakhi_evaluation_results0224.json", # "Meme": "../Evaluation_result/clustering_part1_200_baisakhi_evaluation_results0224.json", # "Clustered": "../Evaluation_result/clustering_part1_200_baisakhi_evaluation_results0224.json", # "Combined": "../Evaluation_result/clustering_part1_200_baisakhi_evaluation_results0224.json" "Anime": "1mwxYkfKN6ACy-zr-xPlFDhe2YCqmC9oU", "Celeb": "1Srcb3wWA1khv2ZQMSt8oRMqSjTmiLlqz", "Meme": "1HZtLo8iJo2rz32eJ8lVBYiZ6zo3H6C4W", "Clustered": "154nbfikh9VuPnER-XNxoo3ureVNKF-0o", "Combined": "1bVFfXtQBCfku3R3JZpAPM76nEpimF9AD" } # ----------------- DATA LOADING FUNCTIONS ----------------- @st.cache_data # def load_conversation_data(json_file): # with open(json_file, "r") as file: # return pd.json_normalize(json.load(file), sep="_") @st.cache_data def download_from_gdrive(file_id): """Downloads a file from Google Drive and returns its local path.""" url = f"https://drive.google.com/uc?id={file_id}" output = f"temp_{file_id}.json" # Unique temp filename gdown.download(url, output, quiet=False) return output def load_conversation_data(file_id, chunk_size=500): """ Lazily loads large conversation datasets in chunks to prevent memory overflow. Returns only the first chunk. """ # Read from local # with open(json_file, "r") as file: # data = json.load(file) # Load JSON normally # df = pd.json_normalize(data, sep="_") # Convert JSON to DataFrame # return df.iloc[:chunk_size] # Load only the first `chunk_size` rows # Read from google drive json_file = download_from_gdrive(file_id) with open(json_file, "r") as file: data = json.load(file) # Load JSON normally df = pd.json_normalize(data, sep="_") # Convert JSON to DataFrame return df.iloc[:chunk_size] # Load only the first `chunk_size` rows @st.cache_data def load_evaluation_data(file_id): #Read from local # with open(json_file, "r") as file: # data = json.load(file) # for entry in data: # for key, value in entry["evaluation_scores"].items(): # entry["evaluation_scores"][key] = value["score"] # Keep only scores # read from google drive """Loads evaluation data and keeps only the scores.""" json_file = download_from_gdrive(file_id) with open(json_file, "r") as file: data = json.load(file) for entry in data: for key, value in entry["evaluation_scores"].items(): entry["evaluation_scores"][key] = value["score"] # Keep only scores return pd.json_normalize(data, sep="_") @st.cache_data def convert_df_to_json(df): return df.to_json(orient="records", indent=4) # Function to decode base64 image def decode_base64_image(encoded_string): """Decodes a base64 image and returns an HTML image tag.""" return f'' # Load selected dataset conversation_data = load_conversation_data(dataset_paths[dataset_category]) evaluation_data = load_evaluation_data(evaluation_paths[dataset_category]) # Merge evaluation scores into conversation data merged_data = conversation_data.merge(evaluation_data, on="conversation_id", how="left") # ----------------- FILTERING OPTIONS ----------------- st.sidebar.header("🔍 Filter Options") if "images" in merged_data.columns: image_counts = merged_data['images'].apply(len).unique() selected_image_count = st.sidebar.multiselect("Select Number of Images", image_counts, default=image_counts) score_columns = [col for col in evaluation_data.columns if "_score" in col] selected_score = None if score_columns: selected_score = st.sidebar.selectbox("Filter by Score Metric", score_columns) min_score, max_score = st.sidebar.slider("Select Score Range", 0, 10, (5, 10)) else: st.sidebar.error("⚠️ No evaluation score columns found!") search_text = st.sidebar.text_input("Search in Conversation") # Apply Filters filtered_conversations = merged_data.copy() if "images" in merged_data.columns and selected_image_count: filtered_conversations = filtered_conversations[filtered_conversations['images'].apply(len).isin(selected_image_count)] if selected_score and selected_score in merged_data.columns: filtered_conversations = filtered_conversations[filtered_conversations[selected_score].between(min_score, max_score)] if search_text and "conversation" in merged_data.columns: filtered_conversations = filtered_conversations[filtered_conversations["conversation"].str.contains(search_text, case=False, na=False)] # ----------------- DISPLAY FILTERED DATA ----------------- # st.subheader("📊 Filtered Conversations") # if not filtered_conversations.empty: # json_data = convert_df_to_json(filtered_conversations) # st.download_button("📥 Download Filtered Data (JSON)", data=json_data, file_name="filtered_dataset.json", mime="application/json") # st.dataframe(filtered_conversations) # else: # st.warning("⚠️ No data matches your filters.") # # ----------------- VISUALIZATIONS ----------------- # if not filtered_conversations.empty: # avg_scores = filtered_conversations[score_columns].mean().reset_index() # avg_scores.columns = ["Metric", "Average Score"] # avg_scores["Metric"] = avg_scores["Metric"].str.replace("evaluation_scores_", "").str.replace("_score", "").str.replace("_", " ").str.title() # st.subheader("📊 Average Scores by Metric (Filtered Data)") # fig = px.bar(avg_scores, x="Metric", y="Average Score", color="Metric", text="Average Score") # st.plotly_chart(fig) # st.subheader("🔥 Heatmap of Evaluation Scores") # plt.figure(figsize=(10, 5)) # sns.heatmap(filtered_conversations[score_columns].corr(), annot=True, cmap="coolwarm", fmt=".2f") # st.pyplot(plt) # ✅ Define evaluation score columns score_columns = [ "evaluation_scores_Relevance", "evaluation_scores_Coherence", "evaluation_scores_Factual Accuracy", "evaluation_scores_Bias & Toxicity", "evaluation_scores_Fluency", "evaluation_scores_Image Alignment", "evaluation_scores_Creativity" ] # ✅ Ensure filtered_data is not empty before calculations if not filtered_conversations.empty: # ✅ Compute average scores avg_scores = filtered_conversations[score_columns].mean().reset_index() avg_scores.columns = ["Metric", "Average Score"] # Rename columns # ✅ Rename metrics for better readability clean_labels = { "evaluation_scores_Relevance": "Relevance", "evaluation_scores_Coherence": "Coherence", "evaluation_scores_Factual Accuracy": "Factual Accuracy", "evaluation_scores_Bias & Toxicity": "Bias & Toxicity", "evaluation_scores_Fluency": "Fluency", "evaluation_scores_Image Alignment": "Image Alignment", "evaluation_scores_Creativity": "Creativity" } avg_scores["Metric"] = avg_scores["Metric"].replace(clean_labels) # ✅ Re-plot bar chart with updated labels st.subheader("📊 Average Scores by Metric (Filtered Data)") fig = px.bar(avg_scores, x="Metric", y="Average Score", color="Metric", text="Average Score") fig.update_layout(xaxis_title="Evaluation Metric", yaxis_title="Average Score") st.plotly_chart(fig) else: st.warning("⚠️ No data available after filtering. Adjust filters to see results.") # Show Filtered Dataset with Image Thumbnails and Image-to-Tag Mapping st.subheader("📊 Filtered Conversations") if not filtered_conversations.empty: json_data = convert_df_to_json(filtered_conversations) st.download_button( label="📥 Download Filtered Data (JSON)", data=json_data, file_name="filtered_dataset.json", mime="application/json", ) for index, row in filtered_conversations.iterrows(): st.markdown(f"### **Conversation ID: {row['conversation_id']}**") # Image-to-Tag Mapping st.markdown("**📷 Image-to-Tag Mapping:**") image_mappings = {} for idx, img_data in enumerate(row["images"]): img_name = img_data["name"] img_tag = f"" image_mappings[img_tag] = img_name st.json(image_mappings) # Display mapping # Show Images as Thumbnails st.markdown("**🖼️ Images Used:**") image_html = "" for img in row["images"]: image_html += decode_base64_image(img["base64"]) + " " st.markdown(image_html, unsafe_allow_html=True) # Render images inline # Show Conversation st.markdown(f"**💬 Conversation:** {row['conversation']}") # Show Scores st.markdown("**📊 Evaluation Scores:**") scores = {key: row[key] for key in score_columns if key in row} st.json(scores) st.divider() # Add a separator between conversations else: st.warning("⚠️ No data matches your filters.")