import pandas as pd import streamlit as st import numpy as np import plotly.express as px from yaml import safe_load import user_evaluation_variables from pathlib import Path from huggingface_hub import snapshot_download from profanity_check import predict databaseDF = None EVAL_DATABASE_DIR = Path("data") EVAL_DATABASE_DIR.mkdir(parents=True, exist_ok=True) GEN_EVAL_DATABASE_PATH = 'user_data/data/general_eval_database.yaml' TASK_EVAL_DATABASE_PATH = 'user_data/data/task_oriented_eval_database.yaml' def get_evaluation_id(evalType, debugging): global GEN_EVAL_DATABASE_PATH global TASK_EVAL_DATABASE_PATH if evalType == 'general': DFPath = GEN_EVAL_DATABASE_PATH else: DFPath = TASK_EVAL_DATABASE_PATH df = add_user_evalID_columns_to_df(None, DFPath, False) evalColumn = [int(x.split('_')[1]) for x in list(df['Eval. ID'])] newEvalID = max(evalColumn) + 1 if evalType == 'general': newEvalID = 'G_'+str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1])) else: newEvalID = 'T_' + str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1])) if debugging: st.write(df['Eval. ID']) st.write(evalColumn) st.write("current last EVAL ID:", df['Eval. ID'].iloc[-1]) st.write("NEW EVAL ID:", newEvalID) return newEvalID def check_profanity(df): cleanedDF = df for i, row in cleanedDF.iterrows(): if predict([row['User']])[0] != 0.0: cleanedDF.at[i, 'User'] = '**NSFW**' if 'Target' in df: if predict([row['Target']])[0] != 0.0: cleanedDF.at[i, 'Target'] = '**NSFW**' return cleanedDF def dataframe_with_selections(df): df_with_selections = check_profanity(df.copy()) df_with_selections.insert(0, "Select", True) # Get dataframe row-selections from user with st.data_editor edited_df = st.data_editor( df_with_selections, hide_index=True, column_config={"Select": st.column_config.CheckboxColumn(required=True)}, disabled=df.columns, ) # Filter the dataframe using the temporary column, then drop the column selected_rows = edited_df[edited_df.Select] return selected_rows.drop('Select', axis=1) def add_user_evalID_columns_to_df(df, evalDataPath, personalFLAG): with open(evalDataPath, 'r') as f: yamlData = safe_load(f) for user in yamlData['evaluations']['username']: if df is None: df = pd.DataFrame(yamlData['evaluations']['username'][user]).T df.insert(0, "Eval. ID", list(yamlData['evaluations']['username'][user].keys()), True) df.insert(0, "User", [user for i in range(len(yamlData['evaluations']['username'][user]))], True) else: df = pd.concat([df, pd.DataFrame(yamlData['evaluations']['username'][user]).T], ignore_index=True) evalIDIterator = 0 for index, row in df.iterrows(): if row['User'] is np.nan: df.loc[index, 'User'] = user if row['Eval. ID'] is np.nan: df.loc[index, 'Eval. ID'] = list(yamlData['evaluations']['username'][user].keys())[ evalIDIterator] evalIDIterator += 1 if personalFLAG: df.drop(df[df['User'] != user_evaluation_variables.USERNAME].index, inplace=True) if len(df) == 0: st.warning("It looks like you haven't conducted any evaluations! Run some evaluations and refresh this page." "If the problem persists, please contact support. ", icon="⚠️") return df def initialise_page(tab): global databaseDF global GEN_EVAL_DATABASE_PATH global TASK_EVAL_DATABASE_PATH with tab: c1, c2 = st.columns(2) with c1: st.subheader("\U0001F30E General Bias") with st.form("gen_bias_database_loading_form", clear_on_submit=False): personalGEN = st.form_submit_button("Personal Evaluations") communityGEN = st.form_submit_button("TBYB Community Evaluations") if personalGEN: databaseDF = None databaseDF = add_user_evalID_columns_to_df(databaseDF, GEN_EVAL_DATABASE_PATH,True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps", "Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]] if communityGEN: databaseDF = None databaseDF = add_user_evalID_columns_to_df(databaseDF, GEN_EVAL_DATABASE_PATH, False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps", "Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]] with c2: st.subheader("\U0001F3AF Task-Oriented Bias") with st.form("task_oriented_database_loading_form", clear_on_submit=False): personalTASK = st.form_submit_button("Personal Evaluations") communityTASK = st.form_submit_button("TBYB Community Evaluations") if personalTASK: databaseDF = None databaseDF = add_user_evalID_columns_to_df(databaseDF, TASK_EVAL_DATABASE_PATH, True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps", "Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]] if communityTASK: databaseDF = None databaseDF = add_user_evalID_columns_to_df(databaseDF, TASK_EVAL_DATABASE_PATH,False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps", "Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]] if databaseDF is not None: selection = dataframe_with_selections(databaseDF) normalised = st.toggle('Normalize Data (better for direct comparisons)') submitCOMPARE = st.button("Compare Selected Models") if submitCOMPARE: plot_comparison_graphs(tab, selection, normalised) def normalise_data(rawValues, metric): rawValues = list(map(float, rawValues)) normalisedValues = [] # Normalise the raw data for x in rawValues: if (max(rawValues) - min(rawValues)) == 0: normX = 1 else: if metric in ['HJ','MG']: normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues)) else: normX = 1 - ((x - min(rawValues)) / (max(rawValues) - min(rawValues))) normalisedValues.append(normX) return normalisedValues def plot_comparison_graphs(tab, data,normalise): BDColor = ['#59DC23', ] * len(data['Dist. Bias'].tolist()) HJColor = ['#2359DC', ] * len(data['Hallucination'].tolist()) MGColor = ['#DC2359', ] * len(data['Gen. Miss Rate'].tolist()) if not normalise: BDData = data['Dist. Bias'] HJData = data['Hallucination'] MGData = data['Gen. Miss Rate'] else: data['Dist. Bias'] = normalise_data(data['Dist. Bias'], 'BD') data['Hallucination'] = normalise_data(data['Hallucination'], 'HJ') data['Gen. Miss Rate'] = normalise_data(data['Gen. Miss Rate'], 'MG') with tab: st.write("Selected evaluations for comparison:") st.write(data) BDFig = px.bar(x=data['Eval. ID'], y=data['Dist. Bias'],color_discrete_sequence=BDColor).update_layout( xaxis_title=r'Evaluation ID', yaxis_title=r'Distribution Bias', title=r'Distribution Bias Comparison') st.plotly_chart(BDFig, theme="streamlit",use_container_width=True) HJFig = px.bar(x=data['Eval. ID'], y=data['Hallucination'],color_discrete_sequence=HJColor).update_layout( xaxis_title=r'Evaluation ID', yaxis_title=r'Jaccard Hallucination', title=r'Jaccard Hallucination Comparison') st.plotly_chart(HJFig, theme="streamlit",use_container_width=True) MGFig = px.bar(x=data['Eval. ID'], y=data['Gen. Miss Rate'],color_discrete_sequence=MGColor).update_layout( xaxis_title=r'Evaluation ID', yaxis_title=r'Generative Miss Rate', title=r'Generative Miss Rate Comparison') st.plotly_chart(MGFig, theme="streamlit",use_container_width=True) if normalise: Full3DFig = px.scatter_3d(data, x='Dist. Bias', y='Hallucination', z='Gen. Miss Rate', width=800, height=800,color='Eval. ID',title='3D Text-to-Image Model Bias Comparison') st.plotly_chart(Full3DFig, theme="streamlit",use_container_width=True)