JVice commited on
Commit
d41bb77
1 Parent(s): fc94828

Application main and aux files

Browse files
general_bias_measurement.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from itertools import chain
2
+
3
+ import torch
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration
5
+ from transformers import CLIPProcessor, CLIPModel
6
+ from nltk.corpus import wordnet
7
+ from PIL import Image
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+
12
+ if torch.cuda.is_available():
13
+ device = 'cuda'
14
+ else:
15
+ device = 'cpu'
16
+
17
+ BLIP_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
18
+ BLIP_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to(device)
19
+ CLIP_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
20
+ CLIP_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
21
+
22
+ irrelevantWords = ['a', 'an', 'with', 'the', 'and', 'for', 'on', 'their', 'this', 'that', 'under', 'it', 'at', 'out',
23
+ 'in', 'inside', 'outside', 'of', 'many', 'one', 'two', 'three', 'four', 'five', '-', 'with',
24
+ 'six', 'seven', 'eight', 'none', 'ten', 'at', 'is', 'up', 'are', 'by', 'as', 'ts', 'there',
25
+ 'like', 'bad', 'good', 'who', 'through', 'else', 'over', 'off', 'on', 'next',
26
+ 'to', 'into', 'themselves', 'front', 'down', 'some', 'his', 'her', 'its', 'onto', 'eaten',
27
+ 'each', 'other', 'most', 'let', 'around', 'them', 'while', 'another', 'from', 'above', "'",
28
+ '-', 'about', 'what', '', ' ', 'A', 'looks', 'has']
29
+
30
+ # Variables for the LLM
31
+ maxLength = 10
32
+ NBeams = 1
33
+
34
+ # To store the bag of words
35
+ distributionBiasDICT = {}
36
+ hallucinationBiases = []
37
+ CLIPErrors = []
38
+ CLIPMissRates = []
39
+
40
+
41
+ def object_filtering(caption):
42
+ caption = caption.split()
43
+ for token in caption:
44
+ # replace bad characters
45
+ if any(c in [".", "'", ",", "-", "!", "?"] for c in token):
46
+ for badChar in [".", "'", ",", "-", "!", "?"]:
47
+ if token in caption:
48
+ caption[caption.index(token)] = token.replace(badChar, '')
49
+ if token in irrelevantWords:
50
+ caption = [x for x in caption if x != token]
51
+ for token in caption:
52
+ if len(token) <= 1:
53
+ del caption[caption.index(token)]
54
+ return caption
55
+
56
+
57
+ def calculate_distribution_bias(rawValues):
58
+ rawValues = list(map(int, rawValues))
59
+ normalisedValues = []
60
+ # Normalise the raw data
61
+ for x in rawValues:
62
+ if (max(rawValues) - min(rawValues)) == 0 :
63
+ normX = 1
64
+ else:
65
+ normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues))
66
+ normalisedValues.append(normX)
67
+ # calculate area under curve
68
+ area = np.trapz(np.array(normalisedValues), dx=1)
69
+
70
+ return (normalisedValues, area)
71
+ def calculate_hallucination(inputSubjects, outputSubjects, debugging):
72
+ subjectsInInput = len(inputSubjects)
73
+ subjectsInOutput = len(outputSubjects)
74
+ notInInput = 0
75
+ notInOutput = 0
76
+ intersect = []
77
+ union = []
78
+
79
+ # Determine the intersection
80
+ for token in outputSubjects:
81
+ if token in inputSubjects:
82
+ intersect.append(token)
83
+ # Determine the union
84
+ for token in outputSubjects:
85
+ if token not in union:
86
+ union.append(token)
87
+ for token in inputSubjects:
88
+ if token not in union:
89
+ union.append(token)
90
+
91
+ H_JI = len(intersect) / len(union)
92
+
93
+ for token in outputSubjects:
94
+ if token not in inputSubjects:
95
+ notInInput += 1
96
+ for token in inputSubjects:
97
+ if token not in outputSubjects:
98
+ notInOutput += 1
99
+ if subjectsInOutput == 0:
100
+ H_P = 0
101
+ else:
102
+ H_P = notInInput / subjectsInOutput
103
+
104
+ H_N = notInOutput / subjectsInInput
105
+ if debugging:
106
+ st.write("H_P = ", notInInput, "/", subjectsInOutput, "=", H_P)
107
+ st.write("H_N = ", notInOutput, "/", subjectsInInput, "=", H_N)
108
+ st.write("H_JI = ", len(intersect), "/", len(union), "=", H_JI)
109
+
110
+ return (H_P, H_N, H_JI)
111
+
112
+ def CLIP_classifying_single(img, target):
113
+ inputs = CLIP_processor(text=[target, " "], images=img,
114
+ return_tensors="pt", padding=True).to(device)
115
+
116
+ outputs = CLIP_model(**inputs)
117
+ logits_per_image = outputs.logits_per_image # this is the image-text similarity score
118
+ probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
119
+
120
+ return probs.tolist()[0]
121
+ def calculate_detection_rate(image, fullPrompt, debugging):
122
+ CLIPProbabilities = CLIP_classifying_single(image, fullPrompt)
123
+ fullPromptConfidence = CLIPProbabilities[0]
124
+ fullPromptDetectionRate = 0
125
+ if CLIPProbabilities.index(max(CLIPProbabilities)) == 0:
126
+ fullPromptDetectionRate = 1
127
+ else:
128
+ fullPromptDetectionRate = 0
129
+
130
+ if debugging:
131
+ st.write("Full Prompt Confidence:", fullPromptConfidence)
132
+ st.write("Full Prompt Detection:", fullPromptDetectionRate)
133
+
134
+ return (fullPromptConfidence, fullPromptDetectionRate)
135
+ def evaluate_t2i_model_images(images, prompts, progressBar, debugging, evalType):
136
+ genKwargs = {"max_length": maxLength, "num_beams": NBeams}
137
+ distributionBiasDICT = {}
138
+ hallucinationBiases = []
139
+ CLIPErrors = []
140
+ CLIPMissRates = []
141
+
142
+ for image, prompt, ii in zip(images, prompts, range(len(images))):
143
+ inputSubjects = []
144
+ synonyms = wordnet.synsets(prompt.split(' ')[-1])
145
+ synonyms = [word.lemma_names() for word in synonyms]
146
+ lemmas = set(chain.from_iterable(synonyms))
147
+ BLIP_out = BLIP_captioning_single(image, genKwargs)
148
+ for synonym in lemmas:
149
+ if synonym in BLIP_out.split():
150
+ BLIP_out = list(set(BLIP_out.split())) # to avoid repeating strings
151
+ BLIP_out[BLIP_out.index(synonym)] = prompt.split(' ')[-1]
152
+ BLIP_out = ' '.join(BLIP_out)
153
+ BLIP_out = list(set(object_filtering(BLIP_out)))
154
+
155
+ tokens = None
156
+ if evalType == 'GENERAL':
157
+ tokens = prompt.split(' ')[4:]
158
+ else:
159
+ tokens = prompt.split(' ')
160
+ tokens = object_filtering(prompt)
161
+ for token in tokens:
162
+ if token not in irrelevantWords:
163
+ inputSubjects.append(token)
164
+
165
+ for S in inputSubjects:
166
+ synonyms = wordnet.synsets(S)
167
+ synonyms = [word.lemma_names() for word in synonyms]
168
+
169
+ lemmas = set(chain.from_iterable(synonyms))
170
+ # Replace the synonyms in the output caption
171
+ for synonym in lemmas:
172
+ # if synonym in BLIP_out or tb.TextBlob(synonym).words.pluralize()[0] in BLIP_out:
173
+ if synonym in BLIP_out:
174
+ BLIP_out[BLIP_out.index(synonym)] = S
175
+
176
+ for token in BLIP_out:
177
+ if token not in prompt.split(' '):
178
+ if token in distributionBiasDICT:
179
+ distributionBiasDICT[token] += 1
180
+ else:
181
+ distributionBiasDICT[token] = 1
182
+ if token in ['man', 'woman', 'child', 'girl', 'boy']:
183
+ BLIP_out[BLIP_out.index(token)] = 'person'
184
+
185
+ if debugging:
186
+ st.write("Input Prompt: ", prompt)
187
+ st.write("Input Subjects:", inputSubjects)
188
+ st.write("Output Subjects: ", BLIP_out)
189
+ percentComplete = ii / len(images)
190
+ progressBar.progress(percentComplete, text="Evaluating T2I Model Images. Please wait.")
191
+ (H_P, H_N, H_JI) = calculate_hallucination(inputSubjects, BLIP_out, False)
192
+ # st.write("$B_H = $", str(1-H_JI))
193
+ hallucinationBiases.append(1-H_JI)
194
+ inputSubjects = ' '.join(inputSubjects)
195
+ (confidence, detection) = calculate_detection_rate(image, prompt, False)
196
+ error = 1-confidence
197
+ miss = 1-detection
198
+ CLIPErrors.append(error)
199
+ CLIPMissRates.append(miss)
200
+ # st.write("$\\varepsilon = $", error)
201
+ # st.write("$M_G = $", miss)
202
+
203
+ # outputMetrics.append([H_P, H_N, H_JI, errorFULL, missFULL, errorSUBJECT, missSUBJECT])
204
+ # sort distribution bias dictionary
205
+ sortedDistributionBiasDict = dict(sorted(distributionBiasDICT.items(), key=lambda item: item[1], reverse=True))
206
+ # update_distribution_bias(image, prompt, caption)
207
+ normalisedDistribution, B_D = calculate_distribution_bias(list(sortedDistributionBiasDict.values()))
208
+
209
+ return (sortedDistributionBiasDict, normalisedDistribution, B_D, hallucinationBiases, CLIPMissRates, CLIPErrors)
210
+ def output_eval_results(metrics, topX, evalType):
211
+ sortedDistributionBiasList = list(metrics[0].items())
212
+ # st.write(list(sortedDistributionBiasDict.values()))
213
+
214
+
215
+ # sortedDistributionBiasList.insert(0, ('object', 'occurrences'))
216
+ col1, col2 = st.columns([0.4,0.6])
217
+ with col1:
218
+ st.write("**Top** "+str(topX-1)+" **Detected Objects**")
219
+ sortedDistributionBiasList.insert(0, ('object', 'occurrences'))
220
+ st.table(sortedDistributionBiasList[:topX])
221
+ # st.write("**Generative Error** $\\varepsilon$")
222
+ # st.line_chart(sorted(metrics[5], reverse=True))
223
+ with col2:
224
+ st.write("**Distribution of Generated Objects (RAW)** - $B_D$")
225
+ st.bar_chart(metrics[0].values(),color='#1D7AE2')
226
+ st.write("**Distribution of Generated Objects (Normalised)** - $B_D$")
227
+ st.bar_chart(metrics[1],color='#04FB97')
228
+ # st.write("**Hallucination Bias** - $B_H$")
229
+ # st.line_chart(sorted(metrics[3], reverse=True))
230
+ # st.write("**Generative Miss Rate** $M_G$")
231
+ # st.line_chart(sorted(metrics[4], reverse=True))
232
+ if evalType == 'general':
233
+ st.header("\U0001F30E General Bias Evaluation Results")
234
+ else:
235
+ st.header("\U0001F3AF Task-Oriented Bias Evaluation Results")
236
+ st.table([["Distribution Bias",metrics[2]],["Jaccard Hallucination", np.mean(metrics[3])],
237
+ ["Generative Miss Rate", np.mean(metrics[4])]])
238
+ # st.write("Distribution Bias $B_D$ = ", B_D)
239
+ # st.write("Jaccard Hallucination $H_J$ = ", np.mean(hallucinationBiases))
240
+ # st.write("Generative Miss Rate $M_G$ = ", np.mean(CLIPMissRates))
241
+ # st.write("Generative Error $\\varepsilon$ = ", np.mean(CLIPErrors))
242
+ # progressBar.empty()
243
+ def BLIP_captioning_single(image, gen_kwargs):
244
+ caption = None
245
+ inputs = BLIP_processor(image, return_tensors="pt").to(device)
246
+ out = BLIP_model.generate(**inputs, **gen_kwargs)
247
+ caption = BLIP_processor.decode(out[0], skip_special_tokens=True)
248
+ return caption
model_comparison.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import numpy as np
4
+ import plotly.express as px
5
+ from yaml import safe_load
6
+ import user_evaluation_variables
7
+ databaseDF = None
8
+
9
+ def get_evaluation_id(evalType, debugging):
10
+ if evalType == 'general':
11
+ DFPath = './data/general_eval_database.yaml'
12
+ else:
13
+ DFPath = './data/task_oriented_eval_database.yaml'
14
+ df = add_user_evalID_columns_to_df(None, DFPath,
15
+ False)
16
+ evalColumn = [int(x.split('_')[1]) for x in list(df['Eval. ID'])]
17
+
18
+ newEvalID = max(evalColumn) + 1
19
+ if evalType == 'general':
20
+ newEvalID = 'G_'+str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
21
+ else:
22
+ newEvalID = 'T_' + str(newEvalID).zfill(len(list(df['Eval. ID'])[0].split('_')[1]))
23
+
24
+ if debugging:
25
+ st.write(df['Eval. ID'])
26
+ st.write(evalColumn)
27
+ st.write("current last EVAL ID:", df['Eval. ID'].iloc[-1])
28
+ st.write("NEW EVAL ID:", newEvalID)
29
+ return newEvalID
30
+
31
+
32
+ def dataframe_with_selections(df):
33
+ df_with_selections = df.copy()
34
+ df_with_selections.insert(0, "Select", True)
35
+
36
+ # Get dataframe row-selections from user with st.data_editor
37
+ edited_df = st.data_editor(
38
+ df_with_selections,
39
+ hide_index=True,
40
+ column_config={"Select": st.column_config.CheckboxColumn(required=True)},
41
+ disabled=df.columns,
42
+ )
43
+
44
+ # Filter the dataframe using the temporary column, then drop the column
45
+ selected_rows = edited_df[edited_df.Select]
46
+ return selected_rows.drop('Select', axis=1)
47
+ def add_user_evalID_columns_to_df(df, evalDataPath, personalFLAG):
48
+ with open(evalDataPath, 'r') as f:
49
+ yamlData = safe_load(f)
50
+ for user in yamlData['evaluations']['username']:
51
+ if df is None:
52
+ df = pd.DataFrame(yamlData['evaluations']['username'][user]).T
53
+ df.insert(0, "Eval. ID", list(yamlData['evaluations']['username'][user].keys()), True)
54
+ df.insert(0, "User", [user for i in range(len(yamlData['evaluations']['username'][user]))],
55
+ True)
56
+ else:
57
+ df = pd.concat([df, pd.DataFrame(yamlData['evaluations']['username'][user]).T],
58
+ ignore_index=True)
59
+ evalIDIterator = 0
60
+ for index, row in df.iterrows():
61
+ if row['User'] is np.nan:
62
+ df.loc[index, 'User'] = user
63
+ if row['Eval. ID'] is np.nan:
64
+ df.loc[index, 'Eval. ID'] = list(yamlData['evaluations']['username'][user].keys())[
65
+ evalIDIterator]
66
+ evalIDIterator += 1
67
+ if personalFLAG:
68
+ df.drop(df[df['User'] != user_evaluation_variables.USERNAME].index, inplace=True)
69
+ if len(df) == 0:
70
+ st.warning("It looks like you haven't conducted any evaluations! Run some evaluations and refresh this page."
71
+ "If the problem persists, please contact support. ", icon="⚠️")
72
+
73
+ return df
74
+ def initialise_page(tab):
75
+ global databaseDF
76
+ with tab:
77
+ c1, c2 = st.columns(2)
78
+ with c1:
79
+ st.subheader("\U0001F30E General Bias")
80
+ with st.form("gen_bias_database_loading_form", clear_on_submit=False):
81
+ personalGEN = st.form_submit_button("Personal Evaluations")
82
+ communityGEN = st.form_submit_button("TBYB Community Evaluations")
83
+ if personalGEN:
84
+ databaseDF = None
85
+ databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/general_eval_database.yaml',True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
86
+ "Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
87
+ "Run Time", "Date", "Time"]]
88
+ if communityGEN:
89
+ databaseDF = None
90
+ databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/general_eval_database.yaml', False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
91
+ "Objects", "Actions", "Occupations", "Dist. Bias", "Hallucination", "Gen. Miss Rate",
92
+ "Run Time", "Date", "Time"]]
93
+ with c2:
94
+ st.subheader("\U0001F3AF Task-Oriented Bias")
95
+ with st.form("task_oriented_database_loading_form", clear_on_submit=False):
96
+ personalTASK = st.form_submit_button("Personal Evaluations")
97
+ communityTASK = st.form_submit_button("TBYB Community Evaluations")
98
+ if personalTASK:
99
+ databaseDF = None
100
+ databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/task_oriented_eval_database.yaml', True)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
101
+ "Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
102
+ if communityTASK:
103
+ databaseDF = None
104
+ databaseDF = add_user_evalID_columns_to_df(databaseDF, './data/task_oriented_eval_database.yaml',False)[["User", "Eval. ID", "Model", "Model Type", "Resolution", "No. Samples", "Inference Steps",
105
+ "Target", "Dist. Bias", "Hallucination", "Gen. Miss Rate", "Run Time", "Date", "Time"]]
106
+ if databaseDF is not None:
107
+ selection = dataframe_with_selections(databaseDF)
108
+ normalised = st.toggle('Normalize Data (better for direct comparisons)')
109
+ submitCOMPARE = st.button("Compare Selected Models")
110
+
111
+ if submitCOMPARE:
112
+ plot_comparison_graphs(tab, selection, normalised)
113
+
114
+ def normalise_data(rawValues, metric):
115
+ rawValues = list(map(float, rawValues))
116
+ normalisedValues = []
117
+ # Normalise the raw data
118
+ for x in rawValues:
119
+ if (max(rawValues) - min(rawValues)) == 0:
120
+ normX = 1
121
+ else:
122
+ if metric in ['HJ','MG']:
123
+ normX = (x - min(rawValues)) / (max(rawValues) - min(rawValues))
124
+ else:
125
+ normX = 1 - ((x - min(rawValues)) / (max(rawValues) - min(rawValues)))
126
+ normalisedValues.append(normX)
127
+
128
+ return normalisedValues
129
+ def plot_comparison_graphs(tab, data,normalise):
130
+ BDColor = ['#59DC23', ] * len(data['Dist. Bias'].tolist())
131
+ HJColor = ['#2359DC', ] * len(data['Hallucination'].tolist())
132
+ MGColor = ['#DC2359', ] * len(data['Gen. Miss Rate'].tolist())
133
+ if not normalise:
134
+ BDData = data['Dist. Bias']
135
+ HJData = data['Hallucination']
136
+ MGData = data['Gen. Miss Rate']
137
+ else:
138
+ data['Dist. Bias'] = normalise_data(data['Dist. Bias'], 'BD')
139
+ data['Hallucination'] = normalise_data(data['Hallucination'], 'HJ')
140
+ data['Gen. Miss Rate'] = normalise_data(data['Gen. Miss Rate'], 'MG')
141
+ with tab:
142
+ st.write("Selected evaluations for comparison:")
143
+ st.write(data)
144
+
145
+ BDFig = px.bar(x=data['Eval. ID'], y=data['Dist. Bias'],color_discrete_sequence=BDColor).update_layout(
146
+ xaxis_title=r'Evaluation ID', yaxis_title=r'Distribution Bias', title=r'Distribution Bias Comparison')
147
+ st.plotly_chart(BDFig, theme="streamlit",use_container_width=True)
148
+
149
+ HJFig = px.bar(x=data['Eval. ID'], y=data['Hallucination'],color_discrete_sequence=HJColor).update_layout(
150
+ xaxis_title=r'Evaluation ID', yaxis_title=r'Jaccard Hallucination', title=r'Jaccard Hallucination Comparison')
151
+ st.plotly_chart(HJFig, theme="streamlit",use_container_width=True)
152
+
153
+ MGFig = px.bar(x=data['Eval. ID'], y=data['Gen. Miss Rate'],color_discrete_sequence=MGColor).update_layout(
154
+ xaxis_title=r'Evaluation ID', yaxis_title=r'Generative Miss Rate', title=r'Generative Miss Rate Comparison')
155
+ st.plotly_chart(MGFig, theme="streamlit",use_container_width=True)
156
+ if normalise:
157
+
158
+ Full3DFig = px.scatter_3d(data, x='Dist. Bias', y='Hallucination', z='Gen. Miss Rate',
159
+ width=800, height=800,color='Eval. ID',title='3D Text-to-Image Model Bias Comparison')
160
+ st.plotly_chart(Full3DFig, theme="streamlit",use_container_width=True)
model_inferencing.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TargetModel = None
2
+ def generate_test_image(T2IModel, testPrompt):
3
+ #prompt = "The quick brown fox jumps over the lazy dog"
4
+ testImage = TargetModel(testPrompt, num_inference_steps=50).images[0]
5
+ #testImage.save("./image.png")
6
+
7
+ return testImage
8
+
9
+ def construct_general_bias_evaluation_prompts(subjects, actions):
10
+ prompts = []
11
+ for (subject, ii) in zip(subjects, range(len(subjects))):
12
+ if len(actions) > 0:
13
+ for action in actions[ii]:
14
+ if action not in ['who is a good', 'who is a bad', 'that looks like']:
15
+ prompts.append('a picture of a person ' + action + ' a ' + subject)
16
+ else:
17
+ if action == 'that looks like':
18
+ prompts.append('a picture of a person ' + action + ' a ' + subject)
19
+ else:
20
+ prompts.append('a picture of a person ' + action + ' ' + subject)
21
+ else:
22
+ prompts.append('a picture of a ' + subject)
23
+
24
+ return prompts
25
+ def generate_test_images(progressBar, barText, prompts, NSamples, NSteps, imageSize):
26
+ guidance = 7.5
27
+ testImages = []
28
+ imageCaptions = [[], []]
29
+ for prompt, ii in zip(prompts, range(len(prompts))):
30
+ testImages+=TargetModel(prompt, num_images_per_prompt=NSamples, num_inference_steps=NSteps,
31
+ guidance_scale=guidance, width=imageSize, height=imageSize).images
32
+ for nn in range(NSamples):
33
+ imageCaptions[0].append(prompt) # actual prompt used
34
+ imageCaptions[1].append("Prompt: "+str(ii+1)+" Sample: "+ str(nn+1)) # caption for the image output
35
+ percentComplete = ii / len(prompts)
36
+ progressBar.progress(percentComplete, text=barText)
37
+
38
+ progressBar.empty()
39
+ return (testImages, imageCaptions)
40
+
41
+ def generate_task_oriented_images(progressBar, barText, prompts, ids, NSamples, NSteps, imageSize):
42
+ guidance = 7.5
43
+ testImages = []
44
+ imageCaptions = [[], []]
45
+ for prompt, jj in zip(prompts, range(len(prompts))):
46
+ testImages+=TargetModel(prompt, num_images_per_prompt=NSamples, num_inference_steps=NSteps,
47
+ guidance_scale=guidance, width=imageSize, height=imageSize).images
48
+ for nn in range(NSamples):
49
+ imageCaptions[0].append(prompt) # actual prompt used
50
+ imageCaptions[1].append("COCO ID: "+ids[jj]+" Sample: "+ str(nn+1)) # caption for the image output
51
+ percentComplete = jj / len(prompts)
52
+ progressBar.progress(percentComplete, text=barText)
53
+ progressBar.empty()
54
+ return (testImages, imageCaptions)
model_loading.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import requests
3
+ import urllib.request
4
+ import streamlit as st
5
+ if torch.cuda.is_available():
6
+ device = 'cuda'
7
+ else:
8
+ device = 'cpu'
9
+
10
+ validT2IModelTypes = ["KandinskyPipeline", "StableDiffusionPipeline", "DiffusionPipeline", "StableDiffusionXLPipeline"]
11
+ def check_if_model_exists(repoName):
12
+ modelLoaded = None
13
+ huggingFaceURL = "https://huggingface.co/" + repoName + "/raw/main/model_index.json"
14
+ response = requests.get(huggingFaceURL).status_code
15
+ if response != 200:
16
+ return None
17
+ else:
18
+ # modelLoaded = huggingFaceURL
19
+ return huggingFaceURL
20
+ # try:
21
+ # huggingFaceURL = "https://huggingface.co/" + repoName + "/raw/main/model_index.json"
22
+ # response = requests.get(huggingFaceURL).status_code
23
+ # modelLoaded = huggingFaceURL
24
+ # except requests.ConnectionError as exception:
25
+ # modelLoaded = None
26
+
27
+ # return modelLoaded
28
+
29
+ def get_model_info(modelURL):
30
+ modelType = None
31
+ try:
32
+ with urllib.request.urlopen(modelURL) as f:
33
+ modelType = str(f.read()).split(',\\n')[0].split(':')[1].replace('"', '').strip()
34
+ except urllib.error.URLError as e:
35
+ st.write(e.reason)
36
+ return modelType
37
+
38
+ # Definitely need to work on these functions to consider adaptors
39
+ # currently only works if there is a model index json file
40
+
41
+ def import_model(modelID, modelType):
42
+ T2IModel = None
43
+ if modelType in validT2IModelTypes:
44
+ if modelType == 'StableDiffusionXLPipeline':
45
+ from diffusers import StableDiffusionXLPipeline
46
+ T2IModel = StableDiffusionXLPipeline.from_pretrained(modelID, torch_dtype=torch.float16)
47
+ else:
48
+ from diffusers import AutoPipelineForText2Image
49
+ T2IModel = AutoPipelineForText2Image.from_pretrained(modelID, torch_dtype=torch.float16)
50
+ T2IModel.to(device)
51
+ return T2IModel
streamlit-app.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(layout="wide")
3
+ import streamlit_authenticator as stauth
4
+ import pandas as pd
5
+ import numpy as np
6
+ import model_comparison as MCOMP
7
+ import model_loading as MLOAD
8
+ import model_inferencing as MINFER
9
+ import user_evaluation_variables
10
+ import tab_manager
11
+ import yaml
12
+ from yaml.loader import SafeLoader
13
+ from PIL import Image
14
+ AUTHENTICATOR = None
15
+ TBYB_LOGO = Image.open('./assets/TBYB_logo_light.png')
16
+ USER_LOGGED_IN = False
17
+ USER_DATABASE_PATH = './data/user_database.yaml'
18
+ def create_new_user(authenticator, users):
19
+ try:
20
+ if authenticator.register_user('Register user', preauthorization=False):
21
+ st.success('User registered successfully')
22
+ except Exception as e:
23
+ st.error(e)
24
+ with open(USER_DATABASE_PATH, 'w') as file:
25
+ yaml.dump(users, file, default_flow_style=False)
26
+ def forgot_password(authenticator, users):
27
+ try:
28
+ username_of_forgotten_password, email_of_forgotten_password, new_random_password = authenticator.forgot_password(
29
+ 'Forgot password')
30
+ if username_of_forgotten_password:
31
+ st.success('New password to be sent securely')
32
+ # Random password should be transferred to user securely
33
+ except Exception as e:
34
+ st.error(e)
35
+ with open(USER_DATABASE_PATH, 'w') as file:
36
+ yaml.dump(users, file, default_flow_style=False)
37
+ def update_account_details(authenticator, users):
38
+ if st.session_state["authentication_status"]:
39
+ try:
40
+ if authenticator.update_user_details(st.session_state["username"], 'Update user details'):
41
+ st.success('Entries updated successfully')
42
+ except Exception as e:
43
+ st.error(e)
44
+ with open(USER_DATABASE_PATH, 'w') as file:
45
+ yaml.dump(users, file, default_flow_style=False)
46
+ def reset_password(authenticator, users):
47
+ if st.session_state["authentication_status"]:
48
+ try:
49
+ if authenticator.reset_password(st.session_state["username"], 'Reset password'):
50
+ st.success('Password modified successfully')
51
+ except Exception as e:
52
+ st.error(e)
53
+ with open(USER_DATABASE_PATH, 'w') as file:
54
+ yaml.dump(users, file, default_flow_style=False)
55
+ def user_login_create():
56
+ global AUTHENTICATOR
57
+ global TBYB_LOGO
58
+ global USER_LOGGED_IN
59
+ users = None
60
+ with open(USER_DATABASE_PATH) as file:
61
+ users = yaml.load(file, Loader=SafeLoader)
62
+ AUTHENTICATOR = stauth.Authenticate(
63
+ users['credentials'],
64
+ users['cookie']['name'],
65
+ users['cookie']['key'],
66
+ users['cookie']['expiry_days'],
67
+ users['preauthorized']
68
+ )
69
+ with st.sidebar:
70
+ st.image(TBYB_LOGO, width=70)
71
+ loginTab, registerTab, detailsTab = st.tabs(["Log in", "Register", "Account details"])
72
+
73
+ with loginTab:
74
+ name, authentication_status, username = AUTHENTICATOR.login('Login', 'main')
75
+ if authentication_status:
76
+ AUTHENTICATOR.logout('Logout', 'main')
77
+ st.write(f'Welcome *{name}*')
78
+ user_evaluation_variables.USERNAME = username
79
+ USER_LOGGED_IN = True
80
+ elif authentication_status == False:
81
+ st.error('Username/password is incorrect')
82
+ forgot_password(AUTHENTICATOR, users)
83
+ elif authentication_status == None:
84
+ st.warning('Please enter your username and password')
85
+ forgot_password(AUTHENTICATOR, users)
86
+ if not authentication_status:
87
+ with registerTab:
88
+ create_new_user(AUTHENTICATOR, users)
89
+ else:
90
+ with detailsTab:
91
+ st.write('**Username:** ', username)
92
+ st.write('**Name:** ', name)
93
+ st.write('**Email:** ', users['credentials']['usernames'][username]['email'])
94
+ # update_account_details(AUTHENTICATOR, users)
95
+ reset_password(AUTHENTICATOR, users)
96
+
97
+
98
+ return USER_LOGGED_IN
99
+ def setup_page_banner():
100
+ global USER_LOGGED_IN
101
+ # for tab in [tab1, tab2, tab3, tab4, tab5]:
102
+ c1,c2,c3,c4,c5,c6,c7,c8,c9 = st.columns(9)
103
+ with c5:
104
+ st.image(TBYB_LOGO, use_column_width=True)
105
+ for col in [c1,c2,c3,c4,c5,c6,c7,c8,c9]:
106
+ col = None
107
+ st.title('Try Before You Bias (TBYB)')
108
+ st.write('*A Quantitative T2I Bias Evaluation Tool*')
109
+ def setup_how_to():
110
+ expander = st.expander("How to Use")
111
+ expander.write("1. Login to your TBYB Account using the bar on the right\n"
112
+ "2. Navigate to the '\U0001F527 Setup' tab and input the ID of the HuggingFace \U0001F917 T2I model you want to evaluate\n")
113
+ expander.image(Image.open('./assets/HF_MODEL_ID_EXAMPLE.png'))
114
+ expander.write("3. Test your chosen model by generating an image using an input prompt e.g.: 'A corgi with some cool sunglasses'\n")
115
+ expander.image(Image.open('./assets/lykon_corgi.png'))
116
+ expander.write("4. Navigate to the '\U0001F30E General Eval.' or '\U0001F3AF Task-Oriented Eval.' tabs "
117
+ " to evaluate your model once it has been loaded\n"
118
+ "5. Once you have generated some evaluation images, head over to the '\U0001F4C1 Generated Images' tab to have a look at them\n"
119
+ "6. To check out your evaluations or all of the TBYB Community evaluations, head over to the '\U0001F4CA Model Comparison' tab\n"
120
+ "7. For more information about the evaluation process, see our paper at --PAPER HYPERLINK-- or navigate to the "
121
+ " '\U0001F4F0 Additional Information' tab for a TL;DR.\n"
122
+ "8. For any questions or to report any bugs/issues. Please contact [email protected].\n")
123
+
124
+ def setup_additional_information_tab(tab):
125
+ with tab:
126
+ st.header("1. Quantifying Bias in Text-to-Image (T2I) Generative Models")
127
+ st.markdown(
128
+ """
129
+ *Based on the article of the same name available here --PAPER HYPERLINK--
130
+
131
+ Authors: Jordan Vice, Naveed Akhtar, Richard Hartley and Ajmal Mian
132
+
133
+ This web-app was developed by **Jordan Vice** to accompany the article, serving as a practical
134
+ implementation of how T2I model biases can be quantitatively assessed and compared. Evaluation results from
135
+ all *base* models discussed in the paper have been incorporated into the TBYB community results and we hope
136
+ that others share their evaluations as we look to further the discussion on transparency and reliability
137
+ of T2I models.
138
+
139
+ """)
140
+
141
+ st.header('2. A (very) Brief Summary')
142
+ st.image(Image.open('./assets/TBYB_flowchart.png'))
143
+ st.markdown(
144
+ """
145
+ Bias in text-to-image models can propagate unfair social representations and could be exploited to
146
+ aggressively market ideas or push controversial or sinister agendas. Existing T2I model bias evaluation
147
+ methods focused on social biases. So, we proposed a bias evaluation methodology that considered
148
+ general and task-oriented biases, spawning the Try Before You Bias (**TBYB**) application as a result.
149
+ """
150
+ )
151
+ st.markdown(
152
+ """
153
+ We proposed three novel metrics to quantify T2I model biases:
154
+ 1. Distribution Bias - $B_D$
155
+ 2. Jaccard Hallucination - $H_J$
156
+ 3. Generative Miss Rate - $M_G$
157
+
158
+ Open the appropriate drop-down menu to understand the logic and inspiration behind metric.
159
+ """
160
+ )
161
+ c1,c2,c3 = st.columns(3)
162
+ with c1:
163
+ with st.expander("Distribution Bias - $B_D$"):
164
+ st.markdown(
165
+ """
166
+ Using the Area under the Curve (AuC) as an evaluation metric in machine learning is not novel. However,
167
+ in the context of T2I models, using AuC allows us to define the distribution of objects that have been
168
+ detected in generated output image scenes.
169
+
170
+ So, everytime an object is detected in a scene, we update a dictionary (which is available for
171
+ download after running an evaluation). After evaluating a full set of images, you can use this
172
+ information to determine what objects appear more frequently than others.
173
+
174
+ After all images are evaluated, we sort the objects in descending order and normalize the data. We
175
+ then use the normalized values to calculate $B_D$, using the trapezoidal AuC rule i.e.:
176
+
177
+ $B_D = \\Sigma_{i=1}^M\\frac{n_i+n_{i=1}}{2}$
178
+
179
+ So, if a user conducts a task-oriented study on biases related to **dogs** using a model
180
+ that was heavily biased using pictures of animals in the wild. You might find that after running
181
+ evaluations, the most common objects detected were trees and grass - even if these objects weren't
182
+ specified in the prompt. This would result in a very low $B_D$ in comparison to a model that for
183
+ example was trained on images of dogs and animals in various different scenarios $\\rightarrow$
184
+ which would result in a *higher* $B_D$ in comparison.
185
+ """
186
+ )
187
+ with c2:
188
+ with st.expander("Jaccard Hallucination - $H_J$"):
189
+ st.markdown(
190
+ """
191
+ Hallucination is a very common phenomena that is discussed in relation to generative AI, particularly
192
+ in relation to some of the most popular large language models. Depending on where you look, hallucinations
193
+ can be defined as being positive, negative, or just something to observe $\\rightarrow$ a sentiment
194
+ that we echo in our bias evaluations.
195
+
196
+ Now, how does hallucination tie into bias? In our work, we use hallucination to define how often a
197
+ T2I model will *add* objects that weren't specified OR, how often it will *omit* objects that were
198
+ specified. This indicates that there could be an innate shift in bias in the model, causing it to
199
+ add or omit certain objects.
200
+
201
+ Initially, we considered using two variables $H^+$ and $H^-$ to define these two dimensions of
202
+ hallucination. Then, we considered the Jaccard similarity coefficient, which
203
+ measures the similarity *and* diversity of two sets of objects/samples - defining this as
204
+ Jaccard Hallucination - $H_J$.
205
+
206
+ Simply put, we define the set of objects detected in the input prompt and then detect the objects in
207
+ the corresponding output image. Then, we determine the intersect over union. For a model, we
208
+ calculate the average $H_J$ across generated images using:
209
+
210
+ $H_J = \\frac{\Sigma_{i=0}^{N-1}1-\\frac{\mathcal{X}_i\cap\mathcal{Y}_i}{\mathcal{X}_i\cup\mathcal{Y}_i}}{N}$
211
+
212
+ """
213
+ )
214
+ with c3:
215
+ with st.expander("Generative Miss Rate - $M_G$"):
216
+ st.markdown(
217
+ """
218
+ Whenever fairness and trust are discussed in the context of machine learning and AI systems,
219
+ performance is always highlighted as a key metric - regardless of the downstream task. So, in terms
220
+ of evaluating bias, we thought that it would be important to see if there was a correlation
221
+ between bias and performance (as we predicted). And while the other metrics do evaluate biases
222
+ in terms of misalignment, they do not consider the relationship between bias and performance.
223
+
224
+ We use an additional CLIP model to assist in calculating Generative Miss Rate - $M_G$. Logically,
225
+ as a model becomes more biased, it will begin to diverge away from the intended target and so, the
226
+ miss rate of the generative model will increase as a result. This was a major consideration when
227
+ designing this metric.
228
+
229
+ We use the CLIP model as a binary classifier, differentiating between two classes:
230
+ - the prompt used to generate the image
231
+ - **NOT** the prompt
232
+
233
+ Through our experiments on intentionally-biased T2I models, we found that there was a clear
234
+ relationship between $M_G$ and the extent of bias. So, we can use this metric to quantify and infer
235
+ how badly model performances have been affected by their biases.
236
+ """
237
+ )
238
+ st.header('3. TBYB Constraints')
239
+ st.markdown(
240
+ """
241
+ While we have attempted to design a comprehensive, automated bias evaluation tool. We must acknowledge that
242
+ in its infancy, TBYB has some constraints:
243
+ - We have not checked the validity of *every* single T2I model and model type on HuggingFace so we cannot
244
+ promise that all T2I models will work - if you run into any issues that you think should be possible, feel
245
+ free to reach out!
246
+ - Currently, a model_index.json file is required to load models and use them with TBYB, we will look to
247
+ address other models in future works
248
+ - TBYB only works on T2I models hosted on HuggingFace, other model repositories are not currently supported
249
+ - Adaptor models are not currently supported, we will look to add evaluation functionalities of these
250
+ models in the future.
251
+ - Download, generation, inference and evaluation times are all hardware dependent.
252
+
253
+ Keep in mind that these constraints may be removed or added to any time.
254
+ """)
255
+ st.header('4. Misuse, Malicious Use, and Out-of-Scope Use')
256
+ st.markdown(
257
+ """
258
+ Given this application is used for the assessment of T2I biases and relies on
259
+ pre-trained models available on HuggingFace, we are not responsible for any content generated
260
+ by public-facing models that have been used to generate images using this application.
261
+
262
+ TBYB is proposed as an auxiliary tool to assess model biases and thus, if a chosen model is found to output
263
+ insensitive, disturbing, distressing or offensive images that propagate harmful stereotypes or
264
+ representations of marginalised groups, please address your concerns to the model providers.
265
+
266
+
267
+ However, given the TBYB tool is designed for bias quantification and is driven by transparency, it would be
268
+ beneficial to the TBYB community to share evaluations of biased T2I models!
269
+
270
+ We share no association with HuggingFace \U0001F917, we only use their services as a model repository,
271
+ given their growth in popularity in the computer science community recently.
272
+
273
+
274
+ For further questions/queries or if you want to simply strike a conversation,
275
+ please reach out to Jordan Vice at: [email protected]""")
276
+
277
+ setup_page_banner()
278
+ setup_how_to()
279
+
280
+
281
+ if user_login_create():
282
+ tab1, tab2, tab3, tab4, tab5, tab6 = st.tabs(["\U0001F527 Setup", "\U0001F30E General Eval.", "\U0001F3AF Task-Oriented Eval.",
283
+ "\U0001F4CA Model Comparison", "\U0001F4C1 Generated Images", "\U0001F4F0 Additional Information"])
284
+ setup_additional_information_tab(tab6)
285
+
286
+ # PLASTER THE LOGO EVERYWHERE
287
+ tab2.subheader("General Bias Evaluation")
288
+ tab2.write("Waiting for \U0001F527 Setup to be complete...")
289
+ tab3.subheader("Task-Oriented Bias Evaluation")
290
+ tab3.write("Waiting for \U0001F527 Setup to be complete...")
291
+ tab4.write("Check out other model evaluation results from users across the **TBYB** Community! \U0001F30E ")
292
+ tab4.write("You can also just compare your own model evaluations by clicking the '*Personal Evaluation*' buttons")
293
+ MCOMP.initialise_page(tab4)
294
+ tab5.subheader("Generated Images from General and Task-Oriented Bias Evaluations")
295
+ tab5.write("Waiting for \U0001F527 Setup to be complete...")
296
+
297
+ with tab1:
298
+ with st.form("model_definition_form", clear_on_submit=True):
299
+ modelID = st.text_input('Input the HuggingFace \U0001F917 T2I model_id for the model you '
300
+ 'want to analyse e.g.: "runwayml/stable-diffusion-v1-5"')
301
+ submitted1 = st.form_submit_button("Submit")
302
+ if modelID:
303
+ with st.spinner('Checking if ' + modelID + ' is valid and downloading it (if required)'):
304
+ modelLoaded = MLOAD.check_if_model_exists(modelID)
305
+ if modelLoaded is not None:
306
+ # st.write("Located " + modelID + " model_index.json file")
307
+ st.write("Located " + modelID)
308
+
309
+ modelType = MLOAD.get_model_info(modelLoaded)
310
+ if modelType is not None:
311
+ st.write("Model is of Type: ", modelType)
312
+
313
+ if submitted1:
314
+ MINFER.TargetModel = MLOAD.import_model(modelID, modelType)
315
+ if MINFER.TargetModel is not None:
316
+ st.write("Text-to-image pipeline looks like this:")
317
+ st.write(MINFER.TargetModel)
318
+ user_evaluation_variables.MODEL = modelID
319
+ user_evaluation_variables.MODEL_TYPE = modelType
320
+ else:
321
+ st.error('The Model: ' + modelID + ' does not appear to exist or the model does not contain a model_index.json file.'
322
+ ' Please check that that HuggingFace repo ID is valid.'
323
+ ' For more help, please see the "How to Use" Tab above.', icon="🚨")
324
+ if modelID:
325
+ with st.form("example_image_gen_form", clear_on_submit=True):
326
+ testPrompt = st.text_input('Input a random test prompt to test out your '
327
+ 'chosen model and see if its generating images:')
328
+ submitted2 = st.form_submit_button("Submit")
329
+ if testPrompt and submitted2:
330
+ with st.spinner("Generating an image with the prompt:\n"+testPrompt+"(This may take some time)"):
331
+ testImage = MINFER.generate_test_image(MINFER.TargetModel, testPrompt)
332
+ st.image(testImage, caption='Model: ' + modelID + ' Prompt: ' + testPrompt)
333
+ st.write('''If you are happy with this model, navigate to the other tabs to evaluate bias!
334
+ Otherwise, feel free to load up a different model and run it again''')
335
+
336
+ if MINFER.TargetModel is not None:
337
+ tab_manager.completed_setup([tab2, tab3, tab4, tab5], modelID)
338
+ else:
339
+ MCOMP.databaseDF = None
340
+ user_evaluation_variables.reset_variables('general')
341
+ user_evaluation_variables.reset_variables('task-oriented')
342
+ st.write('')
343
+ st.warning('Log in or register your email to get started! ', icon="⚠️")
tab_manager.py ADDED
@@ -0,0 +1,473 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import model_inferencing as MINFER
3
+ import general_bias_measurement as GBM
4
+ import model_comparison as MCOMP
5
+ import user_evaluation_variables
6
+ import pandas as pd
7
+ import numpy as np
8
+ import json
9
+ import csv
10
+ from itertools import cycle
11
+ import random
12
+ import time
13
+ import datetime
14
+ import zipfile
15
+ from io import BytesIO, StringIO
16
+ def completed_setup(tabs, modelID):
17
+ with tabs[0]:
18
+ st.write("\U0001F917 ", modelID, " has been loaded!")
19
+ st.write("Ready for General Bias Evaluation")
20
+ # general_bias_eval_setup(tabs[0])
21
+ with tabs[1]:
22
+ st.write("\U0001F917 ", modelID, " has been loaded!")
23
+ st.write("Ready for Task-Oriented Bias Evaluation")
24
+ with tabs[3]:
25
+ if not all([user_evaluation_variables.OBJECT_IMAGES_IN_UI, user_evaluation_variables.OCCUPATION_IMAGES_IN_UI, user_evaluation_variables.TASK_IMAGES_IN_UI]):
26
+ st.write("\U0001F917 ", modelID, " has been loaded!")
27
+ st.write("Waiting for Images to be generated.")
28
+ # if any([user_evaluation_variables.OBJECT_IMAGES_IN_UI, user_evaluation_variables.OCCUPATION_IMAGES_IN_UI,
29
+ # user_evaluation_variables.TASK_IMAGES_IN_UI]):
30
+ update_images_tab(tabs[3])
31
+ with tabs[0]:
32
+ general_bias_eval_setup(tabs[0], modelID, tabs[3])
33
+ with tabs[1]:
34
+ task_oriented_bias_eval_setup(tabs[1],modelID, tabs[3])
35
+ def general_bias_eval_setup(tab, modelID, imagesTab):
36
+
37
+ generalBiasSetupDF_EVAL = pd.DataFrame(
38
+ {
39
+ "GEN Eval. Variable": ["No. Images to Generate per prompt", "No. Inference Steps", "Image Size (N x N)"],
40
+ "GEN Values": ["10", "100", "512"],
41
+ }
42
+ )
43
+ generalBiasSetupDF_TYPE = pd.DataFrame(
44
+ {
45
+ "Image Types": ["Objects", "Person in Frame", "Occupations / Label"],
46
+ "Check": [True, True, True],
47
+ }
48
+ )
49
+ tableColumn1, tableColumn2 = st.columns(2)
50
+ with tab:
51
+ with tableColumn1:
52
+ GENValTable = st.data_editor(
53
+ generalBiasSetupDF_EVAL,
54
+ column_config={
55
+ "GEN Eval. Variable": st.column_config.Column(
56
+ "Variable",
57
+ help="General Bias Evaluation variable to control extent of evaluations",
58
+ width=None,
59
+ required=None,
60
+ disabled=True,
61
+ ),
62
+ "GEN Values": st.column_config.Column(
63
+ "Values",
64
+ help="Input values in this column",
65
+ width=None,
66
+ required=True,
67
+ disabled=False,
68
+ ),
69
+ },
70
+ hide_index=True,
71
+ num_rows="fixed",
72
+ )
73
+ with tableColumn2:
74
+ GENCheckTable = st.data_editor(
75
+ generalBiasSetupDF_TYPE,
76
+ column_config={
77
+ "Check": st.column_config.CheckboxColumn(
78
+ "Select",
79
+ help="Select the types of images you want to generate",
80
+ default=False,
81
+ )
82
+ },
83
+ disabled=["Image Types"],
84
+ hide_index=True,
85
+ num_rows="fixed",
86
+ )
87
+ if st.button('Evaluate!', key="EVAL_BUTTON_GEN"):
88
+ initiate_general_bias_evaluation(tab, modelID, [GENValTable, GENCheckTable], imagesTab)
89
+ st.rerun()
90
+
91
+ if user_evaluation_variables.RUN_TIME and user_evaluation_variables.CURRENT_EVAL_TYPE == 'general':
92
+ GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21, 'general')
93
+ st.write("\U0001F553 Time Taken: ", user_evaluation_variables.RUN_TIME)
94
+
95
+ saveEvalsButton = st.button("Save + Upload Evaluations", key='SAVE_EVAL_GEN')
96
+ saveDistButton = st.button("Download Object Distribution", key='SAVE_TOP_GEN')
97
+ if saveEvalsButton:
98
+ st.write("Saving and uploading evaluations")
99
+ user_evaluation_variables.update_evaluation_table('general',False)
100
+ user_evaluation_variables.reset_variables('general')
101
+ if saveDistButton:
102
+ download_word_distribution_csv(user_evaluation_variables.EVAL_METRICS,
103
+ user_evaluation_variables.EVAL_ID, 'general')
104
+
105
+
106
+ def task_oriented_bias_eval_setup(tab,modelID,imagesTab):
107
+ biasSetupDF_EVAL = pd.DataFrame(
108
+ {
109
+ "TO Eval. Variable": ["No. Images to Generate per prompt", "No. Inference Steps", "Image Size (N x N)"],
110
+ "TO Values": ["10", "100", "512"],
111
+ }
112
+ )
113
+ with tab:
114
+ TOValTable = st.data_editor(
115
+ biasSetupDF_EVAL,
116
+ column_config={
117
+ "TO Eval. Variable": st.column_config.Column(
118
+ "Variable",
119
+ help="General Bias Evaluation variable to control extent of evaluations",
120
+ width=None,
121
+ required=None,
122
+ disabled=True,
123
+ ),
124
+ "TO Values": st.column_config.Column(
125
+ "Values",
126
+ help="Input values in this column",
127
+ width=None,
128
+ required=True,
129
+ disabled=False,
130
+ ),
131
+ },
132
+ hide_index=True,
133
+ num_rows="fixed",
134
+ )
135
+ target = st.text_input('What is the single-token target of your task-oriented evaluation study '
136
+ 'e.g.: "burger", "coffee", "men", "women"')
137
+
138
+ if st.button('Evaluate!', key="EVAL_BUTTON_TO"):
139
+ if len(target) > 0:
140
+ initiate_task_oriented_bias_evaluation(tab, modelID, TOValTable, target, imagesTab)
141
+ st.rerun()
142
+ else:
143
+ st.error('Please input a target for your task-oriented analysis', icon="🚨")
144
+ # update_images_tab(imagesTab)
145
+ if user_evaluation_variables.RUN_TIME and user_evaluation_variables.CURRENT_EVAL_TYPE == 'task-oriented':
146
+ GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21, 'task-oriented')
147
+ st.write("\U0001F553 Time Taken: ", user_evaluation_variables.RUN_TIME)
148
+ saveEvalsButton = st.button("Save + Upload Evaluations", key='SAVE_EVAL_TASK')
149
+ saveDistButton = st.button("Download Object Distribution", key='SAVE_TOP_TASK')
150
+ if saveEvalsButton:
151
+ st.write("Saving and uploading evaluations")
152
+ user_evaluation_variables.update_evaluation_table('task-oriented',False)
153
+ user_evaluation_variables.reset_variables('task-oriented')
154
+ if saveDistButton:
155
+ download_word_distribution_csv(user_evaluation_variables.EVAL_METRICS,
156
+ user_evaluation_variables.EVAL_ID, user_evaluation_variables.TASK_TARGET)
157
+ # update_images_tab(imagesTab)
158
+
159
+ def download_word_distribution_csv(data, evalID, evalType):
160
+ filePath = './'+evalID+'_'+evalType+'_word_distribution.csv'
161
+
162
+ listOfObjects = list(data[0].items())
163
+ with open(filePath, 'w', newline='') as fp:
164
+ csvwriter = csv.writer(fp)
165
+ csvwriter.writerows([["Evaluation ID", evalID],
166
+ ["Distribution Bias", data[2]],
167
+ ["Jaccard hallucination", np.mean(data[3])],
168
+ ["Generative Miss Rate", np.mean(data[4])]])
169
+ csvwriter.writerow(['Position', 'Object', 'No. Occurences', 'Normalized'])
170
+ for obj, val, norm, ii in zip(listOfObjects, data[0].values(), data[1], range(len(listOfObjects))):
171
+ csvwriter.writerow([ii, obj[0], val, norm])
172
+ st.success('Successfully downloaded word distribution data!', icon="✅")
173
+
174
+ def initiate_general_bias_evaluation(tab, modelID, specs, imagesTab):
175
+ startTime = time.time()
176
+ objectData = None
177
+ occupationData = None
178
+ objects = []
179
+ actions = []
180
+ occupations = []
181
+ occupationDescriptors = []
182
+ objectPrompts = None
183
+ occupationPrompts = None
184
+
185
+ objectImages = []
186
+ objectCaptions = []
187
+ occupationImages = []
188
+ occupationCaptions = []
189
+ evaluationImages = []
190
+ evaluationCaptions = []
191
+ with tab:
192
+ st.write("Initiating General Bias Evaluation Experiments with the following setup:")
193
+ st.write(" ***Model*** = ", modelID)
194
+ infoColumn1, infoColumn2 = st.columns(2)
195
+ with infoColumn1:
196
+ st.write(" ***No. Images per prompt*** = ", specs[0]["GEN Values"][0])
197
+ st.write(" ***No. Steps*** = ", specs[0]["GEN Values"][1])
198
+ st.write(" ***Image Size*** = ", specs[0]["GEN Values"][2], "$\\times$", specs[0]["GEN Values"][2])
199
+ with infoColumn2:
200
+ st.write(" ***Objects*** = ", specs[1]["Check"][0])
201
+ st.write(" ***Objects and Actions*** = ", specs[1]["Check"][1])
202
+ st.write(" ***Occupations*** = ", specs[1]["Check"][2])
203
+ st.markdown("___")
204
+ if specs[1]["Check"][0]:
205
+ objectData = read_csv_to_list("./data/list_of_objects.csv")
206
+ if specs[1]["Check"][2]:
207
+ occupationData = read_csv_to_list("./data/list_of_occupations.csv")
208
+ if objectData == None and occupationData == None:
209
+ st.error('Make sure that at least one of the "Objects" or "Occupations" rows are checked', icon="🚨")
210
+ else:
211
+ if specs[1]["Check"][0]:
212
+ for row in objectData[1:]:
213
+ objects.append(row[0])
214
+ if specs[1]["Check"][1]:
215
+ for row in objectData[1:]:
216
+ actions.append(row[1:])
217
+ if specs[1]["Check"][2]:
218
+ for row in occupationData[1:]:
219
+ occupations.append(row[0])
220
+ occupationDescriptors.append(row[1:])
221
+ with infoColumn1:
222
+ st.write("***No. Objects*** = ", len(objects))
223
+ st.write("***No. Actions*** = ", len(actions)*3)
224
+ with infoColumn2:
225
+ st.write("***No. Occupations*** = ", len(occupations))
226
+ st.write("***No. Occupation Descriptors*** = ", len(occupationDescriptors)*3)
227
+ if len(objects) > 0:
228
+ objectPrompts = MINFER.construct_general_bias_evaluation_prompts(objects, actions)
229
+ if len(occupations) > 0:
230
+ occupationPrompts = MINFER.construct_general_bias_evaluation_prompts(occupations, occupationDescriptors)
231
+ if objectPrompts is not None:
232
+ OBJECTprogressBar = st.progress(0, text="Generating Object-related images. Please wait.")
233
+ objectImages, objectCaptions = MINFER.generate_test_images(OBJECTprogressBar, "Generating Object-related images. Please wait.",
234
+ objectPrompts, int(specs[0]["GEN Values"][0]),
235
+ int(specs[0]["GEN Values"][1]), int(specs[0]["GEN Values"][2]))
236
+ evaluationImages+=objectImages
237
+ evaluationCaptions+=objectCaptions[0]
238
+ TXTObjectPrompts = ""
239
+
240
+ if occupationPrompts is not None:
241
+ OCCprogressBar = st.progress(0, text="Generating Occupation-related images. Please wait.")
242
+ occupationImages, occupationCaptions = MINFER.generate_test_images(OCCprogressBar, "Generating Occupation-related images. Please wait.",
243
+ occupationPrompts, int(specs[0]["GEN Values"][0]),
244
+ int(specs[0]["GEN Values"][1]), int(specs[0]["GEN Values"][2]))
245
+ evaluationImages += occupationImages
246
+ evaluationCaptions += occupationCaptions[0]
247
+
248
+ if len(evaluationImages) > 0:
249
+ EVALprogressBar = st.progress(0, text="Evaluating "+modelID+" Model Images. Please wait.")
250
+ user_evaluation_variables.EVAL_METRICS = GBM.evaluate_t2i_model_images(evaluationImages, evaluationCaptions, EVALprogressBar, False, "GENERAL")
251
+ # GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21)
252
+ elapsedTime = time.time() - startTime
253
+ # st.write("\U0001F553 Time Taken: ", str(datetime.timedelta(seconds=elapsedTime)).split(".")[0])
254
+
255
+ user_evaluation_variables.NO_SAMPLES = len(evaluationImages)
256
+ user_evaluation_variables.RESOLUTION = specs[0]["GEN Values"][2] + "x" + specs[0]["GEN Values"][2]
257
+ user_evaluation_variables.INFERENCE_STEPS = int(specs[0]["GEN Values"][1])
258
+ user_evaluation_variables.GEN_OBJECTS = bool(specs[1]["Check"][0])
259
+ user_evaluation_variables.GEN_ACTIONS = bool(specs[1]["Check"][1])
260
+ user_evaluation_variables.GEN_OCCUPATIONS = bool(specs[1]["Check"][2])
261
+ user_evaluation_variables.DIST_BIAS = float(f"{user_evaluation_variables.EVAL_METRICS[2]:.4f}")
262
+ user_evaluation_variables.HALLUCINATION = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[3]):.4f}")
263
+ user_evaluation_variables.MISS_RATE = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[4]):.4f}")
264
+ user_evaluation_variables.EVAL_ID = MCOMP.get_evaluation_id('general', True)
265
+ user_evaluation_variables.DATE = datetime.datetime.utcnow().strftime('%d-%m-%Y')
266
+ user_evaluation_variables.TIME = datetime.datetime.utcnow().strftime('%H:%M:%S')
267
+ user_evaluation_variables.RUN_TIME = str(datetime.timedelta(seconds=elapsedTime)).split(".")[0]
268
+
269
+ user_evaluation_variables.OBJECT_IMAGES =objectImages
270
+ user_evaluation_variables.OBJECT_CAPTIONS = objectCaptions
271
+ user_evaluation_variables.OCCUPATION_IMAGES = occupationImages
272
+ user_evaluation_variables.OCCUPATION_CAPTIONS = occupationCaptions
273
+ user_evaluation_variables.CURRENT_EVAL_TYPE = 'general'
274
+
275
+
276
+ def initiate_task_oriented_bias_evaluation(tab, modelID, specs, target, imagesTab):
277
+ startTime = time.time()
278
+ TASKImages = []
279
+ TASKCaptions = []
280
+ with tab:
281
+ st.write("Initiating Task-Oriented Bias Evaluation Experiments with the following setup:")
282
+ st.write(" ***Model*** = ", modelID)
283
+ infoColumn1, infoColumn2 = st.columns(2)
284
+ st.write(" ***No. Images per prompt*** = ", specs["TO Values"][0])
285
+ st.write(" ***No. Steps*** = ", specs["TO Values"][1])
286
+ st.write(" ***Image Size*** = ", specs["TO Values"][2], "$\\times$", specs["TO Values"][2])
287
+ st.write(" ***Target*** = ", target.lower())
288
+ st.markdown("___")
289
+
290
+ captionsToExtract = 50
291
+ if (captionsToExtract * int(specs['TO Values'][0])) < 30:
292
+ st.error('There should be at least 30 images generated, You are attempting to generate:\t'
293
+ + str(captionsToExtract * int(specs['TO Values'][0]))+'.\nPlease readjust your No. Images per prompt',
294
+ icon="🚨")
295
+ else:
296
+ COCOLoadingBar = st.progress(0, text="Scanning through COCO Dataset for relevant prompts. Please wait")
297
+ prompts, cocoIDs = get_COCO_captions('./data/COCO_captions.json', target.lower(), COCOLoadingBar, captionsToExtract)
298
+ if len(prompts) == 0:
299
+ st.error('Woops! Could not find **ANY** relevant COCO prompts for the target: '+target.lower()+
300
+ '\nPlease input a different target', icon="🚨")
301
+ elif len(prompts) > 0 and len(prompts) < captionsToExtract:
302
+ st.warning('WARNING: Only found '+str(len(prompts))+ ' relevant COCO prompts for the target: '+target.lower()+
303
+ '\nWill work with these. Nothing to worry about!', icon="⚠️")
304
+ else:
305
+ st.success('Successfully found '+str(captionsToExtract)+' relevant COCO prompts', icon="✅")
306
+ if len(prompts) > 0:
307
+ COCOUIOutput = []
308
+ for id, pr in zip(cocoIDs, prompts):
309
+ COCOUIOutput.append([id, pr])
310
+ st.write('**Here are some of the randomised '+'"'+target.lower()+'"'+' captions extracted from the COCO dataset**')
311
+ COCOUIOutput.insert(0, ('ID', 'Caption'))
312
+ st.table(COCOUIOutput[:11])
313
+ TASKprogressBar = st.progress(0, text="Generating Task-oriented images. Please wait.")
314
+ TASKImages, TASKCaptions = MINFER.generate_task_oriented_images(TASKprogressBar,"Generating Task-oriented images. Please wait.",
315
+ prompts, cocoIDs, int(specs["TO Values"][0]),
316
+ int(specs["TO Values"][1]), int(specs["TO Values"][2]))
317
+
318
+ EVALprogressBar = st.progress(0, text="Evaluating " + modelID + " Model Images. Please wait.")
319
+ user_evaluation_variables.EVAL_METRICS = GBM.evaluate_t2i_model_images(TASKImages, TASKCaptions[0], EVALprogressBar, False, "TASK")
320
+
321
+
322
+ # GBM.output_eval_results(user_evaluation_variables.EVAL_METRICS, 21)
323
+ elapsedTime = time.time() - startTime
324
+ # st.write("\U0001F553 Time Taken: ", str(datetime.timedelta(seconds=elapsedTime)).split(".")[0])
325
+
326
+ user_evaluation_variables.NO_SAMPLES = len(TASKImages)
327
+ user_evaluation_variables.RESOLUTION = specs["TO Values"][2]+"x"+specs["TO Values"][2]
328
+ user_evaluation_variables.INFERENCE_STEPS = int(specs["TO Values"][1])
329
+ user_evaluation_variables.DIST_BIAS = float(f"{user_evaluation_variables.EVAL_METRICS[2]:.4f}")
330
+ user_evaluation_variables.HALLUCINATION = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[3]):.4f}")
331
+ user_evaluation_variables.MISS_RATE = float(f"{np.mean(user_evaluation_variables.EVAL_METRICS[4]):.4f}")
332
+ user_evaluation_variables.TASK_TARGET = target.lower()
333
+ user_evaluation_variables.EVAL_ID = MCOMP.get_evaluation_id('task-oriented', True)
334
+ user_evaluation_variables.DATE = datetime.datetime.utcnow().strftime('%d-%m-%Y')
335
+ user_evaluation_variables.TIME = datetime.datetime.utcnow().strftime('%H:%M:%S')
336
+ user_evaluation_variables.RUN_TIME = str(datetime.timedelta(seconds=elapsedTime)).split(".")[0]
337
+
338
+ user_evaluation_variables.TASK_IMAGES = TASKImages
339
+ user_evaluation_variables.TASK_CAPTIONS = TASKCaptions
340
+ user_evaluation_variables.TASK_COCOIDs = cocoIDs
341
+
342
+ user_evaluation_variables.CURRENT_EVAL_TYPE = 'task-oriented'
343
+
344
+
345
+ def download_and_zip_images(zipImagePath, images, captions, imageType):
346
+ csvFileName = None
347
+ if imageType == 'object':
348
+ csvFileName = 'object_prompts.csv'
349
+ elif imageType == 'occupation':
350
+ csvFileName = 'occupation_prompts.csv'
351
+ else:
352
+ csvFileName = 'task-oriented_prompts.csv'
353
+ with st.spinner("Zipping images..."):
354
+ with zipfile.ZipFile(zipImagePath, 'w') as img_zip:
355
+ for idx, image in enumerate(images):
356
+ imgName = captions[1][idx]
357
+ imageFile = BytesIO()
358
+ image.save(imageFile, 'JPEG')
359
+ img_zip.writestr(imgName, imageFile.getvalue())
360
+
361
+ # Saving prompt data as accompanying csv file
362
+ string_buffer = StringIO()
363
+ csvwriter = csv.writer(string_buffer)
364
+
365
+ if imageType in ['object', 'occupation']:
366
+ csvwriter.writerow(['No.', 'Prompt'])
367
+ for prompt, ii in zip(captions[0], range(len(captions[0]))):
368
+ csvwriter.writerow([ii + 1, prompt])
369
+ else:
370
+ csvwriter.writerow(['COCO ID', 'Prompt'])
371
+ for prompt, id in zip(captions[0], user_evaluation_variables.TASK_COCOIDs):
372
+ csvwriter.writerow([id, prompt])
373
+
374
+ img_zip.writestr(csvFileName, string_buffer.getvalue())
375
+ st.success('Successfully zipped and downloaded images!', icon="✅")
376
+
377
+
378
+ def update_images_tab(imagesTab):
379
+ with imagesTab:
380
+ if len(user_evaluation_variables.OBJECT_IMAGES) > 0:
381
+ with st.expander('Object-related Images'):
382
+ user_evaluation_variables.OBJECT_IMAGES_IN_UI = True
383
+ TXTObjectPrompts = ""
384
+ for prompt, ii in zip(user_evaluation_variables.OBJECT_CAPTIONS[0], range(len(user_evaluation_variables.OBJECT_CAPTIONS[0]))):
385
+ TXTObjectPrompts += str(1 + ii) + '. ' + prompt + '\n'
386
+ st.write("**Object-related General Bias Evaluation Images**")
387
+ st.write("Number of Generated Images = ", len(user_evaluation_variables.OBJECT_IMAGES))
388
+ st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.OBJECT_CAPTIONS[0]))
389
+ st.text_area("***List of Object Prompts***",
390
+ TXTObjectPrompts,
391
+ height=400,
392
+ disabled=False,
393
+ key='TEXT_AREA_OBJECT')
394
+ cols = cycle(st.columns(3))
395
+ for idx, image in enumerate(user_evaluation_variables.OBJECT_IMAGES):
396
+ next(cols).image(image, width=225, caption=user_evaluation_variables.OBJECT_CAPTIONS[1][idx])
397
+
398
+ saveObjectImages = st.button("Save Object-related Images")
399
+ if saveObjectImages:
400
+ zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_object_related_images.zip'
401
+ download_and_zip_images(zipPath, user_evaluation_variables.OBJECT_IMAGES,
402
+ user_evaluation_variables.OBJECT_CAPTIONS, 'object')
403
+
404
+ if len(user_evaluation_variables.OCCUPATION_IMAGES) > 0:
405
+ user_evaluation_variables.OCCUPATION_IMAGES_IN_UI = True
406
+ with st.expander('Occupation-related Images'):
407
+ TXTOccupationPrompts = ""
408
+ for prompt, ii in zip(user_evaluation_variables.OCCUPATION_CAPTIONS[0], range(len(user_evaluation_variables.OCCUPATION_CAPTIONS[0]))):
409
+ TXTOccupationPrompts += str(1 + ii) + '. ' + prompt + '\n'
410
+ st.write("**Occupation-related General Bias Evaluation Images**")
411
+ st.write("Number of Generated Images = ", len(user_evaluation_variables.OCCUPATION_IMAGES))
412
+ st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.OCCUPATION_CAPTIONS[0]))
413
+ st.text_area("***List of Occupation Prompts***",
414
+ TXTOccupationPrompts,
415
+ height=400,
416
+ disabled=False,
417
+ key='TEXT_AREA_OCCU')
418
+ cols = cycle(st.columns(3))
419
+ for idx, image in enumerate(user_evaluation_variables.OCCUPATION_IMAGES):
420
+ next(cols).image(image, width=225, caption=user_evaluation_variables.OCCUPATION_CAPTIONS[1][idx])
421
+
422
+ saveOccupationImages = st.button("Save Occupation-related Images")
423
+ if saveOccupationImages:
424
+ zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_occupation_related_images.zip'
425
+ download_and_zip_images(zipPath, user_evaluation_variables.OCCUPATION_IMAGES,
426
+ user_evaluation_variables.OCCUPATION_CAPTIONS, 'occupation')
427
+ if len(user_evaluation_variables.TASK_IMAGES) > 0:
428
+ with st.expander(user_evaluation_variables.TASK_TARGET+'-related Images'):
429
+ user_evaluation_variables.TASK_IMAGES_IN_UI = True
430
+ TXTTaskPrompts = ""
431
+ for prompt, id in zip(user_evaluation_variables.TASK_CAPTIONS[0], user_evaluation_variables.TASK_COCOIDs):
432
+ TXTTaskPrompts += "ID_" + str(id) + '. ' + prompt + '\n'
433
+
434
+ st.write("**Task-oriented Bias Evaluation Images. Target** = ", user_evaluation_variables.TASK_TARGET)
435
+ st.write("Number of Generated Images = ", len(user_evaluation_variables.TASK_IMAGES))
436
+ st.write("Corresponding Number of *unique* Captions = ", len(user_evaluation_variables.TASK_CAPTIONS[0]))
437
+ st.text_area("***List of Task-Oriented Prompts***",
438
+ TXTTaskPrompts,
439
+ height=400,
440
+ disabled=False,
441
+ key='TEXT_AREA_TASK')
442
+ cols = cycle(st.columns(3))
443
+ for idx, image in enumerate(user_evaluation_variables.TASK_IMAGES):
444
+ next(cols).image(image, width=225, caption=user_evaluation_variables.TASK_CAPTIONS[1][idx])
445
+
446
+ saveTaskImages = st.button("Save Task-oriented Images")
447
+ if saveTaskImages:
448
+ zipPath = 'TBYB_' + user_evaluation_variables.USERNAME + '_' + user_evaluation_variables.EVAL_ID + '_'+ user_evaluation_variables.TASK_TARGET+'-oriented_images.zip'
449
+ download_and_zip_images(zipPath, user_evaluation_variables.TASK_IMAGES,
450
+ user_evaluation_variables.TASK_CAPTIONS, 'task-oriented')
451
+
452
+ def get_COCO_captions(filePath, target, progressBar, NPrompts=50):
453
+ captionData = json.load(open(filePath))
454
+ COCOCaptions = []
455
+ COCOIDs = []
456
+ random.seed(42)
457
+ random.shuffle(captionData['annotations'])
458
+ for anno in captionData['annotations']:
459
+ if target in anno.get('caption').lower().split(' '):
460
+ if len(COCOCaptions) < NPrompts:
461
+ COCOCaptions.append(anno.get('caption').lower())
462
+ COCOIDs.append(str(anno.get('id')))
463
+ percentComplete = len(COCOCaptions) / NPrompts
464
+ progressBar.progress(percentComplete, text="Scanning through COCO Dataset for relevant prompts. Please wait")
465
+ return (COCOCaptions, COCOIDs)
466
+ def read_csv_to_list(filePath):
467
+ data = []
468
+ with open(filePath, 'r', newline='') as csvfile:
469
+ csvReader = csv.reader(csvfile)
470
+ for row in csvReader:
471
+ data.append(row)
472
+ return data
473
+
user_evaluation_variables.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ from yaml import safe_load
3
+ import streamlit as st
4
+
5
+ USERNAME = None
6
+ EVAL_ID = None
7
+ MODEL = None
8
+ MODEL_TYPE = None
9
+ NO_SAMPLES = None
10
+ RESOLUTION = None
11
+ INFERENCE_STEPS = None
12
+ GEN_OBJECTS = None
13
+ GEN_ACTIONS = None
14
+ GEN_OCCUPATIONS = None
15
+ TASK_TARGET = None
16
+ DIST_BIAS = None
17
+ HALLUCINATION = None
18
+ MISS_RATE = None
19
+ DATE = None
20
+ TIME = None
21
+ RUN_TIME = None
22
+
23
+ EVAL_METRICS = None
24
+ OBJECT_IMAGES = []
25
+ OCCUPATION_IMAGES = []
26
+ TASK_IMAGES = []
27
+ OBJECT_CAPTIONS = None
28
+ OCCUPATION_CAPTIONS = None
29
+ TASK_CAPTIONS = None
30
+ TASK_COCOIDs = None
31
+
32
+ OBJECT_IMAGES_IN_UI = False
33
+ OCCUPATION_IMAGES_IN_UI = False
34
+ TASK_IMAGES_IN_UI = False
35
+ CURRENT_EVAL_TYPE = None
36
+ def update_evaluation_table(evalType, debugging):
37
+ global USERNAME
38
+ global EVAL_ID
39
+ global MODEL
40
+ global MODEL_TYPE
41
+ global NO_SAMPLES
42
+ global RESOLUTION
43
+ global INFERENCE_STEPS
44
+ global GEN_OBJECTS
45
+ global GEN_ACTIONS
46
+ global GEN_OCCUPATIONS
47
+ global TASK_TARGET
48
+ global DIST_BIAS
49
+ global HALLUCINATION
50
+ global MISS_RATE
51
+ global DATE
52
+ global TIME
53
+ global RUN_TIME
54
+ global CURRENT_EVAL_TYPE
55
+
56
+ if debugging:
57
+ st.write("Username: ", USERNAME)
58
+ st.write("EVAL_ID: ", EVAL_ID)
59
+ st.write("MODEL: ", MODEL)
60
+ st.write("MODEL_TYPE: ", MODEL_TYPE)
61
+ st.write("NO_SAMPLES: ", NO_SAMPLES)
62
+ st.write("RESOLUTION: ", RESOLUTION)
63
+ st.write("INFERENCE_STEPS: ", INFERENCE_STEPS)
64
+ st.write("GEN_OBJECTS: ", GEN_OBJECTS)
65
+ st.write("GEN_ACTIONS: ", GEN_ACTIONS)
66
+ st.write("GEN_OCCUPATIONS: ", GEN_OCCUPATIONS)
67
+ st.write("TASK_TARGET: ", TASK_TARGET)
68
+ st.write("DIST_BIAS: ", DIST_BIAS)
69
+ st.write("HALLUCINATION: ", HALLUCINATION)
70
+ st.write("MISS_RATE: ", MISS_RATE)
71
+ st.write("DATE: ", DATE)
72
+ st.write("TIME: ", TIME)
73
+ st.write("RUN_TIME: ", RUN_TIME)
74
+
75
+ newEvaluationData = None
76
+ if evalType == 'general':
77
+ evalDataPath = './data/general_eval_database.yaml'
78
+ newEvaluationData = {
79
+ "Model": MODEL,
80
+ "Model Type": MODEL_TYPE,
81
+ "No. Samples": NO_SAMPLES,
82
+ "Resolution": RESOLUTION,
83
+ "Inference Steps": INFERENCE_STEPS,
84
+ "Objects": GEN_OBJECTS,
85
+ "Actions": GEN_ACTIONS,
86
+ "Occupations": GEN_OCCUPATIONS,
87
+ "Dist. Bias": DIST_BIAS,
88
+ "Hallucination": HALLUCINATION,
89
+ "Gen. Miss Rate": MISS_RATE,
90
+ "Date": DATE,
91
+ "Time": TIME,
92
+ "Run Time": RUN_TIME
93
+ }
94
+ else:
95
+ evalDataPath = './data/task_oriented_eval_database.yaml'
96
+ newEvaluationData = {
97
+ "Model": MODEL,
98
+ "Model Type": MODEL_TYPE,
99
+ "No. Samples": NO_SAMPLES,
100
+ "Resolution": RESOLUTION,
101
+ "Inference Steps": INFERENCE_STEPS,
102
+ "Target": TASK_TARGET,
103
+ "Dist. Bias": DIST_BIAS,
104
+ "Hallucination": HALLUCINATION,
105
+ "Gen. Miss Rate": MISS_RATE,
106
+ "Date": DATE,
107
+ "Time": TIME,
108
+ "Run Time": RUN_TIME
109
+ }
110
+ with open(evalDataPath, 'r') as f:
111
+ yamlData = safe_load(f)
112
+
113
+ # st.write("OLD DATABASE ", yamlData['evaluations']['username'][USERNAME])
114
+ if USERNAME not in yamlData['evaluations']['username']:
115
+ if TASK_TARGET is not None:
116
+ st.success('Congrats on your first General Bias evaluation!', icon='\U0001F388')
117
+ else:
118
+ st.success('Congrats on your first Task-Oriented Bias evaluation!', icon='\U0001F388')
119
+ yamlData['evaluations']['username'][USERNAME]= {}
120
+
121
+ yamlData['evaluations']['username'][USERNAME][EVAL_ID] = newEvaluationData
122
+
123
+ st.write("NEW DATABASE ", yamlData['evaluations']['username'][USERNAME])
124
+ with open(evalDataPath, 'w') as yaml_file:
125
+ yaml_file.write(yaml.dump(yamlData, default_flow_style=False))
126
+
127
+ def reset_variables(evalType):
128
+ global USERNAME
129
+ global EVAL_ID
130
+ global MODEL
131
+ global MODEL_TYPE
132
+ global NO_SAMPLES
133
+ global RESOLUTION
134
+ global INFERENCE_STEPS
135
+ global GEN_OBJECTS
136
+ global GEN_ACTIONS
137
+ global GEN_OCCUPATIONS
138
+ global TASK_TARGET
139
+ global DIST_BIAS
140
+ global HALLUCINATION
141
+ global MISS_RATE
142
+ global DATE
143
+ global TIME
144
+ global RUN_TIME
145
+ global EVAL_METRICS
146
+ global OBJECT_IMAGES
147
+ global OCCUPATION_IMAGES
148
+ global TASK_IMAGES
149
+ global OBJECT_CAPTIONS
150
+ global OCCUPATION_CAPTIONS
151
+ global TASK_CAPTIONS
152
+ global TASK_COCOIDs
153
+ global OBJECT_IMAGES_IN_UI
154
+ global OCCUPATION_IMAGES_IN_UI
155
+ global TASK_IMAGES_IN_UI
156
+ global CURRENT_EVAL_TYPE
157
+ EVAL_ID = None
158
+ # MODEL = None
159
+ # MODEL_TYPE = None
160
+ NO_SAMPLES = None
161
+ RESOLUTION = None
162
+ INFERENCE_STEPS = None
163
+ GEN_OBJECTS = None
164
+ GEN_ACTIONS = None
165
+ GEN_OCCUPATIONS = None
166
+ TASK_TARGET = None
167
+ DIST_BIAS = None
168
+ HALLUCINATION = None
169
+ MISS_RATE = None
170
+ DATE = None
171
+ TIME = None
172
+ RUN_TIME = None
173
+
174
+ EVAL_METRICS = None
175
+ CURRENT_EVAL_TYPE = None
176
+
177
+ if evalType == 'general':
178
+ OBJECT_IMAGES = []
179
+ OCCUPATION_IMAGES = []
180
+ OBJECT_CAPTIONS = None
181
+ OCCUPATION_CAPTIONS = None
182
+ OBJECT_IMAGES_IN_UI = False
183
+ OCCUPATION_IMAGES_IN_UI = False
184
+ else:
185
+ TASK_IMAGES = []
186
+ TASK_CAPTIONS = None
187
+ TASK_COCOIDs = None
188
+ TASK_IMAGES_IN_UI = False
189
+