Dongfu Jiang commited on
Commit
b34b884
โ€ข
1 Parent(s): 8a099d6
Files changed (1) hide show
  1. app.py +81 -47
app.py CHANGED
@@ -44,16 +44,24 @@ available_models = [] # to be filled in later
44
  import random
45
  random.seed(42)
46
  np.random.seed(42)
47
- def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
48
 
 
 
 
 
 
 
49
  def filter_examples(item):
50
- if task_category and item['category'] not in task_category:
 
 
51
  return False
52
- if task_difficulty and item['difficulty'] not in task_difficulty:
53
  return False
54
- if task_quality and item['quality'] not in task_quality:
55
  return False
56
- if feedback_score and item['feedback']['processed']['score'] not in feedback_score:
57
  return False
58
  return True
59
 
@@ -89,6 +97,8 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
89
  "difficulty": example['difficulty'],
90
  "quality": example['quality'],
91
  "intent": example['intent'],
 
 
92
  "plan_history": plan_history,
93
  "ground_history": ground_history,
94
  # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
@@ -96,6 +106,7 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
96
  "pred": example['model'], # model that generates the original response
97
  "answer": example['revision']['model'], # model that generates the revised response
98
  "correctness": example['feedback']['model'], # model that generates the feedback for the original response
 
99
  "image": "file/data_dir/test_images/000000341196.jpg"
100
  }
101
  return result_dict
@@ -119,8 +130,8 @@ def diff_texts(text1, text2):
119
  last_token_category = category
120
  return merged_tokens
121
 
122
- def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
123
- eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
124
  print("---" * 10)
125
  for key, value in eval_item.items():
126
  print(f"{key}: {value}")
@@ -135,6 +146,8 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
135
  difficulty = eval_item["difficulty"]
136
  quality = eval_item["quality"]
137
  intent = eval_item["intent"]
 
 
138
 
139
  if eval_item["image"]:
140
  image_path = eval_item["image"]
@@ -148,7 +161,7 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
148
  chats_ground += [item_user, item_asst]
149
  chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
150
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
151
- task_metadata = f"- ๐Ÿ†”: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
152
 
153
  diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
154
 
@@ -161,9 +174,9 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
161
  print(f"Revised Response: {chats_ground}")
162
  if image_path != "":
163
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
164
- return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
165
  else:
166
- return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
167
 
168
 
169
 
@@ -194,7 +207,7 @@ def slider_change_full(length_penalty, show_winrate):
194
 
195
  seafoam = Seafoam()
196
  def build_demo(TYPES):
197
- global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
198
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
199
  gr.Markdown(HEADER_MD, elem_classes="markdown-text")
200
 
@@ -208,26 +221,27 @@ def build_demo(TYPES):
208
  with gr.Column():
209
 
210
  with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
211
- task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
212
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
213
- # clear the selected_models
214
- clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
215
  with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
216
- task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
217
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
218
- # clear the selected_models
219
- clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
220
  with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
221
- feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
222
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
223
- # clear the selected_models
224
- clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score])
225
-
 
 
226
  with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
227
- task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
228
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
229
- # clear the selected_models
230
- clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category])
 
231
 
232
  with gr.Row(visible=False):
233
  with gr.Column(scale=1.5):
@@ -246,31 +260,11 @@ def build_demo(TYPES):
246
  task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
247
  task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
248
 
249
- with gr.Row():
250
- with gr.Column(scale=1.1):
251
- # gr.Markdown("## ๐Ÿ“ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
252
- gr.Markdown("## ๐Ÿ“ข Model Original Response", elem_classes="accordion-label")
253
- Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
254
- Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
255
- with gr.Column(scale=1):
256
- # gr.Markdown("## ๐Ÿ“ข Ground Module Process History", elem_classes="accordion-label")
257
- gr.Markdown("## ๐Ÿ“ข Model Revised Response", elem_classes="accordion-label")
258
- Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
259
- Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
260
-
261
- with gr.Row():
262
- with gr.Column():
263
- with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
264
- highlighted_diff = gr.HighlightedText(label="Highlighted differences",
265
- combine_adjacent=False,
266
- show_legend=True,
267
- color_map={"+": "green", "-": "red"})
268
-
269
  with gr.Row():
270
  with gr.Column():
271
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
272
  with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
273
- prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
274
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
275
 
276
  with gr.Column():
@@ -283,11 +277,50 @@ def build_demo(TYPES):
283
  with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
284
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
285
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
  # Display chat history when button is clicked
288
  btn_show_history.click(fn=display_chat_history,
289
- inputs=[task_category, task_difficulty, task_quality, feedback_score],
290
- outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff])
 
 
 
 
 
291
 
292
  with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
293
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
@@ -328,6 +361,7 @@ if __name__ == "__main__":
328
  avaliable_quality = sorted(list(set(dataset['quality'])))
329
  available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
330
  available_categories = sorted(list(set(dataset['category'])))
 
331
 
332
 
333
  TYPES = ["markdown", "number"]
 
44
  import random
45
  random.seed(42)
46
  np.random.seed(42)
47
+ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better):
48
 
49
+ print(f"task_category: {task_category}")
50
+ print(f"task_difficulty: {task_difficulty}")
51
+ print(f"task_quality: {task_quality}")
52
+ print(f"feedback_score: {feedback_score}")
53
+ print(f"revision_better: {revision_better}")
54
+
55
  def filter_examples(item):
56
+ if not task_category or item['category'] not in task_category:
57
+ return False
58
+ if not task_difficulty or item['difficulty'] not in task_difficulty:
59
  return False
60
+ if not task_quality or item['quality'] not in task_quality:
61
  return False
62
+ if not feedback_score or item['feedback']['processed']['score'] not in feedback_score:
63
  return False
64
+ if not revision_better or item['pair_feedback']['revision_better'] not in revision_better:
65
  return False
66
  return True
67
 
 
97
  "difficulty": example['difficulty'],
98
  "quality": example['quality'],
99
  "intent": example['intent'],
100
+ "ori_feedback": example['feedback']['processed'],
101
+ "revision_better": example['pair_feedback']['revision_better'],
102
  "plan_history": plan_history,
103
  "ground_history": ground_history,
104
  # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
 
106
  "pred": example['model'], # model that generates the original response
107
  "answer": example['revision']['model'], # model that generates the revised response
108
  "correctness": example['feedback']['model'], # model that generates the feedback for the original response
109
+ "pair_feedback_model": example['pair_feedback']['model'], # model that generates the feedback for the revised response
110
  "image": "file/data_dir/test_images/000000341196.jpg"
111
  }
112
  return result_dict
 
130
  last_token_category = category
131
  return merged_tokens
132
 
133
+ def display_chat_history(task_category, task_difficulty, task_quality, feedback_score, revision_better):
134
+ eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better)
135
  print("---" * 10)
136
  for key, value in eval_item.items():
137
  print(f"{key}: {value}")
 
146
  difficulty = eval_item["difficulty"]
147
  quality = eval_item["quality"]
148
  intent = eval_item["intent"]
149
+ feedback = eval_item["ori_feedback"]
150
+ pair_feedback_model = eval_item["pair_feedback_model"]
151
 
152
  if eval_item["image"]:
153
  image_path = eval_item["image"]
 
161
  chats_ground += [item_user, item_asst]
162
  chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
163
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
164
+ task_metadata = f"- ๐Ÿ†”: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent} \n- **Revision Better**: {eval_item['revision_better']}"
165
 
166
  diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
167
 
 
174
  print(f"Revised Response: {chats_ground}")
175
  if image_path != "":
176
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
177
+ return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, image, diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
178
  else:
179
+ return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, f'<div style="text-align: center;"> </div>', diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
180
 
181
 
182
 
 
207
 
208
  seafoam = Seafoam()
209
  def build_demo(TYPES):
210
+ global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores, available_revision_better
211
  with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
212
  gr.Markdown(HEADER_MD, elem_classes="markdown-text")
213
 
 
221
  with gr.Column():
222
 
223
  with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
224
+ selected_task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
225
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
226
+ clear_button.click(lambda: {selected_task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_difficulty])
 
227
  with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
228
+ selected_task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
229
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
230
+ clear_button.click(lambda: {selected_task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_quality])
 
231
  with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
232
+ selected_feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
233
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
234
+ clear_button.click(lambda: {selected_feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_feedback_score])
235
+ with gr.Accordion("Choose revision better", open=False, elem_classes="accordion-label"):
236
+ selected_revision_better = gr.CheckboxGroup(available_revision_better, info="", value=available_revision_better, show_label=False, elem_id="select-revision-better")
237
+ clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
238
+ clear_button.click(lambda: {selected_revision_better: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_revision_better])
239
  with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
240
+ selected_task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
241
  clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
242
+ clear_button.click(lambda: {selected_task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_category])
243
+
244
+
245
 
246
  with gr.Row(visible=False):
247
  with gr.Column(scale=1.5):
 
260
  task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
261
  task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
262
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  with gr.Row():
264
  with gr.Column():
265
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
266
  with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
267
+ prediction = gr.HTML("", elem_classes="markdown-text-tiny")
268
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
269
 
270
  with gr.Column():
 
277
  with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
278
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
279
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
280
+
281
+ with gr.Column(visible=True):
282
+ with gr.Accordion("Feedback Model (2nd stage)", open=True, elem_classes="accordion-label"):
283
+ pair_feedback_model = gr.HTML("", elem_id="markdown-text-tiny")
284
+ pair_feedback_model.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
285
+
286
+
287
+
288
+ with gr.Row():
289
+ with gr.Column(scale=1.1):
290
+ # gr.Markdown("## ๐Ÿ“ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
291
+ gr.Markdown("## ๐Ÿ“ข Policy Model Response (Original)", elem_classes="accordion-label")
292
+ Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
293
+ Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
294
+ with gr.Column(scale=1):
295
+ # gr.Markdown("## ๐Ÿ“ข Ground Module Process History", elem_classes="accordion-label")
296
+ gr.Markdown("## ๐Ÿ“ข Revision Model Response (Revised)", elem_classes="accordion-label")
297
+ Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
298
+ Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
299
+
300
+ with gr.Row():
301
+ with gr.Column():
302
+ with gr.Accordion("๐Ÿ“Š Feedback of the original response", open=True, elem_classes="accordion-label"):
303
+ intent = gr.Textbox("", lines=1, max_lines=30, label="Intent", elem_classes="markdown-text-tiny")
304
+ checklist = gr.Textbox("", lines=1, max_lines=30, label="Checklist", elem_classes="markdown-text-tiny")
305
+ strengths = gr.Textbox("", lines=1, max_lines=30, label="Strengths", elem_classes="markdown-text-tiny")
306
+ weaknesses = gr.Textbox("", lines=1, max_lines=30, label="Weaknesses", elem_classes="markdown-text-tiny")
307
+ feedback_score = gr.Textbox("", lines=1, max_lines=1, label="Feedback Score", elem_classes="markdown-text-tiny")
308
+ with gr.Column():
309
+ with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
310
+ highlighted_diff = gr.HighlightedText(label="Original (-) vs Revised (+)",
311
+ combine_adjacent=False,
312
+ show_legend=True,
313
+ color_map={"-": "red", "+": "green"})
314
 
315
  # Display chat history when button is clicked
316
  btn_show_history.click(fn=display_chat_history,
317
+ inputs=[selected_task_category, selected_task_difficulty, selected_task_quality, selected_feedback_score, selected_revision_better],
318
+ outputs=[
319
+ task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata,
320
+ prediction, gold_answer, correctness, pair_feedback_model,
321
+ image, highlighted_diff,
322
+ intent, checklist, strengths, weaknesses, feedback_score
323
+ ])
324
 
325
  with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
326
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
 
361
  avaliable_quality = sorted(list(set(dataset['quality'])))
362
  available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
363
  available_categories = sorted(list(set(dataset['category'])))
364
+ available_revision_better = sorted(list(set([item['pair_feedback']['revision_better'] for item in dataset])))
365
 
366
 
367
  TYPES = ["markdown", "number"]