Spaces:
Sleeping
Sleeping
Dongfu Jiang
commited on
Commit
โข
b34b884
1
Parent(s):
8a099d6
update
Browse files
app.py
CHANGED
@@ -44,16 +44,24 @@ available_models = [] # to be filled in later
|
|
44 |
import random
|
45 |
random.seed(42)
|
46 |
np.random.seed(42)
|
47 |
-
def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
def filter_examples(item):
|
50 |
-
if task_category
|
|
|
|
|
51 |
return False
|
52 |
-
if
|
53 |
return False
|
54 |
-
if
|
55 |
return False
|
56 |
-
if
|
57 |
return False
|
58 |
return True
|
59 |
|
@@ -89,6 +97,8 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
|
|
89 |
"difficulty": example['difficulty'],
|
90 |
"quality": example['quality'],
|
91 |
"intent": example['intent'],
|
|
|
|
|
92 |
"plan_history": plan_history,
|
93 |
"ground_history": ground_history,
|
94 |
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
@@ -96,6 +106,7 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
|
|
96 |
"pred": example['model'], # model that generates the original response
|
97 |
"answer": example['revision']['model'], # model that generates the revised response
|
98 |
"correctness": example['feedback']['model'], # model that generates the feedback for the original response
|
|
|
99 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
100 |
}
|
101 |
return result_dict
|
@@ -119,8 +130,8 @@ def diff_texts(text1, text2):
|
|
119 |
last_token_category = category
|
120 |
return merged_tokens
|
121 |
|
122 |
-
def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
|
123 |
-
eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
|
124 |
print("---" * 10)
|
125 |
for key, value in eval_item.items():
|
126 |
print(f"{key}: {value}")
|
@@ -135,6 +146,8 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
|
|
135 |
difficulty = eval_item["difficulty"]
|
136 |
quality = eval_item["quality"]
|
137 |
intent = eval_item["intent"]
|
|
|
|
|
138 |
|
139 |
if eval_item["image"]:
|
140 |
image_path = eval_item["image"]
|
@@ -148,7 +161,7 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
|
|
148 |
chats_ground += [item_user, item_asst]
|
149 |
chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
|
150 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
151 |
-
task_metadata = f"- ๐: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
|
152 |
|
153 |
diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
|
154 |
|
@@ -161,9 +174,9 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
|
|
161 |
print(f"Revised Response: {chats_ground}")
|
162 |
if image_path != "":
|
163 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
164 |
-
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
|
165 |
else:
|
166 |
-
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
|
167 |
|
168 |
|
169 |
|
@@ -194,7 +207,7 @@ def slider_change_full(length_penalty, show_winrate):
|
|
194 |
|
195 |
seafoam = Seafoam()
|
196 |
def build_demo(TYPES):
|
197 |
-
global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
|
198 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
199 |
gr.Markdown(HEADER_MD, elem_classes="markdown-text")
|
200 |
|
@@ -208,26 +221,27 @@ def build_demo(TYPES):
|
|
208 |
with gr.Column():
|
209 |
|
210 |
with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
|
211 |
-
|
212 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
213 |
-
|
214 |
-
clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
|
215 |
with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
|
216 |
-
|
217 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
218 |
-
|
219 |
-
clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
|
220 |
with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
|
221 |
-
|
222 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
223 |
-
|
224 |
-
|
225 |
-
|
|
|
|
|
226 |
with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
|
227 |
-
|
228 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
229 |
-
|
230 |
-
|
|
|
231 |
|
232 |
with gr.Row(visible=False):
|
233 |
with gr.Column(scale=1.5):
|
@@ -246,31 +260,11 @@ def build_demo(TYPES):
|
|
246 |
task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
|
247 |
task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
248 |
|
249 |
-
with gr.Row():
|
250 |
-
with gr.Column(scale=1.1):
|
251 |
-
# gr.Markdown("## ๐ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
252 |
-
gr.Markdown("## ๐ข Model Original Response", elem_classes="accordion-label")
|
253 |
-
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
254 |
-
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
255 |
-
with gr.Column(scale=1):
|
256 |
-
# gr.Markdown("## ๐ข Ground Module Process History", elem_classes="accordion-label")
|
257 |
-
gr.Markdown("## ๐ข Model Revised Response", elem_classes="accordion-label")
|
258 |
-
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
259 |
-
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
260 |
-
|
261 |
-
with gr.Row():
|
262 |
-
with gr.Column():
|
263 |
-
with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
|
264 |
-
highlighted_diff = gr.HighlightedText(label="Highlighted differences",
|
265 |
-
combine_adjacent=False,
|
266 |
-
show_legend=True,
|
267 |
-
color_map={"+": "green", "-": "red"})
|
268 |
-
|
269 |
with gr.Row():
|
270 |
with gr.Column():
|
271 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
272 |
with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
|
273 |
-
prediction = gr.
|
274 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
275 |
|
276 |
with gr.Column():
|
@@ -283,11 +277,50 @@ def build_demo(TYPES):
|
|
283 |
with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
|
284 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
285 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
|
287 |
# Display chat history when button is clicked
|
288 |
btn_show_history.click(fn=display_chat_history,
|
289 |
-
inputs=[
|
290 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
|
293 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
@@ -328,6 +361,7 @@ if __name__ == "__main__":
|
|
328 |
avaliable_quality = sorted(list(set(dataset['quality'])))
|
329 |
available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
|
330 |
available_categories = sorted(list(set(dataset['category'])))
|
|
|
331 |
|
332 |
|
333 |
TYPES = ["markdown", "number"]
|
|
|
44 |
import random
|
45 |
random.seed(42)
|
46 |
np.random.seed(42)
|
47 |
+
def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better):
|
48 |
|
49 |
+
print(f"task_category: {task_category}")
|
50 |
+
print(f"task_difficulty: {task_difficulty}")
|
51 |
+
print(f"task_quality: {task_quality}")
|
52 |
+
print(f"feedback_score: {feedback_score}")
|
53 |
+
print(f"revision_better: {revision_better}")
|
54 |
+
|
55 |
def filter_examples(item):
|
56 |
+
if not task_category or item['category'] not in task_category:
|
57 |
+
return False
|
58 |
+
if not task_difficulty or item['difficulty'] not in task_difficulty:
|
59 |
return False
|
60 |
+
if not task_quality or item['quality'] not in task_quality:
|
61 |
return False
|
62 |
+
if not feedback_score or item['feedback']['processed']['score'] not in feedback_score:
|
63 |
return False
|
64 |
+
if not revision_better or item['pair_feedback']['revision_better'] not in revision_better:
|
65 |
return False
|
66 |
return True
|
67 |
|
|
|
97 |
"difficulty": example['difficulty'],
|
98 |
"quality": example['quality'],
|
99 |
"intent": example['intent'],
|
100 |
+
"ori_feedback": example['feedback']['processed'],
|
101 |
+
"revision_better": example['pair_feedback']['revision_better'],
|
102 |
"plan_history": plan_history,
|
103 |
"ground_history": ground_history,
|
104 |
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
|
|
106 |
"pred": example['model'], # model that generates the original response
|
107 |
"answer": example['revision']['model'], # model that generates the revised response
|
108 |
"correctness": example['feedback']['model'], # model that generates the feedback for the original response
|
109 |
+
"pair_feedback_model": example['pair_feedback']['model'], # model that generates the feedback for the revised response
|
110 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
111 |
}
|
112 |
return result_dict
|
|
|
130 |
last_token_category = category
|
131 |
return merged_tokens
|
132 |
|
133 |
+
def display_chat_history(task_category, task_difficulty, task_quality, feedback_score, revision_better):
|
134 |
+
eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better)
|
135 |
print("---" * 10)
|
136 |
for key, value in eval_item.items():
|
137 |
print(f"{key}: {value}")
|
|
|
146 |
difficulty = eval_item["difficulty"]
|
147 |
quality = eval_item["quality"]
|
148 |
intent = eval_item["intent"]
|
149 |
+
feedback = eval_item["ori_feedback"]
|
150 |
+
pair_feedback_model = eval_item["pair_feedback_model"]
|
151 |
|
152 |
if eval_item["image"]:
|
153 |
image_path = eval_item["image"]
|
|
|
161 |
chats_ground += [item_user, item_asst]
|
162 |
chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
|
163 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
164 |
+
task_metadata = f"- ๐: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent} \n- **Revision Better**: {eval_item['revision_better']}"
|
165 |
|
166 |
diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
|
167 |
|
|
|
174 |
print(f"Revised Response: {chats_ground}")
|
175 |
if image_path != "":
|
176 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
177 |
+
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, image, diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
|
178 |
else:
|
179 |
+
return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, f'<div style="text-align: center;"> </div>', diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
|
180 |
|
181 |
|
182 |
|
|
|
207 |
|
208 |
seafoam = Seafoam()
|
209 |
def build_demo(TYPES):
|
210 |
+
global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores, available_revision_better
|
211 |
with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
|
212 |
gr.Markdown(HEADER_MD, elem_classes="markdown-text")
|
213 |
|
|
|
221 |
with gr.Column():
|
222 |
|
223 |
with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
|
224 |
+
selected_task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
|
225 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
226 |
+
clear_button.click(lambda: {selected_task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_difficulty])
|
|
|
227 |
with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
|
228 |
+
selected_task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
|
229 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
230 |
+
clear_button.click(lambda: {selected_task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_quality])
|
|
|
231 |
with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
|
232 |
+
selected_feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
|
233 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
234 |
+
clear_button.click(lambda: {selected_feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_feedback_score])
|
235 |
+
with gr.Accordion("Choose revision better", open=False, elem_classes="accordion-label"):
|
236 |
+
selected_revision_better = gr.CheckboxGroup(available_revision_better, info="", value=available_revision_better, show_label=False, elem_id="select-revision-better")
|
237 |
+
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
238 |
+
clear_button.click(lambda: {selected_revision_better: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_revision_better])
|
239 |
with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
|
240 |
+
selected_task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
|
241 |
clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
|
242 |
+
clear_button.click(lambda: {selected_task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_category])
|
243 |
+
|
244 |
+
|
245 |
|
246 |
with gr.Row(visible=False):
|
247 |
with gr.Column(scale=1.5):
|
|
|
260 |
task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
|
261 |
task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
263 |
with gr.Row():
|
264 |
with gr.Column():
|
265 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
266 |
with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
|
267 |
+
prediction = gr.HTML("", elem_classes="markdown-text-tiny")
|
268 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
269 |
|
270 |
with gr.Column():
|
|
|
277 |
with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
|
278 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
279 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
280 |
+
|
281 |
+
with gr.Column(visible=True):
|
282 |
+
with gr.Accordion("Feedback Model (2nd stage)", open=True, elem_classes="accordion-label"):
|
283 |
+
pair_feedback_model = gr.HTML("", elem_id="markdown-text-tiny")
|
284 |
+
pair_feedback_model.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
285 |
+
|
286 |
+
|
287 |
+
|
288 |
+
with gr.Row():
|
289 |
+
with gr.Column(scale=1.1):
|
290 |
+
# gr.Markdown("## ๐ข Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
291 |
+
gr.Markdown("## ๐ข Policy Model Response (Original)", elem_classes="accordion-label")
|
292 |
+
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
293 |
+
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
294 |
+
with gr.Column(scale=1):
|
295 |
+
# gr.Markdown("## ๐ข Ground Module Process History", elem_classes="accordion-label")
|
296 |
+
gr.Markdown("## ๐ข Revision Model Response (Revised)", elem_classes="accordion-label")
|
297 |
+
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
298 |
+
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
299 |
+
|
300 |
+
with gr.Row():
|
301 |
+
with gr.Column():
|
302 |
+
with gr.Accordion("๐ Feedback of the original response", open=True, elem_classes="accordion-label"):
|
303 |
+
intent = gr.Textbox("", lines=1, max_lines=30, label="Intent", elem_classes="markdown-text-tiny")
|
304 |
+
checklist = gr.Textbox("", lines=1, max_lines=30, label="Checklist", elem_classes="markdown-text-tiny")
|
305 |
+
strengths = gr.Textbox("", lines=1, max_lines=30, label="Strengths", elem_classes="markdown-text-tiny")
|
306 |
+
weaknesses = gr.Textbox("", lines=1, max_lines=30, label="Weaknesses", elem_classes="markdown-text-tiny")
|
307 |
+
feedback_score = gr.Textbox("", lines=1, max_lines=1, label="Feedback Score", elem_classes="markdown-text-tiny")
|
308 |
+
with gr.Column():
|
309 |
+
with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
|
310 |
+
highlighted_diff = gr.HighlightedText(label="Original (-) vs Revised (+)",
|
311 |
+
combine_adjacent=False,
|
312 |
+
show_legend=True,
|
313 |
+
color_map={"-": "red", "+": "green"})
|
314 |
|
315 |
# Display chat history when button is clicked
|
316 |
btn_show_history.click(fn=display_chat_history,
|
317 |
+
inputs=[selected_task_category, selected_task_difficulty, selected_task_quality, selected_feedback_score, selected_revision_better],
|
318 |
+
outputs=[
|
319 |
+
task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata,
|
320 |
+
prediction, gold_answer, correctness, pair_feedback_model,
|
321 |
+
image, highlighted_diff,
|
322 |
+
intent, checklist, strengths, weaknesses, feedback_score
|
323 |
+
])
|
324 |
|
325 |
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
|
326 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
|
|
361 |
avaliable_quality = sorted(list(set(dataset['quality'])))
|
362 |
available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
|
363 |
available_categories = sorted(list(set(dataset['category'])))
|
364 |
+
available_revision_better = sorted(list(set([item['pair_feedback']['revision_better'] for item in dataset])))
|
365 |
|
366 |
|
367 |
TYPES = ["markdown", "number"]
|