VAPO_data_demo

Sleeping

App Files Files Community

Dongfu Jiang commited on Aug 1

Commit

b34b884

•

1 Parent(s): 8a099d6

update

Browse files

Files changed (1) hide show

app.py +81 -47

app.py CHANGED Viewed

@@ -44,16 +44,24 @@ available_models = [] # to be filled in later
 import random
 random.seed(42)
 np.random.seed(42)
-def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score):
     def filter_examples(item):
-        if task_category and item['category'] not in task_category:
             return False
-        if task_difficulty and item['difficulty'] not in task_difficulty:
             return False
-        if task_quality and item['quality'] not in task_quality:
             return False
-        if feedback_score and item['feedback']['processed']['score'] not in feedback_score:
             return False
         return True
@@ -89,6 +97,8 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
         "difficulty": example['difficulty'],
         "quality": example['quality'],
         "intent": example['intent'],
         "plan_history": plan_history,
         "ground_history": ground_history,
         # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
@@ -96,6 +106,7 @@ def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_sc
         "pred": example['model'], # model that generates the original response
         "answer": example['revision']['model'], # model that generates the revised response
         "correctness": example['feedback']['model'], # model that generates the feedback for the original response
         "image": "file/data_dir/test_images/000000341196.jpg"
     }
     return result_dict
@@ -119,8 +130,8 @@ def diff_texts(text1, text2):
         last_token_category = category
     return merged_tokens
-def display_chat_history(task_category, task_difficulty, task_quality, feedback_score):
-    eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score)
     print("---" * 10)
     for key, value in eval_item.items():
         print(f"{key}: {value}")
@@ -135,6 +146,8 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
     difficulty = eval_item["difficulty"]
     quality = eval_item["quality"]
     intent = eval_item["intent"]
     if eval_item["image"]:
         image_path = eval_item["image"]
@@ -148,7 +161,7 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
         chats_ground += [item_user, item_asst]
     chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
     chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
-    task_metadata = f"- 🆔: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent}"
     diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
@@ -161,9 +174,9 @@ def display_chat_history(task_category, task_difficulty, task_quality, feedback_
     print(f"Revised Response: {chats_ground}")
     if image_path != "":
         image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
-        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image, diff_text
     else:
-        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, f'<div style="text-align: center;"> </div>', diff_text
@@ -194,7 +207,7 @@ def slider_change_full(length_penalty, show_winrate):
 seafoam = Seafoam()
 def build_demo(TYPES):
-    global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.Markdown(HEADER_MD, elem_classes="markdown-text")
@@ -208,26 +221,27 @@ def build_demo(TYPES):
                     with gr.Column():
                         with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
-                            task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the selected_models
-                            clear_button.click(lambda: {task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_difficulty])
                         with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
-                            task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the selected_models
-                            clear_button.click(lambda: {task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_quality])
                         with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
-                            feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the selected_models
-                            clear_button.click(lambda: {feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[feedback_score])
                         with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
-                            task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
-                            # clear the selected_models
-                            clear_button.click(lambda: {task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[task_category])
                 with gr.Row(visible=False):
                     with gr.Column(scale=1.5):
@@ -246,31 +260,11 @@ def build_demo(TYPES):
                             task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
                             task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                with gr.Row():
-                    with gr.Column(scale=1.1):
-                        # gr.Markdown("## 📢 Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
-                        gr.Markdown("## 📢 Model Original Response", elem_classes="accordion-label")
-                        Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
-                        Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                    with gr.Column(scale=1):
-                        # gr.Markdown("## 📢 Ground Module Process History", elem_classes="accordion-label")
-                        gr.Markdown("## 📢 Model Revised Response", elem_classes="accordion-label")
-                        Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
-                        Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
-                with gr.Row():
-                    with gr.Column():
-                        with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
-                            highlighted_diff = gr.HighlightedText(label="Highlighted differences",
-                                                            combine_adjacent=False,
-                                                            show_legend=True,
-                                                            color_map={"+": "green", "-": "red"})
                 with gr.Row():
                     with gr.Column():
                         # with gr.Accordion("🙋 Prediction", open=True, elem_classes="accordion-label"):
                         with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
-                            prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
                             prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column():
@@ -283,11 +277,50 @@ def build_demo(TYPES):
                         with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
                             correctness = gr.HTML("", elem_id="markdown-text-tiny")
                             correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                 # Display chat history when button is clicked
                 btn_show_history.click(fn=display_chat_history,
-                                       inputs=[task_category, task_difficulty, task_quality, feedback_score],
-                                       outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image, highlighted_diff])
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
@@ -328,6 +361,7 @@ if __name__ == "__main__":
     avaliable_quality = sorted(list(set(dataset['quality'])))
     available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
     available_categories = sorted(list(set(dataset['category'])))
     TYPES = ["markdown", "number"]

 import random
 random.seed(42)
 np.random.seed(42)
+def sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better):
+    print(f"task_category: {task_category}")
+    print(f"task_difficulty: {task_difficulty}")
+    print(f"task_quality: {task_quality}")
+    print(f"feedback_score: {feedback_score}")
+    print(f"revision_better: {revision_better}")
     def filter_examples(item):
+        if not task_category or item['category'] not in task_category:
+            return False
+        if not task_difficulty or item['difficulty'] not in task_difficulty:
             return False
+        if not task_quality or item['quality'] not in task_quality:
             return False
+        if not feedback_score or item['feedback']['processed']['score'] not in feedback_score:
             return False
+        if not revision_better or item['pair_feedback']['revision_better'] not in revision_better:
             return False
         return True
         "difficulty": example['difficulty'],
         "quality": example['quality'],
         "intent": example['intent'],
+        "ori_feedback": example['feedback']['processed'],
+        "revision_better": example['pair_feedback']['revision_better'],
         "plan_history": plan_history,
         "ground_history": ground_history,
         # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
         "pred": example['model'], # model that generates the original response
         "answer": example['revision']['model'], # model that generates the revised response
         "correctness": example['feedback']['model'], # model that generates the feedback for the original response
+        "pair_feedback_model": example['pair_feedback']['model'], # model that generates the feedback for the revised response
         "image": "file/data_dir/test_images/000000341196.jpg"
     }
     return result_dict
         last_token_category = category
     return merged_tokens
+def display_chat_history(task_category, task_difficulty, task_quality, feedback_score, revision_better):
+    eval_item = sample_an_feedback(task_category, task_difficulty, task_quality, feedback_score, revision_better)
     print("---" * 10)
     for key, value in eval_item.items():
         print(f"{key}: {value}")
     difficulty = eval_item["difficulty"]
     quality = eval_item["quality"]
     intent = eval_item["intent"]
+    feedback = eval_item["ori_feedback"]
+    pair_feedback_model = eval_item["pair_feedback_model"]
     if eval_item["image"]:
         image_path = eval_item["image"]
         chats_ground += [item_user, item_asst]
     chats_plan = [(chats_plan[i], chats_plan[i+1]) for i in range(0, len(chats_plan), 2)]
     chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
+    task_metadata = f"- 🆔: `{session_id}` \n- **Category**: {category} \n- **Difficulty**: {difficulty} \n- **Quality**: {quality} \n- **Intent**: {intent} \n- **Revision Better**: {eval_item['revision_better']}"
     diff_text = diff_texts(chats_plan[-1][1], chats_ground[-1][1])
     print(f"Revised Response: {chats_ground}")
     if image_path != "":
         image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
+        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, image, diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
     else:
+        return category, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, pair_feedback_model, f'<div style="text-align: center;"> </div>', diff_text, feedback['intent'], feedback['checklist'], feedback['strengths'], feedback['weaknesses'], feedback['score']
 seafoam = Seafoam()
 def build_demo(TYPES):
+    global available_categories, avaliable_difficulty, avaliable_quality, available_feedback_scores, available_revision_better
     with gr.Blocks(theme=gr.themes.Soft(), css=css, js=js_light) as demo:
         gr.Markdown(HEADER_MD, elem_classes="markdown-text")
                     with gr.Column():
                         with gr.Accordion("Choose task difficulty", open=False, elem_classes="accordion-label"):
+                            selected_task_difficulty = gr.CheckboxGroup(avaliable_difficulty, info="", value=avaliable_difficulty, show_label=False, elem_id="select-difficulty")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            clear_button.click(lambda: {selected_task_difficulty: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_difficulty])
                         with gr.Accordion("Choose task quality", open=False, elem_classes="accordion-label"):
+                            selected_task_quality = gr.CheckboxGroup(avaliable_quality, info="", value=avaliable_quality, show_label=False, elem_id="select-quality")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            clear_button.click(lambda: {selected_task_quality: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_quality])
                         with gr.Accordion("Choose feedback score", open=False, elem_classes="accordion-label"):
+                            selected_feedback_score = gr.CheckboxGroup(available_feedback_scores, info="", value=available_feedback_scores, show_label=False, elem_id="select-feedback")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            clear_button.click(lambda: {selected_feedback_score: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_feedback_score])
+                        with gr.Accordion("Choose revision better", open=False, elem_classes="accordion-label"):
+                            selected_revision_better = gr.CheckboxGroup(available_revision_better, info="", value=available_revision_better, show_label=False, elem_id="select-revision-better")
+                            clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            clear_button.click(lambda: {selected_revision_better: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_revision_better])
                         with gr.Accordion("Choose task category", open=False, elem_classes="accordion-label"):
+                            selected_task_category = gr.CheckboxGroup(available_categories, info="", value=available_categories, show_label=False, elem_id="select-category")
                             clear_button = gr.Button("Clear", elem_classes="btn_boderline_gray", scale=1)
+                            clear_button.click(lambda: {selected_task_category: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_task_category])
                 with gr.Row(visible=False):
                     with gr.Column(scale=1.5):
                             task_metadata = gr.Markdown("", elem_classes="markdown-text-tiny")
                             task_metadata.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                 with gr.Row():
                     with gr.Column():
                         # with gr.Accordion("🙋 Prediction", open=True, elem_classes="accordion-label"):
                         with gr.Accordion("Policy Model", open=True, elem_classes="accordion-label"):
+                            prediction = gr.HTML("", elem_classes="markdown-text-tiny")
                             prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
                     with gr.Column():
                         with gr.Accordion("Feedback Model", open=True, elem_classes="accordion-label"):
                             correctness = gr.HTML("", elem_id="markdown-text-tiny")
                             correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                    with gr.Column(visible=True):
+                        with gr.Accordion("Feedback Model (2nd stage)", open=True, elem_classes="accordion-label"):
+                            pair_feedback_model = gr.HTML("", elem_id="markdown-text-tiny")
+                            pair_feedback_model.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                with gr.Row():
+                    with gr.Column(scale=1.1):
+                        # gr.Markdown("## 📢 Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
+                        gr.Markdown("## 📢 Policy Model Response (Original)", elem_classes="accordion-label")
+                        Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
+                        Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                    with gr.Column(scale=1):
+                        # gr.Markdown("## 📢 Ground Module Process History", elem_classes="accordion-label")
+                        gr.Markdown("## 📢 Revision Model Response (Revised)", elem_classes="accordion-label")
+                        Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=3000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
+                        Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
+                with gr.Row():
+                    with gr.Column():
+                        with gr.Accordion("📊 Feedback of the original response", open=True, elem_classes="accordion-label"):
+                            intent = gr.Textbox("", lines=1, max_lines=30, label="Intent", elem_classes="markdown-text-tiny")
+                            checklist = gr.Textbox("", lines=1, max_lines=30, label="Checklist", elem_classes="markdown-text-tiny")
+                            strengths = gr.Textbox("", lines=1, max_lines=30, label="Strengths", elem_classes="markdown-text-tiny")
+                            weaknesses = gr.Textbox("", lines=1, max_lines=30, label="Weaknesses", elem_classes="markdown-text-tiny")
+                            feedback_score = gr.Textbox("", lines=1, max_lines=1, label="Feedback Score", elem_classes="markdown-text-tiny")
+                    with gr.Column():
+                        with gr.Accordion("Highlighted differences", open=True, elem_classes="accordion-label"):
+                            highlighted_diff = gr.HighlightedText(label="Original (-) vs Revised (+)",
+                                                            combine_adjacent=False,
+                                                            show_legend=True,
+                                                            color_map={"-": "red", "+": "green"})
                 # Display chat history when button is clicked
                 btn_show_history.click(fn=display_chat_history,
+                                       inputs=[selected_task_category, selected_task_difficulty, selected_task_quality, selected_feedback_score, selected_revision_better],
+                                       outputs=[
+                                           task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata,
+                                           prediction, gold_answer, correctness, pair_feedback_model,
+                                           image, highlighted_diff,
+                                           intent, checklist, strengths, weaknesses, feedback_score
+                                        ])
             with gr.TabItem("📮 About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
                 gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
     avaliable_quality = sorted(list(set(dataset['quality'])))
     available_feedback_scores = sorted(list(set([item['feedback']['processed']['score'] for item in dataset])))
     available_categories = sorted(list(set(dataset['category'])))
+    available_revision_better = sorted(list(set([item['pair_feedback']['revision_better'] for item in dataset])))
     TYPES = ["markdown", "number"]