Dongfu Jiang commited on
Commit
1d6e701
β€’
1 Parent(s): fa390d6
Files changed (1) hide show
  1. app.py +71 -5
app.py CHANGED
@@ -16,6 +16,7 @@ from datetime import datetime, timezone
16
  from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, add_winrates_tasks
17
  # from gradio.themes.utils import colors, fonts, sizes
18
  from themes import Seafoam
 
19
  from huggingface_hub import HfApi
20
  # from datasets import Dataset, load_dataset, concatenate_datasets
21
  import os, uuid
@@ -37,9 +38,63 @@ eval_results = load_eval_results()
37
 
38
  available_models = [] # to be filled in later
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  def display_chat_history(model_selections):
42
- eval_item = sample_an_eval_result(eval_results, model_selections)
 
 
 
 
 
 
 
43
  session_id = eval_item["session_id"]
44
  task = eval_item["task"]
45
  task_type = eval_item["task_type"]
@@ -61,6 +116,13 @@ def display_chat_history(model_selections):
61
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
62
  task_metadata = f"- πŸ†”: `{session_id}` \n- **Task category**: {task_type}"
63
 
 
 
 
 
 
 
 
64
  if image_path != "":
65
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
66
  return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
@@ -133,22 +195,26 @@ def build_demo(TYPES):
133
 
134
  with gr.Row():
135
  with gr.Column(scale=1.1):
136
- gr.Markdown("## πŸ“’ Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
 
137
  Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
138
  Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
139
  with gr.Column(scale=1):
140
- gr.Markdown("## πŸ“’ Ground Module Process History", elem_classes="accordion-label")
 
141
  Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
142
  Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
143
 
144
  with gr.Row():
145
  with gr.Column():
146
- with gr.Accordion("πŸ™‹ Prediction", open=True, elem_classes="accordion-label"):
 
147
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
148
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
149
 
150
  with gr.Column():
151
- with gr.Accordion("πŸ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
 
152
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
153
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
154
 
 
16
  from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, add_winrates_tasks
17
  # from gradio.themes.utils import colors, fonts, sizes
18
  from themes import Seafoam
19
+ import datasets
20
  from huggingface_hub import HfApi
21
  # from datasets import Dataset, load_dataset, concatenate_datasets
22
  import os, uuid
 
38
 
39
  available_models = [] # to be filled in later
40
 
41
+ dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='train')
42
+
43
+ import random
44
+ random.seed(42)
45
+ np.random.seed(42)
46
+ def sample_an_feedback():
47
+ feedback = dataset[np.random.randint(0, len(dataset))]
48
+
49
+ two_model_responses = random.sample(feedback['responses'], 2)
50
+ model_response_1 = two_model_responses[0]
51
+ model_response_2 = two_model_responses[1]
52
+
53
+
54
+ plan_history = {
55
+ "user": [
56
+ feedback['query'],
57
+ "Please give the feedback (query GPT-4o-mini)"
58
+ ],
59
+ "assistant": [
60
+ model_response_1['response'],
61
+ model_response_2['feedback']['raw']
62
+ ]
63
+ }
64
+
65
+ ground_history = {
66
+ "user": [
67
+ feedback['query'],
68
+ "Please give the feedback (query GPT-4o-mini)"
69
+ ],
70
+ "assistant": [
71
+ model_response_2['response'],
72
+ model_response_2['feedback']['raw']
73
+ ]
74
+ }
75
+
76
+ result_dict = {
77
+ "session_id": feedback['id'],
78
+ "task": feedback['source'],
79
+ "task_type": feedback['source'],
80
+ "plan_history": plan_history,
81
+ "ground_history": ground_history,
82
+ "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
83
+ "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
84
+ "correctness": "Correct",
85
+ "image": "file/data_dir/test_images/000000341196.jpg"
86
+ }
87
+ return result_dict
88
 
89
  def display_chat_history(model_selections):
90
+ # eval_item = sample_an_eval_result(eval_results, model_selections)
91
+ eval_item = sample_an_feedback()
92
+ print("---" * 10)
93
+ for key, value in eval_item.items():
94
+ print(f"{key}: {value}")
95
+ print("---" * 10)
96
+
97
+ # eval_item = sample_an_feedback()
98
  session_id = eval_item["session_id"]
99
  task = eval_item["task"]
100
  task_type = eval_item["task_type"]
 
116
  chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
117
  task_metadata = f"- πŸ†”: `{session_id}` \n- **Task category**: {task_type}"
118
 
119
+ print(f"Task: {task}")
120
+ print(f"Plan History: {chats_plan}")
121
+ print(f"Ground History: {chats_ground}")
122
+ print(f"Task Metadata: {task_metadata}")
123
+ print(f"Prediction: {prediction}")
124
+ print(f"Gold Answer: {gold_answer}")
125
+ print(f"Correctness: {correctness}")
126
  if image_path != "":
127
  image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
128
  return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
 
195
 
196
  with gr.Row():
197
  with gr.Column(scale=1.1):
198
+ # gr.Markdown("## πŸ“’ Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
199
+ gr.Markdown("## πŸ“’ Model A's response and feedback", elem_classes="accordion-label")
200
  Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
201
  Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
202
  with gr.Column(scale=1):
203
+ # gr.Markdown("## πŸ“’ Ground Module Process History", elem_classes="accordion-label")
204
+ gr.Markdown("## πŸ“’ Model B's response and feedback", elem_classes="accordion-label")
205
  Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
206
  Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
207
 
208
  with gr.Row():
209
  with gr.Column():
210
+ # with gr.Accordion("πŸ™‹ Prediction", open=True, elem_classes="accordion-label"):
211
+ with gr.Accordion("Feedback Score (A)", open=True, elem_classes="accordion-label"):
212
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
213
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
214
 
215
  with gr.Column():
216
+ # with gr.Accordion("πŸ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
217
+ with gr.Accordion("Feedback Score (B)", open=True, elem_classes="accordion-label"):
218
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
219
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
220