Spaces:
Sleeping
Sleeping
Dongfu Jiang
commited on
Commit
β’
1d6e701
1
Parent(s):
fa390d6
update
Browse files
app.py
CHANGED
@@ -16,6 +16,7 @@ from datetime import datetime, timezone
|
|
16 |
from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, add_winrates_tasks
|
17 |
# from gradio.themes.utils import colors, fonts, sizes
|
18 |
from themes import Seafoam
|
|
|
19 |
from huggingface_hub import HfApi
|
20 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
21 |
import os, uuid
|
@@ -37,9 +38,63 @@ eval_results = load_eval_results()
|
|
37 |
|
38 |
available_models = [] # to be filled in later
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
def display_chat_history(model_selections):
|
42 |
-
eval_item = sample_an_eval_result(eval_results, model_selections)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
session_id = eval_item["session_id"]
|
44 |
task = eval_item["task"]
|
45 |
task_type = eval_item["task_type"]
|
@@ -61,6 +116,13 @@ def display_chat_history(model_selections):
|
|
61 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
62 |
task_metadata = f"- π: `{session_id}` \n- **Task category**: {task_type}"
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
if image_path != "":
|
65 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
66 |
return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
|
@@ -133,22 +195,26 @@ def build_demo(TYPES):
|
|
133 |
|
134 |
with gr.Row():
|
135 |
with gr.Column(scale=1.1):
|
136 |
-
gr.Markdown("## π’ Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
|
|
137 |
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
138 |
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
139 |
with gr.Column(scale=1):
|
140 |
-
gr.Markdown("## π’ Ground Module Process History", elem_classes="accordion-label")
|
|
|
141 |
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
142 |
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
143 |
|
144 |
with gr.Row():
|
145 |
with gr.Column():
|
146 |
-
with gr.Accordion("π Prediction", open=True, elem_classes="accordion-label"):
|
|
|
147 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
148 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
149 |
|
150 |
with gr.Column():
|
151 |
-
with gr.Accordion("π Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
|
|
152 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
153 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
154 |
|
|
|
16 |
from data_utils import load_eval_results, sample_an_eval_result, apply_length_penalty, post_processing, add_winrates, add_winrates_tasks
|
17 |
# from gradio.themes.utils import colors, fonts, sizes
|
18 |
from themes import Seafoam
|
19 |
+
import datasets
|
20 |
from huggingface_hub import HfApi
|
21 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
22 |
import os, uuid
|
|
|
38 |
|
39 |
available_models = [] # to be filled in later
|
40 |
|
41 |
+
dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='train')
|
42 |
+
|
43 |
+
import random
|
44 |
+
random.seed(42)
|
45 |
+
np.random.seed(42)
|
46 |
+
def sample_an_feedback():
|
47 |
+
feedback = dataset[np.random.randint(0, len(dataset))]
|
48 |
+
|
49 |
+
two_model_responses = random.sample(feedback['responses'], 2)
|
50 |
+
model_response_1 = two_model_responses[0]
|
51 |
+
model_response_2 = two_model_responses[1]
|
52 |
+
|
53 |
+
|
54 |
+
plan_history = {
|
55 |
+
"user": [
|
56 |
+
feedback['query'],
|
57 |
+
"Please give the feedback (query GPT-4o-mini)"
|
58 |
+
],
|
59 |
+
"assistant": [
|
60 |
+
model_response_1['response'],
|
61 |
+
model_response_2['feedback']['raw']
|
62 |
+
]
|
63 |
+
}
|
64 |
+
|
65 |
+
ground_history = {
|
66 |
+
"user": [
|
67 |
+
feedback['query'],
|
68 |
+
"Please give the feedback (query GPT-4o-mini)"
|
69 |
+
],
|
70 |
+
"assistant": [
|
71 |
+
model_response_2['response'],
|
72 |
+
model_response_2['feedback']['raw']
|
73 |
+
]
|
74 |
+
}
|
75 |
+
|
76 |
+
result_dict = {
|
77 |
+
"session_id": feedback['id'],
|
78 |
+
"task": feedback['source'],
|
79 |
+
"task_type": feedback['source'],
|
80 |
+
"plan_history": plan_history,
|
81 |
+
"ground_history": ground_history,
|
82 |
+
"pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
83 |
+
"answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
|
84 |
+
"correctness": "Correct",
|
85 |
+
"image": "file/data_dir/test_images/000000341196.jpg"
|
86 |
+
}
|
87 |
+
return result_dict
|
88 |
|
89 |
def display_chat_history(model_selections):
|
90 |
+
# eval_item = sample_an_eval_result(eval_results, model_selections)
|
91 |
+
eval_item = sample_an_feedback()
|
92 |
+
print("---" * 10)
|
93 |
+
for key, value in eval_item.items():
|
94 |
+
print(f"{key}: {value}")
|
95 |
+
print("---" * 10)
|
96 |
+
|
97 |
+
# eval_item = sample_an_feedback()
|
98 |
session_id = eval_item["session_id"]
|
99 |
task = eval_item["task"]
|
100 |
task_type = eval_item["task_type"]
|
|
|
116 |
chats_ground = [(chats_ground[i], chats_ground[i+1]) for i in range(0, len(chats_ground), 2)]
|
117 |
task_metadata = f"- π: `{session_id}` \n- **Task category**: {task_type}"
|
118 |
|
119 |
+
print(f"Task: {task}")
|
120 |
+
print(f"Plan History: {chats_plan}")
|
121 |
+
print(f"Ground History: {chats_ground}")
|
122 |
+
print(f"Task Metadata: {task_metadata}")
|
123 |
+
print(f"Prediction: {prediction}")
|
124 |
+
print(f"Gold Answer: {gold_answer}")
|
125 |
+
print(f"Correctness: {correctness}")
|
126 |
if image_path != "":
|
127 |
image = f'<div style="text-align: center;"> <img src="{image_path}" style="height: 250px;"> </div>'
|
128 |
return task, chats_plan, chats_ground, task_metadata, prediction, gold_answer, correctness, image
|
|
|
195 |
|
196 |
with gr.Row():
|
197 |
with gr.Column(scale=1.1):
|
198 |
+
# gr.Markdown("## π’ Plan Module Process History w/ <span style='background-color: #FDFDBA;'>Execution Module Results</span>", elem_classes="accordion-label")
|
199 |
+
gr.Markdown("## π’ Model A's response and feedback", elem_classes="accordion-label")
|
200 |
Chatbot_Common_Plan = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Plan History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
201 |
Chatbot_Common_Plan.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
202 |
with gr.Column(scale=1):
|
203 |
+
# gr.Markdown("## π’ Ground Module Process History", elem_classes="accordion-label")
|
204 |
+
gr.Markdown("## π’ Model B's response and feedback", elem_classes="accordion-label")
|
205 |
Chatbot_Common_Ground = gr.Chatbot(avatar_images=["human_icon.jpeg", "ai_icon.png"], height=1000, container=False, label="Common Ground History", likeable=False, show_share_button=False, show_label=True, elem_classes="chat-common", layout="bubble")
|
206 |
Chatbot_Common_Ground.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
207 |
|
208 |
with gr.Row():
|
209 |
with gr.Column():
|
210 |
+
# with gr.Accordion("π Prediction", open=True, elem_classes="accordion-label"):
|
211 |
+
with gr.Accordion("Feedback Score (A)", open=True, elem_classes="accordion-label"):
|
212 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
213 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
214 |
|
215 |
with gr.Column():
|
216 |
+
# with gr.Accordion("π Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
217 |
+
with gr.Accordion("Feedback Score (B)", open=True, elem_classes="accordion-label"):
|
218 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
219 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
220 |
|