Spaces:
Sleeping
Sleeping
Dongfu Jiang
commited on
Commit
โข
325d8fa
1
Parent(s):
d77e70b
update
Browse files
app.py
CHANGED
@@ -21,6 +21,7 @@ from huggingface_hub import HfApi
|
|
21 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
22 |
import os, uuid
|
23 |
from utils_display import model_info
|
|
|
24 |
|
25 |
# get the last updated time from the elo_ranks.all.jsonl file
|
26 |
LAST_UPDATED = None
|
@@ -43,14 +44,19 @@ dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='
|
|
43 |
import random
|
44 |
random.seed(42)
|
45 |
np.random.seed(42)
|
46 |
-
def sample_an_feedback():
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
|
53 |
-
|
54 |
plan_history = {
|
55 |
"user": [
|
56 |
feedback['query'],
|
@@ -79,16 +85,18 @@ def sample_an_feedback():
|
|
79 |
"task_type": feedback['source'],
|
80 |
"plan_history": plan_history,
|
81 |
"ground_history": ground_history,
|
82 |
-
"pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
83 |
-
"answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
|
84 |
-
"
|
|
|
|
|
85 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
86 |
}
|
87 |
return result_dict
|
88 |
|
89 |
def display_chat_history(model_selections):
|
90 |
# eval_item = sample_an_eval_result(eval_results, model_selections)
|
91 |
-
eval_item = sample_an_feedback()
|
92 |
print("---" * 10)
|
93 |
for key, value in eval_item.items():
|
94 |
print(f"{key}: {value}")
|
@@ -176,7 +184,7 @@ def build_demo(TYPES):
|
|
176 |
# clear the selected_models
|
177 |
clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
|
178 |
|
179 |
-
with gr.Row():
|
180 |
with gr.Column(scale=1.5):
|
181 |
with gr.Accordion("๐ Task Description", open=True, elem_classes="accordion-label"):
|
182 |
task = gr.Markdown("", elem_classes="markdown-text-tiny")
|
@@ -208,30 +216,30 @@ def build_demo(TYPES):
|
|
208 |
with gr.Row():
|
209 |
with gr.Column():
|
210 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
211 |
-
with gr.Accordion("
|
212 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
213 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
214 |
|
215 |
with gr.Column():
|
216 |
# with gr.Accordion("๐ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
217 |
-
with gr.Accordion("
|
218 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
219 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
220 |
|
221 |
-
with gr.Column():
|
222 |
-
with gr.Accordion("
|
223 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
224 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
225 |
|
226 |
# Display chat history when button is clicked
|
227 |
btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
|
228 |
|
229 |
-
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3):
|
230 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
231 |
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
|
232 |
|
233 |
with gr.Row():
|
234 |
-
with gr.Accordion("๐ Citation", open=False, elem_classes="accordion-label"):
|
235 |
gr.Textbox(
|
236 |
value=CITATION_TEXT,
|
237 |
lines=7,
|
@@ -262,7 +270,14 @@ if __name__ == "__main__":
|
|
262 |
|
263 |
|
264 |
# available_models = sorted(list(set(list(original_df["model name "]))))
|
265 |
-
available_models = list(model_info.keys())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
# remove the rows where the model name is not in the available_models
|
267 |
original_df = original_df[original_df["model name "].isin(available_models)]
|
268 |
ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
|
|
|
21 |
# from datasets import Dataset, load_dataset, concatenate_datasets
|
22 |
import os, uuid
|
23 |
from utils_display import model_info
|
24 |
+
from tqdm import tqdm
|
25 |
|
26 |
# get the last updated time from the elo_ranks.all.jsonl file
|
27 |
LAST_UPDATED = None
|
|
|
44 |
import random
|
45 |
random.seed(42)
|
46 |
np.random.seed(42)
|
47 |
+
def sample_an_feedback(selected_models):
|
48 |
+
shuffled_dataset = dataset.shuffle(seed=42)
|
49 |
+
feedback = None
|
50 |
+
for example in tqdm(shuffled_dataset, total=len(shuffled_dataset), desc="Searching for valid examples"):
|
51 |
+
example_model_responses = example['responses']
|
52 |
+
valid_model_responses = [model_response for model_response in example_model_responses if model_response['model'] in selected_models]
|
53 |
+
if len(valid_model_responses) >= 2:
|
54 |
+
feedback = example
|
55 |
+
model_response_1, model_response_2 = random.sample(valid_model_responses, 2)
|
56 |
+
break
|
57 |
+
if not feedback:
|
58 |
+
return gr.Exit("No valid examples found. Please select other models.")
|
59 |
|
|
|
60 |
plan_history = {
|
61 |
"user": [
|
62 |
feedback['query'],
|
|
|
85 |
"task_type": feedback['source'],
|
86 |
"plan_history": plan_history,
|
87 |
"ground_history": ground_history,
|
88 |
+
# "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
|
89 |
+
# "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
|
90 |
+
"pred": str(model_response_1['model']),
|
91 |
+
"answer": str(model_response_2['model']),
|
92 |
+
"correctness": "GPT-4o-mini",
|
93 |
"image": "file/data_dir/test_images/000000341196.jpg"
|
94 |
}
|
95 |
return result_dict
|
96 |
|
97 |
def display_chat_history(model_selections):
|
98 |
# eval_item = sample_an_eval_result(eval_results, model_selections)
|
99 |
+
eval_item = sample_an_feedback(model_selections)
|
100 |
print("---" * 10)
|
101 |
for key, value in eval_item.items():
|
102 |
print(f"{key}: {value}")
|
|
|
184 |
# clear the selected_models
|
185 |
clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
|
186 |
|
187 |
+
with gr.Row(visible=False):
|
188 |
with gr.Column(scale=1.5):
|
189 |
with gr.Accordion("๐ Task Description", open=True, elem_classes="accordion-label"):
|
190 |
task = gr.Markdown("", elem_classes="markdown-text-tiny")
|
|
|
216 |
with gr.Row():
|
217 |
with gr.Column():
|
218 |
# with gr.Accordion("๐ Prediction", open=True, elem_classes="accordion-label"):
|
219 |
+
with gr.Accordion("Model A Name", open=True, elem_classes="accordion-label"):
|
220 |
prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
|
221 |
prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
222 |
|
223 |
with gr.Column():
|
224 |
# with gr.Accordion("๐ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
|
225 |
+
with gr.Accordion("Model B Name", open=True, elem_classes="accordion-label"):
|
226 |
gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
|
227 |
gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
228 |
|
229 |
+
with gr.Column(visible=True):
|
230 |
+
with gr.Accordion("Feedback Model Name", open=True, elem_classes="accordion-label"):
|
231 |
correctness = gr.HTML("", elem_id="markdown-text-tiny")
|
232 |
correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
|
233 |
|
234 |
# Display chat history when button is clicked
|
235 |
btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
|
236 |
|
237 |
+
with gr.TabItem("๐ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
|
238 |
gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
|
239 |
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
|
240 |
|
241 |
with gr.Row():
|
242 |
+
with gr.Accordion("๐ Citation", open=False, elem_classes="accordion-label", visible=False):
|
243 |
gr.Textbox(
|
244 |
value=CITATION_TEXT,
|
245 |
lines=7,
|
|
|
270 |
|
271 |
|
272 |
# available_models = sorted(list(set(list(original_df["model name "]))))
|
273 |
+
# available_models = list(model_info.keys())
|
274 |
+
|
275 |
+
available_models = set()
|
276 |
+
for example in dataset:
|
277 |
+
for model_response in example['responses']:
|
278 |
+
available_models.add(model_response['model'])
|
279 |
+
available_models = sorted(list(available_models))
|
280 |
+
|
281 |
# remove the rows where the model name is not in the available_models
|
282 |
original_df = original_df[original_df["model name "].isin(available_models)]
|
283 |
ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
|