Dongfu Jiang commited on
Commit
325d8fa
โ€ข
1 Parent(s): d77e70b
Files changed (1) hide show
  1. app.py +34 -19
app.py CHANGED
@@ -21,6 +21,7 @@ from huggingface_hub import HfApi
21
  # from datasets import Dataset, load_dataset, concatenate_datasets
22
  import os, uuid
23
  from utils_display import model_info
 
24
 
25
  # get the last updated time from the elo_ranks.all.jsonl file
26
  LAST_UPDATED = None
@@ -43,14 +44,19 @@ dataset = datasets.load_dataset("DongfuJiang/WildFeedback", "feedbacks", split='
43
  import random
44
  random.seed(42)
45
  np.random.seed(42)
46
- def sample_an_feedback():
47
- feedback = dataset[np.random.randint(0, len(dataset))]
48
-
49
- two_model_responses = random.sample(feedback['responses'], 2)
50
- model_response_1 = two_model_responses[0]
51
- model_response_2 = two_model_responses[1]
 
 
 
 
 
 
52
 
53
-
54
  plan_history = {
55
  "user": [
56
  feedback['query'],
@@ -79,16 +85,18 @@ def sample_an_feedback():
79
  "task_type": feedback['source'],
80
  "plan_history": plan_history,
81
  "ground_history": ground_history,
82
- "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
83
- "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
84
- "correctness": "Correct",
 
 
85
  "image": "file/data_dir/test_images/000000341196.jpg"
86
  }
87
  return result_dict
88
 
89
  def display_chat_history(model_selections):
90
  # eval_item = sample_an_eval_result(eval_results, model_selections)
91
- eval_item = sample_an_feedback()
92
  print("---" * 10)
93
  for key, value in eval_item.items():
94
  print(f"{key}: {value}")
@@ -176,7 +184,7 @@ def build_demo(TYPES):
176
  # clear the selected_models
177
  clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
178
 
179
- with gr.Row():
180
  with gr.Column(scale=1.5):
181
  with gr.Accordion("๐Ÿ“ Task Description", open=True, elem_classes="accordion-label"):
182
  task = gr.Markdown("", elem_classes="markdown-text-tiny")
@@ -208,30 +216,30 @@ def build_demo(TYPES):
208
  with gr.Row():
209
  with gr.Column():
210
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
211
- with gr.Accordion("Feedback Score (A)", open=True, elem_classes="accordion-label"):
212
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
213
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
214
 
215
  with gr.Column():
216
  # with gr.Accordion("๐Ÿ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
217
- with gr.Accordion("Feedback Score (B)", open=True, elem_classes="accordion-label"):
218
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
219
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
220
 
221
- with gr.Column():
222
- with gr.Accordion("Correctness", open=True, elem_classes="accordion-label"):
223
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
224
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
225
 
226
  # Display chat history when button is clicked
227
  btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
228
 
229
- with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3):
230
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
231
  gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
232
 
233
  with gr.Row():
234
- with gr.Accordion("๐Ÿ“™ Citation", open=False, elem_classes="accordion-label"):
235
  gr.Textbox(
236
  value=CITATION_TEXT,
237
  lines=7,
@@ -262,7 +270,14 @@ if __name__ == "__main__":
262
 
263
 
264
  # available_models = sorted(list(set(list(original_df["model name "]))))
265
- available_models = list(model_info.keys())
 
 
 
 
 
 
 
266
  # remove the rows where the model name is not in the available_models
267
  original_df = original_df[original_df["model name "].isin(available_models)]
268
  ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]
 
21
  # from datasets import Dataset, load_dataset, concatenate_datasets
22
  import os, uuid
23
  from utils_display import model_info
24
+ from tqdm import tqdm
25
 
26
  # get the last updated time from the elo_ranks.all.jsonl file
27
  LAST_UPDATED = None
 
44
  import random
45
  random.seed(42)
46
  np.random.seed(42)
47
+ def sample_an_feedback(selected_models):
48
+ shuffled_dataset = dataset.shuffle(seed=42)
49
+ feedback = None
50
+ for example in tqdm(shuffled_dataset, total=len(shuffled_dataset), desc="Searching for valid examples"):
51
+ example_model_responses = example['responses']
52
+ valid_model_responses = [model_response for model_response in example_model_responses if model_response['model'] in selected_models]
53
+ if len(valid_model_responses) >= 2:
54
+ feedback = example
55
+ model_response_1, model_response_2 = random.sample(valid_model_responses, 2)
56
+ break
57
+ if not feedback:
58
+ return gr.Exit("No valid examples found. Please select other models.")
59
 
 
60
  plan_history = {
61
  "user": [
62
  feedback['query'],
 
85
  "task_type": feedback['source'],
86
  "plan_history": plan_history,
87
  "ground_history": ground_history,
88
+ # "pred": str(model_response_1['feedback']['processed']['score']) if model_response_1['feedback']['processed'] else "A",
89
+ # "answer": str(model_response_2['feedback']['processed']['score']) if model_response_2['feedback']['processed'] else "A",
90
+ "pred": str(model_response_1['model']),
91
+ "answer": str(model_response_2['model']),
92
+ "correctness": "GPT-4o-mini",
93
  "image": "file/data_dir/test_images/000000341196.jpg"
94
  }
95
  return result_dict
96
 
97
  def display_chat_history(model_selections):
98
  # eval_item = sample_an_eval_result(eval_results, model_selections)
99
+ eval_item = sample_an_feedback(model_selections)
100
  print("---" * 10)
101
  for key, value in eval_item.items():
102
  print(f"{key}: {value}")
 
184
  # clear the selected_models
185
  clear_button.click(lambda: {selected_models: {"value": [], "__type__": "update"}}, inputs=[], outputs=[selected_models])
186
 
187
+ with gr.Row(visible=False):
188
  with gr.Column(scale=1.5):
189
  with gr.Accordion("๐Ÿ“ Task Description", open=True, elem_classes="accordion-label"):
190
  task = gr.Markdown("", elem_classes="markdown-text-tiny")
 
216
  with gr.Row():
217
  with gr.Column():
218
  # with gr.Accordion("๐Ÿ™‹ Prediction", open=True, elem_classes="accordion-label"):
219
+ with gr.Accordion("Model A Name", open=True, elem_classes="accordion-label"):
220
  prediction = gr.Markdown("", elem_classes="markdown-text-tiny")
221
  prediction.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
222
 
223
  with gr.Column():
224
  # with gr.Accordion("๐Ÿ”‘ Ground-Truth Answer", open=True, elem_classes="accordion-label"):
225
+ with gr.Accordion("Model B Name", open=True, elem_classes="accordion-label"):
226
  gold_answer = gr.HTML("", elem_id="markdown-text-tiny")
227
  gold_answer.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
228
 
229
+ with gr.Column(visible=True):
230
+ with gr.Accordion("Feedback Model Name", open=True, elem_classes="accordion-label"):
231
  correctness = gr.HTML("", elem_id="markdown-text-tiny")
232
  correctness.change(lambda x: x, inputs=[], outputs=[], scroll_to_output=False, js=js_code)
233
 
234
  # Display chat history when button is clicked
235
  btn_show_history.click(fn=display_chat_history, inputs=[selected_models], outputs=[task, Chatbot_Common_Plan, Chatbot_Common_Ground, task_metadata, prediction, gold_answer, correctness, image])
236
 
237
+ with gr.TabItem("๐Ÿ“ฎ About Us", elem_id="od-benchmark-tab-table", id=3, visible=False):
238
  gr.Markdown(ABOUT_MD, elem_classes="markdown-text")
239
  gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text-small")
240
 
241
  with gr.Row():
242
+ with gr.Accordion("๐Ÿ“™ Citation", open=False, elem_classes="accordion-label", visible=False):
243
  gr.Textbox(
244
  value=CITATION_TEXT,
245
  lines=7,
 
270
 
271
 
272
  # available_models = sorted(list(set(list(original_df["model name "]))))
273
+ # available_models = list(model_info.keys())
274
+
275
+ available_models = set()
276
+ for example in dataset:
277
+ for model_response in example['responses']:
278
+ available_models.add(model_response['model'])
279
+ available_models = sorted(list(available_models))
280
+
281
  # remove the rows where the model name is not in the available_models
282
  original_df = original_df[original_df["model name "].isin(available_models)]
283
  ablation_df = ablation_df[ablation_df["model name "].isin(available_models)]