AtsuMiyai commited on
Commit
e8e50f1
β€’
1 Parent(s): 03f13cc

update explanations on MM-UPD Bench

Browse files
Files changed (2) hide show
  1. app.py +11 -15
  2. constants.py +29 -3
app.py CHANGED
@@ -316,15 +316,6 @@ with block:
316
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
317
  # table mmupd bench
318
  with gr.TabItem("πŸ… MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
319
- with gr.Row():
320
- with gr.Accordion("Citation", open=False):
321
- citation_button = gr.Textbox(
322
- value=CITATION_BUTTON_TEXT,
323
- label=CITATION_BUTTON_LABEL,
324
- elem_id="citation-button",
325
- show_copy_button=True,
326
- )
327
-
328
  # selection for column part:
329
  checkbox_aad_group = gr.CheckboxGroup(
330
  choices=TASK_AAD_INFO,
@@ -411,8 +402,6 @@ with block:
411
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
412
  checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
413
 
414
-
415
- with gr.TabItem("πŸ… MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
416
  with gr.Row():
417
  with gr.Accordion("Citation", open=False):
418
  citation_button = gr.Textbox(
@@ -422,6 +411,7 @@ with block:
422
  show_copy_button=True,
423
  )
424
 
 
425
  checkbox_iasd_group = gr.CheckboxGroup(
426
  choices=TASK_IASD_INFO,
427
  value=AVG_INFO,
@@ -505,8 +495,6 @@ with block:
505
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
506
  checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
507
 
508
- # Table 3
509
- with gr.TabItem("πŸ… MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
510
  with gr.Row():
511
  with gr.Accordion("Citation", open=False):
512
  citation_button = gr.Textbox(
@@ -516,6 +504,9 @@ with block:
516
  show_copy_button=True,
517
  )
518
 
 
 
 
519
  # selection for column part:
520
  checkbox_ivqd_group = gr.CheckboxGroup(
521
  choices=TASK_IVQD_INFO,
@@ -599,6 +590,13 @@ with block:
599
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
600
  checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
601
 
 
 
 
 
 
 
 
602
 
603
  # table 4
604
  with gr.TabItem("πŸ“ About", elem_id="mmupd-benchmark-tab-table", id=4):
@@ -606,8 +604,6 @@ with block:
606
 
607
  # table 5
608
  with gr.TabItem("πŸš€ Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
609
- gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
610
-
611
  with gr.Row():
612
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
613
 
 
316
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
317
  # table mmupd bench
318
  with gr.TabItem("πŸ… MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
 
 
 
 
 
 
 
 
 
319
  # selection for column part:
320
  checkbox_aad_group = gr.CheckboxGroup(
321
  choices=TASK_AAD_INFO,
 
402
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
403
  checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
404
 
 
 
405
  with gr.Row():
406
  with gr.Accordion("Citation", open=False):
407
  citation_button = gr.Textbox(
 
411
  show_copy_button=True,
412
  )
413
 
414
+ with gr.TabItem("πŸ… MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
415
  checkbox_iasd_group = gr.CheckboxGroup(
416
  choices=TASK_IASD_INFO,
417
  value=AVG_INFO,
 
495
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
496
  checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
497
 
 
 
498
  with gr.Row():
499
  with gr.Accordion("Citation", open=False):
500
  citation_button = gr.Textbox(
 
504
  show_copy_button=True,
505
  )
506
 
507
+ # Table 3
508
+ with gr.TabItem("πŸ… MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
509
+ with gr.Row():
510
  # selection for column part:
511
  checkbox_ivqd_group = gr.CheckboxGroup(
512
  choices=TASK_IVQD_INFO,
 
590
  question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
591
  checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
592
 
593
+ with gr.Accordion("Citation", open=False):
594
+ citation_button = gr.Textbox(
595
+ value=CITATION_BUTTON_TEXT,
596
+ label=CITATION_BUTTON_LABEL,
597
+ elem_id="citation-button",
598
+ show_copy_button=True,
599
+ )
600
 
601
  # table 4
602
  with gr.TabItem("πŸ“ About", elem_id="mmupd-benchmark-tab-table", id=4):
 
604
 
605
  # table 5
606
  with gr.TabItem("πŸš€ Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
 
 
607
  with gr.Row():
608
  gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
609
 
constants.py CHANGED
@@ -35,11 +35,37 @@ LEADERBORAD_INTRODUCTION = """
35
  <a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
36
  </div>
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  - **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
39
- - **Ability-wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
40
- - **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
41
 
42
- Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated `result_dual.json` file here. After clicking the `Submit Eval` button, click the `Refresh` button.
 
 
 
 
 
 
 
 
 
43
  """
44
 
45
 
 
35
  <a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
36
  </div>
37
 
38
+ ## About MM-UPD Bench
39
+ ### What is MM-UPD Bench?
40
+ MM-UPD Bench is a comprehensive benchmark for evaluating the trustworthiness of Vision Language Models (VLMs) in the context of Unsolvable Problem Detection (UPD).
41
+ Our MM-UPD Bench encompasses three benchmarks: MM-AAD, MM-IASD, and MM-IVQD.
42
+ - **MM-AAD:** Benchmark for Absent Answer Detection (AAD). MM-AAD Bench is a dataset where the correct answer
43
+ option for each question is removed. MM-AAD tests the model's capability
44
+ to recognize when the correct answer is absent from the provided choices.
45
+
46
+ - **MM-IASD:** Benchmark for Incompatible Answer Set Detection (IASD). MM-IASD Bench is a dataset where the answer set
47
+ is completely incompatible with the context specified by the question and the image.
48
+ MM-IASD tests the model's capability to recognize when the answer set is incompatible with the context.
49
+ - **MM-IVQD:** Benchmark for Incompatible Visual Question Detection (IVQD). MM-IVQD Bench is a dataset where the question is incompatible with the image.
50
+ MM-IVQD evaluates the VLMs' capability to discern when a question and image are irrelevant or
51
+ inappropriate.
52
+
53
+ ### Characteristics of MM-UPD Bench
54
+ We design MM-UPD Bench to provide a comprehensive evaluation of VLMs across multiple senarios.
55
  - **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
56
+ - **Ability-Wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
57
+ - **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
58
 
59
+
60
+ ## About Evaluation Metrics
61
+ We evaluate the performance of VLMs on MM-UPD Bench using the following metrics:
62
+ - **Dual accuracy:** The accuracy on standard-UPD pairs, where we count
63
+ success only if the model is correct on both the standard and UPD questions.
64
+ - **Standard accuracy:** The accuracy on standard questions.
65
+ - **UPD (AAD/IASD/IVQD) accuracy:** The accuracy of AAD/IASD/IVQD questions.
66
+
67
+
68
+ Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated JSON file here. After clicking the `Submit Eval` button, click the `Refresh` button.
69
  """
70
 
71