Spaces:
Running
Running
AtsuMiyai
commited on
Commit
β’
e8e50f1
1
Parent(s):
03f13cc
update explanations on MM-UPD Bench
Browse files- app.py +11 -15
- constants.py +29 -3
app.py
CHANGED
@@ -316,15 +316,6 @@ with block:
|
|
316 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
317 |
# table mmupd bench
|
318 |
with gr.TabItem("π
MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
|
319 |
-
with gr.Row():
|
320 |
-
with gr.Accordion("Citation", open=False):
|
321 |
-
citation_button = gr.Textbox(
|
322 |
-
value=CITATION_BUTTON_TEXT,
|
323 |
-
label=CITATION_BUTTON_LABEL,
|
324 |
-
elem_id="citation-button",
|
325 |
-
show_copy_button=True,
|
326 |
-
)
|
327 |
-
|
328 |
# selection for column part:
|
329 |
checkbox_aad_group = gr.CheckboxGroup(
|
330 |
choices=TASK_AAD_INFO,
|
@@ -411,8 +402,6 @@ with block:
|
|
411 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
|
412 |
checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
|
413 |
|
414 |
-
|
415 |
-
with gr.TabItem("π
MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
|
416 |
with gr.Row():
|
417 |
with gr.Accordion("Citation", open=False):
|
418 |
citation_button = gr.Textbox(
|
@@ -422,6 +411,7 @@ with block:
|
|
422 |
show_copy_button=True,
|
423 |
)
|
424 |
|
|
|
425 |
checkbox_iasd_group = gr.CheckboxGroup(
|
426 |
choices=TASK_IASD_INFO,
|
427 |
value=AVG_INFO,
|
@@ -505,8 +495,6 @@ with block:
|
|
505 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
|
506 |
checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
|
507 |
|
508 |
-
# Table 3
|
509 |
-
with gr.TabItem("π
MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
|
510 |
with gr.Row():
|
511 |
with gr.Accordion("Citation", open=False):
|
512 |
citation_button = gr.Textbox(
|
@@ -516,6 +504,9 @@ with block:
|
|
516 |
show_copy_button=True,
|
517 |
)
|
518 |
|
|
|
|
|
|
|
519 |
# selection for column part:
|
520 |
checkbox_ivqd_group = gr.CheckboxGroup(
|
521 |
choices=TASK_IVQD_INFO,
|
@@ -599,6 +590,13 @@ with block:
|
|
599 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
|
600 |
checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
|
601 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
|
603 |
# table 4
|
604 |
with gr.TabItem("π About", elem_id="mmupd-benchmark-tab-table", id=4):
|
@@ -606,8 +604,6 @@ with block:
|
|
606 |
|
607 |
# table 5
|
608 |
with gr.TabItem("π Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
|
609 |
-
gr.Markdown(LEADERBORAD_INTRODUCTION, elem_classes="markdown-text")
|
610 |
-
|
611 |
with gr.Row():
|
612 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
613 |
|
|
|
316 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
317 |
# table mmupd bench
|
318 |
with gr.TabItem("π
MM-AAD Benchmark", elem_id="mmaad-benchmark-tab-table", id=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
319 |
# selection for column part:
|
320 |
checkbox_aad_group = gr.CheckboxGroup(
|
321 |
choices=TASK_AAD_INFO,
|
|
|
402 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
|
403 |
checkbox_aad_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_aad_group], outputs=data_component_aad)
|
404 |
|
|
|
|
|
405 |
with gr.Row():
|
406 |
with gr.Accordion("Citation", open=False):
|
407 |
citation_button = gr.Textbox(
|
|
|
411 |
show_copy_button=True,
|
412 |
)
|
413 |
|
414 |
+
with gr.TabItem("π
MM-IASD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=2):
|
415 |
checkbox_iasd_group = gr.CheckboxGroup(
|
416 |
choices=TASK_IASD_INFO,
|
417 |
value=AVG_INFO,
|
|
|
495 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
|
496 |
checkbox_iasd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_iasd_group], outputs=data_component_iasd)
|
497 |
|
|
|
|
|
498 |
with gr.Row():
|
499 |
with gr.Accordion("Citation", open=False):
|
500 |
citation_button = gr.Textbox(
|
|
|
504 |
show_copy_button=True,
|
505 |
)
|
506 |
|
507 |
+
# Table 3
|
508 |
+
with gr.TabItem("π
MM-IVQD Benchmark", elem_id="mmiasd-benchmark-tab-table", id=3):
|
509 |
+
with gr.Row():
|
510 |
# selection for column part:
|
511 |
checkbox_ivqd_group = gr.CheckboxGroup(
|
512 |
choices=TASK_IVQD_INFO,
|
|
|
590 |
question_type.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
|
591 |
checkbox_ivqd_group.change(fn=on_filter_model_size_method_change, inputs=[model_size, question_type, checkbox_ivqd_group], outputs=data_component_ivqd)
|
592 |
|
593 |
+
with gr.Accordion("Citation", open=False):
|
594 |
+
citation_button = gr.Textbox(
|
595 |
+
value=CITATION_BUTTON_TEXT,
|
596 |
+
label=CITATION_BUTTON_LABEL,
|
597 |
+
elem_id="citation-button",
|
598 |
+
show_copy_button=True,
|
599 |
+
)
|
600 |
|
601 |
# table 4
|
602 |
with gr.TabItem("π About", elem_id="mmupd-benchmark-tab-table", id=4):
|
|
|
604 |
|
605 |
# table 5
|
606 |
with gr.TabItem("π Submit here! ", elem_id="mmupd-benchmark-tab-table", id=5):
|
|
|
|
|
607 |
with gr.Row():
|
608 |
gr.Markdown(SUBMIT_INTRODUCTION, elem_classes="markdown-text")
|
609 |
|
constants.py
CHANGED
@@ -35,11 +35,37 @@ LEADERBORAD_INTRODUCTION = """
|
|
35 |
<a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
|
36 |
</div>
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
- **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
|
39 |
-
- **Ability-
|
40 |
-
- **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
"""
|
44 |
|
45 |
|
|
|
35 |
<a href='https://arxiv.org/abs/2403.20331'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
|
36 |
</div>
|
37 |
|
38 |
+
## About MM-UPD Bench
|
39 |
+
### What is MM-UPD Bench?
|
40 |
+
MM-UPD Bench is a comprehensive benchmark for evaluating the trustworthiness of Vision Language Models (VLMs) in the context of Unsolvable Problem Detection (UPD).
|
41 |
+
Our MM-UPD Bench encompasses three benchmarks: MM-AAD, MM-IASD, and MM-IVQD.
|
42 |
+
- **MM-AAD:** Benchmark for Absent Answer Detection (AAD). MM-AAD Bench is a dataset where the correct answer
|
43 |
+
option for each question is removed. MM-AAD tests the model's capability
|
44 |
+
to recognize when the correct answer is absent from the provided choices.
|
45 |
+
|
46 |
+
- **MM-IASD:** Benchmark for Incompatible Answer Set Detection (IASD). MM-IASD Bench is a dataset where the answer set
|
47 |
+
is completely incompatible with the context specified by the question and the image.
|
48 |
+
MM-IASD tests the model's capability to recognize when the answer set is incompatible with the context.
|
49 |
+
- **MM-IVQD:** Benchmark for Incompatible Visual Question Detection (IVQD). MM-IVQD Bench is a dataset where the question is incompatible with the image.
|
50 |
+
MM-IVQD evaluates the VLMs' capability to discern when a question and image are irrelevant or
|
51 |
+
inappropriate.
|
52 |
+
|
53 |
+
### Characteristics of MM-UPD Bench
|
54 |
+
We design MM-UPD Bench to provide a comprehensive evaluation of VLMs across multiple senarios.
|
55 |
- **Multiple Senario Evaluation:** We carefully design prompts choices and examine the three senario: (i) base (no instruction), (ii) option (add an additional option), (iii) instruction (add an instruction).
|
56 |
+
- **Ability-Wise Evaluation:** We carefully decompose each benchmark into more than 10 abilities to reveal individual model's strengths and weaknesses.
|
57 |
+
- **Valuable Insights:** MM-UPD Bench provides multi-perspective insights on trustworthiness and reliablitity for the community.
|
58 |
|
59 |
+
|
60 |
+
## About Evaluation Metrics
|
61 |
+
We evaluate the performance of VLMs on MM-UPD Bench using the following metrics:
|
62 |
+
- **Dual accuracy:** The accuracy on standard-UPD pairs, where we count
|
63 |
+
success only if the model is correct on both the standard and UPD questions.
|
64 |
+
- **Standard accuracy:** The accuracy on standard questions.
|
65 |
+
- **UPD (AAD/IASD/IVQD) accuracy:** The accuracy of AAD/IASD/IVQD questions.
|
66 |
+
|
67 |
+
|
68 |
+
Please follow the instructions in [UPD](https://github.com/AtsuMiyai/UPD) to upload the generated JSON file here. After clicking the `Submit Eval` button, click the `Refresh` button.
|
69 |
"""
|
70 |
|
71 |
|