Update EgoPlan-Bench Leaderboard.
Browse files- .gitignore +2 -0
- README.md +3 -3
- app.py +105 -0
- constants.py +31 -0
- file/result_egoplan_bench.csv +29 -0
- requirements.txt +70 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
.idea
|
2 |
+
__pycache__
|
README.md
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
---
|
2 |
title: EgoPlan-Bench Leaderboard
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
|
|
1 |
---
|
2 |
title: EgoPlan-Bench Leaderboard
|
3 |
+
emoji: ⚡
|
4 |
+
colorFrom: yellow
|
5 |
colorTo: gray
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.27.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
---
|
app.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Adapted from the SEED-Bench Leaderboard by AILab-CVC
|
3 |
+
Source: https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard
|
4 |
+
"""
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import pandas as pd
|
8 |
+
from constants import *
|
9 |
+
global data_component, filter_component
|
10 |
+
|
11 |
+
def upload_file(files):
|
12 |
+
file_paths = [file.name for file in files]
|
13 |
+
return file_paths
|
14 |
+
|
15 |
+
def get_baseline_df():
|
16 |
+
df = pd.read_csv(CSV_DIR)
|
17 |
+
df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
|
18 |
+
present_columns = MODEL_INFO + [DEFAULT_SPLIT]
|
19 |
+
df = df[present_columns]
|
20 |
+
# print(df)
|
21 |
+
return df
|
22 |
+
|
23 |
+
def get_all_df():
|
24 |
+
df = pd.read_csv(CSV_DIR)
|
25 |
+
df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
|
26 |
+
# print(df)
|
27 |
+
return df
|
28 |
+
|
29 |
+
block = gr.Blocks()
|
30 |
+
|
31 |
+
|
32 |
+
with block:
|
33 |
+
gr.Markdown(
|
34 |
+
LEADERBORAD_INTRODUCTION
|
35 |
+
)
|
36 |
+
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
37 |
+
with gr.TabItem("🏅 EgoPlan Benchmark", elem_id="evalcrafter-benchmark-tab-table", id=0):
|
38 |
+
|
39 |
+
gr.Markdown(
|
40 |
+
TABLE_INTRODUCTION
|
41 |
+
)
|
42 |
+
|
43 |
+
dropdown_value = gr.inputs.Dropdown(
|
44 |
+
choices=SPLIT_INFO,
|
45 |
+
default=DEFAULT_SPLIT
|
46 |
+
)
|
47 |
+
|
48 |
+
# pdb.set_trace()
|
49 |
+
data_component = gr.components.Dataframe(
|
50 |
+
value=get_baseline_df,
|
51 |
+
headers=COLUMN_NAMES,
|
52 |
+
type="pandas",
|
53 |
+
datatype=DATA_TITILE_TYPE,
|
54 |
+
interactive=False,
|
55 |
+
visible=True,
|
56 |
+
)
|
57 |
+
|
58 |
+
def on_dropdown_value_change(selected_split):
|
59 |
+
# pdb.set_trace()
|
60 |
+
present_columns = MODEL_INFO + [selected_split]
|
61 |
+
updated_data = get_all_df()[present_columns].dropna()
|
62 |
+
updated_data = updated_data.sort_values(by=present_columns[-1], ascending=False)
|
63 |
+
updated_headers = present_columns
|
64 |
+
update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
|
65 |
+
|
66 |
+
# pdb.set_trace()
|
67 |
+
filter_component = gr.components.Dataframe(
|
68 |
+
value=updated_data,
|
69 |
+
headers=updated_headers,
|
70 |
+
type="pandas",
|
71 |
+
datatype=update_datatype,
|
72 |
+
interactive=False,
|
73 |
+
visible=True,
|
74 |
+
)
|
75 |
+
# pdb.set_trace()
|
76 |
+
return filter_component.value
|
77 |
+
|
78 |
+
dropdown_value.change(fn=on_dropdown_value_change, inputs=dropdown_value, outputs=data_component)
|
79 |
+
|
80 |
+
# table 2
|
81 |
+
with gr.TabItem("📝 About", elem_id="egoplan-benchmark-tab-table", id=2):
|
82 |
+
gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
|
83 |
+
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
data_run = gr.Button("Refresh")
|
87 |
+
data_run.click(
|
88 |
+
get_baseline_df, outputs=data_component
|
89 |
+
)
|
90 |
+
|
91 |
+
gr.Markdown(r"""
|
92 |
+
Please cite this paper if you find it useful ♥️:
|
93 |
+
|
94 |
+
```bibtex
|
95 |
+
@article{chen2023egoplan,
|
96 |
+
title={EgoPlan-Bench: Benchmarking Multimodal Large Language Models for Human-Level Planning},
|
97 |
+
author={Chen, Yi and Ge, Yuying and Ge, Yixiao and Ding, Mingyu and Li, Bohao and Wang, Rui and Xu, Ruifeng and Shan, Ying and Liu, Xihui},
|
98 |
+
journal={arXiv preprint arXiv:2312.06722},
|
99 |
+
year={2023}
|
100 |
+
}
|
101 |
+
```
|
102 |
+
""")
|
103 |
+
# block.load(get_baseline_df, outputs=data_title)
|
104 |
+
|
105 |
+
block.launch(share=False)
|
constants.py
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
SPLIT_INFO = ['Validation Split', 'Test Split']
|
2 |
+
MODEL_INFO = ['Model', 'Large Language Model']
|
3 |
+
DEFAULT_SPLIT = 'Validation Split'
|
4 |
+
DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number"]
|
5 |
+
CSV_DIR = "file/result_egoplan_bench.csv"
|
6 |
+
|
7 |
+
COLUMN_NAMES = MODEL_INFO + SPLIT_INFO
|
8 |
+
|
9 |
+
TABLE_INTRODUCTION = "In the table below, we summarize the model performance on the validation and test splits."
|
10 |
+
|
11 |
+
LEADERBORAD_INTRODUCTION = """# EgoPlan-Bench Leaderboard 🏆
|
12 |
+
|
13 |
+
Welcome to the EgoPlan-Bench Leaderboard! This leaderboard ranks Multimodal Large Language Models (MLLMs) based on their performance on EgoPlan-Bench, which evaluates their planning abilities in real-world, egocentric scenarios. EgoPlan-Bench features realistic tasks, diverse action plans, and intricate visual observations, providing a challenging assessment platform for MLLMs. Explore the leaderboard to track the progress of MLLMs towards achieving human-level planning ! 🛫
|
14 |
+
|
15 |
+
Join our evaluation by sending an email 📧 ([email protected])! You may also read the [Code](https://github.com/ChenYi99/EgoPlan), [Paper](https://arxiv.org/pdf/2312.06722), and [Project page](https://chenyi99.github.io/ego_plan/) for more detailed information 🤗
|
16 |
+
"""
|
17 |
+
|
18 |
+
LEADERBORAD_INFO = """
|
19 |
+
The pursuit of artificial general intelligence (AGI) has been accelerated by Multimodal Large Language Models (MLLMs), which exhibit superior reasoning, generalization capabilities, and proficiency in processing multimodal inputs. A crucial milestone in the evolution of AGI is the attainment of human-level planning, a fundamental ability for making informed decisions in complex environments, and solving a wide range of real-world problems. Despite the impressive advancements in MLLMs, a question remains: **How far are current MLLMs from achieving human-level planning?**
|
20 |
+
To shed light on this question, we introduce EgoPlan-Bench, a comprehensive benchmark to evaluate the planning abilities of MLLMs in real-world scenarios from an egocentric perspective, mirroring human perception. EgoPlan-Bench emphasizes the evaluation of planning capabilities of MLLMs, featuring realistic tasks, diverse action plans, and intricate visual observations. Our rigorous evaluation of a wide range of MLLMs reveals that EgoPlan-Bench poses significant challenges, highlighting a substantial scope for improvement in MLLMs to achieve human-level task planning. To facilitate this advancement, we further present EgoPlan-IT, a specialized instruction-tuning dataset that effectively enhances model performance on EgoPlan-Bench. We have made all codes, data, and a maintained benchmark leaderboard available to advance future research.
|
21 |
+
"""
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
26 |
+
CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2023EvalCrafterBA,
|
27 |
+
title={EvalCrafter: Benchmarking and Evaluating Large Video Generation Models},
|
28 |
+
author={Yaofang Liu and Xiaodong Cun and Xuebo Liu and Xintao Wang and Yong Zhang and Haoxin Chen and Yang Liu and Tieyong Zeng and Raymond Chan and Ying Shan},
|
29 |
+
year={2023},
|
30 |
+
url={https://api.semanticscholar.org/CorpusID:264172222}
|
31 |
+
}"""
|
file/result_egoplan_bench.csv
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Model,Large Language Model,Validation Split,Test Split
|
2 |
+
BLIP-2,Flan-T5-XL,26.71,27.90
|
3 |
+
InstructBLIP,Flan-T5-XL,28.09,25.19
|
4 |
+
InstructBLIP Vicuna,Vicuna-7B,26.53,26.64
|
5 |
+
LLaVA,LLaMA-7B,27.0,28.16
|
6 |
+
MiniGPT-4,Vicuna-7B,28.11,30.93
|
7 |
+
VPGTrans,LLaMA-7B,27.38,24.12
|
8 |
+
MultiModal-GPT,Vicuna-7B, 27.81,30.43
|
9 |
+
Otter,LLaMA-7B,28.08,30.87
|
10 |
+
OpenFlamingo,LLaMA-7B,27.67,30.18
|
11 |
+
LLaMA-Adapter V2,LLaMA-7B,27.81,30.43
|
12 |
+
GVT,Vicuna-7B, 27.87,29.67
|
13 |
+
mPLUG-Owl,LLaMA-7B,27.63,31.31
|
14 |
+
mPLUG-Owl-2,LLaMA2-7B,27.84,30.37
|
15 |
+
Kosmos-2,Decoder only 1.3B,26.97,""
|
16 |
+
Qwen-VL-Chat,Qwen-7B,27.69,31.06
|
17 |
+
LLaVA-1.5,Vicuna-7B,27.81,29.80
|
18 |
+
VideoChat,Vicuna-7B,27.51,28.72
|
19 |
+
Video-ChatGPT,LLaMA-7B,27.33,29.17
|
20 |
+
Valley,LLaMA-13B,27.27,30.11
|
21 |
+
Video-LLaMA,LLaMA2-Chat-7B,28.58,30.30
|
22 |
+
SEED-LLaMA,LLaMA2-Chat-13B,29.93,""
|
23 |
+
SEED-X,LLaMA2-Chat-13B,31.07,29.92
|
24 |
+
DeepSeek-VL-Chat,DeepSeek-LLM-7B,27.57,26.01
|
25 |
+
CogVLM,Vicuna-7B,27.48,31.06
|
26 |
+
Yi-VL,Yi-6B,28.67,30.56
|
27 |
+
Xcomposer,InternLM-7B,37.17,36.36
|
28 |
+
Gemini-Pro-Vision,\-,30.46,32.39
|
29 |
+
GPT-4V,\-,37.98,37.25
|
requirements.txt
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==23.1.0
|
2 |
+
aiohttp==3.8.4
|
3 |
+
aiosignal==1.3.1
|
4 |
+
altair==4.2.2
|
5 |
+
anyio==3.6.2
|
6 |
+
APScheduler==3.10.1
|
7 |
+
async-timeout==4.0.2
|
8 |
+
attrs==23.1.0
|
9 |
+
certifi==2022.12.7
|
10 |
+
charset-normalizer==3.1.0
|
11 |
+
click==8.1.3
|
12 |
+
contourpy==1.0.7
|
13 |
+
cycler==0.11.0
|
14 |
+
datasets==2.12.0
|
15 |
+
entrypoints==0.4
|
16 |
+
fastapi==0.95.1
|
17 |
+
ffmpy==0.3.0
|
18 |
+
filelock==3.11.0
|
19 |
+
fonttools==4.39.3
|
20 |
+
frozenlist==1.3.3
|
21 |
+
fsspec==2023.4.0
|
22 |
+
gradio==3.27.0
|
23 |
+
gradio_client==0.1.3
|
24 |
+
h11==0.14.0
|
25 |
+
httpcore==0.17.0
|
26 |
+
httpx==0.24.0
|
27 |
+
huggingface-hub==0.13.4
|
28 |
+
idna==3.4
|
29 |
+
Jinja2==3.1.2
|
30 |
+
jsonschema==4.17.3
|
31 |
+
kiwisolver==1.4.4
|
32 |
+
linkify-it-py==2.0.0
|
33 |
+
markdown-it-py==2.2.0
|
34 |
+
MarkupSafe==2.1.2
|
35 |
+
matplotlib==3.7.1
|
36 |
+
mdit-py-plugins==0.3.3
|
37 |
+
mdurl==0.1.2
|
38 |
+
multidict==6.0.4
|
39 |
+
numpy==1.24.2
|
40 |
+
orjson==3.8.10
|
41 |
+
packaging==23.1
|
42 |
+
pandas==2.0.0
|
43 |
+
Pillow==9.5.0
|
44 |
+
plotly==5.14.1
|
45 |
+
pyarrow==11.0.0
|
46 |
+
pydantic==1.10.7
|
47 |
+
pydub==0.25.1
|
48 |
+
pyparsing==3.0.9
|
49 |
+
pyrsistent==0.19.3
|
50 |
+
python-dateutil==2.8.2
|
51 |
+
python-multipart==0.0.6
|
52 |
+
pytz==2023.3
|
53 |
+
pytz-deprecation-shim==0.1.0.post0
|
54 |
+
PyYAML==6.0
|
55 |
+
requests==2.28.2
|
56 |
+
semantic-version==2.10.0
|
57 |
+
six==1.16.0
|
58 |
+
sniffio==1.3.0
|
59 |
+
starlette==0.26.1
|
60 |
+
toolz==0.12.0
|
61 |
+
tqdm==4.65.0
|
62 |
+
transformers==4.28.1
|
63 |
+
typing_extensions==4.5.0
|
64 |
+
tzdata==2023.3
|
65 |
+
tzlocal==4.3
|
66 |
+
uc-micro-py==1.0.1
|
67 |
+
urllib3==1.26.15
|
68 |
+
uvicorn==0.21.1
|
69 |
+
websockets==11.0.1
|
70 |
+
yarl==1.8.2
|