ChenYi99 commited on
Commit
6c61a43
1 Parent(s): a5615cf

Update EgoPlan-Bench Leaderboard.

Browse files
Files changed (6) hide show
  1. .gitignore +2 -0
  2. README.md +3 -3
  3. app.py +105 -0
  4. constants.py +31 -0
  5. file/result_egoplan_bench.csv +29 -0
  6. requirements.txt +70 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ .idea
2
+ __pycache__
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
  title: EgoPlan-Bench Leaderboard
3
- emoji: 🏃
4
- colorFrom: red
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.33.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
1
  ---
2
  title: EgoPlan-Bench Leaderboard
3
+ emoji:
4
+ colorFrom: yellow
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 3.27.0
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Adapted from the SEED-Bench Leaderboard by AILab-CVC
3
+ Source: https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ from constants import *
9
+ global data_component, filter_component
10
+
11
+ def upload_file(files):
12
+ file_paths = [file.name for file in files]
13
+ return file_paths
14
+
15
+ def get_baseline_df():
16
+ df = pd.read_csv(CSV_DIR)
17
+ df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
18
+ present_columns = MODEL_INFO + [DEFAULT_SPLIT]
19
+ df = df[present_columns]
20
+ # print(df)
21
+ return df
22
+
23
+ def get_all_df():
24
+ df = pd.read_csv(CSV_DIR)
25
+ df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
26
+ # print(df)
27
+ return df
28
+
29
+ block = gr.Blocks()
30
+
31
+
32
+ with block:
33
+ gr.Markdown(
34
+ LEADERBORAD_INTRODUCTION
35
+ )
36
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
37
+ with gr.TabItem("🏅 EgoPlan Benchmark", elem_id="evalcrafter-benchmark-tab-table", id=0):
38
+
39
+ gr.Markdown(
40
+ TABLE_INTRODUCTION
41
+ )
42
+
43
+ dropdown_value = gr.inputs.Dropdown(
44
+ choices=SPLIT_INFO,
45
+ default=DEFAULT_SPLIT
46
+ )
47
+
48
+ # pdb.set_trace()
49
+ data_component = gr.components.Dataframe(
50
+ value=get_baseline_df,
51
+ headers=COLUMN_NAMES,
52
+ type="pandas",
53
+ datatype=DATA_TITILE_TYPE,
54
+ interactive=False,
55
+ visible=True,
56
+ )
57
+
58
+ def on_dropdown_value_change(selected_split):
59
+ # pdb.set_trace()
60
+ present_columns = MODEL_INFO + [selected_split]
61
+ updated_data = get_all_df()[present_columns].dropna()
62
+ updated_data = updated_data.sort_values(by=present_columns[-1], ascending=False)
63
+ updated_headers = present_columns
64
+ update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
65
+
66
+ # pdb.set_trace()
67
+ filter_component = gr.components.Dataframe(
68
+ value=updated_data,
69
+ headers=updated_headers,
70
+ type="pandas",
71
+ datatype=update_datatype,
72
+ interactive=False,
73
+ visible=True,
74
+ )
75
+ # pdb.set_trace()
76
+ return filter_component.value
77
+
78
+ dropdown_value.change(fn=on_dropdown_value_change, inputs=dropdown_value, outputs=data_component)
79
+
80
+ # table 2
81
+ with gr.TabItem("📝 About", elem_id="egoplan-benchmark-tab-table", id=2):
82
+ gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
83
+
84
+
85
+ with gr.Row():
86
+ data_run = gr.Button("Refresh")
87
+ data_run.click(
88
+ get_baseline_df, outputs=data_component
89
+ )
90
+
91
+ gr.Markdown(r"""
92
+ Please cite this paper if you find it useful ♥️:
93
+
94
+ ```bibtex
95
+ @article{chen2023egoplan,
96
+ title={EgoPlan-Bench: Benchmarking Multimodal Large Language Models for Human-Level Planning},
97
+ author={Chen, Yi and Ge, Yuying and Ge, Yixiao and Ding, Mingyu and Li, Bohao and Wang, Rui and Xu, Ruifeng and Shan, Ying and Liu, Xihui},
98
+ journal={arXiv preprint arXiv:2312.06722},
99
+ year={2023}
100
+ }
101
+ ```
102
+ """)
103
+ # block.load(get_baseline_df, outputs=data_title)
104
+
105
+ block.launch(share=False)
constants.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ SPLIT_INFO = ['Validation Split', 'Test Split']
2
+ MODEL_INFO = ['Model', 'Large Language Model']
3
+ DEFAULT_SPLIT = 'Validation Split'
4
+ DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number"]
5
+ CSV_DIR = "file/result_egoplan_bench.csv"
6
+
7
+ COLUMN_NAMES = MODEL_INFO + SPLIT_INFO
8
+
9
+ TABLE_INTRODUCTION = "In the table below, we summarize the model performance on the validation and test splits."
10
+
11
+ LEADERBORAD_INTRODUCTION = """# EgoPlan-Bench Leaderboard 🏆
12
+
13
+ Welcome to the EgoPlan-Bench Leaderboard! This leaderboard ranks Multimodal Large Language Models (MLLMs) based on their performance on EgoPlan-Bench, which evaluates their planning abilities in real-world, egocentric scenarios. EgoPlan-Bench features realistic tasks, diverse action plans, and intricate visual observations, providing a challenging assessment platform for MLLMs. Explore the leaderboard to track the progress of MLLMs towards achieving human-level planning ! 🛫
14
+
15
+ Join our evaluation by sending an email 📧 ([email protected])! You may also read the [Code](https://github.com/ChenYi99/EgoPlan), [Paper](https://arxiv.org/pdf/2312.06722), and [Project page](https://chenyi99.github.io/ego_plan/) for more detailed information 🤗
16
+ """
17
+
18
+ LEADERBORAD_INFO = """
19
+ The pursuit of artificial general intelligence (AGI) has been accelerated by Multimodal Large Language Models (MLLMs), which exhibit superior reasoning, generalization capabilities, and proficiency in processing multimodal inputs. A crucial milestone in the evolution of AGI is the attainment of human-level planning, a fundamental ability for making informed decisions in complex environments, and solving a wide range of real-world problems. Despite the impressive advancements in MLLMs, a question remains: **How far are current MLLMs from achieving human-level planning?**
20
+ To shed light on this question, we introduce EgoPlan-Bench, a comprehensive benchmark to evaluate the planning abilities of MLLMs in real-world scenarios from an egocentric perspective, mirroring human perception. EgoPlan-Bench emphasizes the evaluation of planning capabilities of MLLMs, featuring realistic tasks, diverse action plans, and intricate visual observations. Our rigorous evaluation of a wide range of MLLMs reveals that EgoPlan-Bench poses significant challenges, highlighting a substantial scope for improvement in MLLMs to achieve human-level task planning. To facilitate this advancement, we further present EgoPlan-IT, a specialized instruction-tuning dataset that effectively enhances model performance on EgoPlan-Bench. We have made all codes, data, and a maintained benchmark leaderboard available to advance future research.
21
+ """
22
+
23
+
24
+
25
+ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
26
+ CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2023EvalCrafterBA,
27
+ title={EvalCrafter: Benchmarking and Evaluating Large Video Generation Models},
28
+ author={Yaofang Liu and Xiaodong Cun and Xuebo Liu and Xintao Wang and Yong Zhang and Haoxin Chen and Yang Liu and Tieyong Zeng and Raymond Chan and Ying Shan},
29
+ year={2023},
30
+ url={https://api.semanticscholar.org/CorpusID:264172222}
31
+ }"""
file/result_egoplan_bench.csv ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Large Language Model,Validation Split,Test Split
2
+ BLIP-2,Flan-T5-XL,26.71,27.90
3
+ InstructBLIP,Flan-T5-XL,28.09,25.19
4
+ InstructBLIP Vicuna,Vicuna-7B,26.53,26.64
5
+ LLaVA,LLaMA-7B,27.0,28.16
6
+ MiniGPT-4,Vicuna-7B,28.11,30.93
7
+ VPGTrans,LLaMA-7B,27.38,24.12
8
+ MultiModal-GPT,Vicuna-7B, 27.81,30.43
9
+ Otter,LLaMA-7B,28.08,30.87
10
+ OpenFlamingo,LLaMA-7B,27.67,30.18
11
+ LLaMA-Adapter V2,LLaMA-7B,27.81,30.43
12
+ GVT,Vicuna-7B, 27.87,29.67
13
+ mPLUG-Owl,LLaMA-7B,27.63,31.31
14
+ mPLUG-Owl-2,LLaMA2-7B,27.84,30.37
15
+ Kosmos-2,Decoder only 1.3B,26.97,""
16
+ Qwen-VL-Chat,Qwen-7B,27.69,31.06
17
+ LLaVA-1.5,Vicuna-7B,27.81,29.80
18
+ VideoChat,Vicuna-7B,27.51,28.72
19
+ Video-ChatGPT,LLaMA-7B,27.33,29.17
20
+ Valley,LLaMA-13B,27.27,30.11
21
+ Video-LLaMA,LLaMA2-Chat-7B,28.58,30.30
22
+ SEED-LLaMA,LLaMA2-Chat-13B,29.93,""
23
+ SEED-X,LLaMA2-Chat-13B,31.07,29.92
24
+ DeepSeek-VL-Chat,DeepSeek-LLM-7B,27.57,26.01
25
+ CogVLM,Vicuna-7B,27.48,31.06
26
+ Yi-VL,Yi-6B,28.67,30.56
27
+ Xcomposer,InternLM-7B,37.17,36.36
28
+ Gemini-Pro-Vision,\-,30.46,32.39
29
+ GPT-4V,\-,37.98,37.25
requirements.txt ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.1.0
2
+ aiohttp==3.8.4
3
+ aiosignal==1.3.1
4
+ altair==4.2.2
5
+ anyio==3.6.2
6
+ APScheduler==3.10.1
7
+ async-timeout==4.0.2
8
+ attrs==23.1.0
9
+ certifi==2022.12.7
10
+ charset-normalizer==3.1.0
11
+ click==8.1.3
12
+ contourpy==1.0.7
13
+ cycler==0.11.0
14
+ datasets==2.12.0
15
+ entrypoints==0.4
16
+ fastapi==0.95.1
17
+ ffmpy==0.3.0
18
+ filelock==3.11.0
19
+ fonttools==4.39.3
20
+ frozenlist==1.3.3
21
+ fsspec==2023.4.0
22
+ gradio==3.27.0
23
+ gradio_client==0.1.3
24
+ h11==0.14.0
25
+ httpcore==0.17.0
26
+ httpx==0.24.0
27
+ huggingface-hub==0.13.4
28
+ idna==3.4
29
+ Jinja2==3.1.2
30
+ jsonschema==4.17.3
31
+ kiwisolver==1.4.4
32
+ linkify-it-py==2.0.0
33
+ markdown-it-py==2.2.0
34
+ MarkupSafe==2.1.2
35
+ matplotlib==3.7.1
36
+ mdit-py-plugins==0.3.3
37
+ mdurl==0.1.2
38
+ multidict==6.0.4
39
+ numpy==1.24.2
40
+ orjson==3.8.10
41
+ packaging==23.1
42
+ pandas==2.0.0
43
+ Pillow==9.5.0
44
+ plotly==5.14.1
45
+ pyarrow==11.0.0
46
+ pydantic==1.10.7
47
+ pydub==0.25.1
48
+ pyparsing==3.0.9
49
+ pyrsistent==0.19.3
50
+ python-dateutil==2.8.2
51
+ python-multipart==0.0.6
52
+ pytz==2023.3
53
+ pytz-deprecation-shim==0.1.0.post0
54
+ PyYAML==6.0
55
+ requests==2.28.2
56
+ semantic-version==2.10.0
57
+ six==1.16.0
58
+ sniffio==1.3.0
59
+ starlette==0.26.1
60
+ toolz==0.12.0
61
+ tqdm==4.65.0
62
+ transformers==4.28.1
63
+ typing_extensions==4.5.0
64
+ tzdata==2023.3
65
+ tzlocal==4.3
66
+ uc-micro-py==1.0.1
67
+ urllib3==1.26.15
68
+ uvicorn==0.21.1
69
+ websockets==11.0.1
70
+ yarl==1.8.2