Spaces:

ChenYi99
/

EgoPlan-Bench_Leaderboard

Running

App Files Files Community

ChenYi99 commited on Jun 6

Commit

6c61a43

•

1 Parent(s): a5615cf

Update EgoPlan-Bench Leaderboard.

Browse files

Files changed (6) hide show

.gitignore +2 -0
README.md +3 -3
app.py +105 -0
constants.py +31 -0
file/result_egoplan_bench.csv +29 -0
requirements.txt +70 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .idea
2	+ __pycache__

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: EgoPlan-Bench Leaderboard
-emoji: 🏃
-colorFrom: red
 colorTo: gray
 sdk: gradio
-sdk_version: 4.33.0
 app_file: app.py
 pinned: false
 ---

 ---
 title: EgoPlan-Bench Leaderboard
+emoji: ⚡
+colorFrom: yellow
 colorTo: gray
 sdk: gradio
+sdk_version: 3.27.0
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+Adapted from the SEED-Bench Leaderboard by AILab-CVC
+Source: https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard
+"""
+import gradio as gr
+import pandas as pd
+from constants import *
+global data_component, filter_component
+def upload_file(files):
+    file_paths = [file.name for file in files]
+    return file_paths
+def get_baseline_df():
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
+    present_columns = MODEL_INFO + [DEFAULT_SPLIT]
+    df = df[present_columns]
+    # print(df)
+    return df
+def get_all_df():
+    df = pd.read_csv(CSV_DIR)
+    df = df.sort_values(by=DEFAULT_SPLIT, ascending=False)
+    # print(df)
+    return df
+block = gr.Blocks()
+with block:
+    gr.Markdown(
+        LEADERBORAD_INTRODUCTION
+    )
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        with gr.TabItem("🏅 EgoPlan Benchmark", elem_id="evalcrafter-benchmark-tab-table", id=0):
+            gr.Markdown(
+                TABLE_INTRODUCTION
+            )
+            dropdown_value = gr.inputs.Dropdown(
+                choices=SPLIT_INFO,
+                default=DEFAULT_SPLIT
+            )
+            # pdb.set_trace()
+            data_component = gr.components.Dataframe(
+                value=get_baseline_df,
+                headers=COLUMN_NAMES,
+                type="pandas",
+                datatype=DATA_TITILE_TYPE,
+                interactive=False,
+                visible=True,
+                )
+            def on_dropdown_value_change(selected_split):
+                # pdb.set_trace()
+                present_columns = MODEL_INFO + [selected_split]
+                updated_data = get_all_df()[present_columns].dropna()
+                updated_data = updated_data.sort_values(by=present_columns[-1], ascending=False)
+                updated_headers = present_columns
+                update_datatype = [DATA_TITILE_TYPE[COLUMN_NAMES.index(x)] for x in updated_headers]
+                # pdb.set_trace()
+                filter_component = gr.components.Dataframe(
+                    value=updated_data,
+                    headers=updated_headers,
+                    type="pandas",
+                    datatype=update_datatype,
+                    interactive=False,
+                    visible=True,
+                    )
+                # pdb.set_trace()
+                return filter_component.value
+            dropdown_value.change(fn=on_dropdown_value_change, inputs=dropdown_value, outputs=data_component)
+        # table 2
+        with gr.TabItem("📝 About", elem_id="egoplan-benchmark-tab-table", id=2):
+            gr.Markdown(LEADERBORAD_INFO, elem_classes="markdown-text")
+    with gr.Row():
+        data_run = gr.Button("Refresh")
+        data_run.click(
+            get_baseline_df, outputs=data_component
+        )
+    gr.Markdown(r"""
+        Please cite this paper if you find it useful ♥️:
+        ```bibtex
+        @article{chen2023egoplan,
+          title={EgoPlan-Bench: Benchmarking Multimodal Large Language Models for Human-Level Planning},
+          author={Chen, Yi and Ge, Yuying and Ge, Yixiao and Ding, Mingyu and Li, Bohao and Wang, Rui and Xu, Ruifeng and Shan, Ying and Liu, Xihui},
+          journal={arXiv preprint arXiv:2312.06722},
+          year={2023}
+        }
+        ```
+        """)
+    # block.load(get_baseline_df, outputs=data_title)
+block.launch(share=False)

constants.py ADDED Viewed

	@@ -0,0 +1,31 @@

+SPLIT_INFO = ['Validation Split', 'Test Split']
+MODEL_INFO = ['Model', 'Large Language Model']
+DEFAULT_SPLIT = 'Validation Split'
+DATA_TITILE_TYPE = ["markdown", "markdown", "number", "number"]
+CSV_DIR = "file/result_egoplan_bench.csv"
+COLUMN_NAMES = MODEL_INFO + SPLIT_INFO
+TABLE_INTRODUCTION = "In the table below, we summarize the model performance on the validation and test splits."
+LEADERBORAD_INTRODUCTION = """# EgoPlan-Bench Leaderboard 🏆
+    Welcome to the EgoPlan-Bench Leaderboard! This leaderboard ranks Multimodal Large Language Models (MLLMs) based on their performance on EgoPlan-Bench, which evaluates their planning abilities in real-world, egocentric scenarios. EgoPlan-Bench features realistic tasks, diverse action plans, and intricate visual observations, providing a challenging assessment platform for MLLMs. Explore the leaderboard to track the progress of MLLMs towards achieving human-level planning ! 🛫
+    Join our evaluation by sending an email 📧 ([email protected])! You may also read the [Code](https://github.com/ChenYi99/EgoPlan), [Paper](https://arxiv.org/pdf/2312.06722), and [Project page](https://chenyi99.github.io/ego_plan/)  for more detailed information 🤗
+    """
+LEADERBORAD_INFO = """
+      The pursuit of artificial general intelligence (AGI) has been accelerated by Multimodal Large Language Models (MLLMs), which exhibit superior reasoning, generalization capabilities, and proficiency in processing multimodal inputs. A crucial milestone in the evolution of AGI is the attainment of human-level planning, a fundamental ability for making informed decisions in complex environments, and solving a wide range of real-world problems. Despite the impressive advancements in MLLMs, a question remains: **How far are current MLLMs from achieving human-level planning?**
+      To shed light on this question, we introduce EgoPlan-Bench, a comprehensive benchmark to evaluate the planning abilities of MLLMs in real-world scenarios from an egocentric perspective, mirroring human perception. EgoPlan-Bench emphasizes the evaluation of planning capabilities of MLLMs, featuring realistic tasks, diverse action plans, and intricate visual observations. Our rigorous evaluation of a wide range of MLLMs reveals that EgoPlan-Bench poses significant challenges, highlighting a substantial scope for improvement in MLLMs to achieve human-level task planning. To facilitate this advancement, we further present EgoPlan-IT, a specialized instruction-tuning dataset that effectively enhances model performance on EgoPlan-Bench. We have made all codes, data, and a maintained benchmark leaderboard available to advance future research.
+"""
+CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@inproceedings{Liu2023EvalCrafterBA,
+  title={EvalCrafter: Benchmarking and Evaluating Large Video Generation Models},
+  author={Yaofang Liu and Xiaodong Cun and Xuebo Liu and Xintao Wang and Yong Zhang and Haoxin Chen and Yang Liu and Tieyong Zeng and Raymond Chan and Ying Shan},
+  year={2023},
+  url={https://api.semanticscholar.org/CorpusID:264172222}
+}"""

file/result_egoplan_bench.csv ADDED Viewed

	@@ -0,0 +1,29 @@

+Model,Large Language Model,Validation Split,Test Split
+BLIP-2,Flan-T5-XL,26.71,27.90
+InstructBLIP,Flan-T5-XL,28.09,25.19
+InstructBLIP Vicuna,Vicuna-7B,26.53,26.64
+LLaVA,LLaMA-7B,27.0,28.16
+MiniGPT-4,Vicuna-7B,28.11,30.93
+VPGTrans,LLaMA-7B,27.38,24.12
+MultiModal-GPT,Vicuna-7B, 27.81,30.43
+Otter,LLaMA-7B,28.08,30.87
+OpenFlamingo,LLaMA-7B,27.67,30.18
+LLaMA-Adapter V2,LLaMA-7B,27.81,30.43
+GVT,Vicuna-7B, 27.87,29.67
+mPLUG-Owl,LLaMA-7B,27.63,31.31
+mPLUG-Owl-2,LLaMA2-7B,27.84,30.37
+Kosmos-2,Decoder only 1.3B,26.97,""
+Qwen-VL-Chat,Qwen-7B,27.69,31.06
+LLaVA-1.5,Vicuna-7B,27.81,29.80
+VideoChat,Vicuna-7B,27.51,28.72
+Video-ChatGPT,LLaMA-7B,27.33,29.17
+Valley,LLaMA-13B,27.27,30.11
+Video-LLaMA,LLaMA2-Chat-7B,28.58,30.30
+SEED-LLaMA,LLaMA2-Chat-13B,29.93,""
+SEED-X,LLaMA2-Chat-13B,31.07,29.92
+DeepSeek-VL-Chat,DeepSeek-LLM-7B,27.57,26.01
+CogVLM,Vicuna-7B,27.48,31.06
+Yi-VL,Yi-6B,28.67,30.56
+Xcomposer,InternLM-7B,37.17,36.36
+Gemini-Pro-Vision,\-,30.46,32.39
+GPT-4V,\-,37.98,37.25

requirements.txt ADDED Viewed

	@@ -0,0 +1,70 @@

+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+APScheduler==3.10.1
+async-timeout==4.0.2
+attrs==23.1.0
+certifi==2022.12.7
+charset-normalizer==3.1.0
+click==8.1.3
+contourpy==1.0.7
+cycler==0.11.0
+datasets==2.12.0
+entrypoints==0.4
+fastapi==0.95.1
+ffmpy==0.3.0
+filelock==3.11.0
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.4.0
+gradio==3.27.0
+gradio_client==0.1.3
+h11==0.14.0
+httpcore==0.17.0
+httpx==0.24.0
+huggingface-hub==0.13.4
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+multidict==6.0.4
+numpy==1.24.2
+orjson==3.8.10
+packaging==23.1
+pandas==2.0.0
+Pillow==9.5.0
+plotly==5.14.1
+pyarrow==11.0.0
+pydantic==1.10.7
+pydub==0.25.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+PyYAML==6.0
+requests==2.28.2
+semantic-version==2.10.0
+six==1.16.0
+sniffio==1.3.0
+starlette==0.26.1
+toolz==0.12.0
+tqdm==4.65.0
+transformers==4.28.1
+typing_extensions==4.5.0
+tzdata==2023.3
+tzlocal==4.3
+uc-micro-py==1.0.1
+urllib3==1.26.15
+uvicorn==0.21.1
+websockets==11.0.1
+yarl==1.8.2