zhwang4ai commited on
Commit
229c9d9
·
verified ·
1 Parent(s): 96b85d6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ TITLE = """<h1 align="center" id="space-title">LLM Leaderboard for Minecraft</h1>"""
8
+
9
+ DESCRIPTION = f"""
10
+ Evaluation of VLM on Minecraft
11
+ """
12
+
13
+ BENCHMARKS_TO_SKIP = []
14
+
15
+
16
+ def get_leaderboard_df(score_path):
17
+ with open(score_path, "r") as f:
18
+ scores = json.load(f)
19
+
20
+ rows = []
21
+ for model, metrics in scores.items():
22
+ row = {"Model": model} # Initialize with the model name
23
+ for key, value in metrics.items():
24
+ if isinstance(value, dict): # If it's a dictionary, further flatten it
25
+ for sub_key, sub_value in value.items():
26
+ if sub_key != "20":
27
+ continue
28
+ #row[f"{key}_{sub_key}"] = sub_value
29
+ row[f"{key}"] = sub_value
30
+ else:
31
+ row[key] = value
32
+ rows.append(row)
33
+
34
+ df = pd.DataFrame(rows)
35
+ return df
36
+
37
+
38
+ leaderboard_df = get_leaderboard_df("output/score.json")
39
+
40
+
41
+ def agg_df(df, agg: str = "max"):
42
+ df = df.copy()
43
+ # Drop date and aggregate results by model name
44
+ df = df.drop("Date", axis=1).groupby("Model").agg(agg).reset_index()
45
+
46
+ df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
47
+
48
+ # Convert all values to percentage
49
+ df[df.select_dtypes(include=["number"]).columns] *= 100.0
50
+ df = df.sort_values(by=["Average"], ascending=False)
51
+ return df
52
+
53
+
54
+ # Function to update the table based on search query
55
+ def filter_and_search(cols: list[str], search_query: str, agg: str):
56
+ df = leaderboard_df
57
+ df = agg_df(df, agg)
58
+ if len(search_query) > 0:
59
+ search_terms = search_query.split(";")
60
+ search_terms = [term.strip().lower() for term in search_terms]
61
+ pattern = "|".join(search_terms)
62
+ df = df[df["Model"].str.lower().str.contains(pattern, regex=True)]
63
+ # Drop any columns which are all NaN
64
+ df = df.dropna(how="all", axis=1)
65
+ if len(cols) > 0:
66
+ index_cols = list(leaderboard_df.columns[:1])
67
+ new_cols = index_cols + cols
68
+ df = df.copy()[new_cols]
69
+ df = df.copy().dropna(how="all", axis=0, subset=[c for c in df.columns if c in cols])
70
+ df.insert(loc=1, column="Average", value=df.mean(axis=1, numeric_only=True))
71
+ return df
72
+
73
+
74
+ demo = gr.Blocks()
75
+
76
+ with demo:
77
+ gr.HTML(TITLE)
78
+ with gr.Column():
79
+ gr.Markdown(DESCRIPTION, elem_classes="markdown-text")
80
+ with gr.Row():
81
+ search_bar = gr.Textbox(placeholder="Search for your model...", show_label=False)
82
+ agg = gr.Radio(
83
+ ["min", "max", "mean"],
84
+ value="max",
85
+ label="Aggregation",
86
+ info="How to aggregate results for each model",
87
+ )
88
+
89
+ # with gr.Row():
90
+ # agg = gr.Radio(
91
+ # ["20", "50", "100", "200"],
92
+ # value="20",
93
+ # label="Threshold",
94
+ # info="The threshold of gui",
95
+ # )
96
+
97
+
98
+ with gr.Row():
99
+ cols_bar = gr.CheckboxGroup(
100
+ choices=[c for c in leaderboard_df.columns[1:] if c != "Average"],
101
+ show_label=False,
102
+ info="Select columns to display",
103
+ )
104
+ with gr.Group():
105
+ leaderboard_table = gr.Dataframe(
106
+ value=leaderboard_df,
107
+ wrap=True,
108
+ column_widths=[400, 110] + [(260 + len(c)) for c in leaderboard_df.columns[1:]],
109
+ )
110
+
111
+ threshold_text = gr.HTML("Threshold corresponding to the values of gui and embodied: 20")
112
+ cols_bar.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
113
+ agg.change(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
114
+ search_bar.submit(filter_and_search, inputs=[cols_bar, search_bar, agg], outputs=[leaderboard_table])
115
+ demo.launch()