dsorokin commited on
Commit
cbb678d
Β·
1 Parent(s): b31aebf
Files changed (1) hide show
  1. app.py +11 -145
app.py CHANGED
@@ -9,17 +9,11 @@ import numpy as np
9
  import pandas as pd
10
 
11
 
12
- # notebook_url = "https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing"
13
- notebook_url = "https://colab.research.google.com/drive/1KdwokPjirkTmpO_P1WByFNFiqxWQquwH#scrollTo=o_CpbkGEbhrK"
14
-
15
-
16
- basic_component_values = [None] * 6
17
- leader_component_values = [None] * 5
18
 
19
 
20
  def make_default_md():
21
  leaderboard_md = f"""
22
- # πŸ† BabilongLeaderboard
23
  | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) |
24
  """
25
  return leaderboard_md
@@ -31,80 +25,6 @@ def make_arena_leaderboard_md():
31
  return leaderboard_md
32
 
33
 
34
- def make_full_leaderboard_md(elo_results):
35
- leaderboard_md = f"""
36
- Three benchmarks are displayed: **Arena Elo**, **MT-Bench** and **MMLU**.
37
- - [Chatbot Arena](https://chat.lmsys.org/?arena) - a crowdsourced, randomized battle platform. We use 200K+ user votes to compute Elo ratings.
38
- - [MT-Bench](https://arxiv.org/abs/2306.05685): a set of challenging multi-turn questions. We use GPT-4 to grade the model responses.
39
- - [MMLU](https://arxiv.org/abs/2009.03300) (5-shot): a test to measure a model's multitask accuracy on 57 tasks.
40
-
41
- πŸ’» Code: The MT-bench scores (single-answer grading on a scale of 10) are computed by [fastchat.llm_judge](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge).
42
- The MMLU scores are mostly computed by [InstructEval](https://github.com/declare-lab/instruct-eval).
43
- Higher values are better for all benchmarks. Empty cells mean not available.
44
- """
45
- return leaderboard_md
46
-
47
-
48
- def make_leaderboard_md_live(elo_results):
49
- leaderboard_md = f"""
50
- # Leaderboard
51
- Last updated: {elo_results["last_updated_datetime"]}
52
- {elo_results["leaderboard_table"]}
53
- """
54
- return leaderboard_md
55
-
56
-
57
- def update_elo_components(max_num_files, elo_results_file):
58
- log_files = get_log_files(max_num_files)
59
-
60
- # Leaderboard
61
- if elo_results_file is None: # Do live update
62
- battles = clean_battle_data(log_files)
63
- elo_results = report_elo_analysis_results(battles)
64
-
65
- leader_component_values[0] = make_leaderboard_md_live(elo_results)
66
- leader_component_values[1] = elo_results["win_fraction_heatmap"]
67
- leader_component_values[2] = elo_results["battle_count_heatmap"]
68
- leader_component_values[3] = elo_results["bootstrap_elo_rating"]
69
- leader_component_values[4] = elo_results["average_win_rate_bar"]
70
-
71
- # Basic stats
72
- basic_stats = report_basic_stats(log_files)
73
- md0 = f"Last updated: {basic_stats['last_updated_datetime']}"
74
-
75
- md1 = "### Action Histogram\n"
76
- md1 += basic_stats["action_hist_md"] + "\n"
77
-
78
- md2 = "### Anony. Vote Histogram\n"
79
- md2 += basic_stats["anony_vote_hist_md"] + "\n"
80
-
81
- md3 = "### Model Call Histogram\n"
82
- md3 += basic_stats["model_hist_md"] + "\n"
83
-
84
- md4 = "### Model Call (Last 24 Hours)\n"
85
- md4 += basic_stats["num_chats_last_24_hours"] + "\n"
86
-
87
- basic_component_values[0] = md0
88
- basic_component_values[1] = basic_stats["chat_dates_bar"]
89
- basic_component_values[2] = md1
90
- basic_component_values[3] = md2
91
- basic_component_values[4] = md3
92
- basic_component_values[5] = md4
93
-
94
-
95
- def update_worker(max_num_files, interval, elo_results_file):
96
- while True:
97
- tic = time.time()
98
- update_elo_components(max_num_files, elo_results_file)
99
- durtaion = time.time() - tic
100
- print(f"update duration: {durtaion:.2f} s")
101
- time.sleep(max(interval - durtaion, 0))
102
-
103
-
104
- def load_demo(url_params, request: gr.Request):
105
- logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
106
- return basic_component_values + leader_component_values
107
-
108
 
109
  def model_hyperlink(model_name, link):
110
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
@@ -147,48 +67,6 @@ def load_leaderboard_table_csv(filename, add_hyperlink=True):
147
  return rows
148
 
149
 
150
- def build_basic_stats_tab():
151
- empty = "Loading ..."
152
- basic_component_values[:] = [empty, None, empty, empty, empty, empty]
153
-
154
- md0 = gr.Markdown(empty)
155
- gr.Markdown("#### Figure 1: Number of model calls and votes")
156
- plot_1 = gr.Plot(show_label=False)
157
- with gr.Row():
158
- with gr.Column():
159
- md1 = gr.Markdown(empty)
160
- with gr.Column():
161
- md2 = gr.Markdown(empty)
162
- with gr.Row():
163
- with gr.Column():
164
- md3 = gr.Markdown(empty)
165
- with gr.Column():
166
- md4 = gr.Markdown(empty)
167
- return [md0, plot_1, md1, md2, md3, md4]
168
-
169
- def get_full_table(arena_df, model_table_df):
170
- values = []
171
- for i in range(len(model_table_df)):
172
- row = []
173
- model_key = model_table_df.iloc[i]["key"]
174
- model_name = model_table_df.iloc[i]["Model"]
175
- # model display name
176
- row.append(model_name)
177
- if model_key in arena_df.index:
178
- idx = arena_df.index.get_loc(model_key)
179
- row.append(round(arena_df.iloc[idx]["rating"]))
180
- else:
181
- row.append(np.nan)
182
- row.append(model_table_df.iloc[i]["MT-bench (score)"])
183
- row.append(model_table_df.iloc[i]["MMLU"])
184
- # Organization
185
- row.append(model_table_df.iloc[i]["Organization"])
186
- # license
187
- row.append(model_table_df.iloc[i]["License"])
188
-
189
- values.append(row)
190
- values.sort(key=lambda x: -x[1] if not np.isnan(x[1]) else 1e9)
191
- return values
192
 
193
 
194
  def build_leaderboard_tab():
@@ -204,27 +82,25 @@ def build_leaderboard_tab():
204
  headers=[
205
  "Rank",
206
  "πŸ€– Model",
207
- "⭐ Arena Elo",
208
- "πŸ“Š 95% CI",
209
- "πŸ—³οΈ Votes",
210
- "Organization",
211
- "License",
212
- "Knowledge Cutoff",
213
  ],
214
  datatype=[
215
  "str",
216
  "markdown",
217
  "number",
218
- "str",
219
  "number",
220
- "str",
221
- "str",
222
- "str",
223
  ],
224
  # value=arena_table_vals,
225
  elem_id="arena_leaderboard_dataframe",
226
  height=700,
227
- column_widths=[50, 200, 120, 100, 100, 150, 150, 100],
228
  wrap=True,
229
  )
230
 
@@ -268,17 +144,7 @@ footer {
268
  }
269
  """
270
 
271
- acknowledgment_md = """
272
- ### Acknowledgment
273
- <div class="image-container">
274
- <p> We thank <a href="https://www.kaggle.com/" target="_blank">Kaggle</a>, <a href="https://mbzuai.ac.ae/" target="_blank">MBZUAI</a>, <a href="https://www.anyscale.com/" target="_blank">AnyScale</a>, <a href="https://www.a16z.com/" target="_blank">a16z</a>, and <a href="https://huggingface.co/" target="_blank">HuggingFace</a> for their generous <a href="https://lmsys.org/donations/" target="_blank">sponsorship</a>. </p>
275
- <img src="https://upload.wikimedia.org/wikipedia/commons/thumb/7/7c/Kaggle_logo.png/400px-Kaggle_logo.png" alt="Kaggle">
276
- <img src="https://mma.prnewswire.com/media/1227419/MBZUAI_Logo.jpg?p=facebookg" alt="MBZUAI">
277
- <img src="https://docs.anyscale.com/site-assets/logo.png" alt="AnyScale">
278
- <img src="https://a16z.com/wp-content/themes/a16z/assets/images/opegraph_images/corporate-Yoast-Twitter.jpg" alt="a16z">
279
- <img src="https://huggingface.co/datasets/huggingface/brand-assets/resolve/main/hf-logo-with-title.png" alt="HuggingFace">
280
- </div>
281
- """
282
 
283
  def build_demo():
284
  text_size = gr.themes.sizes.text_lg
 
9
  import pandas as pd
10
 
11
 
 
 
 
 
 
 
12
 
13
 
14
  def make_default_md():
15
  leaderboard_md = f"""
16
+ # πŸ† Babilong Leaderboard
17
  | [GitHub](https://github.com/lm-sys/FastChat) | [Paper](https://arxiv.org/abs/2306.05685) | [Dataset](https://github.com/lm-sys/FastChat/blob/main/docs/dataset_release.md) |
18
  """
19
  return leaderboard_md
 
25
  return leaderboard_md
26
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  def model_hyperlink(model_name, link):
30
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
67
  return rows
68
 
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
 
72
  def build_leaderboard_tab():
 
82
  headers=[
83
  "Rank",
84
  "πŸ€– Model",
85
+ "qa 1",
86
+ "qa 2",
87
+ "qa 3",
88
+ "qa 4",
89
+ "qa 5",
 
90
  ],
91
  datatype=[
92
  "str",
93
  "markdown",
94
  "number",
 
95
  "number",
96
+ "number",
97
+ "number",
98
+ "number",
99
  ],
100
  # value=arena_table_vals,
101
  elem_id="arena_leaderboard_dataframe",
102
  height=700,
103
+ column_widths=[50, 200, 150, 150, 150, 150, 150],
104
  wrap=True,
105
  )
106
 
 
144
  }
145
  """
146
 
147
+
 
 
 
 
 
 
 
 
 
 
148
 
149
  def build_demo():
150
  text_size = gr.themes.sizes.text_lg