Spaces:
Sleeping
Sleeping
vtrv.vls
commited on
Commit
Β·
1639c46
1
Parent(s):
f9d1508
Tabs test
Browse files- app.py +63 -4
- constants.py +315 -0
- test.md +1 -0
app.py
CHANGED
@@ -1,10 +1,69 @@
|
|
1 |
-
import gradio
|
2 |
-
|
3 |
import os
|
4 |
|
|
|
|
|
|
|
|
|
|
|
5 |
def gen(content):
|
6 |
res = generate(content,'auth_token.json')
|
7 |
return res
|
8 |
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio
|
2 |
+
import argparse
|
3 |
import os
|
4 |
|
5 |
+
from utils import generate
|
6 |
+
from constants import css, js_code, js_light
|
7 |
+
|
8 |
+
MERA_table = None
|
9 |
+
|
10 |
def gen(content):
|
11 |
res = generate(content,'auth_token.json')
|
12 |
return res
|
13 |
|
14 |
+
def tab_arena():
|
15 |
+
arena = gradio.Interface(fn=gen, inputs="text", outputs="text")
|
16 |
+
arena.launch()
|
17 |
+
|
18 |
+
with open("_test.md", "r") as f:
|
19 |
+
TEST_MD = f.read()
|
20 |
+
|
21 |
+
|
22 |
+
def build_demo():
|
23 |
+
# global original_dfs, available_models, gpt4t_dfs, haiku_dfs, llama_dfs
|
24 |
+
|
25 |
+
with gradio.Blocks(theme=gradio.themes.Soft(), css=css, js=js_light) as demo:
|
26 |
+
# gradio.HTML(BANNER, elem_id="banner")
|
27 |
+
# gradio.Markdown(HEADER_MD.replace("{model_num}", str(len(original_dfs["-1"]))), elem_classes="markdown-text")
|
28 |
+
|
29 |
+
with gradio.Tabs(elem_classes="tab-buttons") as tabs:
|
30 |
+
with gradio.TabItem("πΌ MERA leaderboard", elem_id="od-benchmark-tab-table", id=0):
|
31 |
+
gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
|
32 |
+
# _tab_leaderboard()
|
33 |
+
|
34 |
+
with gradio.TabItem("π SBS by categories and criteria", elem_id="od-benchmark-tab-table", id=1):
|
35 |
+
gradio.Markdown(TEST_MD, elem_classes="markdown-text-details")
|
36 |
+
|
37 |
+
with gradio.TabItem("π₯ Model arena", elem_id="od-benchmark-tab-table", id=2):
|
38 |
+
tab_arena()
|
39 |
+
# _tab_explore()
|
40 |
+
|
41 |
+
with gradio.TabItem("πͺ About MERA", elem_id="od-benchmark-tab-table", id=3):
|
42 |
+
gradio.Markdown(TEST_MD, elem_classes="markdown-text")
|
43 |
+
# gr.Markdown(f"Last updated on **{LAST_UPDATED}** | [Link to V1-legacy](https://huggingface.co/spaces/allenai/WildBench-V1-legacy)", elem_classes="markdown-text-small")
|
44 |
+
|
45 |
+
# with gr.Row():
|
46 |
+
# with gr.Accordion("π Citation", open=False, elem_classes="accordion-label"):
|
47 |
+
# gr.Textbox(
|
48 |
+
# value=CITATION_TEXT,
|
49 |
+
# lines=7,
|
50 |
+
# label="Copy the BibTeX snippet to cite this source",
|
51 |
+
# elem_id="citation-button",
|
52 |
+
# show_copy_button=True)
|
53 |
+
# ).style(show_copy_button=True)
|
54 |
+
|
55 |
+
return demo
|
56 |
+
|
57 |
+
if __name__ == "__main__":
|
58 |
+
parser = argparse.ArgumentParser()
|
59 |
+
# parser.add_argument("--share", action="store_true")
|
60 |
+
# parser.add_argument("--bench_table", help="Path to MERA table", default="data_dir/MERA_jun2024.jsonl")
|
61 |
+
args = parser.parse_args()
|
62 |
+
# data_load(args.result_file)
|
63 |
+
# TYPES = ["number", "markdown", "number"]
|
64 |
+
demo = build_demo()
|
65 |
+
demo.launch(share=args.share, height=3000, width="110%")
|
66 |
+
|
67 |
+
# demo = gradio.Interface(fn=gen, inputs="text", outputs="text")
|
68 |
+
# demo.launch()
|
69 |
+
|
constants.py
ADDED
@@ -0,0 +1,315 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
from collections import OrderedDict
|
3 |
+
|
4 |
+
# DEFAULT_K = "β"
|
5 |
+
DEFAULT_K = "1500"
|
6 |
+
|
7 |
+
banner_url = "https://allenai.github.io/WildBench/gray_banner.png" # the same repo here.
|
8 |
+
BANNER = f'<div style="display: flex; justify-content: flex-start;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 800px;"> </div>'
|
9 |
+
|
10 |
+
TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> π¦ AI2 WildBench Leaderboard </b> </body> </html>"
|
11 |
+
|
12 |
+
WINRATE_HEATMAP = "<div><img src='https://github.com/WildEval/WildBench-Leaderboard/blob/main/gradio/pairwise_win_fractions.png?raw=true' style='width:100%;'></div>"
|
13 |
+
|
14 |
+
CITATION_TEXT = """@misc{wildbench2024,
|
15 |
+
title = {WildBench: Benchmarking Language Models with Challenging Tasks from Real Users in the Wild},
|
16 |
+
author = {Bill Yuchen Lin and Yuntian Deng and Khyathi Chandu and Faeze BrArena-Hardman and Abhilasha Ravichander and Valentina Pyatkin and Ronan Le Bras and Yejin Choi},
|
17 |
+
year = 2024,
|
18 |
+
url = {https://huggingface.co/spaces/allenai/WildBench},
|
19 |
+
}
|
20 |
+
"""
|
21 |
+
|
22 |
+
# make column_names as an ordered dict
|
23 |
+
|
24 |
+
|
25 |
+
REWARD_MIX_COLUMN = "π Reward-Mix (Avg)"
|
26 |
+
MACRO_COLUMN = "π Reward (Macro)"
|
27 |
+
|
28 |
+
column_names = OrderedDict({
|
29 |
+
"model_name": "Model",
|
30 |
+
"WB_score": "π― WB Score",
|
31 |
+
"WB_score.task_macro": "π― Score Macro",
|
32 |
+
# "Arena Elo (hard) - 2024-05-20": "LMSYS Elo",
|
33 |
+
"Arena Elo (hard-en) - 2024-06-06": "LMSYS Elo",
|
34 |
+
"Arena-Hard v0.1": "Arena-Hard",
|
35 |
+
"AE2.0 LC": "AE2-LCWR",
|
36 |
+
"AE2.0": "AE2-WR",
|
37 |
+
"#chars": "Length",
|
38 |
+
"Length": "Len",
|
39 |
+
"task_macro_reward": "π Task-Macro",
|
40 |
+
# # "elo overall": "Overall Elo",
|
41 |
+
# 'Others': 'Misc',
|
42 |
+
# # "average": "Task-Avg Elo",
|
43 |
+
# f"mixture_of_rewards.K={K}": "π π― Reward-Mix",
|
44 |
+
# f"gpt4t_reward.K={K}": "π GPT4T",
|
45 |
+
# f"haiku_reward.K={K}": "π Haiku",
|
46 |
+
# f"llama_reward.K={K}": "π Llama2",
|
47 |
+
})
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
LEADERBOARD_REMARKS = """**WB Reward**: for each comparison (A vs B), a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; when there is a **Tie**, the reward is **0**.
|
52 |
+
"""
|
53 |
+
|
54 |
+
LEADERBOARD_REMARKS_MAIN = """
|
55 |
+
**WB Reward**: for each pairwise comparison, a reward for A is **+/-1** if A is **much better/worse** than B, and **+/-0.5** if A is **slightly better/worse** than B; 0 for a **Tie**.
|
56 |
+
The baseline models are GPT4-Turbo, Haiku, and Llama2-70B, and Mix is the average of the three.
|
57 |
+
**WB Score** individually scores each model based on checklists.
|
58 |
+
Evaluator is GPT-4-Turbo.
|
59 |
+
"""
|
60 |
+
|
61 |
+
LENGTH_MARGIN_DESC_MD = """To mitigate the length bias, we consider it a **Tie** when A is only **slightly** better than B but A is longer than B by more than K chars.
|
62 |
+
|
63 |
+
π for closed LLMs; π¨ for newly added models;
|
64 |
+
"""
|
65 |
+
|
66 |
+
RANKING_COLUMN = REWARD_MIX_COLUMN
|
67 |
+
|
68 |
+
ORDERED_COLUMN_NAMES = [
|
69 |
+
"Model",
|
70 |
+
MACRO_COLUMN,
|
71 |
+
"π― Score Macro",
|
72 |
+
REWARD_MIX_COLUMN,
|
73 |
+
# "π― WB Score",
|
74 |
+
"π π― GPT4T",
|
75 |
+
"π π― Haiku",
|
76 |
+
"π π― Llama",
|
77 |
+
# "LMSYS Elo",
|
78 |
+
"LMSYS Elo",
|
79 |
+
"Arena-Hard",
|
80 |
+
"AE2-LCWR",
|
81 |
+
# "AE2-WR",
|
82 |
+
"Len",
|
83 |
+
]
|
84 |
+
|
85 |
+
|
86 |
+
all_task_types_raw = [
|
87 |
+
'Information seeking',
|
88 |
+
'Coding & Debugging',
|
89 |
+
'Math',
|
90 |
+
'Data Analysis',
|
91 |
+
'Planning',
|
92 |
+
'Reasoning',
|
93 |
+
'Creative Writing',
|
94 |
+
'Editing',
|
95 |
+
'Role playing',
|
96 |
+
'Advice seeking',
|
97 |
+
'Brainstorming',
|
98 |
+
# 'Others'
|
99 |
+
]
|
100 |
+
|
101 |
+
all_task_types = ['Creative Tasks', 'Planning & Reasoning', 'Math & Data Analysis', 'Information/Advice seeking', 'Coding & Debugging']
|
102 |
+
|
103 |
+
|
104 |
+
TASK_NAME_MAPPING_RAW = {
|
105 |
+
'Information seeking': 'InfoSek',
|
106 |
+
'Creative Writing': 'CrtWrt',
|
107 |
+
'Coding & Debugging': 'Code',
|
108 |
+
'Reasoning': 'Reason',
|
109 |
+
'Editing': 'Edit',
|
110 |
+
'Math': 'Math',
|
111 |
+
'Planning': 'Plan',
|
112 |
+
'Brainstorming': 'Brnstrm',
|
113 |
+
'Role playing': 'RolPly',
|
114 |
+
'Advice seeking': 'AdvSek',
|
115 |
+
'Data Analysis': 'DataAna',
|
116 |
+
}
|
117 |
+
|
118 |
+
TASK_NAME_MAPPING = {
|
119 |
+
'Planning & Reasoning': 'π Reason & Plan',
|
120 |
+
'Math & Data Analysis': 'π Math & Data',
|
121 |
+
'Coding & Debugging': 'π» Code & Debug',
|
122 |
+
'Creative Tasks': 'π Creative',
|
123 |
+
'Information/Advice seeking': 'βΉοΈ Info Seek',
|
124 |
+
}
|
125 |
+
|
126 |
+
js_light = """
|
127 |
+
function refresh() {
|
128 |
+
const url = new URL(window.location);
|
129 |
+
|
130 |
+
if (url.searchParams.get('__theme') !== 'light') {
|
131 |
+
url.searchParams.set('__theme', 'light');
|
132 |
+
window.location.href = url.href;
|
133 |
+
}
|
134 |
+
}
|
135 |
+
"""
|
136 |
+
|
137 |
+
js_code = """
|
138 |
+
function scroll_top() {
|
139 |
+
console.log("Hello from Gradio!");
|
140 |
+
const bubbles = document.querySelectorAll('.bubble-wrap');
|
141 |
+
bubbles.forEach((bubble, index) => {
|
142 |
+
setTimeout(() => {
|
143 |
+
bubble.scrollTop = 0;
|
144 |
+
}, index * 100); // Delay of 100ms between each iteration
|
145 |
+
});
|
146 |
+
}
|
147 |
+
"""
|
148 |
+
|
149 |
+
|
150 |
+
TASK_TYPE_STR = "**Tasks**: Info seeking (**InfoSek**), Creative Writing (**CrtWrt**), Coding&Debugging (**Code**), Reasoning (**Reason**), Editing (**Edit**), **Math**, Planning (**Plan**), Brainstorming (**Brnstrm**), Role playing (**RolPly**), Advice seeking (**AdvSek**), Data Analysis (**DataAna**)"
|
151 |
+
|
152 |
+
css = """
|
153 |
+
|
154 |
+
|
155 |
+
|
156 |
+
code {
|
157 |
+
font-size: large;
|
158 |
+
}
|
159 |
+
footer {visibility: hidden}
|
160 |
+
.top-left-LP{
|
161 |
+
margin-top: 6px;
|
162 |
+
margin-left: 5px;
|
163 |
+
}
|
164 |
+
.no_margin{
|
165 |
+
margin-top: 0px;
|
166 |
+
margin-left: 0px;
|
167 |
+
margin-right: 0px;
|
168 |
+
margin-bottom: 0px;
|
169 |
+
padding-top: 0px;
|
170 |
+
padding-left: 0px;
|
171 |
+
padding-right: 0px;
|
172 |
+
padding-bottom: 0px;
|
173 |
+
}
|
174 |
+
.markdown-text{font-size: 14pt}
|
175 |
+
.markdown-text-tiny{font-size: 10pt}
|
176 |
+
.markdown-text-small{font-size: 13pt}
|
177 |
+
.markdown-text-tiny{font-size: 12pt}
|
178 |
+
.markdown-text-tiny-red{
|
179 |
+
font-size: 12pt;
|
180 |
+
color: red;
|
181 |
+
background-color: yellow;
|
182 |
+
font-color: red;
|
183 |
+
font-weight: bold;
|
184 |
+
}
|
185 |
+
th {
|
186 |
+
text-align: center;
|
187 |
+
font-size: 17px; /* Adjust the font size as needed */
|
188 |
+
}
|
189 |
+
td {
|
190 |
+
font-size: 15px; /* Adjust the font size as needed */
|
191 |
+
text-align: center;
|
192 |
+
}
|
193 |
+
|
194 |
+
.sample_button{
|
195 |
+
border: 1px solid #000000;
|
196 |
+
border-radius: 5px;
|
197 |
+
padding: 5px;
|
198 |
+
font-size: 15pt;
|
199 |
+
font-weight: bold;
|
200 |
+
margin: 5px;
|
201 |
+
}
|
202 |
+
|
203 |
+
.chat-common{
|
204 |
+
height: auto;
|
205 |
+
max-height: 400px;
|
206 |
+
min-height: 100px;
|
207 |
+
}
|
208 |
+
.chat-specific{
|
209 |
+
height: auto;
|
210 |
+
max-height: 600px;
|
211 |
+
min-height: 200px;
|
212 |
+
}
|
213 |
+
#od-benchmark-tab-table-button{
|
214 |
+
font-size: 15pt;
|
215 |
+
font-weight: bold;
|
216 |
+
}
|
217 |
+
|
218 |
+
.btn_boderline{
|
219 |
+
border: 1px solid #000000;
|
220 |
+
border-radius: 5px;
|
221 |
+
padding: 5px;
|
222 |
+
margin: 5px;
|
223 |
+
font-size: 15pt;
|
224 |
+
font-weight: bold;
|
225 |
+
}
|
226 |
+
|
227 |
+
.btn_boderline_next{
|
228 |
+
border: 0.1px solid #000000;
|
229 |
+
border-radius: 5px;
|
230 |
+
padding: 5px;
|
231 |
+
margin: 5px;
|
232 |
+
font-size: 15pt;
|
233 |
+
font-weight: bold;
|
234 |
+
}
|
235 |
+
|
236 |
+
.btn_boderline_gray{
|
237 |
+
border: 0.5px solid gray;
|
238 |
+
border-radius: 5px;
|
239 |
+
padding: 5px;
|
240 |
+
margin: 5px;
|
241 |
+
font-size: 15pt;
|
242 |
+
font-weight: italic;
|
243 |
+
}
|
244 |
+
.btn_boderline_selected{
|
245 |
+
border: 2px solid purple;
|
246 |
+
background-color: #f2f2f2;
|
247 |
+
border-radius: 5px;
|
248 |
+
padding: 5px;
|
249 |
+
margin: 5px;
|
250 |
+
font-size: 15pt;
|
251 |
+
font-weight: bold;
|
252 |
+
}
|
253 |
+
.accordion-label button span{
|
254 |
+
font-size: 14pt;
|
255 |
+
font-weight: bold;
|
256 |
+
}
|
257 |
+
|
258 |
+
#show-task-categorized span{
|
259 |
+
font-size: 13pt;
|
260 |
+
font-weight: bold;
|
261 |
+
}
|
262 |
+
|
263 |
+
#show-open-source-models span{
|
264 |
+
font-size: 13pt;
|
265 |
+
font-weight: bold;
|
266 |
+
}
|
267 |
+
|
268 |
+
#select-models span{
|
269 |
+
font-size: 10pt;
|
270 |
+
}
|
271 |
+
|
272 |
+
#select-tasks span{
|
273 |
+
font-size: 10pt;
|
274 |
+
}
|
275 |
+
|
276 |
+
|
277 |
+
.markdown-text-details{
|
278 |
+
margin: 10px;
|
279 |
+
padding: 10px;
|
280 |
+
}
|
281 |
+
|
282 |
+
|
283 |
+
button.selected[role="tab"][aria-selected="true"] {
|
284 |
+
font-size: 18px; /* or any other size you prefer */
|
285 |
+
font-weight: bold;
|
286 |
+
}
|
287 |
+
|
288 |
+
#od-benchmark-tab-table-ablation-button {
|
289 |
+
font-size: larger; /* Adjust the font size as needed */
|
290 |
+
}
|
291 |
+
|
292 |
+
|
293 |
+
.plotly-plot{
|
294 |
+
height: auto;
|
295 |
+
max-height: 600px;
|
296 |
+
min-height: 600px;
|
297 |
+
}
|
298 |
+
|
299 |
+
#length-margin-radio{
|
300 |
+
font-size: 10pt;
|
301 |
+
padding: 0px;
|
302 |
+
margin: 0px;
|
303 |
+
}
|
304 |
+
|
305 |
+
#show-task-categorized{
|
306 |
+
font-size: 12pt;
|
307 |
+
font-decoration: bold;
|
308 |
+
}
|
309 |
+
|
310 |
+
#show-open-source-models{
|
311 |
+
font-size: 12pt;
|
312 |
+
font-decoration: bold;
|
313 |
+
}
|
314 |
+
"""
|
315 |
+
|
test.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
## TEST
|