panuthept commited on
Commit
c99ced4
Β·
1 Parent(s): 7fd4e12

test update code

Browse files
Files changed (4) hide show
  1. .gitignore +2 -0
  2. app.py +286 -191
  3. app_demo.py +204 -0
  4. src/about.py +1 -1
.gitignore CHANGED
@@ -11,3 +11,5 @@ eval-results/
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
 
 
 
11
  eval-queue-bk/
12
  eval-results-bk/
13
  logs/
14
+
15
+ .DS_Store
app.py CHANGED
@@ -1,204 +1,299 @@
1
  import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
-
60
- def init_leaderboard(dataframe):
61
- if dataframe is None or dataframe.empty:
62
- raise ValueError("Leaderboard DataFrame is empty or None.")
63
- return Leaderboard(
64
- value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
- select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
- label="Select Columns to Display:",
70
- ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
- filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
- ],
87
- bool_checkboxgroup_label="Hide models",
88
- interactive=False,
89
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
 
91
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
96
 
97
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
- leaderboard = init_leaderboard(LEADERBOARD_DF)
100
-
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
 
2
  import pandas as pd
3
+ from css_html_js import custom_css
 
4
 
5
+ TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ INTRODUCTION_TEXT = """
8
+ πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
+ ## Dataset
10
+ πŸ“ˆ We evaluate models based on 3 datasets,
11
+ 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
12
+ - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
13
+ 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
14
+ - This test is general test for malay grammar.
15
+ 3. General high school science questions, contains 323 questions, https://huggingface.co/datasets/mesolitica/mysoalan.com-qa
16
+ - This test is general test for science.
17
+ 4. Translated MMLU, https://huggingface.co/datasets/mesolitica/translated-MMLU
18
+ - This test is to test general knowledge, originally from MMLU.
19
+ ## Contributions
20
+ 1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani
21
+ 2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102
22
+ ## Tagging
23
+ 🟒 pretrained β­• instruction-tuned πŸ“¦ close sourced
24
+ """
25
 
26
+ close_source = [
27
+ {
28
+ 'T': 'πŸ“¦',
29
+ 'model': 'claude-3-opus-20240229',
30
+ 'BM-PT3 0-shot': 57.41,
31
+ 'BM-PT3 1-shot': 53.70,
32
+ 'BM-PT3 3-shots': 62.96,
33
+ 'Tatabahasa 0-shot': 77.08,
34
+ 'Tatabahasa 1-shot': 73.93,
35
+ 'Tatabahasa 3-shots': 75.64,
36
+ },
37
+ {
38
+ 'T': 'πŸ“¦',
39
+ 'model': 'claude-3-sonnet-20240229',
40
+ 'BM-PT3 0-shot': 48.15,
41
+ 'BM-PT3 1-shot': 50.00,
42
+ 'BM-PT3 3-shots': 37.04,
43
+ 'Tatabahasa 0-shot': 65.90,
44
+ 'Tatabahasa 1-shot': 38.40,
45
+ 'Tatabahasa 3-shots': 40.97,
46
+ },
47
+ {
48
+ 'T': 'πŸ“¦',
49
+ 'model': 'claude-3-haiku-20240307',
50
+ 'BM-PT3 0-shot': 48.15,
51
+ 'BM-PT3 1-shot': 50.00,
52
+ 'BM-PT3 3-shots': 50.00,
53
+ 'Tatabahasa 0-shot': 62.75,
54
+ 'Tatabahasa 1-shot': 49.86,
55
+ 'Tatabahasa 3-shots': 24.07,
56
+ },
57
+ {
58
+ 'T': 'πŸ“¦',
59
+ 'model': 'AWS Bedrock Claude 1.3',
60
+ 'Tatabahasa 0-shot': 60.650887573964496,
61
+ 'Tatabahasa 1-shot': 62.46418338108882,
62
+ 'Tatabahasa 3-shots': 67.34104046242774,
63
+ },
64
+ {
65
+ 'T': 'πŸ“¦',
66
+ 'model': 'AWS Bedrock Claude 2',
67
+ 'Tatabahasa 0-shot': 61.702127659574465,
68
+ 'Tatabahasa 1-shot': 60.17191977077364,
69
+ 'Tatabahasa 3-shots': 59.598853868194844,
70
+ },
71
+ {
72
+ 'T': 'πŸ“¦',
73
+ 'model': 'gpt-4-1106-preview',
74
+ 'BM-PT3 0-shot': 51.85185185185185,
75
+ 'BM-PT3 1-shot': 66.66666666666666,
76
+ 'BM-PT3 3-shots': 55.55555555555556,
77
+ 'Tatabahasa 0-shot': 75.64469914040114,
78
+ 'Tatabahasa 1-shot': 73.63896848137536,
79
+ 'Tatabahasa 3-shots': 75.64469914040114,
80
+ },
81
+ {
82
+ 'T': 'πŸ“¦',
83
+ 'model': 'gpt-3.5-turbo-0613',
84
+ 'BM-PT3 0-shot': 36.53846153846153,
85
+ 'BM-PT3 1-shot': 28.846153846153843,
86
+ 'BM-PT3 3-shots': 24.528301886792452,
87
+ 'Tatabahasa 0-shot': 59.530791788856305,
88
+ 'Tatabahasa 1-shot': 60.80691642651297,
89
+ 'Tatabahasa 3-shots': 63.03724928366762,
90
+ },
91
+ ]
92
 
93
+ open_source = [
94
+ {
95
+ 'T': '🟒',
96
+ 'model': '[meta-llama/llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
97
+ 'Tatabahasa 0-shot': 24.355300859598856,
98
+ 'Tatabahasa 1-shot': 28.08022922636103,
99
+ 'Tatabahasa 3-shots': 24.641833810888254,
100
+ },
101
+ {
102
+ 'T': '🟒',
103
+ 'model': '[mesolitica/tinyllama-1.1b-4096-fpf](https://huggingface.co/mesolitica/tinyllama-1.1b-4096-fpf)',
104
+ 'Tatabahasa 0-shot': 23.248407643312103,
105
+ 'Tatabahasa 1-shot': 27.22063037249284,
106
+ 'Tatabahasa 3-shots': 24.355300859598856,
107
+ },
108
+ {
109
+ 'T': '🟒',
110
+ 'model': '[mesolitica/malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
111
+ 'BM-PT3 0-shot': 20.37037037037037,
112
+ 'BM-PT3 1-shot': 20.37037037037037,
113
+ 'BM-PT3 3-shots': 29.629629629629626,
114
+ 'Tatabahasa 0-shot': 17.765042979942695,
115
+ 'Tatabahasa 1-shot': 24.068767908309454,
116
+ 'Tatabahasa 3-shots': 27.507163323782237,
117
+ },
118
+ {
119
+ 'T': 'β­•',
120
+ 'model': '[mesolitica/malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-v2)',
121
+ 'BM-PT3 0-shot': 33.33333333333333,
122
+ 'BM-PT3 1-shot': 37.03703703703704,
123
+ 'BM-PT3 3-shots': 35.18518518518518,
124
+ 'Tatabahasa 0-shot': 59.31232091690545,
125
+ 'Tatabahasa 1-shot': 53.86819484240688,
126
+ 'Tatabahasa 3-shots': 45.55873925501432,
127
+ },
128
+ {
129
+ 'T': '🟒',
130
+ 'model': '[mesolitica/malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
131
+ 'BM-PT3 0-shot': 33.33333333333333,
132
+ 'BM-PT3 1-shot': 20.37037037037037,
133
+ 'BM-PT3 3-shots': 31.48148148148148,
134
+ 'Tatabahasa 0-shot': 26.07449856733524,
135
+ 'Tatabahasa 1-shot': 25.214899713467048,
136
+ 'Tatabahasa 3-shots': 24.355300859598856,
137
+ },
138
+ {
139
+ 'T': 'β­•',
140
+ 'model': '[mistralai/malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
141
+ 'BM-PT3 0-shot': 28.57142857142857,
142
+ 'BM-PT3 1-shot': 12.244897959183673,
143
+ 'BM-PT3 3-shots': 17.307692307692307,
144
+ },
145
+ {
146
+ 'T': '🟒',
147
+ 'model': '[mistralai/mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
148
+ 'Tatabahasa 0-shot': 28.939828080229223,
149
+ 'Tatabahasa 1-shot': 34.38395415472779,
150
+ 'Tatabahasa 3-shots': 32.95128939828081,
151
+ },
152
+ {
153
+ 'T': '🟒',
154
+ 'model': '[mesolitica/malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
155
+ 'BM-PT3 0-shot': 20.37037037037037,
156
+ 'BM-PT3 1-shot': 22.22222222222222,
157
+ 'BM-PT3 3-shots': 33.33333333333333,
158
+ 'Tatabahasa 0-shot': 21.48997134670487,
159
+ 'Tatabahasa 1-shot': 28.939828080229223,
160
+ 'Tatabahasa 3-shots': 24.641833810888254,
161
+ },
162
+ {
163
+ 'T': '🟒',
164
+ 'model': '[mesolitica/malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
165
+ 'BM-PT3 0-shot': 16.666666666666664,
166
+ 'BM-PT3 1-shot': 16.666666666666664,
167
+ 'BM-PT3 3-shots': 25.925925925925924,
168
+ 'Tatabahasa 0-shot': 18.624641833810887,
169
+ 'Tatabahasa 1-shot': 24.355300859598856,
170
+ 'Tatabahasa 3-shots': 28.653295128939828,
171
+ },
172
+ {
173
+ 'T': 'β­•',
174
+ 'model': '[mesolitica/malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
175
+ 'BM-PT3 0-shot': 40.74074074074074,
176
+ 'BM-PT3 1-shot': 33.33333333333333,
177
+ 'BM-PT3 3-shots': 37.03703703703704,
178
+ 'Tatabahasa 0-shot': 65.32951289398281,
179
+ 'Tatabahasa 1-shot': 57.306590257879655,
180
+ 'Tatabahasa 3-shots': 56.446991404011456,
181
+ },
182
+ {
183
+ 'T': 'β­•',
184
+ 'model': '[mesolitica/malaysian-mistral-7b-32k-instructions-v4](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
185
+ 'BM-PT3 0-shot': 35.18518518518518,
186
+ 'BM-PT3 1-shot': 31.48148148148148,
187
+ 'BM-PT3 3-shots': 33.33333333333333,
188
+ 'Tatabahasa 0-shot': 66.4756446991404,
189
+ 'Tatabahasa 1-shot': 54.15472779369628,
190
+ 'Tatabahasa 3-shots': 49.8567335243553,
191
+ },
192
+ {
193
+ 'T': '🟒',
194
+ 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
195
+ 'BM-PT3 0-shot': 20.37037037037037,
196
+ 'BM-PT3 1-shot': 25.925925925925924,
197
+ 'BM-PT3 3-shots': 31.48148148148148,
198
+ 'Tatabahasa 0-shot': 21.776504297994272,
199
+ 'Tatabahasa 1-shot': 21.776504297994272,
200
+ 'Tatabahasa 3-shots': 24.641833810888254,
201
+ },
202
+ {
203
+ 'T': '🟒',
204
+ 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
205
+ 'BM-PT3 0-shot': 20.37037037037037,
206
+ 'BM-PT3 1-shot': 24.074074074074073,
207
+ 'BM-PT3 3-shots': 33.33333333333333,
208
+ 'Tatabahasa 0-shot': 25.787965616045845,
209
+ 'Tatabahasa 1-shot': 27.507163323782237,
210
+ 'Tatabahasa 3-shots': 26.07449856733524,
211
+ },
212
+ {
213
+ 'T': '🟒',
214
+ 'model': '[mesolitica/mallam-1.1B-4096](https://huggingface.co/mesolitica/mallam-1.1B-4096)',
215
+ 'Tatabahasa 0-shot': 25.757575757575758,
216
+ 'Tatabahasa 1-shot': 25.787965616045845,
217
+ 'Tatabahasa 3-shots': 28.08022922636103,
218
+ },
219
+ {
220
+ 'T': '🟒',
221
+ 'model': '[mesolitica/mallam-3B-4096](https://huggingface.co/mesolitica/mallam-3B-4096)',
222
+ 'Tatabahasa 0-shot': 24.567474048442904,
223
+ 'Tatabahasa 1-shot': 24.641833810888254,
224
+ 'Tatabahasa 3-shots': 28.653295128939828,
225
+ },
226
+ {
227
+ 'T': '🟒',
228
+ 'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
229
+ 'Tatabahasa 0-shot': 24.074074074074073,
230
+ 'Tatabahasa 1-shot': 27.793696275071632,
231
+ 'Tatabahasa 3-shots': 28.653295128939828,
232
+ },
233
+ {
234
+ 'T': '🟒',
235
+ 'model': '[sail/Sailor-0.5B](https://huggingface.co/sail/Sailor-0.5B)',
236
+ 'Tatabahasa 0-shot': 17.191977077363894,
237
+ 'Tatabahasa 1-shot': 23.78223495702006,
238
+ 'Tatabahasa 3-shots': 25.501432664756447,
239
+ },
240
+ {
241
+ 'T': '🟒',
242
+ 'model': '[sail/Sailor-1.8B](https://huggingface.co/sail/Sailor-1.8B)',
243
+ 'Tatabahasa 0-shot': 29.512893982808023,
244
+ 'Tatabahasa 1-shot': 27.507163323782237,
245
+ 'Tatabahasa 3-shots': 24.92836676217765,
246
+ },
247
+ {
248
+ 'T': '🟒',
249
+ 'model': '[sail/Sailor-4B](https://huggingface.co/sail/Sailor-4B)',
250
+ 'Tatabahasa 0-shot': 31.51862464183381,
251
+ 'Tatabahasa 1-shot': 36.10315186246418,
252
+ 'Tatabahasa 3-shots': 27.507163323782237,
253
+ },
254
+ {
255
+ 'T': '🟒',
256
+ 'model': '[sail/Sailor-7B](https://huggingface.co/sail/Sailor-7B)',
257
+ 'Tatabahasa 0-shot': 55.30085959885387,
258
+ 'Tatabahasa 1-shot': 54.72779369627507,
259
+ 'Tatabahasa 3-shots': 59.02578796561605,
260
+ },
261
+ {
262
+ 'T': '🟒',
263
+ 'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
264
+ 'Tatabahasa 0-shot': 24.074074074074073,
265
+ 'Tatabahasa 1-shot': 27.793696275071632,
266
+ 'Tatabahasa 3-shots': 28.653295128939828,
267
+ },
268
+ {
269
+ 'T': '🟒',
270
+ 'model': '[mesolitica/gemma-2B-8192-fpf](https://huggingface.co/mesolitica/gemma-2B-8192-fpf)',
271
+ 'Tatabahasa 0-shot': 14.613180515759314,
272
+ 'Tatabahasa 1-shot': 25.501432664756447,
273
+ 'Tatabahasa 3-shots': 23.49570200573066,
274
+ },
275
+ {
276
+ 'T': '🟒',
277
+ 'model': '[mesolitica/Qwen1.5-0.5B-4096-fpf](https://huggingface.co/mesolitica/Qwen1.5-0.5B-4096-fpf)',
278
+ 'Tatabahasa 0-shot': 13.753581661891118,
279
+ 'Tatabahasa 1-shot': 21.20343839541547,
280
+ 'Tatabahasa 3-shots': 22.636103151862464,
281
+ },
282
+ {
283
+ 'T': 'β­•',
284
+ 'model': '[mesolitica/mallam-1.1b-20k-instructions](https://huggingface.co/mesolitica/mallam-1.1b-20k-instructions)',
285
+ 'Tatabahasa 0-shot': 26.923076923076923,
286
+ 'Tatabahasa 1-shot': 28.939828080229223,
287
+ 'Tatabahasa 3-shots': 21.776504297994272,
288
+ },
289
+ ]
290
 
291
+ data = pd.DataFrame(close_source + open_source)
292
 
293
  demo = gr.Blocks(css=custom_css)
294
  with demo:
295
  gr.HTML(TITLE)
296
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
297
+ gr.DataFrame(data, datatype = 'markdown')
298
 
299
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_demo.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
+ import pandas as pd
4
+ from apscheduler.schedulers.background import BackgroundScheduler
5
+ from huggingface_hub import snapshot_download
6
+
7
+ from src.about import (
8
+ CITATION_BUTTON_LABEL,
9
+ CITATION_BUTTON_TEXT,
10
+ EVALUATION_QUEUE_TEXT,
11
+ INTRODUCTION_TEXT,
12
+ LLM_BENCHMARKS_TEXT,
13
+ TITLE,
14
+ )
15
+ from src.display.css_html_js import custom_css
16
+ from src.display.utils import (
17
+ BENCHMARK_COLS,
18
+ COLS,
19
+ EVAL_COLS,
20
+ EVAL_TYPES,
21
+ AutoEvalColumn,
22
+ ModelType,
23
+ fields,
24
+ WeightType,
25
+ Precision
26
+ )
27
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
+ from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
+ from src.submission.submit import add_new_eval
30
+
31
+
32
+ def restart_space():
33
+ API.restart_space(repo_id=REPO_ID)
34
+
35
+ ### Space initialisation
36
+ try:
37
+ print(EVAL_REQUESTS_PATH)
38
+ snapshot_download(
39
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
+ )
41
+ except Exception:
42
+ restart_space()
43
+ try:
44
+ print(EVAL_RESULTS_PATH)
45
+ snapshot_download(
46
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
+ )
48
+ except Exception:
49
+ restart_space()
50
+
51
+
52
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
+
54
+ (
55
+ finished_eval_queue_df,
56
+ running_eval_queue_df,
57
+ pending_eval_queue_df,
58
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
+
60
+ def init_leaderboard(dataframe):
61
+ if dataframe is None or dataframe.empty:
62
+ raise ValueError("Leaderboard DataFrame is empty or None.")
63
+ return Leaderboard(
64
+ value=dataframe,
65
+ datatype=[c.type for c in fields(AutoEvalColumn)],
66
+ select_columns=SelectColumns(
67
+ default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
+ cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
+ label="Select Columns to Display:",
70
+ ),
71
+ search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
+ hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
+ filter_columns=[
74
+ ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
+ ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
+ ColumnFilter(
77
+ AutoEvalColumn.params.name,
78
+ type="slider",
79
+ min=0.01,
80
+ max=150,
81
+ label="Select the number of parameters (B)",
82
+ ),
83
+ ColumnFilter(
84
+ AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
+ ),
86
+ ],
87
+ bool_checkboxgroup_label="Hide models",
88
+ interactive=False,
89
+ )
90
+
91
+
92
+ demo = gr.Blocks(css=custom_css)
93
+ with demo:
94
+ gr.HTML(TITLE)
95
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
+
97
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
+ with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
100
+
101
+ with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
+
104
+ with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
+ with gr.Column():
106
+ with gr.Row():
107
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
+
109
+ with gr.Column():
110
+ with gr.Accordion(
111
+ f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
+ open=False,
113
+ ):
114
+ with gr.Row():
115
+ finished_eval_table = gr.components.Dataframe(
116
+ value=finished_eval_queue_df,
117
+ headers=EVAL_COLS,
118
+ datatype=EVAL_TYPES,
119
+ row_count=5,
120
+ )
121
+ with gr.Accordion(
122
+ f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
+ open=False,
124
+ ):
125
+ with gr.Row():
126
+ running_eval_table = gr.components.Dataframe(
127
+ value=running_eval_queue_df,
128
+ headers=EVAL_COLS,
129
+ datatype=EVAL_TYPES,
130
+ row_count=5,
131
+ )
132
+
133
+ with gr.Accordion(
134
+ f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
+ open=False,
136
+ ):
137
+ with gr.Row():
138
+ pending_eval_table = gr.components.Dataframe(
139
+ value=pending_eval_queue_df,
140
+ headers=EVAL_COLS,
141
+ datatype=EVAL_TYPES,
142
+ row_count=5,
143
+ )
144
+ with gr.Row():
145
+ gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ model_name_textbox = gr.Textbox(label="Model name")
150
+ revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
+ model_type = gr.Dropdown(
152
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
+ label="Model type",
154
+ multiselect=False,
155
+ value=None,
156
+ interactive=True,
157
+ )
158
+
159
+ with gr.Column():
160
+ precision = gr.Dropdown(
161
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
+ label="Precision",
163
+ multiselect=False,
164
+ value="float16",
165
+ interactive=True,
166
+ )
167
+ weight_type = gr.Dropdown(
168
+ choices=[i.value.name for i in WeightType],
169
+ label="Weights type",
170
+ multiselect=False,
171
+ value="Original",
172
+ interactive=True,
173
+ )
174
+ base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
+
176
+ submit_button = gr.Button("Submit Eval")
177
+ submission_result = gr.Markdown()
178
+ submit_button.click(
179
+ add_new_eval,
180
+ [
181
+ model_name_textbox,
182
+ base_model_name_textbox,
183
+ revision_name_textbox,
184
+ precision,
185
+ weight_type,
186
+ model_type,
187
+ ],
188
+ submission_result,
189
+ )
190
+
191
+ with gr.Row():
192
+ with gr.Accordion("πŸ“™ Citation", open=False):
193
+ citation_button = gr.Textbox(
194
+ value=CITATION_BUTTON_TEXT,
195
+ label=CITATION_BUTTON_LABEL,
196
+ lines=20,
197
+ elem_id="citation-button",
198
+ show_copy_button=True,
199
+ )
200
+
201
+ scheduler = BackgroundScheduler()
202
+ scheduler.add_job(restart_space, "interval", seconds=1800)
203
+ scheduler.start()
204
+ demo.queue(default_concurrency_limit=40).launch()
src/about.py CHANGED
@@ -21,7 +21,7 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">Thai Sentence Embedding Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """