Upload 4 files
Browse files- app.py +58 -194
- envs.py +25 -0
- leaderboard_data.jsonl +11 -0
- utils.py +268 -0
app.py
CHANGED
@@ -1,204 +1,68 @@
|
|
1 |
import gradio as gr
|
2 |
-
from gradio_leaderboard import Leaderboard,
|
3 |
-
|
4 |
-
|
5 |
-
from
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
TITLE,
|
14 |
-
)
|
15 |
-
from src.display.css_html_js import custom_css
|
16 |
-
from src.display.utils import (
|
17 |
-
BENCHMARK_COLS,
|
18 |
-
COLS,
|
19 |
-
EVAL_COLS,
|
20 |
-
EVAL_TYPES,
|
21 |
-
AutoEvalColumn,
|
22 |
-
ModelType,
|
23 |
-
fields,
|
24 |
-
WeightType,
|
25 |
-
Precision
|
26 |
-
)
|
27 |
-
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
|
28 |
-
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
29 |
-
from src.submission.submit import add_new_eval
|
30 |
-
|
31 |
-
|
32 |
-
def restart_space():
|
33 |
-
API.restart_space(repo_id=REPO_ID)
|
34 |
-
|
35 |
-
### Space initialisation
|
36 |
-
try:
|
37 |
-
print(EVAL_REQUESTS_PATH)
|
38 |
-
snapshot_download(
|
39 |
-
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
40 |
-
)
|
41 |
-
except Exception:
|
42 |
-
restart_space()
|
43 |
-
try:
|
44 |
-
print(EVAL_RESULTS_PATH)
|
45 |
-
snapshot_download(
|
46 |
-
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
47 |
-
)
|
48 |
-
except Exception:
|
49 |
-
restart_space()
|
50 |
|
|
|
51 |
|
52 |
-
|
|
|
53 |
|
54 |
-
(
|
55 |
-
finished_eval_queue_df,
|
56 |
-
running_eval_queue_df,
|
57 |
-
pending_eval_queue_df,
|
58 |
-
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
59 |
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
select_columns=SelectColumns(
|
67 |
-
default_selection=
|
68 |
-
cant_deselect=[
|
69 |
-
label="Select Columns to
|
70 |
),
|
71 |
-
search_columns=[
|
72 |
-
hide_columns=[
|
73 |
-
filter_columns=[
|
74 |
-
ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
|
75 |
-
ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
|
76 |
-
ColumnFilter(
|
77 |
-
AutoEvalColumn.params.name,
|
78 |
-
type="slider",
|
79 |
-
min=0.01,
|
80 |
-
max=150,
|
81 |
-
label="Select the number of parameters (B)",
|
82 |
-
),
|
83 |
-
ColumnFilter(
|
84 |
-
AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
|
85 |
-
),
|
86 |
-
],
|
87 |
-
bool_checkboxgroup_label="Hide models",
|
88 |
-
interactive=False,
|
89 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
|
|
|
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
gr.HTML(TITLE)
|
95 |
-
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
96 |
-
|
97 |
-
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
98 |
-
with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
|
99 |
-
leaderboard = init_leaderboard(LEADERBOARD_DF)
|
100 |
-
|
101 |
-
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
102 |
-
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
103 |
-
|
104 |
-
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
|
105 |
-
with gr.Column():
|
106 |
-
with gr.Row():
|
107 |
-
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
|
108 |
-
|
109 |
-
with gr.Column():
|
110 |
-
with gr.Accordion(
|
111 |
-
f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
|
112 |
-
open=False,
|
113 |
-
):
|
114 |
-
with gr.Row():
|
115 |
-
finished_eval_table = gr.components.Dataframe(
|
116 |
-
value=finished_eval_queue_df,
|
117 |
-
headers=EVAL_COLS,
|
118 |
-
datatype=EVAL_TYPES,
|
119 |
-
row_count=5,
|
120 |
-
)
|
121 |
-
with gr.Accordion(
|
122 |
-
f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
|
123 |
-
open=False,
|
124 |
-
):
|
125 |
-
with gr.Row():
|
126 |
-
running_eval_table = gr.components.Dataframe(
|
127 |
-
value=running_eval_queue_df,
|
128 |
-
headers=EVAL_COLS,
|
129 |
-
datatype=EVAL_TYPES,
|
130 |
-
row_count=5,
|
131 |
-
)
|
132 |
-
|
133 |
-
with gr.Accordion(
|
134 |
-
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
|
135 |
-
open=False,
|
136 |
-
):
|
137 |
-
with gr.Row():
|
138 |
-
pending_eval_table = gr.components.Dataframe(
|
139 |
-
value=pending_eval_queue_df,
|
140 |
-
headers=EVAL_COLS,
|
141 |
-
datatype=EVAL_TYPES,
|
142 |
-
row_count=5,
|
143 |
-
)
|
144 |
-
with gr.Row():
|
145 |
-
gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
|
146 |
-
|
147 |
-
with gr.Row():
|
148 |
-
with gr.Column():
|
149 |
-
model_name_textbox = gr.Textbox(label="Model name")
|
150 |
-
revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
|
151 |
-
model_type = gr.Dropdown(
|
152 |
-
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
|
153 |
-
label="Model type",
|
154 |
-
multiselect=False,
|
155 |
-
value=None,
|
156 |
-
interactive=True,
|
157 |
-
)
|
158 |
-
|
159 |
-
with gr.Column():
|
160 |
-
precision = gr.Dropdown(
|
161 |
-
choices=[i.value.name for i in Precision if i != Precision.Unknown],
|
162 |
-
label="Precision",
|
163 |
-
multiselect=False,
|
164 |
-
value="float16",
|
165 |
-
interactive=True,
|
166 |
-
)
|
167 |
-
weight_type = gr.Dropdown(
|
168 |
-
choices=[i.value.name for i in WeightType],
|
169 |
-
label="Weights type",
|
170 |
-
multiselect=False,
|
171 |
-
value="Original",
|
172 |
-
interactive=True,
|
173 |
-
)
|
174 |
-
base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
|
175 |
-
|
176 |
-
submit_button = gr.Button("Submit Eval")
|
177 |
-
submission_result = gr.Markdown()
|
178 |
-
submit_button.click(
|
179 |
-
add_new_eval,
|
180 |
-
[
|
181 |
-
model_name_textbox,
|
182 |
-
base_model_name_textbox,
|
183 |
-
revision_name_textbox,
|
184 |
-
precision,
|
185 |
-
weight_type,
|
186 |
-
model_type,
|
187 |
-
],
|
188 |
-
submission_result,
|
189 |
-
)
|
190 |
-
|
191 |
-
with gr.Row():
|
192 |
-
with gr.Accordion("📙 Citation", open=False):
|
193 |
-
citation_button = gr.Textbox(
|
194 |
-
value=CITATION_BUTTON_TEXT,
|
195 |
-
label=CITATION_BUTTON_LABEL,
|
196 |
-
lines=20,
|
197 |
-
elem_id="citation-button",
|
198 |
-
show_copy_button=True,
|
199 |
-
)
|
200 |
-
|
201 |
-
scheduler = BackgroundScheduler()
|
202 |
-
scheduler.add_job(restart_space, "interval", seconds=1800)
|
203 |
-
scheduler.start()
|
204 |
-
demo.queue(default_concurrency_limit=40).launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from gradio_leaderboard import Leaderboard, SelectColumns, ColumnFilter
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
from utils import LLM_BENCHMARKS_ABOUT_TEXT, LLM_BENCHMARKS_SUBMIT_TEXT, custom_css, jsonl_to_dataframe, add_average_column_to_df, apply_markdown_format_for_columns, submit, PART_LOGO, sort_dataframe_by_column
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
abs_path = Path(__file__).parent
|
10 |
+
|
11 |
+
# Any pandas-compatible data
|
12 |
+
leaderboard_df = jsonl_to_dataframe(str(abs_path / "leaderboard_data.jsonl"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
average_column_name = "Average Accuracy"
|
15 |
|
16 |
+
all_columns = ["Model", average_column_name, "Precision", "#Params (B)", "MMLU", "GSM8K", "TruthfulQA", "Winogrande", "ARC Easy", "Hellaswag", "Belebele"]
|
17 |
+
columns_to_average = ["MMLU", "GSM8K", "TruthfulQA", "Winogrande", "ARC Easy", "Hellaswag", "Belebele"]
|
18 |
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
leaderboard_df = add_average_column_to_df(leaderboard_df, columns_to_average, index=3, average_column_name=average_column_name)
|
21 |
+
leaderboard_df = apply_markdown_format_for_columns(df=leaderboard_df, model_column_name="Model")
|
22 |
+
leaderboard_df = sort_dataframe_by_column(leaderboard_df, column_name=average_column_name)
|
23 |
+
|
24 |
+
columns_data_type = ["markdown" for i in range(len(leaderboard_df.columns))]
|
25 |
+
# "str", "number", "bool", "date", "markdown"
|
26 |
+
# columns_data_type[0] = "markdown"
|
27 |
+
|
28 |
+
NUM_MODELS=len(leaderboard_df)
|
29 |
+
|
30 |
+
with gr.Blocks(css=custom_css) as demo:
|
31 |
+
gr.Markdown("""
|
32 |
+
# Open Lithuanian LLM Leaderboard
|
33 |
+
""")
|
34 |
+
|
35 |
+
gr.Markdown(f"""
|
36 |
+
- **Total Models**: {NUM_MODELS}
|
37 |
+
""")
|
38 |
+
|
39 |
+
with gr.Tab("🎖️ Lithuanian Leaderboard"):
|
40 |
+
Leaderboard(
|
41 |
+
value=leaderboard_df,
|
42 |
+
datatype=columns_data_type,
|
43 |
select_columns=SelectColumns(
|
44 |
+
default_selection=all_columns,
|
45 |
+
cant_deselect=["Model"],
|
46 |
+
label="Select Columns to Show",
|
47 |
),
|
48 |
+
search_columns=["model_name_for_query"],
|
49 |
+
hide_columns=["model_name_for_query",],
|
50 |
+
filter_columns=["Precision", "#Params (B)"],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
)
|
52 |
+
with gr.TabItem("📝 About"):
|
53 |
+
gr.Markdown(LLM_BENCHMARKS_ABOUT_TEXT)
|
54 |
+
|
55 |
+
with gr.Tab("✉️ Submit"):
|
56 |
+
gr.Markdown(LLM_BENCHMARKS_SUBMIT_TEXT)
|
57 |
+
model_name = gr.Textbox(label="Model name")
|
58 |
+
model_id = gr.Textbox(label="username/space e.g neurotechnology/Lt-Llama-2-7b-hf")
|
59 |
+
contact_email = gr.Textbox(label="Contact E-Mail")
|
60 |
+
submit_btn = gr.Button("Submit")
|
61 |
+
|
62 |
+
submit_btn.click(submit, inputs=[model_name, model_id, contact_email], outputs=[])
|
63 |
|
64 |
+
gr.Markdown("""
|
65 |
+
Please find more information about Neurotechnology on [www.neurotechnology.com](https://www.neurotechnology.com/natural-language-processing.html)""")
|
66 |
|
67 |
+
if __name__ == "__main__":
|
68 |
+
demo.launch()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
envs.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from huggingface_hub import HfApi
|
4 |
+
|
5 |
+
# Info to change for your repository
|
6 |
+
# ----------------------------------
|
7 |
+
TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
|
8 |
+
|
9 |
+
OWNER = "neurotechnology" # Change to your org - don't forget to create a results and request dataset, with the correct format!
|
10 |
+
# ----------------------------------
|
11 |
+
|
12 |
+
REPO_ID = f"{OWNER}/open-lithuanian-llm-leaderboard"
|
13 |
+
QUEUE_REPO = f"{OWNER}/leaderboard-requests"
|
14 |
+
RESULTS_REPO = f"{OWNER}/leaderboard-results"
|
15 |
+
|
16 |
+
# If you setup a cache later, just change HF_HOME
|
17 |
+
CACHE_PATH=os.getenv("HF_HOME", ".")
|
18 |
+
|
19 |
+
# Local caches
|
20 |
+
EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
|
21 |
+
EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
|
22 |
+
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
|
23 |
+
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
|
24 |
+
|
25 |
+
API = HfApi(token=TOKEN)
|
leaderboard_data.jsonl
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{"Model":"meta-llama/Llama-3.2-1B", "Precision": "bfloat16", "#Params (B)": 1.24, "MMLU": 25.85, "GSM8K": 1.82, "ARC Easy": 29.08, "Winogrande": 49.80, "TruthfulQA": 38.75, "Hellaswag": 28.53, "Belebele": 27.33, "Hub License": "llama3.2", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.2-1B"}
|
2 |
+
{"Model":"neurotechnology/Lt-Llama-2-7b-hf ", "Precision": "bfloat16", "#Params (B)": 6.9, "MMLU": 26.01, "GSM8K": 0.0, "ARC Easy": 43.18, "Winogrande": 53.67, "TruthfulQA": 41.38, "Hellaswag": 33.17, "Belebele": 27.23, "Hub License": "llama2", "Model sha": "main", "model_name_for_query": "neurotechnology/Lt-Llama-2-7b-hf"}
|
3 |
+
{"Model":"neurotechnology/Lt-Llama-2-13b-hf ", "Precision": "bfloat16", "#Params (B)": 12.7, "MMLU": 26.44, "GSM8K": 0.45, "ARC Easy": 54.5, "Winogrande": 61.72, "TruthfulQA": 35.23, "Hellaswag": 40.61, "Belebele": 27.67, "Hub License": "llama2", "Model sha": "main", "model_name_for_query": "neurotechnology/Lt-Llama-2-13b-hf"}
|
4 |
+
{"Model":"meta-llama/Llama-3.2-3B", "Precision": "bfloat16", "#Params (B)": 3.21, "MMLU": 36.41, "GSM8K": 13.04, "ARC Easy": 39.39, "Winogrande": 51.85, "TruthfulQA": 38.87, "Hellaswag": 31.51, "Belebele": 46.22, "Hub License": "llama3.2", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.2-3B"}
|
5 |
+
{"Model":"google/gemma-2-2b", "Precision": "bfloat16", "#Params (B)": 2.61, "MMLU": 35.84, "GSM8K": 3.64, "ARC Easy": 45.45, "Winogrande": 51.85, "TruthfulQA": 54.78, "Hellaswag": 34.80, "Belebele": 52.44, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-2b"}
|
6 |
+
{"Model":"meta-llama/Llama-3.1-8B", "Precision": "bfloat16", "#Params (B)": 8.03, "MMLU": 44.86, "GSM8K": 30.17, "ARC Easy": 48.65, "Winogrande": 54.22, "TruthfulQA": 37.61, "Hellaswag": 35.19, "Belebele": 67.56, "Hub License": "llama3.1", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.1-8B"}
|
7 |
+
{"Model":"utter-project/EuroLLM-9B", "Precision": "bfloat16", "#Params (B)": 9.15, "MMLU": 51.95, "GSM8K": 31.16, "ARC Easy": 71.55, "Winogrande": 64.17, "TruthfulQA": 42.13, "Hellaswag": 46.32, "Belebele": 69.44, "Hub License": "eurollm", "Model sha": "main", "model_name_for_query": "utter-project/EuroLLM-9B"}
|
8 |
+
{"Model":"google/gemma-2-9b", "Precision": "bfloat16", "#Params (B)": 9.24, "MMLU": 60.09, "GSM8K": 25.78, "ARC Easy": 68.31, "Winogrande": 65.15, "TruthfulQA": 39.69, "Hellaswag": 45.32, "Belebele": 86.78, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-9b"}
|
9 |
+
{"Model":"meta-llama/Llama-3.1-70B", "Precision": "bfloat16", "#Params (B)": 70, "MMLU": 67.50, "GSM8K": 72.40, "ARC Easy": 70.92, "Winogrande": 64.01, "TruthfulQA": 43.59, "Hellaswag": 46.39, "Belebele": 90.02, "Hub License": "llama3.1", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.1-70B"}
|
10 |
+
{"Model":"google/gemma-2-27b", "Precision": "bfloat16", "#Params (B)": 27.2, "MMLU": 64.82, "GSM8K": 68.69, "ARC Easy": 77.40, "Winogrande": 66.77, "TruthfulQA": 42.06, "Hellaswag": 50.82, "Belebele": 89.22, "Hub License": "gemma", "Model sha": "main", "model_name_for_query": "google/gemma-2-27b"}
|
11 |
+
{"Model":"meta-llama/Llama-3.3-70B", "Precision": "bfloat16", "#Params (B)": 70, "MMLU": 71.46, "GSM8K": 80.97, "ARC Easy": 70.66, "Winogrande": 59.83, "TruthfulQA": 45.61, "Hellaswag": 46.05, "Belebele": 89.33, "Hub License": "llama3.3", "Model sha": "main", "model_name_for_query": "meta-llama/Llama-3.3-70B"}
|
utils.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from datetime import datetime
|
4 |
+
|
5 |
+
import gradio as gr
|
6 |
+
import pandas as pd
|
7 |
+
|
8 |
+
from envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
|
9 |
+
|
10 |
+
custom_css = """
|
11 |
+
@import url('https://fonts.googleapis.com/css2?family=Vazirmatn&display=swap');
|
12 |
+
body, .gradio-container, .gr-button, .gr-input, .gr-slider, .gr-dropdown, .gr-markdown {
|
13 |
+
font-family: 'Vazirmatn', sans-serif !important;
|
14 |
+
}
|
15 |
+
.markdown-text {
|
16 |
+
font-size: 16px !important;
|
17 |
+
}
|
18 |
+
#models-to-add-text {
|
19 |
+
font-size: 18px !important;
|
20 |
+
}
|
21 |
+
#citation-button span {
|
22 |
+
font-size: 16px !important;
|
23 |
+
}
|
24 |
+
#citation-button textarea {
|
25 |
+
font-size: 16px !important;
|
26 |
+
}
|
27 |
+
#citation-button > label > button {
|
28 |
+
margin: 6px;
|
29 |
+
transform: scale(1.3);
|
30 |
+
}
|
31 |
+
#leaderboard-table {
|
32 |
+
margin-top: 15px;
|
33 |
+
text-align: center;
|
34 |
+
}
|
35 |
+
#leaderboard-table,
|
36 |
+
#leaderboard-table th,
|
37 |
+
#leaderboard-table td {
|
38 |
+
text-align: center;
|
39 |
+
vertical-align: middle;
|
40 |
+
border-collapse: collapse;
|
41 |
+
}
|
42 |
+
#leaderboard-table td:first-child,
|
43 |
+
#leaderboard-table th:first-child {
|
44 |
+
text-align: left;
|
45 |
+
max-width: 500px;
|
46 |
+
}
|
47 |
+
table > thead {
|
48 |
+
white-space: normal;
|
49 |
+
}
|
50 |
+
table > thead th,
|
51 |
+
table > tbody td {
|
52 |
+
text-align: center;
|
53 |
+
vertical-align: middle;
|
54 |
+
}
|
55 |
+
table > tbody td:first-child {
|
56 |
+
text-align: left;
|
57 |
+
max-width: 500px;
|
58 |
+
}
|
59 |
+
#leaderboard-table-lite {
|
60 |
+
margin-top: 15px;
|
61 |
+
}
|
62 |
+
#search-bar-table-box > div:first-child {
|
63 |
+
background: none;
|
64 |
+
border: none;
|
65 |
+
}
|
66 |
+
#search-bar {
|
67 |
+
padding: 0px;
|
68 |
+
}
|
69 |
+
.tab-buttons button {
|
70 |
+
font-size: 20px;
|
71 |
+
}
|
72 |
+
#scale-logo {
|
73 |
+
border-style: none !important;
|
74 |
+
box-shadow: none;
|
75 |
+
display: block;
|
76 |
+
margin-left: auto;
|
77 |
+
margin-right: auto;
|
78 |
+
max-width: 600px;
|
79 |
+
}
|
80 |
+
#scale-logo .download {
|
81 |
+
display: none;
|
82 |
+
}
|
83 |
+
#filter_type {
|
84 |
+
border: 0;
|
85 |
+
padding-left: 0;
|
86 |
+
padding-top: 0;
|
87 |
+
}
|
88 |
+
#filter_type label {
|
89 |
+
display: flex;
|
90 |
+
}
|
91 |
+
#filter_type label > span {
|
92 |
+
margin-top: var(--spacing-lg);
|
93 |
+
margin-right: 0.5em;
|
94 |
+
}
|
95 |
+
#filter_type label > .wrap {
|
96 |
+
width: 103px;
|
97 |
+
}
|
98 |
+
#filter_type label > .wrap .wrap-inner {
|
99 |
+
padding: 2px;
|
100 |
+
}
|
101 |
+
#filter_type label > .wrap .wrap-inner input {
|
102 |
+
width: 1px;
|
103 |
+
}
|
104 |
+
#filter-columns-type {
|
105 |
+
border: 0;
|
106 |
+
padding: 0.5;
|
107 |
+
}
|
108 |
+
#filter-columns-size {
|
109 |
+
border: 0;
|
110 |
+
padding: 0.5;
|
111 |
+
}
|
112 |
+
#box-filter > .form {
|
113 |
+
border: 0;
|
114 |
+
}
|
115 |
+
"""
|
116 |
+
|
117 |
+
LLM_BENCHMARKS_ABOUT_TEXT = f"""## Open Lithuanian LLM Leaderboard (v1.0.1)
|
118 |
+
> The Open Lithuanian LLM Evaluation Leaderboard, developed by **Part DP AI** in collaboration with **Vilnius University NLP Lab**, provides a comprehensive benchmarking system specifically designed for Lithuanian LLMs. This leaderboard, based on the open-source [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness), offers a unique platform for evaluating the performance of large language models (LLMs) on tasks that demand linguistic proficiency and technical skill in Lithuanian.
|
119 |
+
> **Note:** This leaderboard is continuously updating its data and models, reflecting the latest developments in Lithuanian LLMs. It is currently in version 1.0.0, serving as the initial benchmark for Lithuanian LLM evaluation, with plans for future enhancements.
|
120 |
+
## 1. Key Features
|
121 |
+
> 1. **Open Evaluation Access**
|
122 |
+
> The leaderboard allows open participation, meaning that developers and researchers working with open-source models can submit evaluation requests for their models. This accessibility encourages the development and testing of Lithuanian LLMs within the broader AI ecosystem.
|
123 |
+
>
|
124 |
+
> 2. **Task Diversity**
|
125 |
+
> Six specialized tasks have been curated for this leaderboard, each tailored to challenge different aspects of a model’s capabilities. These tasks include:
|
126 |
+
> - **MMLU**
|
127 |
+
> - **GSM8K**
|
128 |
+
> - **TruthfulQA**
|
129 |
+
> - **ARC Easy**
|
130 |
+
> - **Winogrande**
|
131 |
+
> - **Hellaswag**
|
132 |
+
> - **Belebele**
|
133 |
+
>
|
134 |
+
> Each dataset is available in Lithuanian, providing a robust testing ground for models in a non-English setting. The datasets collectively contain over **40k samples** across various categories such as **Common Knowledge**, **Reasoning**, **Summarization**, **Math**, and **Specialized Examinations**, offering comprehensive coverage of diverse linguistic and technical challenges.
|
135 |
+
>
|
136 |
+
> 3. **Open-Source Dataset Sample**
|
137 |
+
> A sample of the evaluation dataset is hosted on [Hugging Face Datasets](https://huggingface.co/datasets/PartAI/llm-leaderboard-datasets-sample), offering the AI community a glimpse of the benchmark content and format. This sample allows developers to pre-assess their models against representative data before a full leaderboard evaluation.
|
138 |
+
>
|
139 |
+
> 4. **Collaborative Development**
|
140 |
+
>
|
141 |
+
> This leaderboard is developed by [**Neurotechnology**](https://huggingface.co/neurotechnology) and authored by [**Artūras Nakvosas**](https://huggingface.co/artena), leveraging cutting-edge industrial expertise to create a high-quality, open benchmarking tool. The project underscores a commitment to advancing Lithuanian LLMs through innovative solutions and fostering the growth of the local AI ecosystem.
|
142 |
+
>
|
143 |
+
> 5. **Comprehensive Evaluation Pipeline**
|
144 |
+
> By integrating a standardized evaluation pipeline, models are assessed across a variety of data types, including text, mathematical formulas, and numerical data. This multi-faceted approach enhances the evaluation’s reliability and allows for precise, nuanced assessment of model performance across multiple dimensions.
|
145 |
+
## 2. Background and Goals
|
146 |
+
> Recent months have seen a notable increase in the development of Lithuanian LLMs by research centers and AI companies in Lithuania. However, the lack of reliable, standardized benchmarks for Lithuanian LLMs has made it challenging to evaluate model quality comprehensively. Global benchmarks typically do not support Lithuanian, resulting in skewed or unreliable results for Lithuanian LLMs.
|
147 |
+
>
|
148 |
+
> This leaderboard addresses this gap by providing a locally-focused, transparent system that enables consistent, fair comparisons of Lithuanian LLMs. It is expected to be a valuable tool for Lithuanian-speaking businesses and developers, allowing them to select models best suited to their needs. Researchers and model developers also benefit from the competitive environment, with opportunities to showcase and improve their models based on benchmark rankings.
|
149 |
+
## 3. Data Privacy and Integrity
|
150 |
+
> To maintain evaluation integrity and prevent overfitting or data leakage, only part of the benchmark dataset is openly available. This limited access approach upholds model evaluation reliability, ensuring that results are genuinely representative of each model’s capabilities across unseen data.
|
151 |
+
>
|
152 |
+
> The leaderboard represents a significant milestone in Lithuanian LLMs and is positioned to become the leading standard for LLM evaluation in the Lithuanian-speaking world.
|
153 |
+
"""
|
154 |
+
|
155 |
+
|
156 |
+
LLM_BENCHMARKS_SUBMIT_TEXT = """### Submitting a Model for Evaluation
|
157 |
+
|
158 |
+
> To submit your open-source model for evaluation, follow these steps:
|
159 |
+
>
|
160 |
+
> 1. **Ensure your model is on Hugging Face**: Your model must be publicly available on [Hugging Face](https://huggingface.co/).
|
161 |
+
>
|
162 |
+
> 2. **Submit Request**: Send a request with your model's Hugging Face identifier via the provided submission form or email.
|
163 |
+
>
|
164 |
+
> 3. **Manual Queue**: Please note that the evaluation process is currently handled manually. Submissions will be queued and processed in the order received.
|
165 |
+
>
|
166 |
+
> 4. **Results**: Once the evaluation is complete, your model’s results will be updated on the leaderboard and shared with you.
|
167 |
+
>
|
168 |
+
> We appreciate your patience and contributions to the Lithuanian LLM ecosystem!
|
169 |
+
"""
|
170 |
+
|
171 |
+
|
172 |
+
PART_LOGO = """
|
173 |
+
<img src="https://avatars.githubusercontent.com/u/39557177?v=4" style="width:30%;display:block;margin-left:auto;margin-right:auto">
|
174 |
+
<h1 style="font-size: 28px; margin-bottom: 2px;">Part DP AI</h1>
|
175 |
+
"""
|
176 |
+
|
177 |
+
|
178 |
+
def load_jsonl(input_file):
|
179 |
+
data = []
|
180 |
+
with open(input_file, 'r') as f:
|
181 |
+
for line in f:
|
182 |
+
data.append(json.loads(line))
|
183 |
+
return data
|
184 |
+
|
185 |
+
|
186 |
+
def jsonl_to_dataframe(input_file):
|
187 |
+
data = load_jsonl(input_file)
|
188 |
+
return pd.DataFrame(data)
|
189 |
+
|
190 |
+
|
191 |
+
def sort_dataframe_by_column(df, column_name):
|
192 |
+
if column_name not in df.columns:
|
193 |
+
raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")
|
194 |
+
return df.sort_values(by=column_name, ascending=False).reset_index(drop=True)
|
195 |
+
|
196 |
+
|
197 |
+
def add_average_column_to_df(df,columns_to_average, index=3, average_column_name="Average Accuracy"):
|
198 |
+
average_column = df[columns_to_average].mean(axis=1)
|
199 |
+
df.insert(index, average_column_name, average_column)
|
200 |
+
return df
|
201 |
+
|
202 |
+
|
203 |
+
def model_hyperlink(link, model_name):
|
204 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
205 |
+
|
206 |
+
|
207 |
+
def make_clickable_model(model_name):
|
208 |
+
link = f"https://huggingface.co/{model_name}"
|
209 |
+
return model_hyperlink(link, model_name)
|
210 |
+
|
211 |
+
|
212 |
+
def center_align_markdown(text):
|
213 |
+
return f'<div align="center">{text}</div>'
|
214 |
+
|
215 |
+
|
216 |
+
def apply_markdown_format_for_columns(df, model_column_name):
|
217 |
+
columns = list(df.columns)
|
218 |
+
df[model_column_name] = df[model_column_name].apply(make_clickable_model)
|
219 |
+
# for column in columns:
|
220 |
+
# if column != model_column_name:
|
221 |
+
# df[column] = df[column].apply(center_align_markdown)
|
222 |
+
return df
|
223 |
+
|
224 |
+
|
225 |
+
def submit(model_name, model_id, contact_email):
|
226 |
+
if model_name == "" or model_id == "" or contact_email == "":
|
227 |
+
gr.Info("Please fill all the fields")
|
228 |
+
return
|
229 |
+
|
230 |
+
|
231 |
+
try:
|
232 |
+
user_name = ""
|
233 |
+
if "/" in model_id:
|
234 |
+
user_name = model_id.split("/")[0]
|
235 |
+
model_path = model_id.split("/")[1]
|
236 |
+
|
237 |
+
eval_entry = {
|
238 |
+
"model_name": model_name,
|
239 |
+
"model_id": model_id,
|
240 |
+
"contact_email": contact_email,
|
241 |
+
}
|
242 |
+
|
243 |
+
# Get the current timestamp to add to the filename
|
244 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
245 |
+
|
246 |
+
OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
|
247 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
248 |
+
|
249 |
+
# Add the timestamp to the filename
|
250 |
+
out_path = f"{OUT_DIR}/{user_name}_{model_path}_{timestamp}.json"
|
251 |
+
|
252 |
+
with open(out_path, "w") as f:
|
253 |
+
f.write(json.dumps(eval_entry))
|
254 |
+
|
255 |
+
print("Uploading eval file")
|
256 |
+
API.upload_file(
|
257 |
+
path_or_fileobj=out_path,
|
258 |
+
path_in_repo=out_path.split("eval-queue/")[1],
|
259 |
+
repo_id=QUEUE_REPO,
|
260 |
+
repo_type="dataset",
|
261 |
+
commit_message=f"Add {model_name} to eval queue",
|
262 |
+
)
|
263 |
+
|
264 |
+
gr.Info("Successfully submitted", duration=10)
|
265 |
+
# Remove the local file
|
266 |
+
os.remove(out_path)
|
267 |
+
except Exception as e:
|
268 |
+
gr.Error(f"Error submitting the model: {e}")
|