Add a baseline
Browse files
app.py
CHANGED
@@ -1,13 +1,11 @@
|
|
1 |
import os
|
2 |
-
import shutil
|
3 |
import numpy as np
|
4 |
import gradio as gr
|
5 |
from huggingface_hub import Repository, HfApi
|
6 |
from transformers import AutoConfig
|
7 |
import json
|
8 |
-
from apscheduler.schedulers.background import BackgroundScheduler
|
9 |
import pandas as pd
|
10 |
-
import
|
11 |
from utils import get_eval_results_dicts, make_clickable_model
|
12 |
|
13 |
# clone / pull the lmeh eval data
|
@@ -140,6 +138,19 @@ def get_leaderboard():
|
|
140 |
}
|
141 |
all_data.append(gpt35_values)
|
142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
df = pd.DataFrame.from_records(all_data)
|
144 |
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
145 |
df = df[COLS]
|
@@ -323,7 +334,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
323 |
|
324 |
"""
|
325 |
)
|
326 |
-
with gr.Accordion("Finished Evaluations", open=False):
|
327 |
with gr.Row():
|
328 |
finished_eval_table = gr.components.Dataframe(
|
329 |
value=finished_eval_queue,
|
@@ -331,7 +342,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
331 |
datatype=EVAL_TYPES,
|
332 |
max_rows=5,
|
333 |
)
|
334 |
-
with gr.Accordion("Running Evaluation Queue", open=False):
|
335 |
with gr.Row():
|
336 |
running_eval_table = gr.components.Dataframe(
|
337 |
value=running_eval_queue,
|
@@ -340,7 +351,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
340 |
max_rows=5,
|
341 |
)
|
342 |
|
343 |
-
with gr.Accordion("Pending Evaluation Queue", open=False):
|
344 |
with gr.Row():
|
345 |
pending_eval_table = gr.components.Dataframe(
|
346 |
value=pending_eval_queue,
|
@@ -378,6 +389,7 @@ We chose these benchmarks as they test a variety of reasoning and general knowle
|
|
378 |
|
379 |
with gr.Row():
|
380 |
submit_button = gr.Button("Submit Eval")
|
|
|
381 |
with gr.Row():
|
382 |
submission_result = gr.Markdown()
|
383 |
submit_button.click(
|
|
|
1 |
import os
|
|
|
2 |
import numpy as np
|
3 |
import gradio as gr
|
4 |
from huggingface_hub import Repository, HfApi
|
5 |
from transformers import AutoConfig
|
6 |
import json
|
|
|
7 |
import pandas as pd
|
8 |
+
from content import CHANGELOG_TEXT
|
9 |
from utils import get_eval_results_dicts, make_clickable_model
|
10 |
|
11 |
# clone / pull the lmeh eval data
|
|
|
138 |
}
|
139 |
all_data.append(gpt35_values)
|
140 |
|
141 |
+
base_line = {
|
142 |
+
"Model": '<p>Baseline</p>',
|
143 |
+
"Revision": "N/A",
|
144 |
+
"8bit": None,
|
145 |
+
"Average ⬆️": 25.0,
|
146 |
+
"ARC (25-shot) ⬆️": 25.0,
|
147 |
+
"HellaSwag (10-shot) ⬆️": 25.0,
|
148 |
+
"MMLU (5-shot) ⬆️": 25.0,
|
149 |
+
"TruthfulQA (0-shot) ⬆️": 25.0,
|
150 |
+
}
|
151 |
+
|
152 |
+
all_data.append(base_line)
|
153 |
+
|
154 |
df = pd.DataFrame.from_records(all_data)
|
155 |
df = df.sort_values(by=["Average ⬆️"], ascending=False)
|
156 |
df = df[COLS]
|
|
|
334 |
|
335 |
"""
|
336 |
)
|
337 |
+
with gr.Accordion("✅ Finished Evaluations", open=False):
|
338 |
with gr.Row():
|
339 |
finished_eval_table = gr.components.Dataframe(
|
340 |
value=finished_eval_queue,
|
|
|
342 |
datatype=EVAL_TYPES,
|
343 |
max_rows=5,
|
344 |
)
|
345 |
+
with gr.Accordion("🔄 Running Evaluation Queue", open=False):
|
346 |
with gr.Row():
|
347 |
running_eval_table = gr.components.Dataframe(
|
348 |
value=running_eval_queue,
|
|
|
351 |
max_rows=5,
|
352 |
)
|
353 |
|
354 |
+
with gr.Accordion("⏳ Pending Evaluation Queue", open=False):
|
355 |
with gr.Row():
|
356 |
pending_eval_table = gr.components.Dataframe(
|
357 |
value=pending_eval_queue,
|
|
|
389 |
|
390 |
with gr.Row():
|
391 |
submit_button = gr.Button("Submit Eval")
|
392 |
+
|
393 |
with gr.Row():
|
394 |
submission_result = gr.Markdown()
|
395 |
submit_button.click(
|