Spaces:

babylm
/

leaderboard-2024

Running

leaderboard-2024

File size: 4,806 Bytes

6a3b9c1
 
63cb7f9
 
 
 
 
6a3b9c1
63cb7f9
 
 
 
 
 
 
 
 
 
 
 
de60bd6
63cb7f9
4d561ee
63cb7f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d561ee
63cb7f9
 
 
 
 
 
 
de60bd6
63cb7f9
 
de60bd6
938818f
63cb7f9
 
 
 
 
 
 
 
80e4e0d
63cb7f9
 
 
 
 
6a3b9c1
e7e9a2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31b9f7c
 
6a3b9c1
63cb7f9
 
 
 
 
 
 
6a3b9c1
de60bd6
6a3b9c1
de60bd6
6a3b9c1
de60bd6
 
 
63cb7f9
eebfa96
 
 
 
 
a03986e
63cb7f9
 
 
 
 
 
 
 
 
 
 
 
 
de60bd6

import json
import gzip
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
import pandas as pd
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from io import StringIO

from src.about import (
    CITATION_BUTTON_LABEL,
    CITATION_BUTTON_TEXT,
    EVALUATION_QUEUE_TEXT,
    INTRODUCTION_TEXT,
    LLM_BENCHMARKS_TEXT,
    TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
    BENCHMARK_COLS,
    BENCHMARK_COLS_MULTIMODAL,
    COLS,
    COLS_MULTIMODAL,
    EVAL_COLS,
    EVAL_TYPES,
    AutoEvalColumn,
    fields,
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import add_new_eval


def restart_space():
    API.restart_space(repo_id=REPO_ID)

### Space initialisation
try:
    print(EVAL_REQUESTS_PATH)
    snapshot_download(
        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()
try:
    print(EVAL_RESULTS_PATH)
    snapshot_download(
        repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
    )
except Exception:
    restart_space()


LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
LEADERBOARD_DF_MULTIMODAL = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS_MULTIMODAL, BENCHMARK_COLS_MULTIMODAL)

(
    finished_eval_queue_df,
    running_eval_queue_df,
    pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)

def init_leaderboard(dataframe, track):
    if dataframe is None or dataframe.empty:
        raise ValueError("Leaderboard DataFrame is empty or None.")
    # filter for correct track
    dataframe = dataframe.loc[dataframe["Track"] == track]
    return Leaderboard(
        value=dataframe,
        datatype=[c.type for c in fields(AutoEvalColumn)],
        select_columns=SelectColumns(
            default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
            cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
            label="Select Columns to Display:",
        ),
        search_columns=[AutoEvalColumn.model.name],
        hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
        bool_checkboxgroup_label="Hide models",
        interactive=False,
    )

def process_json(temp_file):
    if temp_file is None:
        return {}

    # Handle file upload
    try:
        file_path = temp_file.name
        if file_path.endswith('.gz'):
            with gzip.open(file_path, 'rt') as f:
                data = json.load(f)
        else:
            with open(file_path, 'r') as f:
                data = json.load(f)
    except Exception as e:
        raise gr.Error(f"Error processing file: {str(e)}")

    gr.Markdown("Upload successful!")
    return data


demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("Strict", elem_id="strict-benchmark-tab-table", id=0):
            leaderboard = init_leaderboard(LEADERBOARD_DF, "strict")
        with gr.TabItem("Strict-small", elem_id="strict-small-benchmark-tab-table", id=1):
            leaderboard = init_leaderboard(LEADERBOARD_DF, "strict-small")
        with gr.TabItem("Multimodal", elem_id="multimodal-benchmark-tab-table", id=2):
            leaderboard = init_leaderboard(LEADERBOARD_DF_MULTIMODAL, "multimodal")

        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=4):
            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
        with gr.TabItem("👶 Submit", elem_id="llm-benchmark-tab-table", id=5):
            with gr.Column():
                with gr.Row():                                                                                                           
                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
        
    with gr.Row():
        with gr.Accordion("📙 Citation", open=False):
            citation_button = gr.Textbox(
                value=CITATION_BUTTON_TEXT,
                label=CITATION_BUTTON_LABEL,
                lines=20,
                elem_id="citation-button",
                show_copy_button=True,
            )

scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.start()
demo.queue(default_concurrency_limit=40).launch()