File size: 4,792 Bytes
f2a3e70
bd5b131
 
87ad165
bd5b131
87ad165
 
bd5b131
87ad165
 
 
bd5b131
87ad165
bd5b131
87ad165
 
 
 
 
 
53db359
 
bd5b131
 
 
 
 
 
 
 
f2a3e70
 
 
 
 
bd5b131
 
 
 
 
f2a3e70
bd5b131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53db359
 
 
 
f2a3e70
 
 
 
672a5d6
 
f2a3e70
 
 
 
 
 
 
 
bd5b131
 
 
 
 
ba2508f
 
672a5d6
bd5b131
 
 
 
 
 
 
 
 
 
 
53db359
de1d88f
53db359
de1d88f
bd5b131
f2a3e70
 
bd5b131
 
 
 
53db359
f2a3e70
bd5b131
 
 
 
 
f2a3e70
bd5b131
 
 
 
87ad165
bd5b131
87ad165
bd5b131
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import re

import gradio as gr
import numpy
import pandas as pd

from src.display.css_html_js import custom_css
from src.about import (
    INTRODUCTION_TEXT,
    TITLE,
    AUTHORS,
)
from src.display.formatting import make_clickable_model

demo = gr.Blocks(css=custom_css)
with demo:
    gr.HTML(TITLE)
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    NUMBER_OF_QUESTIONS = 171.0

    # load dataframe from csv
    # leaderboard_df = pd.read_csv("benchmark_results.csv")
    leaderboard_df = []
    with open("benchmark_results.csv", "r") as f:
        header = f.readline().strip().split(",")
        header = [h.strip() for h in header]
        for i, line in enumerate(f):
            leaderboard_df.append(line.strip().split(",", 13))

    metadata = json.load(open('metadata.json'))
    for k, v in list(metadata.items()):
        metadata[k.split(",")[0]] = v

    # create dataframe from list and header
    leaderboard_df = pd.DataFrame(leaderboard_df, columns=header)
    # filter column with value eq-bench_v2_pl
    print(header)
    leaderboard_df = leaderboard_df[(leaderboard_df["Benchmark Version"] == "eq-bench_v2_pl") | (
            leaderboard_df["Benchmark Version"] == 'eq-bench_pl')]
    # fix: ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

    # leave only defined columns
    leaderboard_df = leaderboard_df[["Model Path", "Benchmark Score", "Num Questions Parseable", "Error"]]


    # create new column with model name
    def parse_parseable(x):
        if x["Num Questions Parseable"] == 'FAILED':
            m = re.match(r'(\d+)\.0 questions were parseable', x["Error"])
            return m.group(1)
        return x["Num Questions Parseable"]


    leaderboard_df["Num Questions Parseable"] = leaderboard_df[["Num Questions Parseable", "Error"]].apply(
        lambda x: parse_parseable(x), axis=1)

    def fraction_to_percentage(numerator: float, denominator: float) -> float:
        return (numerator / denominator) * 100

    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].apply(lambda x: fraction_to_percentage(float(x), NUMBER_OF_QUESTIONS))

    def get_params(model_name):
        if model_name in metadata:
            return metadata[model_name]
        else:
            print(model_name)
        return numpy.nan


    leaderboard_df["Params"] = leaderboard_df["Model Path"].apply(lambda x: get_params(x))

    # move column order
    leaderboard_df = leaderboard_df[["Model Path", "Params", "Benchmark Score", "Num Questions Parseable", 'Error']]

    leaderboard_df["Model Path"] = leaderboard_df["Model Path"].apply(lambda x: make_clickable_model(x))

    # change value of column to nan
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].replace('FAILED', numpy.nan)

    #scale Benchmark Score by Num Questions Parseable*100
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float) * ((leaderboard_df["Num Questions Parseable"].astype(float) / 100))

    # set datatype of column
    leaderboard_df["Benchmark Score"] = leaderboard_df["Benchmark Score"].astype(float)
    leaderboard_df["Num Questions Parseable"] = leaderboard_df["Num Questions Parseable"].astype(float)

    # set nan if value of column is less than 0
    leaderboard_df.loc[leaderboard_df["Benchmark Score"] < 0, "Benchmark Score"] = 0

    # sort by 2 columns
    leaderboard_df = leaderboard_df.sort_values(by=["Benchmark Score", "Num Questions Parseable"],
                                                ascending=[False, False])

    # rename columns
    leaderboard_df = leaderboard_df.rename(columns={"Model Path": "Model"})
    leaderboard_df = leaderboard_df.rename(columns={"Num Questions Parseable": "Percentage Questions Parseable"})

    leaderboard_df_styled = leaderboard_df.style.background_gradient(cmap="RdYlGn")
    leaderboard_df_styled = leaderboard_df_styled.background_gradient(cmap="RdYlGn_r", subset=['Params'])

    rounding = {}
    # for col in ["Benchmark Score", "Num Questions Parseable"]:

    rounding["Benchmark Score"] = "{:.2f}"
    rounding["Percentage Questions Parseable"] = "{:.2f}"
    rounding["Params"] = "{:.0f}"
    leaderboard_df_styled = leaderboard_df_styled.format(rounding)

    leaderboard_table = gr.components.Dataframe(
        value=leaderboard_df_styled,
        # headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
        datatype=['markdown', 'number', 'number', 'number', 'str'],
        elem_id="leaderboard-table",
        interactive=False,
        visible=True,
    )

    gr.Markdown(AUTHORS, elem_classes="markdown-text")

    demo.queue(default_concurrency_limit=40).launch()