File size: 4,808 Bytes
da92625
 
 
 
cafba76
da92625
 
 
bfb3ae7
da92625
 
da96aa6
 
37b3751
bfb3ae7
 
 
 
 
37b3751
 
c400723
37b3751
ea6f712
 
37b3751
5db0911
ea6f712
5db0911
 
c400723
cafba76
 
37b3751
cfc0fa8
61d4a75
056a0a0
cfc0fa8
056a0a0
 
cfc0fa8
37b3751
 
4106f16
 
 
37b3751
 
 
 
 
 
 
bfb3ae7
 
ea6f712
bfb3ae7
 
 
 
da92625
 
 
bfb3ae7
da92625
 
 
 
b47be80
 
 
 
 
da92625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import json
import os

import pandas as pd
import numpy as np

from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results




def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    raw_data = get_raw_model_results(results_path)
    all_data_json = [v.to_dict() for v in raw_data]

    df = pd.DataFrame.from_records(all_data_json)
    
    df = df[benchmark_cols]
    # print(df.head())
    
    if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
        df = df.dropna(subset=benchmark_cols)
        df = df.sort_values(by=[rank_col[0]], ascending=True)
    else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
        avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
        df["Average Rank"] = avg_rank
        df = df.sort_values(by=["Average Rank"], ascending=True)
        df = df.fillna('--')
        rank = np.arange(1, len(df)+1)
        df.insert(0, 'Rank', rank)
    
    for col in benchmark_cols:
        # print(col)
        # if 'Std dev' in col or 'Score' in col:
        if 'Std dev' in col or 'Score' in col:
            df[col] = (df[col]*100).map('{:.2f}'.format)
            # df[col] = df[col].round(decimals=2)

    # df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
    # df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
    # print(cols) # []
    # print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
    # exit()
    # only keep the columns that are in the cols list
    
    # for col in cols:
    #     if col not in df.columns:
    #         df[col] = None
    #     else:
    #         df = df[cols].round(decimals=2)

    # filter out if any of the benchmarks have not been produced
    # df = df[has_no_nan_values(df, benchmark_cols)]
    return df

    

def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    raw_data = get_raw_eval_results(results_path, requests_path)
    # raw_data = get_raw_model_results(results_path)
    all_data_json = [v.to_dict() for v in raw_data]

    df = pd.DataFrame.from_records(all_data_json)
    df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
    for col in cols:
        if col not in df.columns:
            df[col] = None
        else:
            df[col] = df[col].round(decimals=2)

    # filter out if any of the benchmarks have not been produced
    df = df[has_no_nan_values(df, benchmark_cols)]
    return df


def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requestes"""
    entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
    all_evals = []

    for entry in entries:
        if ".json" in entry:
            file_path = os.path.join(save_path, entry)
            with open(file_path) as fp:
                data = json.load(fp)

            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
            data[EvalQueueColumn.revision.name] = data.get("revision", "main")

            all_evals.append(data)
        elif ".md" not in entry:
            # this is a folder
            sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
            for sub_entry in sub_entries:
                file_path = os.path.join(save_path, entry, sub_entry)
                with open(file_path) as fp:
                    data = json.load(fp)

                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                all_evals.append(data)

    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]