File size: 4,715 Bytes
da92625 bfb3ae7 da92625 da96aa6 37b3751 bfb3ae7 37b3751 c400723 37b3751 ea6f712 37b3751 5db0911 ea6f712 5db0911 c400723 37b3751 cfc0fa8 61d4a75 056a0a0 cfc0fa8 056a0a0 cfc0fa8 37b3751 4106f16 37b3751 bfb3ae7 ea6f712 bfb3ae7 da92625 bfb3ae7 da92625 b47be80 da92625 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import json
import os
import pandas as pd
from src.display.formatting import has_no_nan_values, make_clickable_model
from src.display.utils import AutoEvalColumn, EvalQueueColumn
from src.leaderboard.read_evals import get_raw_eval_results, get_raw_model_results
def get_model_leaderboard_df(results_path: str, requests_path: str="", cols: list=[], benchmark_cols: list=[], rank_col: list=[]) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
raw_data = get_raw_model_results(results_path)
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df[benchmark_cols]
# print(df.head())
if rank_col: # if there is one col in rank_col, sort by that column and remove NaN values
df = df.dropna(subset=benchmark_cols)
df = df.sort_values(by=[rank_col[0]], ascending=True)
else: # when rank_col is empty, sort by averaging all the benchmarks, except the first one
avg_rank = df.iloc[:, 1:].mean(axis=1) # we'll skip NaN, instrad of deleting the whole row
df["Average Rank"] = avg_rank
df = df.sort_values(by=["Average Rank"], ascending=True)
df = df.fillna('--')
for col in benchmark_cols:
# print(col)
# if 'Std dev' in col or 'Score' in col:
if 'Std dev' in col or 'Score' in col:
df[col] = (df[col]*100).map('{:.2f}'.format)
# df[col] = df[col].round(decimals=2)
# df = df.sort_values(by=[AutoEvalColumn.score.name], ascending=True)
# df[AutoEvalColumn.rank.name] = df[AutoEvalColumn.score.name].rank(ascending=True, method="min")
# print(cols) # []
# print(df.columns) # ['eval_name', 'Model', 'Hub License', 'Organization', 'Knowledge cutoff', 'Overall']
# exit()
# only keep the columns that are in the cols list
# for col in cols:
# if col not in df.columns:
# df[col] = None
# else:
# df = df[cols].round(decimals=2)
# filter out if any of the benchmarks have not been produced
# df = df[has_no_nan_values(df, benchmark_cols)]
return df
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
raw_data = get_raw_eval_results(results_path, requests_path)
# raw_data = get_raw_model_results(results_path)
all_data_json = [v.to_dict() for v in raw_data]
df = pd.DataFrame.from_records(all_data_json)
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
for col in cols:
if col not in df.columns:
df[col] = None
else:
df[col] = df[col].round(decimals=2)
# filter out if any of the benchmarks have not been produced
df = df[has_no_nan_values(df, benchmark_cols)]
return df
def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requestes"""
entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
all_evals = []
for entry in entries:
if ".json" in entry:
file_path = os.path.join(save_path, entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
elif ".md" not in entry:
# this is a folder
sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
for sub_entry in sub_entries:
file_path = os.path.join(save_path, entry, sub_entry)
with open(file_path) as fp:
data = json.load(fp)
data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
data[EvalQueueColumn.revision.name] = data.get("revision", "main")
all_evals.append(data)
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_running = pd.DataFrame.from_records(running_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_running[cols], df_pending[cols]
|