Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os.path | |
from typing import List | |
import pandas as pd | |
from src.benchmarks import DEFAULT_METRIC_QA, DEFAULT_METRIC_LONG_DOC | |
from src.display.column_names import COL_NAME_REVISION, COL_NAME_IS_ANONYMOUS, \ | |
COL_NAME_TIMESTAMP | |
from src.models import FullEvalResult, LeaderboardDataStore | |
from src.utils import get_default_cols, get_leaderboard_df | |
pd.options.mode.copy_on_write = True | |
def load_raw_eval_results(results_path: str) -> List[FullEvalResult]: | |
""" | |
Load the evaluation results from a json file | |
""" | |
model_result_filepaths = [] | |
for root, dirs, files in os.walk(results_path): | |
if len(files) == 0: | |
continue | |
# select the latest results | |
for file in files: | |
if not (file.startswith("results") and file.endswith(".json")): | |
print(f'skip {file}') | |
continue | |
model_result_filepaths.append(os.path.join(root, file)) | |
eval_results = {} | |
for model_result_filepath in model_result_filepaths: | |
# create evaluation results | |
try: | |
eval_result = FullEvalResult.init_from_json_file(model_result_filepath) | |
except UnicodeDecodeError as e: | |
print(f"loading file failed. {model_result_filepath}") | |
continue | |
print(f'file loaded: {model_result_filepath}') | |
timestamp = eval_result.timestamp | |
eval_results[timestamp] = eval_result | |
results = [] | |
for k, v in eval_results.items(): | |
try: | |
v.to_dict() | |
results.append(v) | |
except KeyError: | |
print(f"loading failed: {k}") | |
continue | |
return results | |
def load_leaderboard_datastore(file_path) -> LeaderboardDataStore: | |
lb_data_store = LeaderboardDataStore(None, None, None, None, None, None, None, None) | |
lb_data_store.raw_data = load_raw_eval_results(file_path) | |
print(f'raw data: {len(lb_data_store.raw_data)}') | |
lb_data_store.raw_df_qa = get_leaderboard_df( | |
lb_data_store.raw_data, task='qa', metric=DEFAULT_METRIC_QA) | |
lb_data_store.leaderboard_df_qa = lb_data_store.raw_df_qa.copy() | |
# leaderboard_df_qa = leaderboard_df_qa[has_no_nan_values(df, _benchmark_cols)] | |
print(f'QA data loaded: {lb_data_store.raw_df_qa.shape}') | |
shown_columns_qa, types_qa = get_default_cols( | |
'qa', lb_data_store.leaderboard_df_qa.columns, add_fix_cols=True) | |
lb_data_store.types_qa = types_qa | |
lb_data_store.leaderboard_df_qa = \ | |
lb_data_store.leaderboard_df_qa[~lb_data_store.leaderboard_df_qa[COL_NAME_IS_ANONYMOUS]][shown_columns_qa] | |
lb_data_store.leaderboard_df_qa.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True) | |
lb_data_store.raw_df_long_doc = get_leaderboard_df( | |
lb_data_store.raw_data, task='long-doc', metric=DEFAULT_METRIC_LONG_DOC) | |
print(f'Long-Doc data loaded: {len(lb_data_store.raw_df_long_doc)}') | |
lb_data_store.leaderboard_df_long_doc = lb_data_store.raw_df_long_doc.copy() | |
shown_columns_long_doc, types_long_doc = get_default_cols( | |
'long-doc', lb_data_store.leaderboard_df_long_doc.columns, add_fix_cols=True) | |
lb_data_store.types_long_doc = types_long_doc | |
lb_data_store.leaderboard_df_long_doc = \ | |
lb_data_store.leaderboard_df_long_doc[~lb_data_store.leaderboard_df_long_doc[COL_NAME_IS_ANONYMOUS]][ | |
shown_columns_long_doc] | |
lb_data_store.leaderboard_df_long_doc.drop([COL_NAME_REVISION, COL_NAME_TIMESTAMP], axis=1, inplace=True) | |
lb_data_store.reranking_models = sorted( | |
list(frozenset([eval_result.reranking_model for eval_result in lb_data_store.raw_data]))) | |
return lb_data_store | |
def load_eval_results(file_path: str): | |
output = {} | |
versions = ("AIR-Bench_24.04",) | |
for version in versions: | |
fn = f"{file_path}/{version}" | |
output[version] = load_leaderboard_datastore(fn) | |
return output | |