ebr / app /backend /data_engine.py
fzliu's picture
Modify UI (#8)
353cfb3 verified
"""
Data service provider
"""
import json
from typing import List
import pandas as pd
from utils.cache_decorator import cache_df_with_custom_key, cache_dict_with_custom_key
from utils.http_utils import get
COLUMNS = ['model_name',
'embd_dtype', 'embd_dim', 'num_params', 'max_tokens', 'similarity',
'query_instruct', 'corpus_instruct',
]
COLUMNS_TYPES = ["markdown",
'str', 'str', 'number', 'number', 'str',
'str', 'str',
]
GIT_URL = "https://raw.githubusercontent.com/embedding-benchmark/ebr/refs/heads/main/results/"
DATASET_URL = f"{GIT_URL}datasets.json"
MODEL_URL = f"{GIT_URL}models.json"
RESULT_URL = f"{GIT_URL}results.json"
class DataEngine:
def __init__(self):
self.df = self.init_dataframe()
@property
@cache_dict_with_custom_key("models")
def models(self):
"""
Get models data
"""
res = get(MODEL_URL)
if res.status_code == 200:
return res.json()
return {}
@property
@cache_dict_with_custom_key("datasets")
def datasets(self):
"""
Get tasks data
"""
res = get(DATASET_URL)
if res.status_code == 200:
return res.json()
return {}
@property
@cache_dict_with_custom_key("results")
def results(self):
"""
Get results data
"""
res = get(RESULT_URL)
if res.status_code == 200:
return res.json()
return {}
def init_dataframe(self):
"""
Initialize DataFrame
"""
d = {"hello": [123], "world": [456]}
return pd.DataFrame(d)
@cache_df_with_custom_key("json_result")
def jsons_to_df(self):
results_list = self.results
df_results_list = []
for result_dict in results_list:
dataset_name = result_dict["dataset_name"]
df_result_row = pd.DataFrame(result_dict["results"])
df_result_row["dataset_name"] = dataset_name
df_results_list.append(df_result_row)
df_result = pd.concat(df_results_list)
df_result = df_result[["model_name", "dataset_name", "ndcg_at_10", "embd_dim", "embd_dtype"]]
df_result["ndcg_at_10"] = (df_result["ndcg_at_10"] * 100).round(2)
df_datasets_list = []
for item in self.datasets:
dataset_names = item["datasets"]
df_dataset_row = pd.DataFrame(
{
"group_name": [item["name"] for _ in range(len(dataset_names))],
"dataset_name": dataset_names,
"leaderboard": [item["leaderboard"] for _ in range(len(dataset_names))]
}
)
df_datasets_list.append(df_dataset_row)
df_dataset = pd.concat(df_datasets_list).drop_duplicates()
models_list = self.models
df_model = pd.DataFrame(models_list)
df = pd.merge(df_result, df_dataset, on=["dataset_name"], how="inner")
# df = pd.merge(df, df_model, on=["model_name"], how="inner")
# dataset_num_map = {}
# grouped_dataset_count = df.groupby(["group_name"]).agg({
# "dataset_name": "nunique"
# }).reset_index()
#
# for _, row in grouped_dataset_count.iterrows():
# dataset_num_map[row["group_name"]] = row["dataset_name"]
grouped_model = df.groupby(["model_name", "group_name", "embd_dim", "embd_dtype"]).agg({
"ndcg_at_10": "mean",
}).reset_index()
pivot = grouped_model.pivot(index=["model_name", "embd_dim", "embd_dtype"], columns="group_name",
values=["ndcg_at_10"]).round(2)
# Rename columns
pivot.columns = list(
map(lambda x: f"{x[1].capitalize()} Average" if x[1] != 'text' else f"Average", pivot.columns))
pivot_dataset = df_result.pivot(index=["model_name", "embd_dim", "embd_dtype"], columns="dataset_name", values="ndcg_at_10")
df = pd.merge(df_model, pivot, on=["model_name", "embd_dim", "embd_dtype"])
df = pd.merge(df, pivot_dataset, on=["model_name", "embd_dim", "embd_dtype"])
if df.empty:
return pd.DataFrame(columns=COLUMNS + ["reference"])
return df
def filter_df(self, group_name: str):
"""
filter_by_providers
"""
df = self.jsons_to_df()
return df[df["group_name"] == group_name][COLUMNS][:]