from git import Repo import shutil import os import json import pandas as pd from .dataset_handler import VIDORE_V1_MTEB_NAMES, VIDORE_V2_MTEB_NAMES, get_datasets_nickname class ModelHandler: def __init__(self): self.model_infos = {} @staticmethod def get_folders(dir_path): return sorted([ path_ for path_ in os.listdir(dir_path) if os.path.isdir(os.path.join(dir_path, path_)) ]) def get_vidore_data(self, metric="ndcg_at_5"): repo_url = "https://github.com/embeddings-benchmark/results.git" local_path = "./results" folder_of_interest = "results" if os.path.exists(local_path): repo = Repo(local_path) origin = repo.remotes.origin origin.pull() else: Repo.clone_from(repo_url, local_path, depth=1) model_names = self.get_folders(os.path.join(local_path, folder_of_interest)) for model_name in model_names: revisions = self.get_folders(os.path.join(local_path, folder_of_interest, model_name)) first_revision = revisions[0] result_filenames = [ result_filename for result_filename in os.listdir(os.path.join(local_path, folder_of_interest, model_name, first_revision)) # if result_filename.endswith(".json") and result_filename != "model_meta.json" ] if "model_meta.json" in result_filenames: with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, "model_meta.json"), "r") as f: meta = json.load(f) else: meta = {} results = {} if all(f"{v1_dataset_name}.json" in result_filenames for v1_dataset_name in VIDORE_V1_MTEB_NAMES): for v1_dataset_name in VIDORE_V1_MTEB_NAMES: with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v1_dataset_name}.json"), "r") as f: results[v1_dataset_name] = json.load(f) if all(f"{v2_dataset_name}.json" in result_filenames for v2_dataset_name in VIDORE_V2_MTEB_NAMES): for v2_dataset_name in VIDORE_V2_MTEB_NAMES: with open(os.path.join(local_path, folder_of_interest, model_name, first_revision, f"{v2_dataset_name}.json"), "r") as f: results[v2_dataset_name] = json.load(f) if model_name not in self.model_infos: self.model_infos[model_name] = {} self.model_infos[model_name] = {"meta": meta, "results": results} def filter_models_by_benchmark(self, benchmark_version=1): filtered_model_infos = {} keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES for model, info in self.model_infos.items(): results = info["results"] if any(any(keyword in dataset for keyword in keywords) for dataset in results.keys()): filtered_model_infos[model] = info return filtered_model_infos def render_df(self, metric="ndcg_at_5", benchmark_version=1): model_res = {} filtered_model_infos = self.filter_models_by_benchmark(benchmark_version) if len(filtered_model_infos) > 0: for model in filtered_model_infos.keys(): res = filtered_model_infos[model]["results"] dataset_res = {} keywords = VIDORE_V1_MTEB_NAMES if benchmark_version == 1 else VIDORE_V2_MTEB_NAMES if "n_parameters" in filtered_model_infos[model]["meta"]: dataset_res["Model Size (Million Parameters)"] = filtered_model_infos[model]["meta"]["n_parameters"] // 1_000_000 else: dataset_res["Model Size (Million Parameters)"] = None for dataset in res.keys(): if not any(keyword in dataset for keyword in keywords): continue dataset_nickname = get_datasets_nickname(dataset) dataset_res[dataset_nickname] = res[dataset]["scores"]["test"][0][metric] model_res[model] = dataset_res df = pd.DataFrame(model_res).T return df return pd.DataFrame()