import pandas as pd from pathlib import Path from datasets import load_dataset import numpy as np import os import re UNVERIFIED_MODELS = [ "nvidia/Nemotron-4-340B-Reward", "nvidia/Llama3-70B-SteerLM-RM", "Cohere May 2024", "google/gemini-1.5-pro-0514", "google/flame-24b-july-2024", "Cohere March 2024", "facebook/Self-taught-Llama-3-70B", "facebook/Self-taught-evaluator-llama3.1-70B", "google/flame-1.0-24B-july-2024", "Salesforce/SFR-LLaMa-3.1-70B-Judge-r", "Salesforce/SFR-nemo-12B-Judge-r", "Salesforce/SFR-LLaMa-3.1-8B-Judge-r", "SF-Foundation/TextEval-OffsetBias-12B", "SF-Foundation/TextEval-Llama3.1-70B", "nvidia/Llama-3.1-Nemotron-70B-Reward", ] CONTAMINATED_MODELS = [ "Skywork/Skywork-Reward-Gemma-2-27B", "Skywork/Skywork-Critic-Llama-3.1-70B", "LxzGordon/URM-LLaMa-3.1-8B", "Skywork/Skywork-Reward-Llama-3.1-8B", "Ray2333/GRM-Llama3-8B-rewardmodel-ft", "nicolinho/QRM-Llama3.1-8B", "nicolinho/QRM-Llama3-8B", "general-preference/GPM-Llama-3.1-8B", "SF-Foundation/TextEval-Llama3.1-70B", "ZiyiYe/Con-J-Qwen2-7B", "Ray2333/Gemma-2B-rewardmodel-ft", "Ray2333/GRM-Gemma-2B-rewardmodel-ft" ] # From Open LLM Leaderboard def model_hyperlink(link, model_name): # if model_name is above 50 characters, return first 47 characters and "..." if len(model_name) > 50: model_name = model_name[:47] + "..." if model_name == "random": output = "random" elif model_name == "Cohere March 2024": output = f'{model_name}' elif "openai" == model_name.split("/")[0]: output = f'{model_name}' elif "Anthropic" == model_name.split("/")[0]: output = f'{model_name}' elif "google" == model_name.split("/")[0]: output = f'{model_name}' elif "PoLL" == model_name.split("/")[0]: output = model_name output = f'{model_name}' if model_name in UNVERIFIED_MODELS: output += " *" if model_name in CONTAMINATED_MODELS: output += " ⚠️" return output def undo_hyperlink(html_string): # Regex pattern to match content inside > and < pattern = r'>[^<]+<' match = re.search(pattern, html_string) if match: # Extract the matched text and remove leading '>' and trailing '<' return match.group(0)[1:-1] else: return "No text found" # Define a function to fetch and process data def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to pull the git repo dir = Path(data_repo) data_dir = dir / subdir orgs = [d for d in os.listdir(data_dir) if os.path.isdir(os.path.join(data_dir, d))] # get all files within the sub folders orgs models_results = [] for org in orgs: org_dir = data_dir / org files = [f for f in os.listdir(org_dir) if os.path.isfile(os.path.join(org_dir, f))] for file in files: if file.endswith(".json"): models_results.append(org + "/" + file) # create empty dataframe to add all data to df = pd.DataFrame() # load all json data in the list models_results one by one to avoid not having the same entries for model in models_results: model_data = load_dataset("json", data_files=data_repo + subdir+ "/" + model, split="train") df2 = pd.DataFrame(model_data) # add to df df = pd.concat([df2, df]) # remove chat_template comlumn df = df.drop(columns=["chat_template"]) # sort columns alphabetically df = df.reindex(sorted(df.columns), axis=1) # move column "model" to the front cols = list(df.columns) cols.insert(0, cols.pop(cols.index('model'))) df = df.loc[:, cols] # select all columns except "model" cols = df.columns.tolist() cols.remove("model") # if model_type is a column (pref tests may not have it) if "model_type" in cols: cols.remove("model_type") # remove ref_model if in columns if "ref_model" in cols: cols.remove("ref_model") # remove model_beaker from dataframe if "model_beaker" in cols: cols.remove("model_beaker") df = df.drop(columns=["model_beaker"]) # remove column xstest (outdated data) # if xstest is a column if "xstest" in cols: df = df.drop(columns=["xstest"]) cols.remove("xstest") if "ref_model" in df.columns: df = df.drop(columns=["ref_model"]) # remove column anthropic and summarize_prompted (outdated data) if "anthropic" in cols: df = df.drop(columns=["anthropic"]) cols.remove("anthropic") if "summarize_prompted" in cols: df = df.drop(columns=["summarize_prompted"]) cols.remove("summarize_prompted") # remove pku_better and pku_safer (removed from the leaderboard) if "pku_better" in cols: df = df.drop(columns=["pku_better"]) cols.remove("pku_better") if "pku_safer" in cols: df = df.drop(columns=["pku_safer"]) cols.remove("pku_safer") # convert to score df[cols] = (df[cols]*100) avg = np.nanmean(df[cols].values,axis=1) # add average column df["average"] = avg # apply model_hyperlink function to column "model" df["model"] = df["model"].apply(lambda x: model_hyperlink(f"https://huggingface.co/{x}", x)) # move average column to the second cols = list(df.columns) cols.insert(1, cols.pop(cols.index('average'))) df = df.loc[:, cols] # move model_type column to first if "model_type" in cols: cols = list(df.columns) cols.insert(1, cols.pop(cols.index('model_type'))) df = df.loc[:, cols] # remove models with DPO Ref. Free as type (future work) df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)] return df