|
from collections import defaultdict |
|
from dataclasses import dataclass |
|
from typing import Dict, List |
|
|
|
import numpy as np |
|
import pandas as pd |
|
from datasets import load_dataset |
|
|
|
from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE |
|
from utils import make_clickable_model |
|
from visualizations import (get_bootstrap_result, switch_model_a_b, |
|
visualize_battle_count, visualize_bootstrap_scores, |
|
visualize_pairwise_win_fraction, |
|
visualize_rating_count) |
|
|
|
|
|
@dataclass |
|
class EloEvalResult: |
|
model: str |
|
gpt_4_all: int |
|
human_all: int |
|
human_instruct: int |
|
human_code_instruct: int |
|
tie_allowed: bool |
|
|
|
def to_dict(self): |
|
base_model = f"{self.model}" |
|
data_dict = {} |
|
data_dict["Model"] = make_clickable_model(base_model) |
|
data_dict["GPT-4 (all)"] = self.gpt_4_all |
|
data_dict["Human (all)"] = self.human_all |
|
data_dict["Human (instruct)"] = self.human_instruct |
|
data_dict["Human (code-instruct)"] = self.human_code_instruct |
|
|
|
return data_dict |
|
|
|
|
|
def create_eval_df(df, tie_allowed): |
|
responses = [] |
|
for _, row in df.iterrows(): |
|
if row["status"] == "canceled": |
|
continue |
|
|
|
rating = row["response"]["annotations"]["Preference"] |
|
if rating == "NaN": |
|
continue |
|
|
|
scores = row["response"]["responses"] |
|
if any(s["Preference"] == "" for s in scores): |
|
continue |
|
|
|
response = { |
|
"id": row["task_id"], |
|
"prompt": row["params"]["templateVariables"]["prompt"], |
|
"model_a": row["params"]["templateVariables"]["modela"], |
|
"model_b": row["params"]["templateVariables"]["modelb"], |
|
"response_a": row["params"]["templateVariables"]["response1"], |
|
"response_b": row["params"]["templateVariables"]["response2"], |
|
"rating": int(rating), |
|
"ratings": [np.array([s["Preference"] for s in scores], dtype=np.int32)], |
|
} |
|
|
|
if tie_allowed: |
|
response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie" |
|
else: |
|
response["win"] = "model_a" if response["rating"] < 5 else "model_b" |
|
|
|
responses.append(response) |
|
|
|
return pd.DataFrame(responses) |
|
|
|
|
|
def create_eval_df_for_gpt(df, tie_allowed): |
|
responses = [] |
|
for _, row in df.iterrows(): |
|
response = { |
|
"id": row["review_id"], |
|
"prompt": row["question"], |
|
"model_a": row["model1"], |
|
"model_b": row["model2"], |
|
"response_a": row["answer1"], |
|
"response_b": row["answer2"], |
|
"rating": row["score"][0], |
|
} |
|
|
|
if tie_allowed: |
|
response["win"] = "model_a" if response["rating"] < 4 else "model_b" if response["rating"] > 5 else "tie" |
|
else: |
|
response["win"] = "model_a" if response["rating"] < 5 else "model_b" |
|
|
|
responses.append(response) |
|
|
|
return pd.DataFrame(responses) |
|
|
|
|
|
|
|
def compute_elo(df, k=32, scale=400, base=10, initial_rating=1000): |
|
rating = defaultdict(lambda: initial_rating) |
|
|
|
for _, model_a, model_b, win in df[["model_a", "model_b", "win"]].itertuples(): |
|
ra = rating[model_a] |
|
rb = rating[model_b] |
|
ea = 1 / (1 + base ** ((rb - ra) / scale)) |
|
eb = 1 / (1 + base ** ((ra - rb) / scale)) |
|
if win == "model_a": |
|
sa = 1 |
|
elif win == "model_b": |
|
sa = 0 |
|
elif win == "tie" or win == "tie (bothbad)": |
|
sa = 0.5 |
|
else: |
|
raise Exception(f"unexpected vote {win}") |
|
rating[model_a] += k * (sa - ea) |
|
rating[model_b] += k * (1 - sa - eb) |
|
|
|
return rating |
|
|
|
|
|
def convert_rating_from_float_to_int(df): |
|
return {model: int(rating) for model, rating in compute_elo(df).items()} |
|
|
|
|
|
def get_elo_results(df_instruct, df_code_instruct, tie_allowed): |
|
df_all = pd.concat([df_instruct, df_code_instruct]) |
|
|
|
df_gpt_4 = load_dataset( |
|
"gpt_4_evals/data/", split="train", revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846" |
|
).to_pandas() |
|
|
|
dfs = [df_instruct, df_code_instruct, df_all] |
|
elo_ratings = [convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed)) for df in dfs] |
|
|
|
gpt_4_elo_ratings = convert_rating_from_float_to_int(create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)) |
|
elo_ratings.append(gpt_4_elo_ratings) |
|
|
|
results = [ |
|
EloEvalResult( |
|
model=model_name, |
|
gpt_4_all=elo_ratings[3][model_name], |
|
human_all=elo_ratings[2][model_name], |
|
human_instruct=elo_ratings[0][model_name], |
|
human_code_instruct=elo_ratings[1][model_name], |
|
tie_allowed=tie_allowed, |
|
) |
|
for model_name in elo_ratings[0].keys() |
|
] |
|
|
|
return results |
|
|
|
|
|
def get_elo_results_dicts(df_instruct, df_code_instruct, tie_allowed) -> List[Dict]: |
|
eval_results = get_elo_results(df_instruct, df_code_instruct, tie_allowed) |
|
return [r.to_dict() for r in eval_results] |
|
|
|
|
|
def get_elo_plots(df_instruct, df_code_instruct, tie_allowed): |
|
df_instruct = create_eval_df(df_instruct, tie_allowed=tie_allowed) |
|
df_code_instruct = create_eval_df(df_code_instruct, tie_allowed=tie_allowed) |
|
df_all = pd.concat([df_instruct, df_code_instruct]) |
|
game = df_all[["model_a", "model_b", "win"]] |
|
|
|
game_switch = switch_model_a_b(game) |
|
plot_1 = visualize_pairwise_win_fraction(game_switch, PLOT_1_TITLE) |
|
|
|
plot_2 = visualize_battle_count(game_switch, PLOT_2_TITLE) |
|
|
|
BOOTSTRAP_ROUNDS = 1000 |
|
if "bootstrap_elo_lu" not in globals(): |
|
bootstrap_elo_lu = get_bootstrap_result(game_switch, compute_elo, BOOTSTRAP_ROUNDS) |
|
|
|
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE) |
|
|
|
plot_4 = visualize_rating_count(game, PLOT_4_TITLE) |
|
|
|
return plot_1, plot_2, plot_3, plot_4 |
|
|