|
import math |
|
|
|
import numpy as np |
|
import pandas as pd |
|
import plotly.express as px |
|
|
|
|
|
|
|
def compute_pairwise_win_fraction(battles): |
|
|
|
a_win_ptbl = pd.pivot_table( |
|
battles[battles["win"] == "model_a"], |
|
index="model_a", |
|
columns="model_b", |
|
aggfunc="size", |
|
fill_value=0, |
|
) |
|
|
|
|
|
b_win_ptbl = pd.pivot_table( |
|
battles[battles["win"] == "model_b"], |
|
index="model_a", |
|
columns="model_b", |
|
aggfunc="size", |
|
fill_value=0, |
|
) |
|
|
|
|
|
num_battles_ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0) |
|
|
|
|
|
|
|
row_beats_col_freq = (a_win_ptbl + b_win_ptbl.T) / (num_battles_ptbl + num_battles_ptbl.T) |
|
|
|
|
|
prop_wins = row_beats_col_freq.mean(axis=1).sort_values(ascending=False) |
|
model_names = list(prop_wins.keys()) |
|
row_beats_col = row_beats_col_freq.loc[model_names, model_names] |
|
return row_beats_col |
|
|
|
|
|
def visualize_pairwise_win_fraction(battles, title): |
|
row_beats_col = compute_pairwise_win_fraction(battles) |
|
fig = px.imshow(row_beats_col, color_continuous_scale="RdBu", text_auto=".2f", title=title) |
|
fig.update_layout( |
|
xaxis_title="Model B", |
|
yaxis_title="Model A", |
|
xaxis_side="top", |
|
title_y=0.07, |
|
title_x=0.5, |
|
) |
|
fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Fraction of A Wins: %{z}<extra></extra>") |
|
return fig |
|
|
|
|
|
|
|
def switch_model_a_b(df): |
|
df_switch = df.copy() |
|
|
|
for i, row in df.iterrows(): |
|
if np.random.rand() < 0.5: |
|
df_switch.at[i, "model_a"] = row["model_b"] |
|
df_switch.at[i, "model_b"] = row["model_a"] |
|
if row["win"] == "model_a": |
|
df_switch.at[i, "win"] = "model_b" |
|
elif row["win"] == "model_b": |
|
df_switch.at[i, "win"] = "model_a" |
|
return df_switch |
|
|
|
|
|
def visualize_battle_count(battles, title): |
|
ptbl = pd.pivot_table(battles, index="model_a", columns="model_b", aggfunc="size", fill_value=0) |
|
battle_counts = ptbl + ptbl.T |
|
ordering = battle_counts.sum().sort_values(ascending=False).index |
|
fig = px.imshow(battle_counts.loc[ordering, ordering], title=title, text_auto=True, width=600) |
|
fig.update_layout( |
|
xaxis_title="Model B", |
|
yaxis_title="Model A", |
|
xaxis_side="top", |
|
title_y=0.07, |
|
title_x=0.5, |
|
) |
|
fig.update_traces(hovertemplate="Model A: %{y}<br>Model B: %{x}<br>Count: %{z}<extra></extra>") |
|
return fig |
|
|
|
|
|
|
|
def get_bootstrap_result(battles, func_compute_elo, num_round): |
|
rows = [func_compute_elo(battles.sample(frac=1.0, replace=True)) for _ in range(num_round)] |
|
df = pd.DataFrame(rows) |
|
return df[df.median().sort_values(ascending=False).index] |
|
|
|
|
|
def visualize_bootstrap_scores(df, title): |
|
bars = ( |
|
pd.DataFrame( |
|
dict( |
|
lower=df.quantile(0.025), |
|
rating=df.quantile(0.5), |
|
upper=df.quantile(0.975), |
|
) |
|
) |
|
.reset_index(names="model") |
|
.sort_values("rating", ascending=False) |
|
) |
|
bars["error_y"] = bars["upper"] - bars["rating"] |
|
bars["error_y_minus"] = bars["rating"] - bars["lower"] |
|
bars["rating_rounded"] = np.round(bars["rating"], 2) |
|
fig = px.scatter( |
|
bars, |
|
x="model", |
|
y="rating", |
|
error_y="error_y", |
|
error_y_minus="error_y_minus", |
|
text="rating_rounded", |
|
title=title, |
|
) |
|
fig.update_layout(xaxis_title="Model", yaxis_title="Rating") |
|
return fig |
|
|
|
|
|
|
|
def visualize_rating_count(df, title): |
|
df_all_value_counts = pd.concat([df["model_a"], df["model_b"]]).value_counts() |
|
fig = px.bar(df_all_value_counts, title=title, text_auto=True) |
|
|
|
min_y = df_all_value_counts.min() |
|
max_y = df_all_value_counts.max() |
|
|
|
y_end = math.ceil(min_y / 100) * 100 |
|
y_begin = math.floor(max_y / 100) * 100 |
|
|
|
fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False) |
|
fig.update_yaxes(range=[y_begin, y_end]) |
|
|
|
fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn") |
|
return fig |
|
|