leaderboard-public / metric.py
kt-test-account's picture
add aug heatmaps
1ed31e5
import pandas as pd
from huggingface_hub import hf_hub_download
def _metric(solution_df,submission_df, mode = "top_level", admin = False, additional_columns = None):
"""
This function calculates the accuracy of the generated predictions.
Parameters
----------
solution_df : pandas.DataFrame
The dataframe containing the solution data.
submission_df : pandas.DataFrame
The dataframe containing the submission data.
mode : str, optional
The mode of evaluation. Can be "top_level" or "bottom_level". The default is "top_level".
Returns
-------
None.
"""
solution_df["submission_pred"] = submission_df["pred"]
if admin:
source_col = "source_og"
else:
source_col = "source"
cols = ["split","pred", source_col]
solution_df["correct"] = solution_df["pred"] == solution_df["submission_pred"]
acc_all = (
solution_df.groupby(cols)["correct"].mean().reset_index()
.rename(columns={"correct": "accuracy"})
)
acc_all["score_name"] = acc_all["pred"] + "_" + acc_all[source_col]
if additional_columns == None:
additional_columns = []
if not admin:
# annonomize
for c in additional_columns:
vals_lookup = pd.Series({v:f"{c[:1]}_{i:02}" for i,v in enumerate(sorted(solution_df[c].unique()))})
solution_df.loc[:,c] = vals_lookup.loc[solution_df.loc[:,c].values].values
def acc_by_additional_columns(temp, col):
temp = temp.groupby(col)["correct"].mean().reset_index().rename(columns={"correct": "accuracy"})
temp["score_name"] = col[:3] + "_" + temp[col]
return temp.set_index("score_name")["accuracy"].sort_index()
def acc_by_source(temp):
scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
return scores_by_source
evaluation = {}
split = "public"
temp = acc_all.query(f"split=='{split}'")
scores_by_source = acc_by_source(temp)
# scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
# scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
# scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
# scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
if mode == "top_level":
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
else:
out = [scores_by_source]
for col in additional_columns:
out.append(acc_by_additional_columns(solution_df.query(f"split=='{split}'"),col))
scores_by_source = pd.concat(out)
evaluation[f"{split}_score"] = scores_by_source.to_dict()
split = "private"
# private has everything
temp = acc_all
scores_by_source = acc_by_source(temp)
# scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
# scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
# scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
# scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
if mode == "top_level":
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
else:
out = [scores_by_source]
for col in additional_columns:
out.append(acc_by_additional_columns(solution_df,col))
scores_by_source = pd.concat(out)
evaluation[f"{split}_score"] = scores_by_source.to_dict()
if "time" in submission_df.columns:
solution_df["submission_time"] = submission_df["time"]
split = "public"
evaluation[f"{split}_score"]["total_time"] = float(solution_df.query(f"split=='{split}'")["submission_time"].sum())
split = "private"
evaluation[f"{split}_score"]["total_time"] = float(solution_df["submission_time"].sum())
else:
for split in ["public","private"]:
evaluation[f"{split}_score"]["total_time"] = -1
if "score" in submission_df.columns:
solution_df["submission_score"] = submission_df["score"]
split = "public"
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df.query(f"split=='{split}'")["submission_score"].isna().mean())
split = "private"
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df["submission_score"].isna().mean())
else:
for split in ["public","private"]:
evaluation[f"{split}_score"]["fail_rate"] = -1
return evaluation
def compute(params):
solution_file = hf_hub_download(
repo_id=params.competition_id,
filename="solution.csv",
token=params.token,
repo_type="dataset",
)
solution_df = pd.read_csv(solution_file).set_index(params.submission_id_col)
submission_filename = f"submissions/{params.team_id}-{params.submission_id}.csv"
submission_file = hf_hub_download(
repo_id=params.competition_id,
filename=submission_filename,
token=params.token,
repo_type="dataset",
)
submission_df = pd.read_csv(submission_file).set_index(params.submission_id_col)
return _metric(solution_df,submission_df)