File size: 6,011 Bytes
4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 1ed31e5 4f25de8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import pandas as pd
from huggingface_hub import hf_hub_download
def _metric(solution_df,submission_df, mode = "top_level", admin = False, additional_columns = None):
"""
This function calculates the accuracy of the generated predictions.
Parameters
----------
solution_df : pandas.DataFrame
The dataframe containing the solution data.
submission_df : pandas.DataFrame
The dataframe containing the submission data.
mode : str, optional
The mode of evaluation. Can be "top_level" or "bottom_level". The default is "top_level".
Returns
-------
None.
"""
solution_df["submission_pred"] = submission_df["pred"]
if admin:
source_col = "source_og"
else:
source_col = "source"
cols = ["split","pred", source_col]
solution_df["correct"] = solution_df["pred"] == solution_df["submission_pred"]
acc_all = (
solution_df.groupby(cols)["correct"].mean().reset_index()
.rename(columns={"correct": "accuracy"})
)
acc_all["score_name"] = acc_all["pred"] + "_" + acc_all[source_col]
if additional_columns == None:
additional_columns = []
if not admin:
# annonomize
for c in additional_columns:
vals_lookup = pd.Series({v:f"{c[:1]}_{i:02}" for i,v in enumerate(sorted(solution_df[c].unique()))})
solution_df.loc[:,c] = vals_lookup.loc[solution_df.loc[:,c].values].values
def acc_by_additional_columns(temp, col):
temp = temp.groupby(col)["correct"].mean().reset_index().rename(columns={"correct": "accuracy"})
temp["score_name"] = col[:3] + "_" + temp[col]
return temp.set_index("score_name")["accuracy"].sort_index()
def acc_by_source(temp):
scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
return scores_by_source
evaluation = {}
split = "public"
temp = acc_all.query(f"split=='{split}'")
scores_by_source = acc_by_source(temp)
# scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
# scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
# scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
# scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
if mode == "top_level":
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
else:
out = [scores_by_source]
for col in additional_columns:
out.append(acc_by_additional_columns(solution_df.query(f"split=='{split}'"),col))
scores_by_source = pd.concat(out)
evaluation[f"{split}_score"] = scores_by_source.to_dict()
split = "private"
# private has everything
temp = acc_all
scores_by_source = acc_by_source(temp)
# scores_by_source = temp.set_index("score_name")["accuracy"].sort_index()
# scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean()
# scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean()
# scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2.
if mode == "top_level":
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"]
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict()
else:
out = [scores_by_source]
for col in additional_columns:
out.append(acc_by_additional_columns(solution_df,col))
scores_by_source = pd.concat(out)
evaluation[f"{split}_score"] = scores_by_source.to_dict()
if "time" in submission_df.columns:
solution_df["submission_time"] = submission_df["time"]
split = "public"
evaluation[f"{split}_score"]["total_time"] = float(solution_df.query(f"split=='{split}'")["submission_time"].sum())
split = "private"
evaluation[f"{split}_score"]["total_time"] = float(solution_df["submission_time"].sum())
else:
for split in ["public","private"]:
evaluation[f"{split}_score"]["total_time"] = -1
if "score" in submission_df.columns:
solution_df["submission_score"] = submission_df["score"]
split = "public"
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df.query(f"split=='{split}'")["submission_score"].isna().mean())
split = "private"
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df["submission_score"].isna().mean())
else:
for split in ["public","private"]:
evaluation[f"{split}_score"]["fail_rate"] = -1
return evaluation
def compute(params):
solution_file = hf_hub_download(
repo_id=params.competition_id,
filename="solution.csv",
token=params.token,
repo_type="dataset",
)
solution_df = pd.read_csv(solution_file).set_index(params.submission_id_col)
submission_filename = f"submissions/{params.team_id}-{params.submission_id}.csv"
submission_file = hf_hub_download(
repo_id=params.competition_id,
filename=submission_filename,
token=params.token,
repo_type="dataset",
)
submission_df = pd.read_csv(submission_file).set_index(params.submission_id_col)
return _metric(solution_df,submission_df) |