|
import pandas as pd |
|
from huggingface_hub import hf_hub_download |
|
|
|
def _metric(solution_df,submission_df, mode = "top_level", admin = False, additional_columns = None): |
|
""" |
|
This function calculates the accuracy of the generated predictions. |
|
|
|
Parameters |
|
---------- |
|
solution_df : pandas.DataFrame |
|
The dataframe containing the solution data. |
|
submission_df : pandas.DataFrame |
|
The dataframe containing the submission data. |
|
mode : str, optional |
|
The mode of evaluation. Can be "top_level" or "bottom_level". The default is "top_level". |
|
|
|
Returns |
|
------- |
|
None. |
|
""" |
|
|
|
|
|
solution_df["submission_pred"] = submission_df["pred"] |
|
|
|
if admin: |
|
source_col = "source_og" |
|
else: |
|
source_col = "source" |
|
|
|
|
|
cols = ["split","pred", source_col] |
|
|
|
solution_df["correct"] = solution_df["pred"] == solution_df["submission_pred"] |
|
acc_all = ( |
|
solution_df.groupby(cols)["correct"].mean().reset_index() |
|
.rename(columns={"correct": "accuracy"}) |
|
) |
|
acc_all["score_name"] = acc_all["pred"] + "_" + acc_all[source_col] |
|
|
|
if additional_columns == None: |
|
additional_columns = [] |
|
|
|
if not admin: |
|
|
|
for c in additional_columns: |
|
vals_lookup = pd.Series({v:f"{c[:1]}_{i:02}" for i,v in enumerate(sorted(solution_df[c].unique()))}) |
|
solution_df.loc[:,c] = vals_lookup.loc[solution_df.loc[:,c].values].values |
|
|
|
def acc_by_additional_columns(temp, col): |
|
temp = temp.groupby(col)["correct"].mean().reset_index().rename(columns={"correct": "accuracy"}) |
|
temp["score_name"] = col[:3] + "_" + temp[col] |
|
return temp.set_index("score_name")["accuracy"].sort_index() |
|
|
|
def acc_by_source(temp): |
|
scores_by_source = temp.set_index("score_name")["accuracy"].sort_index() |
|
scores_by_source["generated_accuracy"] = temp.query("pred=='generated'")["accuracy"].mean() |
|
scores_by_source["pristine_accuracy"] = temp.query("pred=='pristine'")["accuracy"].mean() |
|
scores_by_source["balanced_accuracy"] = (scores_by_source["generated_accuracy"] + scores_by_source["pristine_accuracy"])/2. |
|
return scores_by_source |
|
|
|
|
|
evaluation = {} |
|
|
|
split = "public" |
|
|
|
temp = acc_all.query(f"split=='{split}'") |
|
scores_by_source = acc_by_source(temp) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if mode == "top_level": |
|
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"] |
|
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict() |
|
else: |
|
out = [scores_by_source] |
|
for col in additional_columns: |
|
out.append(acc_by_additional_columns(solution_df.query(f"split=='{split}'"),col)) |
|
scores_by_source = pd.concat(out) |
|
evaluation[f"{split}_score"] = scores_by_source.to_dict() |
|
|
|
split = "private" |
|
|
|
|
|
temp = acc_all |
|
scores_by_source = acc_by_source(temp) |
|
|
|
|
|
|
|
|
|
|
|
|
|
if mode == "top_level": |
|
scores_to_save = ["generated_accuracy", "pristine_accuracy", "balanced_accuracy"] |
|
evaluation[f"{split}_score"] = scores_by_source.loc[scores_to_save].to_dict() |
|
else: |
|
out = [scores_by_source] |
|
for col in additional_columns: |
|
out.append(acc_by_additional_columns(solution_df,col)) |
|
scores_by_source = pd.concat(out) |
|
evaluation[f"{split}_score"] = scores_by_source.to_dict() |
|
|
|
|
|
if "time" in submission_df.columns: |
|
solution_df["submission_time"] = submission_df["time"] |
|
|
|
split = "public" |
|
evaluation[f"{split}_score"]["total_time"] = float(solution_df.query(f"split=='{split}'")["submission_time"].sum()) |
|
|
|
split = "private" |
|
evaluation[f"{split}_score"]["total_time"] = float(solution_df["submission_time"].sum()) |
|
else: |
|
for split in ["public","private"]: |
|
evaluation[f"{split}_score"]["total_time"] = -1 |
|
|
|
|
|
if "score" in submission_df.columns: |
|
solution_df["submission_score"] = submission_df["score"] |
|
|
|
split = "public" |
|
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df.query(f"split=='{split}'")["submission_score"].isna().mean()) |
|
|
|
split = "private" |
|
evaluation[f"{split}_score"]["fail_rate"] = float(solution_df["submission_score"].isna().mean()) |
|
|
|
else: |
|
for split in ["public","private"]: |
|
evaluation[f"{split}_score"]["fail_rate"] = -1 |
|
|
|
|
|
|
|
return evaluation |
|
|
|
|
|
|
|
def compute(params): |
|
solution_file = hf_hub_download( |
|
repo_id=params.competition_id, |
|
filename="solution.csv", |
|
token=params.token, |
|
repo_type="dataset", |
|
) |
|
|
|
solution_df = pd.read_csv(solution_file).set_index(params.submission_id_col) |
|
|
|
submission_filename = f"submissions/{params.team_id}-{params.submission_id}.csv" |
|
submission_file = hf_hub_download( |
|
repo_id=params.competition_id, |
|
filename=submission_filename, |
|
token=params.token, |
|
repo_type="dataset", |
|
) |
|
|
|
submission_df = pd.read_csv(submission_file).set_index(params.submission_id_col) |
|
|
|
return _metric(solution_df,submission_df) |