|
import os |
|
import pandas as pd |
|
from pathlib import Path |
|
from pathlib import Path |
|
import pandas as pd |
|
import json |
|
import metric |
|
from sklearn.metrics import roc_auc_score, roc_curve |
|
import numpy as np |
|
import altair as alt |
|
import re |
|
|
|
_metric = metric._metric |
|
|
|
|
|
def get_submission(f): |
|
submission_info = json.load(open(f)) |
|
submissions = pd.DataFrame(submission_info["submissions"]) |
|
submissions["team_id"] = submission_info["id"] |
|
|
|
return submissions |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_submissions_file(f): |
|
submission_df = pd.read_csv(f).set_index("id") |
|
if isinstance(submission_df.iloc[0]["score"], str): |
|
submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply( |
|
lambda a: float( |
|
np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze() |
|
if isinstance(a, str) |
|
else float("nan") |
|
) |
|
) |
|
return submission_df |
|
|
|
|
|
def load_results(local_dir): |
|
team_file_name = "teams.json" |
|
team_info = pd.read_json(Path(local_dir) / team_file_name).T |
|
team_info.loc["baselines", "name"] = "baselines" |
|
submission_info_dir = "submission_info" |
|
submission_info_files = list((Path(local_dir) / submission_info_dir).glob("*.json")) |
|
|
|
submissions = pd.concat( |
|
[get_submission(f) for f in submission_info_files], ignore_index=True |
|
) |
|
submissions.loc[:, "team"] = team_info.loc[ |
|
submissions["team_id"].values, "name" |
|
].values |
|
|
|
submissions["submission_files"] = submissions.apply( |
|
lambda a: ( |
|
str( |
|
Path(local_dir) |
|
/ "submissions" |
|
/ (a["team_id"] + "-" + a["submission_id"] + ".csv") |
|
) |
|
if a["team_id"] != "baselines" |
|
else str( |
|
Path("baselines") / (a["team_id"] + "-" + a["submission_id"] + ".csv") |
|
) |
|
), |
|
axis=1, |
|
) |
|
submissions = submissions.drop(columns=["public_score", "private_score"]) |
|
submissions["submission"] = ( |
|
submissions["team"] + " - " + submissions["submission_repo"] |
|
) |
|
return submissions |
|
|
|
|
|
def compute_metrics(submissions, local_dir, admin=True): |
|
|
|
submissions = submissions.query("status==3.0") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
solution_df = pd.read_csv(Path(local_dir) / "solution.csv").set_index("id") |
|
|
|
results = {"private_score": [], "public_score": []} |
|
|
|
fields = ["team_id", "team", "submission_id", "submission_repo"] |
|
for i, row in submissions.T.items(): |
|
|
|
r = get_submissions_file(row["submission_files"]) |
|
eval = _metric( |
|
solution_df, |
|
r, |
|
mode="detailed", |
|
admin=admin, |
|
additional_columns=( |
|
["augmentation"] if "augmentation" in solution_df.columns else None |
|
), |
|
) |
|
for m in ["private_score", "public_score"]: |
|
for f in fields: |
|
eval[m][f] = row[f] |
|
eval[m]["submission"] = f"{row.team} - {row.submission_repo}" |
|
|
|
eval[m] = pd.Series(eval[m]).to_frame().T |
|
results[m].append(eval[m]) |
|
|
|
for m in ["private_score", "public_score"]: |
|
temp = pd.concat(results[m], ignore_index=True).T |
|
temp.index.name = "metric" |
|
temp = temp.reset_index() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
results[m] = ( |
|
temp.set_index("metric") |
|
.T.sort_values("balanced_accuracy", ascending=False) |
|
.drop_duplicates(subset=["team", "submission_repo"]) |
|
) |
|
|
|
if not admin: |
|
|
|
results[m] = ( |
|
results[m] |
|
.sort_values(["team", "balanced_accuracy"], ascending=False) |
|
.drop_duplicates(subset=["team"]) |
|
.sort_values("balanced_accuracy", ascending=False) |
|
) |
|
|
|
results[m] = results[m].set_index("submission" if admin else "team") |
|
|
|
fields_to_merge = [ |
|
"generated_accuracy", |
|
"pristine_accuracy", |
|
"balanced_accuracy", |
|
"total_time", |
|
"fail_rate", |
|
] |
|
|
|
submissions = pd.concat( |
|
[ |
|
submissions.set_index("submission_id"), |
|
results["private_score"] |
|
.reset_index() |
|
.set_index("submission_id") |
|
.loc[:, fields_to_merge], |
|
], |
|
axis=1, |
|
).reset_index() |
|
|
|
return results, submissions |
|
|
|
|
|
status_lookup = "NA,QUEUED,PROCESSING,SUCCESS,FAILED".split(",") |
|
|
|
|
|
def process_data(path, save_path): |
|
submissions = load_results(path) |
|
submissions["datetime"] = pd.DatetimeIndex(submissions["datetime"]) |
|
submissions["date"] = submissions["datetime"].dt.date |
|
submissions["status_reason"] = ( |
|
submissions["status"].astype(int).apply(lambda a: status_lookup[a]) |
|
) |
|
submissions.loc[ |
|
:, ["submission_id", "datetime", "date", "status", "status_reason"] |
|
].to_csv(save_path + "_submissions.csv") |
|
|
|
results, submissions = compute_metrics(submissions, path, admin=False) |
|
cols_to_drop = ["team_id", "submission_id", "submission_repo", "submission"] |
|
results["public_score"].drop(columns=cols_to_drop).to_csv(save_path + ".csv") |
|
|
|
|
|
if __name__ == "__main__": |
|
path_to_cache = os.environ.get("COMP_CACHE","../competition_cache") |
|
process_data(os.path.join(path_to_cache,"temp_task1"), "task1") |
|
process_data(os.path.join(path_to_cache,"temp_task2"), "task2") |
|
process_data(os.path.join(path_to_cache,"temp_task3"), "task3") |
|
process_data(os.path.join(path_to_cache,"temp_practice"), "practice") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from datetime import datetime |
|
import pytz |
|
|
|
|
|
est = pytz.timezone("US/Eastern") |
|
|
|
|
|
est_time = datetime.now(est) |
|
|
|
|
|
today = f"Updated on {est_time.strftime('%Y-%m-%d %H:%M:%S')} EST" |
|
with open("updated.txt", "w") as f: |
|
f.write(str(today)) |
|
|