leaderboard-public / process_data.py
kt-test-account's picture
update
e945d2a
import os
import pandas as pd
from pathlib import Path
from pathlib import Path
import pandas as pd
import json
import metric
from sklearn.metrics import roc_auc_score, roc_curve
import numpy as np
import altair as alt
import re
_metric = metric._metric
def get_submission(f):
submission_info = json.load(open(f))
submissions = pd.DataFrame(submission_info["submissions"])
submissions["team_id"] = submission_info["id"]
return submissions
# def get_submissions_file(f):
# submission_df = pd.read_csv(f).set_index("id")
# if isinstance(submission_df.iloc[0]["score"],str):
# submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply(lambda a: json.loads(re.sub(r'\b(\d+)\.(?!\d)', r'\1.0', a))[0] if isinstance(a,str) else float("nan"))
# return submission_df
def get_submissions_file(f):
submission_df = pd.read_csv(f).set_index("id")
if isinstance(submission_df.iloc[0]["score"], str):
submission_df.loc[:, "score"] = submission_df.loc[:, "score"].apply(
lambda a: float(
np.array(json.loads(re.sub(r"\b(\d+)\.(?!\d)", r"\1.0", a))).squeeze()
if isinstance(a, str)
else float("nan")
)
)
return submission_df
def load_results(local_dir):
team_file_name = "teams.json"
team_info = pd.read_json(Path(local_dir) / team_file_name).T
team_info.loc["baselines", "name"] = "baselines"
submission_info_dir = "submission_info"
submission_info_files = list((Path(local_dir) / submission_info_dir).glob("*.json"))
# submission_info_files += ["baselines/baselines.json"]
submissions = pd.concat(
[get_submission(f) for f in submission_info_files], ignore_index=True
)
submissions.loc[:, "team"] = team_info.loc[
submissions["team_id"].values, "name"
].values
submissions["submission_files"] = submissions.apply(
lambda a: (
str(
Path(local_dir)
/ "submissions"
/ (a["team_id"] + "-" + a["submission_id"] + ".csv")
)
if a["team_id"] != "baselines"
else str(
Path("baselines") / (a["team_id"] + "-" + a["submission_id"] + ".csv")
)
),
axis=1,
)
submissions = submissions.drop(columns=["public_score", "private_score"])
submissions["submission"] = (
submissions["team"] + " - " + submissions["submission_repo"]
)
return submissions
def compute_metrics(submissions, local_dir, admin=True):
submissions = submissions.query("status==3.0")
# if not admin:
# selected_by_team = submissions.groupby("team")["selected"].sum()
# teams_no_selected = selected_by_team.index[selected_by_team==0]
# submissions.loc[submissions.team.isin(teams_no_selected),"selected"] = True
# submissions = submissions.query("selected")
solution_df = pd.read_csv(Path(local_dir) / "solution.csv").set_index("id")
results = {"private_score": [], "public_score": []}
fields = ["team_id", "team", "submission_id", "submission_repo"]
for i, row in submissions.T.items():
# r = pd.read_csv(row["submission_files"]).set_index("id")
r = get_submissions_file(row["submission_files"])
eval = _metric(
solution_df,
r,
mode="detailed",
admin=admin,
additional_columns=(
["augmentation"] if "augmentation" in solution_df.columns else None
),
)
for m in ["private_score", "public_score"]:
for f in fields:
eval[m][f] = row[f]
eval[m]["submission"] = f"{row.team} - {row.submission_repo}"
eval[m] = pd.Series(eval[m]).to_frame().T
results[m].append(eval[m])
for m in ["private_score", "public_score"]:
temp = pd.concat(results[m], ignore_index=True).T
temp.index.name = "metric"
temp = temp.reset_index()
# def parse(s):
# if any(p in s for p in ["generated","pristine"]):
# s = s.split("_")
# return pd.Series(dict(pred = s[0], source = "_".join(s[1:])))
# else:
# return pd.Series(dict(pred = s, source = None))
# temp = pd.concat([temp, temp["metric"].apply(parse)], axis = 1)
# results[m] = temp.set_index(["pred","source"])
# results[m] = results[m].drop(columns = ["metric"]).T
results[m] = (
temp.set_index("metric")
.T.sort_values("balanced_accuracy", ascending=False)
.drop_duplicates(subset=["team", "submission_repo"])
)
if not admin:
# only show top selected
results[m] = (
results[m]
.sort_values(["team", "balanced_accuracy"], ascending=False)
.drop_duplicates(subset=["team"])
.sort_values("balanced_accuracy", ascending=False)
)
results[m] = results[m].set_index("submission" if admin else "team")
fields_to_merge = [
"generated_accuracy",
"pristine_accuracy",
"balanced_accuracy",
"total_time",
"fail_rate",
]
submissions = pd.concat(
[
submissions.set_index("submission_id"),
results["private_score"]
.reset_index()
.set_index("submission_id")
.loc[:, fields_to_merge],
],
axis=1,
).reset_index()
return results, submissions
status_lookup = "NA,QUEUED,PROCESSING,SUCCESS,FAILED".split(",")
def process_data(path, save_path):
submissions = load_results(path)
submissions["datetime"] = pd.DatetimeIndex(submissions["datetime"])
submissions["date"] = submissions["datetime"].dt.date
submissions["status_reason"] = (
submissions["status"].astype(int).apply(lambda a: status_lookup[a])
)
submissions.loc[
:, ["submission_id", "datetime", "date", "status", "status_reason"]
].to_csv(save_path + "_submissions.csv")
results, submissions = compute_metrics(submissions, path, admin=False)
cols_to_drop = ["team_id", "submission_id", "submission_repo", "submission"]
results["public_score"].drop(columns=cols_to_drop).to_csv(save_path + ".csv")
if __name__ == "__main__":
path_to_cache = os.environ.get("COMP_CACHE","../competition_cache")
process_data(os.path.join(path_to_cache,"temp_task1"), "task1")
process_data(os.path.join(path_to_cache,"temp_task2"), "task2")
process_data(os.path.join(path_to_cache,"temp_task3"), "task3")
process_data(os.path.join(path_to_cache,"temp_practice"), "practice")
# from datetime import date
# # Get today's date
# today = date.today()
# # Print date in YYYY-MM-DD format
# print("Today's date:", today)
from datetime import datetime
import pytz
# Define EST timezone
est = pytz.timezone("US/Eastern")
# Get current time in EST
est_time = datetime.now(est)
# Print current date and time in EST
today = f"Updated on {est_time.strftime('%Y-%m-%d %H:%M:%S')} EST"
with open("updated.txt", "w") as f:
f.write(str(today))