abdev-leaderboard

Running

App Files Files Community

loodvanniekerkginkgo commited on about 23 hours ago

Commit

069fb2c

1 Parent(s): 0fdb208

Added some changes to filtering / dedup submissions

Browse files

Files changed (4) hide show

app.py +8 -2
constants.py +1 -0
notebooks/pIgGen_example.ipynb +29 -3
utils.py +17 -5

app.py CHANGED Viewed

@@ -21,6 +21,9 @@ from utils import fetch_hf_results, show_output_box
 def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
     df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
     if assay is not None:
         df = df[df["assay"] == assay]
@@ -48,9 +51,9 @@ def get_leaderboard_object(assay: str | None = None):
     # Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
     lb = Leaderboard(
         value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
-        datatype=["str", "str", "str", "number"],
         select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
-            ["model", "property", "spearman", "dataset"]
         ),
         search_columns=["Model Name"],
         filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
@@ -139,6 +142,9 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
                 """
                 # Overall Leaderboard (filter below by property)
                 Each property has its own prize, and participants can submit models for any combination of properties.
                 """
             )
             lb = get_leaderboard_object()

 def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
+    """
+    Format the dataframe for display on the leaderboard. The dataframe comes from utils.fetch_hf_results().
+    """
     df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
     if assay is not None:
         df = df[df["assay"] == assay]
     # Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
     lb = Leaderboard(
         value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
+        datatype=["str", "str", "str", "number", "str"],
         select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
+            ["model", "property", "spearman", "dataset", "user"]
         ),
         search_columns=["Model Name"],
         filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
                 """
                 # Overall Leaderboard (filter below by property)
                 Each property has its own prize, and participants can submit models for any combination of properties.
+                **Note**: It is trivial to overfit the public GDPa1 dataset, which results in very high Spearman correlations.
+                We would suggest training using cross-validation a limited number of times to give a better indication of the model's performance on the eventual private test set.
                 """
             )
             lb = get_leaderboard_object()

constants.py CHANGED Viewed

@@ -95,6 +95,7 @@ LEADERBOARD_COLUMNS_RENAME = {
     "model": "Model Name",
     "property": "Property",
 }
 def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:

     "model": "Model Name",
     "property": "Property",
 }
+BASELINE_USERNAMES = ["loodvanniekerkginkgo"]
 def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:

notebooks/pIgGen_example.ipynb CHANGED Viewed

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "7c6c914c",
    "metadata": {},
    "outputs": [],
@@ -21,10 +21,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "00cfd012",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
@@ -276,7 +298,7 @@
        "[5 rows x 30 columns]"
       ]
      },
-     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -285,6 +307,10 @@
     "model_name = \"ollieturnbull/p-IgGen\"\n",
     "df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
     "\n",
     "# Example: Just predict HIC, so we'll drop NaN rows for that\n",
     "df = df.dropna(subset=[\"HIC\"])\n",
     "df.head()"

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "7c6c914c",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "00cfd012",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['antibody_id', 'antibody_name', 'Titer', 'Purity', 'SEC %Monomer',\n",
+      "       'SMAC', 'HIC', 'HAC', 'PR_CHO', 'PR_Ova', 'AC-SINS_pH6.0',\n",
+      "       'AC-SINS_pH7.4', 'Tonset', 'Tm1', 'Tm2', 'hc_subtype', 'lc_subtype',\n",
+      "       'highest_clinical_trial_asof_feb2025', 'est_status_asof_feb2025',\n",
+      "       'vh_protein_sequence', 'hc_protein_sequence', 'hc_dna_sequence',\n",
+      "       'vl_protein_sequence', 'lc_protein_sequence', 'lc_dna_sequence',\n",
+      "       'hierarchical_cluster_fold', 'random_fold',\n",
+      "       'hierarchical_cluster_IgG_isotype_stratified_fold', 'light_aligned_aho',\n",
+      "       'heavy_aligned_aho'],\n",
+      "      dtype='object')\n",
+      "Titer             7\n",
+      "HIC               4\n",
+      "PR_CHO           49\n",
+      "Tm2              53\n",
+      "AC-SINS_pH7.4     4\n",
+      "dtype: int64\n"
+     ]
+    },
     {
      "data": {
       "text/html": [
        "[5 rows x 30 columns]"
       ]
      },
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
     "model_name = \"ollieturnbull/p-IgGen\"\n",
     "df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
     "\n",
+    "print(df.columns)\n",
+    "# Show number of NaNs per assay\n",
+    "print(df[[\"Titer\", \"HIC\", \"PR_CHO\", \"Tm2\", 'AC-SINS_pH7.4']].isna().sum())\n",
+    "\n",
     "# Example: Just predict HIC, so we'll drop NaN rows for that\n",
     "df = df.dropna(subset=[\"HIC\"])\n",
     "df.head()"

utils.py CHANGED Viewed

@@ -3,7 +3,7 @@ from datasets import load_dataset
 import gradio as gr
 import hashlib
 from typing import Iterable, Union
-from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
 pd.set_option("display.max_columns", None)
@@ -26,13 +26,25 @@ def fetch_hf_results():
     assert all(
         col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
     ), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
     # Show latest submission only
-    df = df.sort_values("submission_time", ascending=False).drop_duplicates(
-        subset=["model", "assay", "user"], keep="first"
     )
     df["property"] = df["assay"].map(ASSAY_RENAME)
-    # Anonymize the user column at this point
     df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
     return df

 import gradio as gr
 import hashlib
 from typing import Iterable, Union
+from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS, BASELINE_USERNAMES
 pd.set_option("display.max_columns", None)
     assert all(
         col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
     ), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
+    df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
+    df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
     # Show latest submission only
+    # For baselines: Keep unique model names
+    df_baseline = df_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
+        subset=["model", "assay", "dataset", "user"], keep="first"
+    )
+    # For users: Just show latest submission
+    df_non_baseline = df_non_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
+        subset=["assay", "dataset", "user"], keep="first"
     )
+    df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
     df["property"] = df["assay"].map(ASSAY_RENAME)
+    # Rename baseline username to just "Baseline"
+    df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
+    # Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
+    # Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
     df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
     return df