loodvanniekerkginkgo commited on
Commit
069fb2c
·
1 Parent(s): 0fdb208

Added some changes to filtering / dedup submissions

Browse files
Files changed (4) hide show
  1. app.py +8 -2
  2. constants.py +1 -0
  3. notebooks/pIgGen_example.ipynb +29 -3
  4. utils.py +17 -5
app.py CHANGED
@@ -21,6 +21,9 @@ from utils import fetch_hf_results, show_output_box
21
 
22
 
23
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
 
 
 
24
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
25
  if assay is not None:
26
  df = df[df["assay"] == assay]
@@ -48,9 +51,9 @@ def get_leaderboard_object(assay: str | None = None):
48
  # Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
49
  lb = Leaderboard(
50
  value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
51
- datatype=["str", "str", "str", "number"],
52
  select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
53
- ["model", "property", "spearman", "dataset"]
54
  ),
55
  search_columns=["Model Name"],
56
  filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
@@ -139,6 +142,9 @@ with gr.Blocks(theme=gr.themes.Default(text_size=sizes.text_lg)) as demo:
139
  """
140
  # Overall Leaderboard (filter below by property)
141
  Each property has its own prize, and participants can submit models for any combination of properties.
 
 
 
142
  """
143
  )
144
  lb = get_leaderboard_object()
 
21
 
22
 
23
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
24
+ """
25
+ Format the dataframe for display on the leaderboard. The dataframe comes from utils.fetch_hf_results().
26
+ """
27
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
28
  if assay is not None:
29
  df = df[df["assay"] == assay]
 
51
  # Note(Lood): Would be nice to make it clear that the Search Column is searching on model name
52
  lb = Leaderboard(
53
  value=format_leaderboard_table(df_results=current_dataframe, assay=assay),
54
+ datatype=["str", "str", "str", "number", "str"],
55
  select_columns=LEADERBOARD_COLUMNS_RENAME_LIST(
56
+ ["model", "property", "spearman", "dataset", "user"]
57
  ),
58
  search_columns=["Model Name"],
59
  filter_columns=LEADERBOARD_COLUMNS_RENAME_LIST(filter_columns),
 
142
  """
143
  # Overall Leaderboard (filter below by property)
144
  Each property has its own prize, and participants can submit models for any combination of properties.
145
+
146
+ **Note**: It is trivial to overfit the public GDPa1 dataset, which results in very high Spearman correlations.
147
+ We would suggest training using cross-validation a limited number of times to give a better indication of the model's performance on the eventual private test set.
148
  """
149
  )
150
  lb = get_leaderboard_object()
constants.py CHANGED
@@ -95,6 +95,7 @@ LEADERBOARD_COLUMNS_RENAME = {
95
  "model": "Model Name",
96
  "property": "Property",
97
  }
 
98
 
99
 
100
  def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
 
95
  "model": "Model Name",
96
  "property": "Property",
97
  }
98
+ BASELINE_USERNAMES = ["loodvanniekerkginkgo"]
99
 
100
 
101
  def LEADERBOARD_COLUMNS_RENAME_LIST(columns: list[str]) -> list[str]:
notebooks/pIgGen_example.ipynb CHANGED
@@ -2,7 +2,7 @@
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
- "execution_count": null,
6
  "id": "7c6c914c",
7
  "metadata": {},
8
  "outputs": [],
@@ -21,10 +21,32 @@
21
  },
22
  {
23
  "cell_type": "code",
24
- "execution_count": null,
25
  "id": "00cfd012",
26
  "metadata": {},
27
  "outputs": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  {
29
  "data": {
30
  "text/html": [
@@ -276,7 +298,7 @@
276
  "[5 rows x 30 columns]"
277
  ]
278
  },
279
- "execution_count": 2,
280
  "metadata": {},
281
  "output_type": "execute_result"
282
  }
@@ -285,6 +307,10 @@
285
  "model_name = \"ollieturnbull/p-IgGen\"\n",
286
  "df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
287
  "\n",
 
 
 
 
288
  "# Example: Just predict HIC, so we'll drop NaN rows for that\n",
289
  "df = df.dropna(subset=[\"HIC\"])\n",
290
  "df.head()"
 
2
  "cells": [
3
  {
4
  "cell_type": "code",
5
+ "execution_count": 1,
6
  "id": "7c6c914c",
7
  "metadata": {},
8
  "outputs": [],
 
21
  },
22
  {
23
  "cell_type": "code",
24
+ "execution_count": 10,
25
  "id": "00cfd012",
26
  "metadata": {},
27
  "outputs": [
28
+ {
29
+ "name": "stdout",
30
+ "output_type": "stream",
31
+ "text": [
32
+ "Index(['antibody_id', 'antibody_name', 'Titer', 'Purity', 'SEC %Monomer',\n",
33
+ " 'SMAC', 'HIC', 'HAC', 'PR_CHO', 'PR_Ova', 'AC-SINS_pH6.0',\n",
34
+ " 'AC-SINS_pH7.4', 'Tonset', 'Tm1', 'Tm2', 'hc_subtype', 'lc_subtype',\n",
35
+ " 'highest_clinical_trial_asof_feb2025', 'est_status_asof_feb2025',\n",
36
+ " 'vh_protein_sequence', 'hc_protein_sequence', 'hc_dna_sequence',\n",
37
+ " 'vl_protein_sequence', 'lc_protein_sequence', 'lc_dna_sequence',\n",
38
+ " 'hierarchical_cluster_fold', 'random_fold',\n",
39
+ " 'hierarchical_cluster_IgG_isotype_stratified_fold', 'light_aligned_aho',\n",
40
+ " 'heavy_aligned_aho'],\n",
41
+ " dtype='object')\n",
42
+ "Titer 7\n",
43
+ "HIC 4\n",
44
+ "PR_CHO 49\n",
45
+ "Tm2 53\n",
46
+ "AC-SINS_pH7.4 4\n",
47
+ "dtype: int64\n"
48
+ ]
49
+ },
50
  {
51
  "data": {
52
  "text/html": [
 
298
  "[5 rows x 30 columns]"
299
  ]
300
  },
301
+ "execution_count": 10,
302
  "metadata": {},
303
  "output_type": "execute_result"
304
  }
 
307
  "model_name = \"ollieturnbull/p-IgGen\"\n",
308
  "df = load_dataset(\"ginkgo-datapoints/GDPa1\")[\"train\"].to_pandas()\n",
309
  "\n",
310
+ "print(df.columns)\n",
311
+ "# Show number of NaNs per assay\n",
312
+ "print(df[[\"Titer\", \"HIC\", \"PR_CHO\", \"Tm2\", 'AC-SINS_pH7.4']].isna().sum())\n",
313
+ "\n",
314
  "# Example: Just predict HIC, so we'll drop NaN rows for that\n",
315
  "df = df.dropna(subset=[\"HIC\"])\n",
316
  "df.head()"
utils.py CHANGED
@@ -3,7 +3,7 @@ from datasets import load_dataset
3
  import gradio as gr
4
  import hashlib
5
  from typing import Iterable, Union
6
- from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS
7
 
8
  pd.set_option("display.max_columns", None)
9
 
@@ -26,13 +26,25 @@ def fetch_hf_results():
26
  assert all(
27
  col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
28
  ), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
 
 
 
29
  # Show latest submission only
30
- df = df.sort_values("submission_time", ascending=False).drop_duplicates(
31
- subset=["model", "assay", "user"], keep="first"
 
 
 
 
 
32
  )
 
33
  df["property"] = df["assay"].map(ASSAY_RENAME)
34
-
35
- # Anonymize the user column at this point
 
 
 
36
  df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
37
 
38
  return df
 
3
  import gradio as gr
4
  import hashlib
5
  from typing import Iterable, Union
6
+ from constants import RESULTS_REPO, ASSAY_RENAME, LEADERBOARD_RESULTS_COLUMNS, BASELINE_USERNAMES
7
 
8
  pd.set_option("display.max_columns", None)
9
 
 
26
  assert all(
27
  col in df.columns for col in LEADERBOARD_RESULTS_COLUMNS
28
  ), f"Expected columns {LEADERBOARD_RESULTS_COLUMNS} not found in {df.columns}. Missing columns: {set(LEADERBOARD_RESULTS_COLUMNS) - set(df.columns)}"
29
+
30
+ df_baseline = df[df["user"].isin(BASELINE_USERNAMES)]
31
+ df_non_baseline = df[~df["user"].isin(BASELINE_USERNAMES)]
32
  # Show latest submission only
33
+ # For baselines: Keep unique model names
34
+ df_baseline = df_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
35
+ subset=["model", "assay", "dataset", "user"], keep="first"
36
+ )
37
+ # For users: Just show latest submission
38
+ df_non_baseline = df_non_baseline.sort_values("submission_time", ascending=False).drop_duplicates(
39
+ subset=["assay", "dataset", "user"], keep="first"
40
  )
41
+ df = pd.concat([df_baseline, df_non_baseline], ignore_index=True)
42
  df["property"] = df["assay"].map(ASSAY_RENAME)
43
+
44
+ # Rename baseline username to just "Baseline"
45
+ df.loc[df["user"].isin(BASELINE_USERNAMES), "user"] = "Baseline"
46
+ # Note: Could optionally add a column "is_baseline" to the dataframe to indicate whether the model is a baseline model or not. If things get crowded.
47
+ # Anonymize the user column at this point (so note: users can submit anonymous / non-anonymous and we'll show their latest submission regardless)
48
  df.loc[df["anonymous"] != False, "user"] = "anon-" + df.loc[df["anonymous"] != False, "user"].apply(readable_hash)
49
 
50
  return df