In [1]:
import os
import sys
from collections import defaultdict
import warnings
import logging
from typing import Literal

sys.path.append('~/PROTAC-Degradation-Predictor/protac_degradation_predictor')
import protac_degradation_predictor as pdp

import pytorch_lightning as pl
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from jsonargparse import CLI
import pandas as pd
# Import tqdm for notebook
from tqdm.notebook import tqdm
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import (
    StratifiedKFold,
    StratifiedGroupKFold,
)


active_col = 'Active (Dmax 0.6, pDC50 6.0)'
pDC50_threshold = 6.0
Dmax_threshold = 0.6

protac_df = pd.read_csv('~/PROTAC-Degradation-Predictor/data/PROTAC-Degradation-DB.csv')
protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
protac_df[active_col] = protac_df.apply(
    lambda x: pdp.is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
)

In [18]:
def get_random_split_indices(active_df: pd.DataFrame, test_split: float) -> pd.Index:
    """ Get the indices of the test set using a random split.
    
    Args:
        active_df (pd.DataFrame): The DataFrame containing the active PROTACs.
        test_split (float): The percentage of the active PROTACs to use as the test set.
    
    Returns:
        pd.Index: The indices of the test set.
    """
    test_df = active_df.sample(frac=test_split, random_state=42)
    return test_df.index

protac_df['pDC50'] = -np.log10(protac_df['DC50 (nM)'] * 1e-9)
active_df = protac_df[protac_df[active_col].notna()].copy()
test_split = 0.1
test_indices = get_random_split_indices(active_df, test_split)
train_val_df = active_df[~active_df.index.isin(test_indices)].copy()
len(train_val_df)

771

In [20]:
# Get the mean of pDC50 for the active PROTACs
mean_pDC50 = active_df['pDC50'].mean()
mean_pDC50

6.935675466781487

In [19]:
train_val_df.columns

Index(['Compound ID', 'Uniprot', 'Smiles', 'E3 Ligase', 'InChI', 'InChI Key',
       'Molecular Weight', 'Heavy Atom Count', 'Ring Count',
       'Rotatable Bond Count', 'Topological Polar Surface Area',
       'Hydrogen Bond Acceptor Count', 'Hydrogen Bond Donor Count',
       'Cell Type', 'Treatment Time (h)', 'DC50 (nM)', 'Dmax (%)', 'Active',
       'Article DOI', 'Comments', 'Database', 'Molecular Formula', 'cLogP',
       'Target', 'PDB', 'Name', 'Assay (DC50/Dmax)', 'Exact Mass', 'XLogP3',
       'Target (Parsed)', 'POI Sequence', 'E3 Ligase Uniprot',
       'E3 Ligase Sequence', 'Cell Line Identifier', 'Active - OR',
       'Active (Dmax 0.6, pDC50 6.0)', 'pDC50'],
      dtype='object')

In [17]:
# Get the entries for which the same SMILES is active for one cell and inactive for another
# This is a sign of a potential error in the data
# This is a sign of a potential error in the data
active_smiles = train_val_df[train_val_df[active_col] == 1]['Smiles']
inactive_smiles = train_val_df[train_val_df[active_col] == 0]['Smiles']
active_smiles = active_smiles.drop_duplicates()
inactive_smiles = inactive_smiles.drop_duplicates()

common_smiles = active_smiles[active_smiles.isin(inactive_smiles)]
common_df = train_val_df[train_val_df['Smiles'].isin(common_smiles)]

# # Group by Smiles and check if the same SMILES is active for one cell and inactive for another
# grouped = common_df.groupby('Smiles')
# for name, group in grouped:
#     if group[active_col].nunique() > 1 and group['Cell Line Identifier'].nunique() > 1:
#         display(group[['Smiles', 'Cell Line Identifier', active_col, 'DC50 (nM)', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']])
#         print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))
#         print()

Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
69,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,H1975/WR,True,5.9,100.0,P00533,VHL,,10.1016/j.ejmech.2020.112199
1229,C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CC...,A431 siYAP,False,2000.0,,P00533,VHL,Degradation of EGFR in A431 cells after 16 h t...,10.1016/j.ejmech.2020.112199


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CCN... &             H1975/WR &                         True & 8.229148 &     100.0 &  P00533 &       VHL &                                                NaN & 10.1016/j.ejmech.2020.112199 \\
 NaN & C=CC(=O)Nc1cccc(-n2c(=O)cc(C)c3cnc(Nc4ccc(N5CCN... &           A431 siYAP &                        False & 5.698970 &       NaN &  P00533 &       VHL & Degradation of EGFR in A431 cells after 16 h tr... & 10.1016/j.ejmech.2020.112199 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1793,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,30.28,64.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1794,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,1000.0,28.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 7.518844 &      64.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 6.000000 &      28.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1795,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,31.35,72.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1796,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,1000.0,41.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 7.503762 &      72.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 6.000000 &      41.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1771,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,4.97,76.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1772,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,20.3,50.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 8.303644 &      76.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 7.692504 &      50.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1789,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,7.03,84.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1790,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,161.0,43.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 8.153045 &      84.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 6.793174 &      43.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1781,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,3.04,85.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1782,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,663.0,50.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 8.517126 &      85.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 6.178486 &      50.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1799,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,COLO 205,True,21.71,88.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768
1800,CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4cc...,HCT 116,False,1000.0,13.0,P33981,CRBN,Degradation of TTK in COLO-205/HCT-116 cells a...,10.1021/acs.jmedchem.1c01768


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &   pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &             COLO 205 &                         True & 7.66334 &      88.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
 NaN & CCC(=O)N[C@H]1CC[C@@H](n2c(=O)cc(C)c3cnc(Nc4ccc... &              HCT 116 &                        False & 6.00000 &      13.0 &  P33981 &      CRBN & Degradation of TTK in COLO-205/HCT-116 cells af... & 10.1021/acs.jmedchem.1c01768 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
516,COc1cc(-c2cn(C)c(=O)c3cnccc23)cc(OC)c1CN1CCN(C...,IH-1,False,560.0,10.0,Q9H8M2,VHL,,10.1021/acs.jmedchem.8b01413
925,COc1cc(-c2cn(C)c(=O)c3cnccc23)cc(OC)c1CN1CCN(C...,HeLa,True,560.0,80.0,Q9H8M2,VHL,Degradation of BRD9 in HeLa cells after 4 h tr...,10.1021/acs.jmedchem.8b01413


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
 NaN & COc1cc(-c2cn(C)c(=O)c3cnccc23)cc(OC)c1CN1CCN(CC... &                 IH-1 &                        False & 6.251812 &      10.0 &  Q9H8M2 &       VHL &                                                NaN & 10.1021/acs.jmedchem.8b01413 \\
 NaN & COc1cc(-c2cn(C)c(=O)c3cnccc23)cc(OC)c1CN1CCN(CC... &                 HeLa &                         True & 6.251812 &      80.0 &  Q9H8M2 &       VHL & Degradation of BRD9 in HeLa cells after 4 h tre... & 10.1021/acs.jmedchem.8b01413 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1557,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,A549 Cas9,True,243.0,80.0,Q05397,VHL,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1558,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,,False,631.0,59.0,Q05397,VHL,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1559,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,SNU-387,True,32.0,98.0,Q05397,VHL,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1560,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,HLE,True,25.0,85.0,Q05397,VHL,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1561,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,Huh-7,True,100.0,79.0,Q05397,VHL,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1562,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,SNU-423,True,79.0,87.0,Q05397,VHL,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1563,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,HLF,True,40.0,66.0,Q05397,VHL,Degradation of Fak in HLF/SNU-398/HUCCT1 cells...,10.1021/acs.jmedchem.8b01826
1564,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,SNU-398,True,10.0,99.0,Q05397,VHL,Degradation of Fak in HLF/SNU-398/HUCCT1 cells...,10.1021/acs.jmedchem.8b01826
1565,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,HuCC-T1-G10,True,126.0,86.0,Q05397,VHL,Degradation of Fak in HLF/SNU-398/HUCCT1 cells...,10.1021/acs.jmedchem.8b01826
1566,COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@...,HuH-1,True,79.0,79.0,Q05397,VHL,Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel...,10.1021/acs.jmedchem.8b01826


\begin{tabular}{llllrrllll}
\toprule
   Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
BI-0319 & COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@H... &            A549 Cas9 &                         True & 6.614394 &      80.0 &  Q05397 &       VHL & Degradation of Fak in A549/Hep3B2.1-7/SNU-387 c... & 10.1021/acs.jmedchem.8b01826 \\
BI-0319 & COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@H... &                  NaN &                        False & 6.199971 &      59.0 &  Q05397 &       VHL & Degradation of Fak in A549/Hep3B2.1-7/SNU-387 c... & 10.1021/acs.jmedchem.8b01826 \\
BI-0319 & COc1cc(C(=O)NCCOCCOCCOCC(=O)N[C@H](C(=O)N2C[C@H... &              SNU-387 &                         True & 7.494850 &      98.0 &  Q05397 &       VHL & Degradation of Fak in A549/Hep3B2.1-7/SNU-387 c...

  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1545,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,HLE,True,158.0,79.0,Q05397,CRBN,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1546,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,Huh-7,True,50.0,93.0,Q05397,CRBN,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1547,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,SNU-423,True,13.0,93.0,Q05397,CRBN,Degradation of Fak in HLE/HuH-7/SNU-423 cells ...,10.1021/acs.jmedchem.8b01826
1548,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,HuH-1,False,251.0,50.0,Q05397,CRBN,Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel...,10.1021/acs.jmedchem.8b01826
1549,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,HepG2 hALR,True,32.0,89.0,Q05397,CRBN,Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel...,10.1021/acs.jmedchem.8b01826
1550,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,SK-HEP-1,True,32.0,89.0,Q05397,CRBN,Degradation of Fak in HUH-1/HepG2/SK-Hep-1 cel...,10.1021/acs.jmedchem.8b01826
1551,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,A549 Cas9,True,27.0,95.0,Q05397,CRBN,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1552,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,,True,13.0,96.0,Q05397,CRBN,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1553,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,SNU-387,True,25.0,90.0,Q05397,CRBN,Degradation of Fak in A549/Hep3B2.1-7/SNU-387 ...,10.1021/acs.jmedchem.8b01826
1555,COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(...,SNU-398,True,3.0,95.0,Q05397,CRBN,Degradation of Fak in HLF/SNU-398/HUCCT1 cells...,10.1021/acs.jmedchem.8b01826


\begin{tabular}{llllrrllll}
\toprule
   Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
BI-3663 & COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(C... &                  HLE &                         True & 6.801343 &      79.0 &  Q05397 &      CRBN & Degradation of Fak in HLE/HuH-7/SNU-423 cells a... & 10.1021/acs.jmedchem.8b01826 \\
BI-3663 & COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(C... &                Huh-7 &                         True & 7.301030 &      93.0 &  Q05397 &      CRBN & Degradation of Fak in HLE/HuH-7/SNU-423 cells a... & 10.1021/acs.jmedchem.8b01826 \\
BI-3663 & COc1cc(C(=O)NCCOCCOCCOCCC(=O)Nc2cccc3c2C(=O)N(C... &              SNU-423 &                         True & 7.886057 &      93.0 &  Q05397 &      CRBN & Degradation of Fak in HLE/HuH-7/SNU-423 cells a...

  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1294,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,HeLa,True,790.0,92.0,O75530,VHL,Degradation of EED in HeLa/DB cells after 24 h...,10.1016/j.chembiol.2019.11.006
1296,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,HeLa,True,300.0,75.0,Q15910,VHL,Degradation of EZH2 in HeLa/DB cells after 24 ...,10.1016/j.chembiol.2019.11.006
1297,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,DB,True,670.0,96.0,Q15910,VHL,Degradation of EZH2 in HeLa/DB cells after 24 ...,10.1016/j.chembiol.2019.11.006
1298,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,HeLa,False,,22.0,Q15022,VHL,Degradation of SUZ12 in HeLa/DB cells after 24...,10.1016/j.chembiol.2019.11.006
1299,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,DB,True,590.0,82.0,Q15022,VHL,Degradation of SUZ12 in HeLa/DB cells after 24...,10.1016/j.chembiol.2019.11.006


\begin{tabular}{llllrrllll}
\toprule
   Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                    Article DOI \\
\midrule
UNC6852 & Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)... &                 HeLa &                         True & 6.102373 &      92.0 &  O75530 &       VHL & Degradation of EED in HeLa/DB cells after 24 h ... & 10.1016/j.chembiol.2019.11.006 \\
UNC6852 & Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)... &                 HeLa &                         True & 6.522879 &      75.0 &  Q15910 &       VHL & Degradation of EZH2 in HeLa/DB cells after 24 h... & 10.1016/j.chembiol.2019.11.006 \\
UNC6852 & Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)... &                   DB &                         True & 6.173925 &      96.0 &  Q15910 &       VHL & Degradation of EZH2 in HeLa/DB cells after 2

  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1435,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,LNCaP,True,50.0,71.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells after 2-...,10.1021/acsmedchemlett.9b00115
1436,Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O...,VCaP,False,50.0,51.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells after 2-...,10.1021/acsmedchemlett.9b00115


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &   pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                    Article DOI \\
\midrule
 NaN & Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)... &                LNCaP &                         True & 7.30103 &      71.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells after 2-4... & 10.1021/acsmedchemlett.9b00115 \\
 NaN & Cc1ncsc1-c1ccc(CNC(=O)[C@@H]2C[C@@H](O)CN2C(=O)... &                 VCaP &                        False & 7.30103 &      51.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells after 2-4... & 10.1021/acsmedchemlett.9b00115 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
4,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,MOLT-4,True,53.0,100.0,Q07817,VHL,,10.1016/j.ejmech.2020.112186
1245,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,MOLT-4,True,63.0,90.8,Q07817,VHL,Degradation of BCL-xL in MOLT-4/platelets cell...,10.1038/s41591-019-0668-z
1246,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,2T60,False,3000.0,26.0,Q07817,VHL,Degradation of BCL-xL in MOLT-4/platelets cell...,10.1038/s41591-019-0668-z


\begin{tabular}{llllrrllll}
\toprule
  Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &    pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                  Article DOI \\
\midrule
   NaN & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &               MOLT-4 &                         True & 7.275724 &     100.0 &  Q07817 &       VHL &                                                NaN & 10.1016/j.ejmech.2020.112186 \\
DT2216 & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &               MOLT-4 &                         True & 7.200659 &      90.8 &  Q07817 &       VHL & Degradation of BCL-xL in MOLT-4/platelets cells... &    10.1038/s41591-019-0668-z \\
DT2216 & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &                 2T60 &                        False & 5.522879 &      26.0 &  Q07817 &       VHL & Degradation of BCL-xL in MOLT-4/platelets cells... &  

  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1479,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,LNCaP,True,50.0,71.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells using EL...,10.1021/acsmedchemlett.0c00236
1480,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,VCaP,False,50.0,51.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells using EL...,10.1021/acsmedchemlett.0c00236


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &   pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                    Article DOI \\
\midrule
 NaN & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &                LNCaP &                         True & 7.30103 &      71.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells using ELI... & 10.1021/acsmedchemlett.0c00236 \\
 NaN & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &                 VCaP &                        False & 7.30103 &      51.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells using ELI... & 10.1021/acsmedchemlett.0c00236 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


Unnamed: 0,Smiles,Cell Line Identifier,"Active (Dmax 0.6, pDC50 6.0)",DC50 (nM),Dmax (%),Uniprot,E3 Ligase,Assay (DC50/Dmax),Article DOI
1477,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,LNCaP,True,50.0,71.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells using EL...,10.1021/acsmedchemlett.0c00236
1478,Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)...,VCaP,False,50.0,51.0,P10275,VHL,Degradation of AR in LNCaP/VCaP cells using EL...,10.1021/acsmedchemlett.0c00236


\begin{tabular}{llllrrllll}
\toprule
Name &                                             Smiles & Cell Line Identifier & Active (Dmax 0.6, pDC50 6.0) &   pDC50 &  Dmax (\%) & Uniprot & E3 Ligase &                                  Assay (DC50/Dmax) &                    Article DOI \\
\midrule
 NaN & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &                LNCaP &                         True & 7.30103 &      71.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells using ELI... & 10.1021/acsmedchemlett.0c00236 \\
 NaN & Cc1ncsc1-c1ccc([C@H](C)NC(=O)[C@@H]2C[C@@H](O)C... &                 VCaP &                        False & 7.30103 &      51.0 &  P10275 &       VHL & Degradation of AR in LNCaP/VCaP cells using ELI... & 10.1021/acsmedchemlett.0c00236 \\
\bottomrule
\end{tabular}




  print(group[['Name', 'Smiles', 'Cell Line Identifier', active_col, 'pDC50', 'Dmax (%)', 'Uniprot', 'E3 Ligase', 'Assay (DC50/Dmax)', 'Article DOI']].to_latex(index=False))


In [29]:
import optuna

def objective(trial: optuna.Trial, verbose: int = 0) -> float:
    
    radius = trial.suggest_int('radius', 1, 15)
    fpsize = trial.suggest_int('fpsize', 128, 2048, step=128)

    morgan_fpgen = AllChem.GetMorganGenerator(
        radius=radius,
        fpSize=fpsize,
        includeChirality=True,
    )

    smiles2fp = {}
    for smiles in train_val_df['Smiles'].unique().tolist():
        smiles2fp[smiles] = pdp.get_fingerprint(smiles, morgan_fpgen)

    # Count the number of unique SMILES and the number of unique Morgan fingerprints
    unique_fps = set([tuple(fp) for fp in smiles2fp.values()])
    # Get the list of SMILES with overlapping fingerprints
    overlapping_smiles = []
    unique_fps = set()
    for smiles, fp in smiles2fp.items():
        if tuple(fp) in unique_fps:
            overlapping_smiles.append(smiles)
        else:
            unique_fps.add(tuple(fp))
    num_overlaps = len(train_val_df[train_val_df["Smiles"].isin(overlapping_smiles)])
    num_overlaps_tot = len(protac_df[protac_df["Smiles"].isin(overlapping_smiles)])

    if verbose:
        print(f'Radius: {radius}')
        print(f'FP length: {fpsize}')
        print(f'Number of unique SMILES: {len(smiles2fp)}')
        print(f'Number of unique fingerprints: {len(unique_fps)}')
        print(f'Number of SMILES with overlapping fingerprints: {len(overlapping_smiles)}')
        print(f'Number of overlapping SMILES in train_val_df: {num_overlaps}')
        print(f'Number of overlapping SMILES in protac_df: {num_overlaps_tot}')
    return num_overlaps + radius + fpsize / 100

In [30]:
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(sampler=sampler, direction='minimize')
study.optimize(objective, n_trials=50, show_progress_bar=True)

[I 2024-04-29 11:28:05,626] A new study created in memory with name: no-name-4db5d822-6220-4ab8-bc3a-c776b0e5cac2


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-04-29 11:28:07,705] Trial 0 finished with value: 39.480000000000004 and parameters: {'radius': 6, 'fpsize': 2048}. Best is trial 0 with value: 39.480000000000004.
[I 2024-04-29 11:28:09,590] Trial 1 finished with value: 23.8 and parameters: {'radius': 11, 'fpsize': 1280}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:10,474] Trial 2 finished with value: 131.84 and parameters: {'radius': 3, 'fpsize': 384}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:11,978] Trial 3 finished with value: 281.92 and parameters: {'radius': 1, 'fpsize': 1792}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:13,994] Trial 4 finished with value: 25.36 and parameters: {'radius': 10, 'fpsize': 1536}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:15,642] Trial 5 finished with value: 284.48 and parameters: {'radius': 1, 'fpsize': 2048}. Best is trial 1 with value: 23.8.
[I 2024-04-29 11:28:17,154] Trial 6 finished with value: 18.12 and parameters: {'radius': 13, 'fpsize': 51

In [31]:
# Run objective with best params and verbose
objective(study.best_trial, verbose=1)

Radius: 10
FP length: 256
Number of unique SMILES: 532
Number of unique fingerprints: 532
Number of SMILES with overlapping fingerprints: 0
Number of overlapping SMILES in train_val_df: 0
Number of overlapping SMILES in protac_df: 0


12.56