Spaces:

ailab-bio
/

PROTAC-Degradation-Predictor

Sleeping

App Files Files Community

ribesstefano commited on Apr 5, 2024

Commit

ab45a22

1 Parent(s): acd572a

Finalized predictor script

Browse files

Files changed (1) hide show

notebooks/protac_degradation_predictor.py +454 -215

notebooks/protac_degradation_predictor.py CHANGED Viewed

@@ -1,19 +1,20 @@
-import optuna
-from optuna.samplers import TPESampler
-import h5py
 import os
 import pickle
 import warnings
 import logging
 import pandas as pd
 import numpy as np
-import urllib.request
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit import DataStructs
-from collections import defaultdict
-from typing import Literal
 from jsonargparse import CLI
 from tqdm.auto import tqdm
 from imblearn.over_sampling import SMOTE, ADASYN
@@ -44,25 +45,39 @@ warnings.filterwarnings("ignore", ".*FixedLocator*")
 # Ignore UserWarning from PyTorch Lightning
 warnings.filterwarnings("ignore", ".*does not have many workers.*")
 protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
-protac_df.head()
-# Get the unique Article IDs of the entries with NaN values in the Active column
-nan_active = protac_df[protac_df['Active'].isna()]['Article DOI'].unique()
-nan_active
 # Map E3 Ligase Iap to IAP
 protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
-cells = sorted(protac_df['Cell Type'].dropna().unique().tolist())
-print(f'Number of non-cleaned cell lines: {len(cells)}')
-cells = sorted(protac_df['Cell Line Identifier'].dropna().unique().tolist())
-print(f'Number of cleaned cell lines: {len(cells)}')
-unlabeled_df = protac_df[protac_df['Active'].isna()]
-print(f'Number of compounds in test set: {len(unlabeled_df)}')
 # ## Load Protein Embeddings
@@ -101,9 +116,10 @@ for cell_line in protac_df['Cell Line Identifier'].unique():
         cell2embedding[cell_line] = np.zeros(emb_shape)
 ## Precompute Molecular Fingerprints
 morgan_fpgen = AllChem.GetMorganGenerator(
     radius=15,
-    fpSize=1024,
     includeChirality=True,
 )
@@ -131,7 +147,7 @@ print(f'Number of overlapping SMILES in protac_df: {len(protac_df[protac_df["Smi
 tanimoto_matrix = defaultdict(list)
 for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
     fp1 = smiles2fp[smiles1]
-    # TODO: Use BulkTanimotoSimilarity
     for j, smiles2 in enumerate(protac_df['Smiles'].unique()):
         if j < i:
             continue
@@ -153,7 +169,8 @@ class PROTAC_Dataset(Dataset):
         smiles2fp=smiles2fp,
         use_smote=False,
         oversampler=None,
-        use_ored_activity=False,
     ):
         """ Initialize the PROTAC dataset
@@ -165,11 +182,13 @@ class PROTAC_Dataset(Dataset):
             use_smote (bool): Whether to use SMOTE for oversampling
             use_ored_activity (bool): Whether to use the 'Active - OR' column
         """
-        # Filter out examples with NaN in 'Active' column
-        self.data = protac_df  # [~protac_df['Active'].isna()]
         self.protein_embeddings = protein_embeddings
         self.cell2embedding = cell2embedding
         self.smiles2fp = smiles2fp
         self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
         self.protein_emb_dim = protein_embeddings[list(
@@ -177,11 +196,18 @@ class PROTAC_Dataset(Dataset):
         self.cell_emb_dim = cell2embedding[list(
             cell2embedding.keys())[0]].shape[0]
-        self.active_label = 'Active - OR' if use_ored_activity else 'Active'
         self.use_smote = use_smote
         self.oversampler = oversampler
-        # Apply SMOTE
         if self.use_smote:
             self.apply_smote()
@@ -190,15 +216,11 @@ class PROTAC_Dataset(Dataset):
         features = []
         labels = []
         for _, row in self.data.iterrows():
-            smiles_emb = smiles2fp[row['Smiles']]
-            poi_emb = protein_embeddings[row['Uniprot']]
-            e3_emb = protein_embeddings[row['E3 Ligase Uniprot']]
-            cell_emb = cell2embedding[row['Cell Line Identifier']]
             features.append(np.hstack([
-                smiles_emb.astype(np.float32),
-                poi_emb.astype(np.float32),
-                e3_emb.astype(np.float32),
-                cell_emb.astype(np.float32),
             ]))
             labels.append(row[self.active_label])
@@ -231,27 +253,74 @@ class PROTAC_Dataset(Dataset):
         })
         self.data = df_smote
     def __len__(self):
         return len(self.data)
     def __getitem__(self, idx):
-        if self.use_smote:
-            # NOTE: We do not need to look up the embeddings anymore
-            elem = {
-                'smiles_emb': self.data['Smiles'].iloc[idx],
-                'poi_emb': self.data['Uniprot'].iloc[idx],
-                'e3_emb': self.data['E3 Ligase Uniprot'].iloc[idx],
-                'cell_emb': self.data['Cell Line Identifier'].iloc[idx],
-                'active': self.data[self.active_label].iloc[idx],
-            }
-        else:
-            elem = {
-                'smiles_emb': self.smiles2fp[self.data['Smiles'].iloc[idx]].astype(np.float32),
-                'poi_emb': self.protein_embeddings[self.data['Uniprot'].iloc[idx]].astype(np.float32),
-                'e3_emb': self.protein_embeddings[self.data['E3 Ligase Uniprot'].iloc[idx]].astype(np.float32),
-                'cell_emb': self.cell2embedding[self.data['Cell Line Identifier'].iloc[idx]].astype(np.float32),
-                'active': 1. if self.data[self.active_label].iloc[idx] else 0.,
-            }
         return elem
@@ -260,18 +329,19 @@ class PROTAC_Model(pl.LightningModule):
     def __init__(
         self,
         hidden_dim: int,
-        smiles_emb_dim: int = 1024,
         poi_emb_dim: int = 1024,
         e3_emb_dim: int = 1024,
         cell_emb_dim: int = 768,
         batch_size: int = 32,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
-        join_embeddings: Literal['concat', 'sum'] = 'concat',
         train_dataset: PROTAC_Dataset = None,
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: list = [],
     ):
         super().__init__()
         self.poi_emb_dim = poi_emb_dim
@@ -286,6 +356,7 @@ class PROTAC_Model(pl.LightningModule):
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
         self.disabled_embeddings = disabled_embeddings
         # Set our init args as class attributes
         self.__dict__.update(locals())  # Add arguments as attributes
         # Save the arguments passed to init
@@ -296,19 +367,29 @@ class PROTAC_Model(pl.LightningModule):
         ]
         self.save_hyperparameters(ignore=ignore_args_as_hyperparams)
-        if 'poi' not in self.disabled_embeddings:
-            self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
-        if 'e3' not in self.disabled_embeddings:
-            self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
-        if 'cell' not in self.disabled_embeddings:
-            self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
-        if 'smiles' not in self.disabled_embeddings:
-            self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
-        if self.join_embeddings == 'concat':
             joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
         elif self.join_embeddings == 'sum':
             joint_dim = hidden_dim
         self.fc1 = nn.Linear(joint_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
@@ -333,25 +414,46 @@ class PROTAC_Model(pl.LightningModule):
             model = {1}.load_from_checkpoint('checkpoint.ckpt')
             model.{0} = my_{0}
             '''
     def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
         embeddings = []
-        if 'poi' not in self.disabled_embeddings:
-            embeddings.append(self.poi_emb(poi_emb))
-        if 'e3' not in self.disabled_embeddings:
-            embeddings.append(self.e3_emb(e3_emb))
-        if 'cell' not in self.disabled_embeddings:
-            embeddings.append(self.cell_emb(cell_emb))
-        if 'smiles' not in self.disabled_embeddings:
-            embeddings.append(self.smiles_emb(smiles_emb))
-        if self.join_embeddings == 'concat':
             x = torch.cat(embeddings, dim=1)
-        elif self.join_embeddings == 'sum':
-            if len(embeddings) > 1:
-                embeddings = torch.stack(embeddings, dim=1)
-                x = torch.sum(embeddings, dim=1)
-            else:
-                x = embeddings[0]
         x = self.dropout(F.relu(self.fc1(x)))
         x = self.dropout(F.relu(self.fc2(x)))
         x = self.fc3(x)
@@ -391,6 +493,25 @@ class PROTAC_Model(pl.LightningModule):
         cell_emb = batch['cell_emb']
         smiles_emb = batch['smiles_emb']
         y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
         return torch.sigmoid(y_hat)
@@ -398,6 +519,7 @@ class PROTAC_Model(pl.LightningModule):
         if self.train_dataset is None:
             format = 'train_dataset', self.__class__.__name__
             raise ValueError(self.missing_dataset_error.format(*format))
         return DataLoader(
             self.train_dataset,
             batch_size=self.batch_size,
@@ -425,23 +547,25 @@ class PROTAC_Model(pl.LightningModule):
             shuffle=False,
         )
 def train_model(
-        train_df,
-        val_df,
-        test_df=None,
-        hidden_dim=768,
-        batch_size=8,
-        learning_rate=2e-5,
-        max_epochs=50,
-        smiles_emb_dim=1024,
-        join_embeddings='concat',
-        smote_k_neighbors=5,
-        use_ored_activity=True,
-        fast_dev_run=False,
-        use_logger=True,
-        logger_name='protac',
-        disabled_embeddings=[],
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -455,7 +579,6 @@ def train_model(
         max_epochs (int): The maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
-        use_ored_activity (bool): Whether to use the ORED activity column, i.e., "Active - OR" column.
         fast_dev_run (bool): Whether to run a fast development run.
         disabled_embeddings (list): The list of disabled embeddings.
@@ -468,16 +591,16 @@ def train_model(
         protein_embeddings,
         cell2embedding,
         smiles2fp,
-        use_smote=True,
-        oversampler=oversampler,
-        use_ored_activity=use_ored_activity,
     )
     val_ds = PROTAC_Dataset(
         val_df,
         protein_embeddings,
         cell2embedding,
         smiles2fp,
-        use_ored_activity=use_ored_activity,
     )
     if test_df is not None:
         test_ds = PROTAC_Dataset(
@@ -485,7 +608,7 @@ def train_model(
             protein_embeddings,
             cell2embedding,
             smiles2fp,
-            use_ored_activity=use_ored_activity,
         )
     logger = pl.loggers.TensorBoardLogger(
         save_dir='../logs',
@@ -495,6 +618,18 @@ def train_model(
         pl.callbacks.EarlyStopping(
             monitor='train_loss',
             patience=10,
             mode='max',
             verbose=True,
         ),
@@ -514,6 +649,8 @@ def train_model(
         enable_model_summary=False,
         enable_checkpointing=False,
         enable_progress_bar=False,
     )
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
@@ -522,8 +659,10 @@ def train_model(
         e3_emb_dim=1024,
         cell_emb_dim=768,
         batch_size=batch_size,
-        learning_rate=learning_rate,
         join_embeddings=join_embeddings,
         train_dataset=train_ds,
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
@@ -541,25 +680,42 @@ def train_model(
 # Setup hyperparameter optimization:
 def objective(
-        trial,
-        train_df,
-        val_df,
-        hidden_dim_options,
-        batch_size_options,
-        learning_rate_options,
-        max_epochs_options,
-        smote_k_neighbors_options,
-        fast_dev_run=False,
-        use_ored_activity=True,
-        disabled_embeddings=[],
 ) -> float:
     # Generate the hyperparameters
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
-    max_epochs = trial.suggest_categorical('max_epochs', max_epochs_options)
-    join_embeddings = trial.suggest_categorical('join_embeddings', ['concat', 'sum'])
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
     # Train the model with the current set of hyperparameters
     _, _, metrics = train_model(
@@ -569,11 +725,14 @@ def objective(
         batch_size=batch_size,
         join_embeddings=join_embeddings,
         learning_rate=learning_rate,
-        max_epochs=max_epochs,
         smote_k_neighbors=smote_k_neighbors,
         use_logger=False,
         fast_dev_run=fast_dev_run,
-        use_ored_activity=use_ored_activity,
         disabled_embeddings=disabled_embeddings,
     )
@@ -587,14 +746,14 @@ def objective(
 def hyperparameter_tuning_and_training(
-        train_df,
-        val_df,
-        test_df,
-        fast_dev_run=False,
-        n_trials=20,
-        logger_name='protac_hparam_search',
-        use_ored_activity=True,
-        disabled_embeddings=[],
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
@@ -603,6 +762,10 @@ def hyperparameter_tuning_and_training(
         val_df (pd.DataFrame): The validation set.
         test_df (pd.DataFrame): The test set.
         fast_dev_run (bool): Whether to run a fast development run.
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
@@ -611,7 +774,6 @@ def hyperparameter_tuning_and_training(
     hidden_dim_options = [256, 512, 768]
     batch_size_options = [8, 16, 32]
     learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
-    max_epochs_options = [10, 20, 50]
     smote_k_neighbors_options = list(range(3, 16))
     # Set the verbosity of Optuna
@@ -624,13 +786,12 @@ def hyperparameter_tuning_and_training(
             trial,
             train_df,
             val_df,
-            hidden_dim_options,
-            batch_size_options,
-            learning_rate_options,
-            max_epochs_options,
             smote_k_neighbors_options=smote_k_neighbors_options,
             fast_dev_run=fast_dev_run,
-            use_ored_activity=use_ored_activity,
             disabled_embeddings=disabled_embeddings,
         ),
         n_trials=n_trials,
@@ -644,7 +805,7 @@ def hyperparameter_tuning_and_training(
         use_logger=True,
         logger_name=logger_name,
         fast_dev_run=fast_dev_run,
-        use_ored_activity=use_ored_activity,
         disabled_embeddings=disabled_embeddings,
         **study.best_params,
     )
@@ -657,10 +818,11 @@ def hyperparameter_tuning_and_training(
 def main(
-    use_ored_activity: bool = True,
     n_trials: int = 50,
-    n_splits: int = 5,
     fast_dev_run: bool = False,
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
@@ -671,101 +833,178 @@ def main(
         fast_dev_run (bool): Whether to run a fast development run.
     """
     ## Set the Column to Predict
-    active_col = 'Active - OR' if use_ored_activity else 'Active'
-    active_name = active_col.replace(' ', '').lower()
-    active_name = 'active-and' if active_name == 'active' else active_name
-    ## Test Sets
-    active_df = protac_df[protac_df[active_col].notna()]
-    # Before starting any training, we isolate a small group of test data. Each element in the test set is selected so that all the following conditions are met:
-    # * its SMILES appears only once in the dataframe
-    # * its Uniprot appears only once in the dataframe
-    # * its (Smiles, Uniprot) pair appears only once in the dataframe
-    unique_smiles = active_df['Smiles'].value_counts() == 1
-    unique_uniprot = active_df['Uniprot'].value_counts() == 1
-    unique_smiles_uniprot = active_df.groupby(['Smiles', 'Uniprot']).size() == 1
-    # Get the indices of the unique samples
-    unique_smiles_idx = active_df['Smiles'].map(unique_smiles)
-    unique_uniprot_idx = active_df['Uniprot'].map(unique_uniprot)
-    unique_smiles_uniprot_idx = active_df.set_index(['Smiles', 'Uniprot']).index.map(unique_smiles_uniprot)
-    # Cross the indices to get the unique samples
-    unique_samples = active_df[unique_smiles_idx & unique_uniprot_idx & unique_smiles_uniprot_idx].index
-    test_df = active_df.loc[unique_samples]
-    train_val_df = active_df[~active_df.index.isin(unique_samples)]
-    ## Cross-Validation Training
-    # Cross validation training with 5 splits. The split operation is done in three different ways:
-    #
-    # * Random split
-    # * POI-wise: some POIs never in both splits
-    # * Least Tanimoto similarity PROTAC-wise
-    # NOTE: When set to 60, it will result in 29 groups, with nice distributions of
-    # the number of unique groups in the train and validation sets, together with
-    # the number of active and inactive PROTACs.
-    n_bins_tanimoto = 60 if active_col == 'Active' else 400
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
         os.makedirs('../reports')
-    # Seed everything in pytorch lightning
-    pl.seed_everything(42)
-    # Loop over the different splits and train the model:
-    report = []
-    for group_type in ['random', 'uniprot', 'tanimoto']:
-        print('-' * 100)
-        print(f'Starting CV for group type: {group_type}')
-        print('-' * 100)
-        # Setup CV iterator and groups
-        if group_type == 'random':
-            kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
-            groups = None
-        elif group_type == 'uniprot':
-            # Split by Uniprot
-            kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-            encoder = OrdinalEncoder()
-            groups = encoder.fit_transform(train_val_df['Uniprot'].values.reshape(-1, 1))
-        elif group_type == 'tanimoto':
-            # Split by tanimoto similarity, i.e., group_type PROTACs with similar Avg Tanimoto
-            kf = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=42)
-            tanimoto_groups = pd.cut(train_val_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
-            encoder = OrdinalEncoder()
-            groups = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1))
         # Start the CV over the folds
         X = train_val_df.drop(columns=active_col)
         y = train_val_df[active_col].tolist()
-        for k, (train_index, val_index) in enumerate(kf.split(X, y, groups)):
             print('-' * 100)
-            print(f'Starting CV for group type: {group_type}, fold: {k}')
             print('-' * 100)
             train_df = train_val_df.iloc[train_index]
             val_df = train_val_df.iloc[val_index]
             stats = {
                 'fold': k,
-                'group_type': group_type,
                 'train_len': len(train_df),
                 'val_len': len(val_df),
                 'train_perc': len(train_df) / len(train_val_df),
                 'val_perc': len(val_df) / len(train_val_df),
-                'train_active_perc': train_df[active_col].sum() / len(train_df),
-                'train_inactive_perc': (len(train_df) - train_df[active_col].sum()) / len(train_df),
-                'val_active_perc': val_df[active_col].sum() / len(val_df),
-                'val_inactive_perc': (len(val_df) - val_df[active_col].sum()) / len(val_df),
-                'test_active_perc': test_df[active_col].sum() / len(test_df),
-                'test_inactive_perc': (len(test_df) - test_df[active_col].sum()) / len(test_df),
-                'num_leaking_uniprot': len(set(train_df['Uniprot']).intersection(set(val_df['Uniprot']))),
-                'num_leaking_smiles': len(set(train_df['Smiles']).intersection(set(val_df['Smiles']))),
-                'disabled_embeddings': np.nan,
             }
-            if group_type != 'random':
-                stats['train_unique_groups'] = len(np.unique(groups[train_index]))
-                stats['val_unique_groups'] = len(np.unique(groups[val_index]))
             # Train and evaluate the model
             model, trainer, metrics = hyperparameter_tuning_and_training(
                 train_df,
@@ -773,8 +1012,8 @@ def main(
                 test_df,
                 fast_dev_run=fast_dev_run,
                 n_trials=n_trials,
-                logger_name=f'protac_{active_name}_{group_type}_fold_{k}',
-                use_ored_activity=use_ored_activity,
             )
             hparams = {p.strip('hparam_'): v for p, v in stats.items() if p.startswith('hparam_')}
             stats.update(metrics)
@@ -793,8 +1032,8 @@ def main(
                     val_df,
                     test_df,
                     fast_dev_run=fast_dev_run,
-                    logger_name=f'protac_{active_name}_{group_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
-                    use_ored_activity=use_ored_activity,
                     disabled_embeddings=disabled_embeddings,
                     **hparams,
                 )
@@ -803,11 +1042,11 @@ def main(
                 del model
                 del trainer
-    report = pd.DataFrame(report)
-    report.to_csv(
-        f'../reports/cv_report_hparam_search_{n_splits}-splits_{active_name}.csv',
-        index=False,
-    )
 if __name__ == '__main__':

 import os
 import pickle
 import warnings
 import logging
+from collections import defaultdict
+from typing import Literal, List, Tuple, Optional
+import urllib.request
+import optuna
+from optuna.samplers import TPESampler
+import h5py
 import pandas as pd
 import numpy as np
 from rdkit import Chem
 from rdkit.Chem import AllChem
 from rdkit import DataStructs
 from jsonargparse import CLI
 from tqdm.auto import tqdm
 from imblearn.over_sampling import SMOTE, ADASYN
 # Ignore UserWarning from PyTorch Lightning
 warnings.filterwarnings("ignore", ".*does not have many workers.*")
 protac_df = pd.read_csv('../data/PROTAC-Degradation-DB.csv')
 # Map E3 Ligase Iap to IAP
 protac_df['E3 Ligase'] = protac_df['E3 Ligase'].str.replace('Iap', 'IAP')
+def is_active(DC50: float, Dmax: float, oring=False, pDC50_threshold=7.0, Dmax_threshold=0.8) -> bool:
+    """ Check if a PROTAC is active based on DC50 and Dmax.
+    Args:
+        DC50(float): DC50 in nM
+        Dmax(float): Dmax in %
+    Returns:
+        bool: True if active, False if inactive, np.nan if either DC50 or Dmax is NaN
+    """
+    pDC50 = -np.log10(DC50 * 1e-9) if pd.notnull(DC50) else np.nan
+    Dmax = Dmax / 100
+    if pd.notnull(pDC50):
+        if pDC50 < pDC50_threshold:
+            return False
+    if pd.notnull(Dmax):
+        if Dmax < Dmax_threshold:
+            return False
+    if oring:
+        if pd.notnull(pDC50):
+            return True if pDC50 >= pDC50_threshold else False
+        elif pd.notnull(Dmax):
+            return True if Dmax >= Dmax_threshold else False
+        else:
+            return np.nan
+    else:
+        if pd.notnull(pDC50) and pd.notnull(Dmax):
+            return True if pDC50 >= pDC50_threshold and Dmax >= Dmax_threshold else False
+        else:
+            return np.nan
 # ## Load Protein Embeddings
         cell2embedding[cell_line] = np.zeros(emb_shape)
 ## Precompute Molecular Fingerprints
+fingerprint_size = 224
 morgan_fpgen = AllChem.GetMorganGenerator(
     radius=15,
+    fpSize=fingerprint_size,
     includeChirality=True,
 )
 tanimoto_matrix = defaultdict(list)
 for i, smiles1 in enumerate(tqdm(protac_df['Smiles'].unique(), desc='Computing Tanimoto similarity')):
     fp1 = smiles2fp[smiles1]
+    # TODO: Use BulkTanimotoSimilarity for better performance
     for j, smiles2 in enumerate(protac_df['Smiles'].unique()):
         if j < i:
             continue
         smiles2fp=smiles2fp,
         use_smote=False,
         oversampler=None,
+        active_label='Active',
+        include_mol_graphs=False,
     ):
         """ Initialize the PROTAC dataset
             use_smote (bool): Whether to use SMOTE for oversampling
             use_ored_activity (bool): Whether to use the 'Active - OR' column
         """
+        # Filter out examples with NaN in active_col column
+        self.data = protac_df  # [~protac_df[active_col].isna()]
         self.protein_embeddings = protein_embeddings
         self.cell2embedding = cell2embedding
         self.smiles2fp = smiles2fp
+        self.active_label = active_label
+        self.include_mol_graphs = include_mol_graphs
         self.smiles_emb_dim = smiles2fp[list(smiles2fp.keys())[0]].shape[0]
         self.protein_emb_dim = protein_embeddings[list(
         self.cell_emb_dim = cell2embedding[list(
             cell2embedding.keys())[0]].shape[0]
+        # Look up the embeddings
+        self.data = pd.DataFrame({
+            'Smiles': self.data['Smiles'].apply(lambda x: smiles2fp[x].astype(np.float32)).tolist(),
+            'Uniprot': self.data['Uniprot'].apply(lambda x: protein_embeddings[x].astype(np.float32)).tolist(),
+            'E3 Ligase Uniprot': self.data['E3 Ligase Uniprot'].apply(lambda x: protein_embeddings[x].astype(np.float32)).tolist(),
+            'Cell Line Identifier': self.data['Cell Line Identifier'].apply(lambda x: cell2embedding[x].astype(np.float32)).tolist(),
+            self.active_label: self.data[self.active_label].astype(np.float32).tolist(),
+        })
+        # Apply SMOTE
         self.use_smote = use_smote
         self.oversampler = oversampler
         if self.use_smote:
             self.apply_smote()
         features = []
         labels = []
         for _, row in self.data.iterrows():
             features.append(np.hstack([
+                row['Smiles'],
+                row['Uniprot'],
+                row['E3 Ligase Uniprot'],
+                row['Cell Line Identifier'],
             ]))
             labels.append(row[self.active_label])
         })
         self.data = df_smote
+    def fit_scaling(self, use_single_scaler=False, **scaler_kwargs) -> dict:
+        """ Fit the scalers for the data.
+        Returns:
+            dict: The fitted scalers.
+        """
+        if use_single_scaler:
+            scaler = StandardScaler(**scaler_kwargs)
+            embeddings = np.hstack([
+                np.array(self.data['Smiles'].tolist()),
+                np.array(self.data['Uniprot'].tolist()),
+                np.array(self.data['E3 Ligase Uniprot'].tolist()),
+                np.array(self.data['Cell Line Identifier'].tolist()),
+            ])
+            scaler.fit(embeddings)
+            return scaler
+        else:
+            scalers = {}
+            scalers['Smiles'] = StandardScaler(**scaler_kwargs)
+            scalers['Uniprot'] = StandardScaler(**scaler_kwargs)
+            scalers['E3 Ligase Uniprot'] = StandardScaler(**scaler_kwargs)
+            scalers['Cell Line Identifier'] = StandardScaler(**scaler_kwargs)
+            scalers['Smiles'].fit(np.stack(self.data['Smiles'].to_numpy()))
+            scalers['Uniprot'].fit(np.stack(self.data['Uniprot'].to_numpy()))
+            scalers['E3 Ligase Uniprot'].fit(np.stack(self.data['E3 Ligase Uniprot'].to_numpy()))
+            scalers['Cell Line Identifier'].fit(np.stack(self.data['Cell Line Identifier'].to_numpy()))
+            return scalers
+    def apply_scaling(self, scalers: dict, use_single_scaler=False):
+        """ Apply scaling to the data.
+        Args:
+            scalers (dict): The scalers for each feature.
+        """
+        if use_single_scaler:
+            embeddings = np.hstack([
+                np.array(self.data['Smiles'].tolist()),
+                np.array(self.data['Uniprot'].tolist()),
+                np.array(self.data['E3 Ligase Uniprot'].tolist()),
+                np.array(self.data['Cell Line Identifier'].tolist()),
+            ])
+            scaled_embeddings = scalers.transform(embeddings)
+            self.data = pd.DataFrame({
+                'Smiles': list(scaled_embeddings[:, :self.smiles_emb_dim]),
+                'Uniprot': list(scaled_embeddings[:, self.smiles_emb_dim:self.smiles_emb_dim+self.protein_emb_dim]),
+                'E3 Ligase Uniprot': list(scaled_embeddings[:, self.smiles_emb_dim+self.protein_emb_dim:self.smiles_emb_dim+2*self.protein_emb_dim]),
+                'Cell Line Identifier': list(scaled_embeddings[:, -self.cell_emb_dim:]),
+                self.active_label: self.data[self.active_label]
+            })
+        else:
+            self.data['Smiles'] = self.data['Smiles'].apply(lambda x: scalers['Smiles'].transform(x[np.newaxis, :])[0])
+            self.data['Uniprot'] = self.data['Uniprot'].apply(lambda x: scalers['Uniprot'].transform(x[np.newaxis, :])[0])
+            self.data['E3 Ligase Uniprot'] = self.data['E3 Ligase Uniprot'].apply(lambda x: scalers['E3 Ligase Uniprot'].transform(x[np.newaxis, :])[0])
+            self.data['Cell Line Identifier'] = self.data['Cell Line Identifier'].apply(lambda x: scalers['Cell Line Identifier'].transform(x[np.newaxis, :])[0])
     def __len__(self):
         return len(self.data)
     def __getitem__(self, idx):
+        elem = {
+            'smiles_emb': self.data['Smiles'].iloc[idx],
+            'poi_emb': self.data['Uniprot'].iloc[idx],
+            'e3_emb': self.data['E3 Ligase Uniprot'].iloc[idx],
+            'cell_emb': self.data['Cell Line Identifier'].iloc[idx],
+            'active': self.data[self.active_label].iloc[idx],
+        }
         return elem
     def __init__(
         self,
         hidden_dim: int,
+        smiles_emb_dim: int = fingerprint_size,
         poi_emb_dim: int = 1024,
         e3_emb_dim: int = 1024,
         cell_emb_dim: int = 768,
         batch_size: int = 32,
         learning_rate: float = 1e-3,
         dropout: float = 0.2,
+        join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
         train_dataset: PROTAC_Dataset = None,
         val_dataset: PROTAC_Dataset = None,
         test_dataset: PROTAC_Dataset = None,
         disabled_embeddings: list = [],
+        apply_scaling: bool = False,
     ):
         super().__init__()
         self.poi_emb_dim = poi_emb_dim
         self.val_dataset = val_dataset
         self.test_dataset = test_dataset
         self.disabled_embeddings = disabled_embeddings
+        self.apply_scaling = apply_scaling
         # Set our init args as class attributes
         self.__dict__.update(locals())  # Add arguments as attributes
         # Save the arguments passed to init
         ]
         self.save_hyperparameters(ignore=ignore_args_as_hyperparams)
+        # Define "surrogate models" branches
+        if self.join_embeddings != 'beginning':
+            if 'poi' not in self.disabled_embeddings:
+                self.poi_emb = nn.Linear(poi_emb_dim, hidden_dim)
+            if 'e3' not in self.disabled_embeddings:
+                self.e3_emb = nn.Linear(e3_emb_dim, hidden_dim)
+            if 'cell' not in self.disabled_embeddings:
+                self.cell_emb = nn.Linear(cell_emb_dim, hidden_dim)
+            if 'smiles' not in self.disabled_embeddings:
+                self.smiles_emb = nn.Linear(smiles_emb_dim, hidden_dim)
+        # Define hidden dimension for joining layer
+        if self.join_embeddings == 'beginning':
+            joint_dim = smiles_emb_dim if 'smiles' not in self.disabled_embeddings else 0
+            joint_dim += poi_emb_dim if 'poi' not in self.disabled_embeddings else 0
+            joint_dim += e3_emb_dim if 'e3' not in self.disabled_embeddings else 0
+            joint_dim += cell_emb_dim if 'cell' not in self.disabled_embeddings else 0
+        elif self.join_embeddings == 'concat':
             joint_dim = hidden_dim * (4 - len(self.disabled_embeddings))
         elif self.join_embeddings == 'sum':
             joint_dim = hidden_dim
+        self.fc0 = nn.Linear(joint_dim, joint_dim)
         self.fc1 = nn.Linear(joint_dim, hidden_dim)
         self.fc2 = nn.Linear(hidden_dim, hidden_dim)
         self.fc3 = nn.Linear(hidden_dim, 1)
             model = {1}.load_from_checkpoint('checkpoint.ckpt')
             model.{0} = my_{0}
             '''
+        # Apply scaling in datasets
+        if self.apply_scaling:
+            use_single_scaler = True if self.join_embeddings == 'beginning' else False
+            self.scalers = self.train_dataset.fit_scaling(use_single_scaler)
+            self.train_dataset.apply_scaling(self.scalers, use_single_scaler)
+            self.val_dataset.apply_scaling(self.scalers, use_single_scaler)
+            if self.test_dataset:
+                self.test_dataset.apply_scaling(self.scalers, use_single_scaler)
     def forward(self, poi_emb, e3_emb, cell_emb, smiles_emb):
         embeddings = []
+        if self.join_embeddings == 'beginning':
+            if 'poi' not in self.disabled_embeddings:
+                embeddings.append(poi_emb)
+            if 'e3' not in self.disabled_embeddings:
+                embeddings.append(e3_emb)
+            if 'cell' not in self.disabled_embeddings:
+                embeddings.append(cell_emb)
+            if 'smiles' not in self.disabled_embeddings:
+                embeddings.append(smiles_emb)
             x = torch.cat(embeddings, dim=1)
+            x = self.dropout(F.relu(self.fc0(x)))
+        else:
+            if 'poi' not in self.disabled_embeddings:
+                embeddings.append(self.poi_emb(poi_emb))
+            if 'e3' not in self.disabled_embeddings:
+                embeddings.append(self.e3_emb(e3_emb))
+            if 'cell' not in self.disabled_embeddings:
+                embeddings.append(self.cell_emb(cell_emb))
+            if 'smiles' not in self.disabled_embeddings:
+                embeddings.append(self.smiles_emb(smiles_emb))
+            if self.join_embeddings == 'concat':
+                x = torch.cat(embeddings, dim=1)
+            elif self.join_embeddings == 'sum':
+                if len(embeddings) > 1:
+                    embeddings = torch.stack(embeddings, dim=1)
+                    x = torch.sum(embeddings, dim=1)
+                else:
+                    x = embeddings[0]
         x = self.dropout(F.relu(self.fc1(x)))
         x = self.dropout(F.relu(self.fc2(x)))
         x = self.fc3(x)
         cell_emb = batch['cell_emb']
         smiles_emb = batch['smiles_emb']
+        if self.apply_scaling:
+            if self.join_embeddings == 'beginning':
+                embeddings = np.hstack([
+                    np.array(smiles_emb.tolist()),
+                    np.array(poi_emb.tolist()),
+                    np.array(e3_emb.tolist()),
+                    np.array(cell_emb.tolist()),
+                ])
+                embeddings = self.scalers.transform(embeddings)
+                smiles_emb = embeddings[:, :self.smiles_emb_dim]
+                poi_emb = embeddings[:, self.smiles_emb_dim:self.smiles_emb_dim+self.poi_emb_dim]
+                e3_emb = embeddings[:, self.smiles_emb_dim+self.poi_emb_dim:self.smiles_emb_dim+2*self.poi_emb_dim]
+                cell_emb = embeddings[:, -self.cell_emb_dim:]
+            else:
+                poi_emb = self.scalers['Uniprot'].transform(poi_emb)
+                e3_emb = self.scalers['E3 Ligase Uniprot'].transform(e3_emb)
+                cell_emb = self.scalers['Cell Line Identifier'].transform(cell_emb)
+                smiles_emb = self.scalers['Smiles'].transform(smiles_emb)
         y_hat = self.forward(poi_emb, e3_emb, cell_emb, smiles_emb)
         return torch.sigmoid(y_hat)
         if self.train_dataset is None:
             format = 'train_dataset', self.__class__.__name__
             raise ValueError(self.missing_dataset_error.format(*format))
         return DataLoader(
             self.train_dataset,
             batch_size=self.batch_size,
             shuffle=False,
         )
 def train_model(
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: Optional[pd.DataFrame] = None,
+        hidden_dim: int = 768,
+        batch_size: int = 8,
+        learning_rate: float = 2e-5,
+        dropout: float = 0.2,
+        max_epochs: int = 50,
+        smiles_emb_dim: int = fingerprint_size,
+        join_embeddings: Literal['beginning', 'concat', 'sum'] = 'concat',
+        smote_k_neighbors:int = 5,
+        use_smote: bool = True,
+        apply_scaling: bool = False,
+        active_label:str = 'Active',
+        fast_dev_run: bool = False,
+        use_logger: bool = True,
+        logger_name: str = 'protac',
+        disabled_embeddings: List[str] = [],
 ) -> tuple:
     """ Train a PROTAC model using the given datasets and hyperparameters.
         max_epochs (int): The maximum number of epochs.
         smiles_emb_dim (int): The dimension of the SMILES embeddings.
         smote_k_neighbors (int): The number of neighbors for the SMOTE oversampler.
         fast_dev_run (bool): Whether to run a fast development run.
         disabled_embeddings (list): The list of disabled embeddings.
         protein_embeddings,
         cell2embedding,
         smiles2fp,
+        use_smote=use_smote,
+        oversampler=oversampler if use_smote else None,
+        active_label=active_label,
     )
     val_ds = PROTAC_Dataset(
         val_df,
         protein_embeddings,
         cell2embedding,
         smiles2fp,
+        active_label=active_label,
     )
     if test_df is not None:
         test_ds = PROTAC_Dataset(
             protein_embeddings,
             cell2embedding,
             smiles2fp,
+            active_label=active_label,
         )
     logger = pl.loggers.TensorBoardLogger(
         save_dir='../logs',
         pl.callbacks.EarlyStopping(
             monitor='train_loss',
             patience=10,
+            mode='min',
+            verbose=True,
+        ),
+        pl.callbacks.EarlyStopping(
+            monitor='val_loss',
+            patience=5,
+            mode='min',
+            verbose=True,
+        ),
+        pl.callbacks.EarlyStopping(
+            monitor='val_acc',
+            patience=10,
             mode='max',
             verbose=True,
         ),
         enable_model_summary=False,
         enable_checkpointing=False,
         enable_progress_bar=False,
+        devices=1,
+        num_nodes=1,
     )
     model = PROTAC_Model(
         hidden_dim=hidden_dim,
         e3_emb_dim=1024,
         cell_emb_dim=768,
         batch_size=batch_size,
         join_embeddings=join_embeddings,
+        dropout=dropout,
+        learning_rate=learning_rate,
+        apply_scaling=apply_scaling,
         train_dataset=train_ds,
         val_dataset=val_ds,
         test_dataset=test_ds if test_df is not None else None,
 # Setup hyperparameter optimization:
 def objective(
+        trial: optuna.Trial,
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        hidden_dim_options: List[int] = [256, 512, 768],
+        batch_size_options: List[int] = [8, 16, 32],
+        learning_rate_options: Tuple[float, float] = (1e-5, 1e-3),
+        smote_k_neighbors_options: List[int] = list(range(3, 16)),
+        dropout_options: Tuple[float, float] = (0.1, 0.5),
+        fast_dev_run: bool = False,
+        active_label: str = 'Active',
+        disabled_embeddings: List[str] = [],
 ) -> float:
+    """ Objective function for hyperparameter optimization.
+    Args:
+        trial (optuna.Trial): The Optuna trial object.
+        train_df (pd.DataFrame): The training set.
+        val_df (pd.DataFrame): The validation set.
+        hidden_dim_options (List[int]): The hidden dimension options.
+        batch_size_options (List[int]): The batch size options.
+        learning_rate_options (Tuple[float, float]): The learning rate options.
+        smote_k_neighbors_options (List[int]): The SMOTE k neighbors options.
+        dropout_options (Tuple[float, float]): The dropout options.
+        fast_dev_run (bool): Whether to run a fast development run.
+        active_label (str): The active label column.
+        disabled_embeddings (List[str]): The list of disabled embeddings.
+    """
     # Generate the hyperparameters
     hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
     batch_size = trial.suggest_categorical('batch_size', batch_size_options)
     learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
+    join_embeddings = trial.suggest_categorical('join_embeddings', ['beginning', 'concat', 'sum'])
     smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
+    use_smote = trial.suggest_categorical('use_smote', [True, False])
+    apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
+    dropout = trial.suggest_float('dropout', *dropout_options)
     # Train the model with the current set of hyperparameters
     _, _, metrics = train_model(
         batch_size=batch_size,
         join_embeddings=join_embeddings,
         learning_rate=learning_rate,
+        dropout=dropout,
+        max_epochs=100,
         smote_k_neighbors=smote_k_neighbors,
+        apply_scaling=apply_scaling,
+        use_smote=use_smote,
         use_logger=False,
         fast_dev_run=fast_dev_run,
+        active_label=active_label,
         disabled_embeddings=disabled_embeddings,
     )
 def hyperparameter_tuning_and_training(
+        train_df: pd.DataFrame,
+        val_df: pd.DataFrame,
+        test_df: pd.DataFrame,
+        fast_dev_run: bool = False,
+        n_trials: int = 50,
+        logger_name: str = 'protac_hparam_search',
+        active_label: str = 'Active',
+        disabled_embeddings: List[str] = [],
 ) -> tuple:
     """ Hyperparameter tuning and training of a PROTAC model.
         val_df (pd.DataFrame): The validation set.
         test_df (pd.DataFrame): The test set.
         fast_dev_run (bool): Whether to run a fast development run.
+        n_trials (int): The number of hyperparameter optimization trials.
+        logger_name (str): The name of the logger.
+        active_label (str): The active label column.
+        disabled_embeddings (List[str]): The list of disabled embeddings.
     Returns:
         tuple: The trained model, the trainer, and the best metrics.
     hidden_dim_options = [256, 512, 768]
     batch_size_options = [8, 16, 32]
     learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
     smote_k_neighbors_options = list(range(3, 16))
     # Set the verbosity of Optuna
             trial,
             train_df,
             val_df,
+            hidden_dim_options=hidden_dim_options,
+            batch_size_options=batch_size_options,
+            learning_rate_options=learning_rate_options,
             smote_k_neighbors_options=smote_k_neighbors_options,
             fast_dev_run=fast_dev_run,
+            active_label=active_label,
             disabled_embeddings=disabled_embeddings,
         ),
         n_trials=n_trials,
         use_logger=True,
         logger_name=logger_name,
         fast_dev_run=fast_dev_run,
+        active_label=active_label,
         disabled_embeddings=disabled_embeddings,
         **study.best_params,
     )
 def main(
+    active_col: str = 'Active (Dmax 0.6, pDC50 6.0)',
     n_trials: int = 50,
     fast_dev_run: bool = False,
+    test_split: float = 0.2,
+    cv_n_splits: int = 5,
 ):
     """ Train a PROTAC model using the given datasets and hyperparameters.
         fast_dev_run (bool): Whether to run a fast development run.
     """
     ## Set the Column to Predict
+    active_name = active_col.replace(' ', '_').strip('(').strip(')').strip(',')
+    # Get Dmax_threshold from the active_col
+    Dmax_threshold = float(active_col.split('Dmax')[1].split(',')[0].strip('(').strip(')').strip())
+    pDC50_threshold = float(active_col.split('pDC50')[1].strip('(').strip(')').strip())
+    protac_df[active_col] = protac_df.apply(
+        lambda x: is_active(x['DC50 (nM)'], x['Dmax (%)'], pDC50_threshold=pDC50_threshold, Dmax_threshold=Dmax_threshold), axis=1
+    )
+    ## Test Sets
+    test_indeces = {}
+    ### Random Split
+    # Randomly select 20% of the active PROTACs as the test set
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = active_df.sample(frac=test_split, random_state=42)
+    test_indeces['random'] = test_df.index
+    ### E3-based Split
+    encoder = OrdinalEncoder()
+    protac_df['E3 Group'] = encoder.fit_transform(protac_df[['E3 Ligase']]).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = active_df[(active_df['E3 Ligase'] != 'VHL') & (active_df['E3 Ligase'] != 'CRBN')]
+    test_indeces['e3_ligase'] = test_df.index
+    ### Tanimoto-based Split
+    n_bins_tanimoto = 200
+    tanimoto_groups = pd.cut(protac_df['Avg Tanimoto'], bins=n_bins_tanimoto).copy()
+    encoder = OrdinalEncoder()
+    protac_df['Tanimoto Group'] = encoder.fit_transform(tanimoto_groups.values.reshape(-1, 1)).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    # Start the loop from the groups containing the smallest number of entries.
+    for group in reversed(active_df['Tanimoto Group'].value_counts().index):
+        group_df = active_df[active_df['Tanimoto Group'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    # Save to global dictionary of test indeces
+    test_indeces['tanimoto'] = test_df.index
+    ### Target-based Split
+    encoder = OrdinalEncoder()
+    protac_df['Uniprot Group'] = encoder.fit_transform(protac_df[['Uniprot']]).astype(int)
+    active_df = protac_df[protac_df[active_col].notna()].copy()
+    test_df = []
+    # For each group, get the number of active and inactive entries. Then, add those
+    # entries to the test_df if: 1) the test_df lenght + the group entries is less
+    # 20% of the active_df lenght, and 2) the percentage of True and False entries
+    # in the active_col in test_df is roughly 50%.
+    # Start the loop from the groups containing the smallest number of entries.
+    for group in reversed(active_df['Uniprot'].value_counts().index):
+        group_df = active_df[active_df['Uniprot'] == group]
+        if test_df == []:
+            test_df.append(group_df)
+            continue
+        num_entries = len(group_df)
+        num_active_group = group_df[active_col].sum()
+        num_inactive_group = num_entries - num_active_group
+        tmp_test_df = pd.concat(test_df)
+        num_entries_test = len(tmp_test_df)
+        num_active_test = tmp_test_df[active_col].sum()
+        num_inactive_test = num_entries_test - num_active_test
+        # Check if the group entries can be added to the test_df
+        if num_entries_test + num_entries < test_split * len(active_df):
+            # Add anything at the beggining
+            if num_entries_test + num_entries < test_split / 2 * len(active_df):
+                test_df.append(group_df)
+                continue
+            # Be more selective and make sure that the percentage of active and
+            # inactive is balanced
+            if (num_active_group + num_active_test) / (num_entries_test + num_entries) < 0.6:
+                if (num_inactive_group + num_inactive_test) / (num_entries_test + num_entries) < 0.6:
+                    test_df.append(group_df)
+    test_df = pd.concat(test_df)
+    # Save to global dictionary of test indeces
+    test_indeces['uniprot'] = test_df.index
+    ## Cross-Validation Training
     # Make directory ../reports if it does not exist
     if not os.path.exists('../reports'):
         os.makedirs('../reports')
+    for split_type, indeces in test_indeces.items():
+        active_df = protac_df[protac_df[active_col].notna()].copy()
+        test_df = active_df.loc[indeces]
+        train_val_df = active_df[~active_df.index.isin(test_df.index)]
+        if split_type == 'random':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = None
+        elif split_type == 'e3_ligase':
+            kf = StratifiedKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['E3 Group'].to_numpy()
+        elif split_type == 'tanimoto':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Tanimoto Group'].to_numpy()
+        elif split_type == 'uniprot':
+            kf = StratifiedGroupKFold(n_splits=cv_n_splits, shuffle=True, random_state=42)
+            group = train_val_df['Uniprot Group'].to_numpy()
         # Start the CV over the folds
         X = train_val_df.drop(columns=active_col)
         y = train_val_df[active_col].tolist()
+        report = []
+        for k, (train_index, val_index) in enumerate(kf.split(X, y, group)):
             print('-' * 100)
+            print(f'Starting CV for group type: {split_type}, fold: {k}')
             print('-' * 100)
             train_df = train_val_df.iloc[train_index]
             val_df = train_val_df.iloc[val_index]
+            leaking_uniprot = list(set(train_df['Uniprot']).intersection(set(val_df['Uniprot'])))
+            leaking_smiles = list(set(train_df['Smiles']).intersection(set(val_df['Smiles'])))
             stats = {
                 'fold': k,
                 'train_len': len(train_df),
                 'val_len': len(val_df),
                 'train_perc': len(train_df) / len(train_val_df),
                 'val_perc': len(val_df) / len(train_val_df),
+                'train_active (%)': train_df[active_col].sum() / len(train_df) * 100,
+                'train_inactive (%)': (len(train_df) - train_df[active_col].sum()) / len(train_df) * 100,
+                'val_active (%)': val_df[active_col].sum() / len(val_df) * 100,
+                'val_inactive (%)': (len(val_df) - val_df[active_col].sum()) / len(val_df) * 100,
+                'num_leaking_uniprot': len(leaking_uniprot),
+                'num_leaking_smiles': len(leaking_smiles),
+                'train_leaking_uniprot (%)': len(train_df[train_df['Uniprot'].isin(leaking_uniprot)]) / len(train_df) * 100,
+                'train_leaking_smiles (%)': len(train_df[train_df['Smiles'].isin(leaking_smiles)]) / len(train_df) * 100,
             }
+            if split_type != 'random':
+                stats['train_unique_groups'] = len(np.unique(group[train_index]))
+                stats['val_unique_groups'] = len(np.unique(group[val_index]))
+            report.append(stats)
             # Train and evaluate the model
             model, trainer, metrics = hyperparameter_tuning_and_training(
                 train_df,
                 test_df,
                 fast_dev_run=fast_dev_run,
                 n_trials=n_trials,
+                logger_name=f'protac_{active_name}_{split_type}_fold_{k}',
+                active_label=active_col,
             )
             hparams = {p.strip('hparam_'): v for p, v in stats.items() if p.startswith('hparam_')}
             stats.update(metrics)
                     val_df,
                     test_df,
                     fast_dev_run=fast_dev_run,
+                    logger_name=f'protac_{active_name}_{split_type}_fold_{k}_disabled-{"-".join(disabled_embeddings)}',
+                    active_label=active_col,
                     disabled_embeddings=disabled_embeddings,
                     **hparams,
                 )
                 del model
                 del trainer
+        report = pd.DataFrame(report)
+        report.to_csv(
+            f'../reports/cv_report_hparam_search_{cv_n_splits}-splits_{active_name}_test_split_{test_split}.csv',
+            index=False,
+        )
 if __name__ == '__main__':