ribesstefano's picture
started testing package code
ea572f9
raw
history blame
12 kB
import os
from typing import Literal, List, Tuple, Optional, Dict
from .pytorch_models import train_model
from .sklearn_models import (
train_sklearn_model,
suggest_random_forest,
suggest_logistic_regression,
suggest_svc,
suggest_gradient_boosting,
)
import optuna
from optuna.samplers import TPESampler
import joblib
import pandas as pd
from sklearn.ensemble import (
RandomForestClassifier,
GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def pytorch_model_objective(
trial: optuna.Trial,
protein2embedding: Dict,
cell2embedding: Dict,
smiles2fp: Dict,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
hidden_dim_options: List[int] = [256, 512, 768],
batch_size_options: List[int] = [8, 16, 32],
learning_rate_options: Tuple[float, float] = (1e-5, 1e-3),
smote_k_neighbors_options: List[int] = list(range(3, 16)),
dropout_options: Tuple[float, float] = (0.1, 0.5),
fast_dev_run: bool = False,
active_label: str = 'Active',
disabled_embeddings: List[str] = [],
max_epochs: int = 100,
) -> float:
""" Objective function for hyperparameter optimization.
Args:
trial (optuna.Trial): The Optuna trial object.
train_df (pd.DataFrame): The training set.
val_df (pd.DataFrame): The validation set.
hidden_dim_options (List[int]): The hidden dimension options.
batch_size_options (List[int]): The batch size options.
learning_rate_options (Tuple[float, float]): The learning rate options.
smote_k_neighbors_options (List[int]): The SMOTE k neighbors options.
dropout_options (Tuple[float, float]): The dropout options.
fast_dev_run (bool): Whether to run a fast development run.
active_label (str): The active label column.
disabled_embeddings (List[str]): The list of disabled embeddings.
"""
# Generate the hyperparameters
hidden_dim = trial.suggest_categorical('hidden_dim', hidden_dim_options)
batch_size = trial.suggest_categorical('batch_size', batch_size_options)
learning_rate = trial.suggest_float('learning_rate', *learning_rate_options, log=True)
join_embeddings = trial.suggest_categorical('join_embeddings', ['beginning', 'concat', 'sum'])
smote_k_neighbors = trial.suggest_categorical('smote_k_neighbors', smote_k_neighbors_options)
use_smote = trial.suggest_categorical('use_smote', [True, False])
apply_scaling = trial.suggest_categorical('apply_scaling', [True, False])
dropout = trial.suggest_float('dropout', *dropout_options)
# Train the model with the current set of hyperparameters
_, _, metrics = train_model(
protein2embedding,
cell2embedding,
smiles2fp,
train_df,
val_df,
hidden_dim=hidden_dim,
batch_size=batch_size,
join_embeddings=join_embeddings,
learning_rate=learning_rate,
dropout=dropout,
max_epochs=max_epochs,
smote_k_neighbors=smote_k_neighbors,
apply_scaling=apply_scaling,
use_smote=use_smote,
use_logger=False,
fast_dev_run=fast_dev_run,
active_label=active_label,
disabled_embeddings=disabled_embeddings,
)
# Metrics is a dictionary containing at least the validation loss
val_loss = metrics['val_loss']
val_acc = metrics['val_acc']
val_roc_auc = metrics['val_roc_auc']
# Optuna aims to minimize the pytorch_model_objective
return val_loss - val_acc - val_roc_auc
def hyperparameter_tuning_and_training(
protein2embedding: Dict,
cell2embedding: Dict,
smiles2fp: Dict,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
test_df: Optional[pd.DataFrame] = None,
fast_dev_run: bool = False,
n_trials: int = 50,
logger_name: str = 'protac_hparam_search',
active_label: str = 'Active',
disabled_embeddings: List[str] = [],
study_filename: Optional[str] = None,
) -> tuple:
""" Hyperparameter tuning and training of a PROTAC model.
Args:
train_df (pd.DataFrame): The training set.
val_df (pd.DataFrame): The validation set.
test_df (pd.DataFrame): The test set.
fast_dev_run (bool): Whether to run a fast development run.
n_trials (int): The number of hyperparameter optimization trials.
logger_name (str): The name of the logger.
active_label (str): The active label column.
disabled_embeddings (List[str]): The list of disabled embeddings.
Returns:
tuple: The trained model, the trainer, and the best metrics.
"""
# Define the search space
hidden_dim_options = [256, 512, 768]
batch_size_options = [8, 16, 32]
learning_rate_options = (1e-5, 1e-3) # min and max values for loguniform distribution
smote_k_neighbors_options = list(range(3, 16))
# Set the verbosity of Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
# Create an Optuna study object
sampler = TPESampler(seed=42, multivariate=True)
study = optuna.create_study(direction='minimize', sampler=sampler)
study_loaded = False
if study_filename:
if os.path.exists(study_filename):
study = joblib.load(study_filename)
study_loaded = True
print(f'Loaded study from {study_filename}')
if not study_loaded:
study.optimize(
lambda trial: pytorch_model_objective(
trial=trial,
protein2embedding=protein2embedding,
cell2embedding=cell2embedding,
smiles2fp=smiles2fp,
train_df=train_df,
val_df=val_df,
hidden_dim_options=hidden_dim_options,
batch_size_options=batch_size_options,
learning_rate_options=learning_rate_options,
smote_k_neighbors_options=smote_k_neighbors_options,
fast_dev_run=fast_dev_run,
active_label=active_label,
disabled_embeddings=disabled_embeddings,
),
n_trials=n_trials,
)
if study_filename:
joblib.dump(study, study_filename)
# Retrain the model with the best hyperparameters
model, trainer, metrics = train_model(
protein2embedding=protein2embedding,
cell2embedding=cell2embedding,
smiles2fp=smiles2fp,
train_df=train_df,
val_df=val_df,
test_df=test_df,
use_logger=True,
logger_name=logger_name,
fast_dev_run=fast_dev_run,
active_label=active_label,
disabled_embeddings=disabled_embeddings,
**study.best_params,
)
# Report the best hyperparameters found
metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
# Return the best metrics
return model, trainer, metrics
def sklearn_model_objective(
trial: optuna.Trial,
protein2embedding: Dict,
cell2embedding: Dict,
smiles2fp: Dict,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
active_label: str = 'Active',
) -> float:
""" Objective function for hyperparameter optimization.
Args:
trial (optuna.Trial): The Optuna trial object.
train_df (pd.DataFrame): The training set.
val_df (pd.DataFrame): The validation set.
model_type (str): The model type.
hyperparameters (Dict): The hyperparameters for the model.
fast_dev_run (bool): Whether to run a fast development run.
active_label (str): The active label column.
"""
# Generate the hyperparameters
use_single_scaler = trial.suggest_categorical('use_single_scaler', [True, False])
if model_type == 'RandomForest':
clf = suggest_random_forest(trial)
elif model_type == 'SVC':
clf = suggest_svc(trial)
elif model_type == 'LogisticRegression':
clf = suggest_logistic_regression(trial)
elif model_type == 'GradientBoosting':
clf = suggest_gradient_boosting(trial)
else:
raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
# Train the model with the current set of hyperparameters
_, metrics = train_sklearn_model(
clf=clf,
protein2embedding=protein2embedding,
cell2embedding=cell2embedding,
smiles2fp=smiles2fp,
train_df=train_df,
val_df=val_df,
active_label=active_label,
use_single_scaler=use_single_scaler,
)
# Metrics is a dictionary containing at least the validation loss
val_acc = metrics['val_acc']
val_roc_auc = metrics['val_roc_auc']
# Optuna aims to minimize the sklearn_model_objective
return - val_acc - val_roc_auc
def hyperparameter_tuning_and_training_sklearn(
protein2embedding: Dict,
cell2embedding: Dict,
smiles2fp: Dict,
train_df: pd.DataFrame,
val_df: pd.DataFrame,
test_df: Optional[pd.DataFrame] = None,
model_type: Literal['RandomForest', 'SVC', 'LogisticRegression', 'GradientBoosting'] = 'RandomForest',
active_label: str = 'Active',
n_trials: int = 50,
logger_name: str = 'protac_hparam_search',
study_filename: Optional[str] = None,
) -> Tuple:
# Set the verbosity of Optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
# Create an Optuna study object
sampler = TPESampler(seed=42, multivariate=True)
study = optuna.create_study(direction='minimize', sampler=sampler)
study_loaded = False
if study_filename:
if os.path.exists(study_filename):
study = joblib.load(study_filename)
study_loaded = True
print(f'Loaded study from {study_filename}')
if not study_loaded:
study.optimize(
lambda trial: sklearn_model_objective(
trial=trial,
protein2embedding=protein2embedding,
cell2embedding=cell2embedding,
smiles2fp=smiles2fp,
train_df=train_df,
val_df=val_df,
model_type=model_type,
active_label=active_label,
),
n_trials=n_trials,
)
if study_filename:
joblib.dump(study, study_filename)
# Retrain the model with the best hyperparameters
best_hyperparameters = {k.replace('model_', ''): v for k, v in study.best_params.items() if k.startswith('model_')}
if model_type == 'RandomForest':
clf = RandomForestClassifier(random_state=42, **best_hyperparameters)
elif model_type == 'SVC':
clf = SVC(random_state=42, probability=True, **best_hyperparameters)
elif model_type == 'LogisticRegression':
clf = LogisticRegression(random_state=42, max_iter=1000, **best_hyperparameters)
elif model_type == 'GradientBoosting':
clf = GradientBoostingClassifier(random_state=42, **best_hyperparameters)
else:
raise ValueError(f'Invalid model type: {model_type}. Available: RandomForest, SVC, LogisticRegression, GradientBoosting.')
model, metrics = train_sklearn_model(
clf=clf,
protein2embedding=protein2embedding,
cell2embedding=cell2embedding,
smiles2fp=smiles2fp,
train_df=train_df,
val_df=val_df,
test_df=test_df,
active_label=active_label,
use_single_scaler=study.best_params['use_single_scaler'],
)
# Report the best hyperparameters found
metrics.update({f'hparam_{k}': v for k, v in study.best_params.items()})
# Return the best metrics
return model, metrics