File size: 7,805 Bytes
5e01175
 
ea572f9
5e01175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
from typing import Literal, List, Tuple, Optional, Dict

from .protac_dataset import PROTAC_Dataset

import pandas as pd
from sklearn.base import ClassifierMixin
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import torch
import torch.nn as nn
from torchmetrics import (
    Accuracy,
    AUROC,
    Precision,
    Recall,
    F1Score,
    MetricCollection,
)
import optuna


def train_sklearn_model(
    clf: ClassifierMixin,
    protein2embedding: Dict,
    cell2embedding: Dict,
    smiles2fp: Dict,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    test_df: Optional[pd.DataFrame] = None,
    active_label: str = 'Active',
    use_single_scaler: bool = True,
) -> Tuple[ClassifierMixin, Dict]:
    """ Train a classifier model on train and val sets and evaluate it on a test set.

    Args:
        clf: The classifier model to train and evaluate.
        train_df (pd.DataFrame): The training set.
        val_df (pd.DataFrame): The validation set.
        test_df (Optional[pd.DataFrame]): The test set.

    Returns:
        Tuple[ClassifierMixin, nn.ModuleDict]: The trained model and the metrics.
    """
    # Initialize the datasets
    train_ds = PROTAC_Dataset(
        train_df,
        protein2embedding,
        cell2embedding,
        smiles2fp,
        active_label=active_label,
        use_smote=False,
    )
    scaler = train_ds.fit_scaling(use_single_scaler=use_single_scaler)
    train_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)
    val_ds = PROTAC_Dataset(
        val_df,
        protein2embedding,
        cell2embedding,
        smiles2fp,
        active_label=active_label,
        use_smote=False,
    )
    val_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)
    if test_df is not None:
        test_ds = PROTAC_Dataset(
            test_df,
            protein2embedding,
            cell2embedding,
            smiles2fp,
            active_label=active_label,
            use_smote=False,
        )
        test_ds.apply_scaling(scaler, use_single_scaler=use_single_scaler)

    # Get the numpy arrays
    X_train, y_train = train_ds.get_numpy_arrays()
    X_val, y_val = val_ds.get_numpy_arrays()
    if test_df is not None:
        X_test, y_test = test_ds.get_numpy_arrays()

    # Train the model
    clf.fit(X_train, y_train)
    # Define the metrics as a module dict
    stages = ['train_metrics', 'val_metrics', 'test_metrics']
    metrics = nn.ModuleDict({s: MetricCollection({
        'acc': Accuracy(task='binary'),
        'roc_auc': AUROC(task='binary'),
        'precision': Precision(task='binary'),
        'recall': Recall(task='binary'),
        'f1_score': F1Score(task='binary'),
        'opt_score': Accuracy(task='binary') + F1Score(task='binary'),
        'hp_metric': Accuracy(task='binary'),
    }, prefix=s.replace('metrics', '')) for s in stages})

    # Get the predictions
    metrics_out = {}

    y_pred = torch.tensor(clf.predict_proba(X_train)[:, 1])
    y_true = torch.tensor(y_train)
    metrics['train_metrics'].update(y_pred, y_true)
    metrics_out.update(metrics['train_metrics'].compute())

    y_pred = torch.tensor(clf.predict_proba(X_val)[:, 1])
    y_true = torch.tensor(y_val)
    metrics['val_metrics'].update(y_pred, y_true)
    metrics_out.update(metrics['val_metrics'].compute())

    if test_df is not None:
        y_pred = torch.tensor(clf.predict_proba(X_test)[:, 1])
        y_true = torch.tensor(y_test)
        metrics['test_metrics'].update(y_pred, y_true)
        metrics_out.update(metrics['test_metrics'].compute())

    return clf, metrics_out


def suggest_random_forest(
        trial: optuna.Trial,
) -> ClassifierMixin:
    """ Suggest hyperparameters for a Random Forest classifier.

    Args:
        trial (optuna.Trial): The Optuna trial object.

    Returns:
        ClassifierMixin: The Random Forest classifier with the suggested hyperparameters.
    """
    n_estimators = trial.suggest_int('model_n_estimators', 10, 1000)
    max_depth = trial.suggest_int('model_max_depth', 2, 100)
    min_samples_split = trial.suggest_int('model_min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('model_min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('model_max_features', [None, 'sqrt', 'log2'])
    criterion = trial.suggest_categorical('model_criterion', ['gini', 'entropy'])

    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion,
        random_state=42,
    )

    return clf


def suggest_logistic_regression(
        trial: optuna.Trial,
) -> ClassifierMixin:
    """ Suggest hyperparameters for a Logistic Regression classifier.

    Args:
        trial (optuna.Trial): The Optuna trial object.

    Returns:
        ClassifierMixin: The Logistic Regression classifier with the suggested hyperparameters.
    """
        # Suggest values for the logistic regression hyperparameters
    C = trial.suggest_loguniform('model_C', 1e-4, 1e2)
    penalty = trial.suggest_categorical('model_penalty', ['l1', 'l2', 'elasticnet', None])
    solver = trial.suggest_categorical('model_solver', ['newton-cholesky', 'lbfgs', 'liblinear', 'sag', 'saga'])

    # Check solver compatibility
    if penalty == 'l1' and solver not in ['liblinear', 'saga']:
        raise optuna.exceptions.TrialPruned()
    if penalty == None and solver not in ['newton-cholesky', 'lbfgs', 'sag']:
        raise optuna.exceptions.TrialPruned()

    # Configure the classifier with the trial's suggested parameters
    clf = LogisticRegression(
        C=C,
        penalty=penalty,
        solver=solver,
        max_iter=1000,
        random_state=42,
    )

    return clf


def suggest_svc(
        trial: optuna.Trial,
) -> ClassifierMixin:
    """ Suggest hyperparameters for an SVC classifier.

    Args:
        trial (optuna.Trial): The Optuna trial object.

    Returns:
        ClassifierMixin: The SVC classifier with the suggested hyperparameters.
    """
    C = trial.suggest_loguniform('model_C', 1e-4, 1e2)
    kernel = trial.suggest_categorical('model_kernel', ['linear', 'poly', 'rbf', 'sigmoid'])
    gamma = trial.suggest_categorical('model_gamma', ['scale', 'auto'])
    degree = trial.suggest_int('model_degree', 2, 5) if kernel == 'poly' else 3
    
    clf = SVC(
        C=C,
        kernel=kernel,
        gamma=gamma,
        degree=degree,
        probability=True,
        random_state=42,
    )

    return clf


def suggest_gradient_boosting(
        trial: optuna.Trial,
) -> ClassifierMixin:
    """ Suggest hyperparameters for a Gradient Boosting classifier.

    Args:
        trial (optuna.Trial): The Optuna trial object.

    Returns:
        ClassifierMixin: The Gradient Boosting classifier with the suggested hyperparameters.
    """
    n_estimators = trial.suggest_int('model_n_estimators', 50, 500)
    learning_rate = trial.suggest_loguniform('model_learning_rate', 0.01, 1)
    max_depth = trial.suggest_int('model_max_depth', 3, 10)
    min_samples_split = trial.suggest_int('model_min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('model_min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('model_max_features', ['sqrt', 'log2', None])
    
    clf = GradientBoostingClassifier(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42,
    )

    return clf