Spaces:

samuelinferences
/

TabPFNEvaluationDemo

Build error

File size: 16,495 Bytes

e487255

from catboost import CatBoostClassifier, Pool

import math

from sklearn.impute import SimpleImputer

import xgboost as xgb
from sklearn import neighbors
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
import numpy as np

from scripts import tabular_metrics
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import time

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval, rand
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

import autosklearn.classification

CV = 5
MULTITHREAD = 1 # Number of threads baselines are able to use at most
param_grid, param_grid_hyperopt = {}, {}

def get_scoring_direction(metric_used):
    # Not needed
    if metric_used == tabular_metrics.auc_metric:
        return -1
    elif metric_used == tabular_metrics.cross_entropy:
        return 1
    else:
        raise Exception('No scoring string found for metric')

def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
    if metric_used == tabular_metrics.auc_metric:
        if usage == 'sklearn_cv':
            return 'roc_auc_ovo'
        elif usage == 'autogluon':
            return 'log_loss' # Autogluon crashes when using 'roc_auc' with some datasets usning logloss gives better scores;
                              # We might be able to fix this, but doesn't work out of box.
                              # File bug report? Error happens with dataset robert and fabert
            if multiclass:
                return 'roc_auc_ovo_macro'
            else:
                return 'roc_auc'
        elif usage == 'autosklearn':
            if multiclass:
                return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead
            else:
                return autosklearn.metrics.roc_auc
        elif usage == 'catboost':
            return 'MultiClass' # Effectively LogLoss, ROC not available
        elif usage == 'xgb':
            return 'logloss'
        return 'roc_auc'
    elif metric_used == tabular_metrics.cross_entropy:
        if usage == 'sklearn_cv':
            return 'neg_log_loss'
        elif usage == 'autogluon':
            return 'log_loss'
        elif usage == 'autosklearn':
            return autosklearn.metrics.log_loss
        elif usage == 'catboost':
            return 'MultiClass' # Effectively LogLoss
        return 'logloss'
    else:
        raise Exception('No scoring string found for metric')

def eval_f(params, clf_, x, y, metric_used, start_time, max_time):
    if time.time() - start_time > max_time:
        return np.nan
    scores = cross_val_score(clf_(**params), x, y, cv=CV, scoring=get_scoring_string(metric_used))

    return -np.nanmean(scores)

def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]):
    import warnings
    def warn(*args, **kwargs):
        pass

    warnings.warn = warn

    x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy()

    if impute:
        imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        imp_mean.fit(x)
        x, test_x = imp_mean.transform(x), imp_mean.transform(test_x)

    if one_hot:
        def make_pd_from_np(x):
            data = pd.DataFrame(x)
            for c in cat_features:
                data.iloc[:, c] = data.iloc[:, c].astype('int')
            return data
        x, test_x = make_pd_from_np(x),  make_pd_from_np(test_x)
        transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough")
        transformer.fit(x)
        x, test_x = transformer.transform(x), transformer.transform(test_x)

    if standardize:
        scaler = MinMaxScaler()
        scaler.fit(x)
        x, test_x = scaler.transform(x), scaler.transform(test_x)

    return x, y, test_x, test_y

## Auto Gluon
def autogluon_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    from autogluon.tabular import TabularPredictor # Inside function so package can be sued without installation
    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
                                             , one_hot=False
                                             , cat_features=cat_features
                                             , impute=False
                                             , standardize=False)
    train_data = pd.DataFrame(np.concatenate([x, y[:, np.newaxis]], 1))
    test_data = pd.DataFrame(np.concatenate([test_x, test_y[:, np.newaxis]], 1))

    # AutoGluon automatically infers datatypes, we don't specify the categorical labels
    predictor = TabularPredictor(
        label=train_data.columns[-1],
        eval_metric=get_scoring_string(metric_used, usage='autogluon', multiclass=(len(np.unique(y)) > 2)),
        problem_type='multiclass' if len(np.unique(y)) > 2 else 'binary'
        ## seed=int(y[:].sum()) doesn't accept seed
    ).fit(
        train_data=train_data,
        time_limit=max_time,
        presets=['best_quality']
        # The seed is deterministic but varies for each dataset and each split of it
    )

    pred = predictor.predict_proba(test_data, as_multiclass=True).values

    metric = metric_used(test_y, pred)

    return metric, pred, predictor.fit_summary()

## AUTO Sklearn
def autosklearn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    return autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=max_time, version=1)

from autosklearn.experimental.askl2 import AutoSklearn2Classifier
from autosklearn.classification import AutoSklearnClassifier
def autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300, version=2):
    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
                                             , one_hot=False
                                             , cat_features=cat_features
                                             , impute=False
                                             , standardize=False)

    def make_pd_from_np(x):
        data = pd.DataFrame(x)
        for c in cat_features:
            data.iloc[:, c] = data.iloc[:, c].astype('category')
        return data

    x = make_pd_from_np(x)
    test_x = make_pd_from_np(test_x)

    clf_ = AutoSklearn2Classifier if version == 2 else AutoSklearnClassifier
    clf = clf_(time_left_for_this_task=max_time,
                                                           memory_limit=4000,
                                                           n_jobs=MULTITHREAD,
               seed=int(y[:].sum()),
        # The seed is deterministic but varies for each dataset and each split of it
               metric=get_scoring_string(metric_used, usage='autosklearn', multiclass=len(np.unique(y)) > 2))

    # fit model to data
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, None

param_grid_hyperopt['logistic'] = {
    'penalty': hp.choice('penalty', ['l1', 'l2', 'none'])
    , 'max_iter': hp.randint('max_iter', [50, 500])
    , 'fit_intercept': hp.choice('fit_intercept', [True, False])
    , 'C': hp.loguniform('C', -5, math.log(5.0))}  # 'normalize': [False],

def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
                                             , one_hot=True, impute=True, standardize=True
                                             , cat_features=cat_features)

    def clf_(**params):
        return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params)

    start_time = time.time()

    def stop(trial):
        return time.time() - start_time > max_time, []

    best = fmin(
        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
        space=param_grid_hyperopt['logistic'],
        algo=rand.suggest,
        rstate=np.random.RandomState(int(y[:].sum())),
        early_stop_fn=stop,
        # The seed is deterministic but varies for each dataset and each split of it
        max_evals=10000)
    best = space_eval(param_grid_hyperopt['logistic'], best)

    clf = clf_(**best)
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, best

## KNN
param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16)
                              }
def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
                                             one_hot=True, impute=True, standardize=True,
                                             cat_features=cat_features)

    def clf_(**params):
        return neighbors.KNeighborsClassifier(n_jobs=1, **params)

    start_time = time.time()

    def stop(trial):
        return time.time() - start_time > max_time, []

    best = fmin(
        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
        space=param_grid_hyperopt['knn'],
        algo=rand.suggest,
        rstate=np.random.RandomState(int(y[:].sum())),
        early_stop_fn=stop,
        # The seed is deterministic but varies for each dataset and each split of it
        max_evals=10000)
    best = space_eval(param_grid_hyperopt['knn'], best)

    clf = clf_(**best)
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, best

## GP
param_grid_hyperopt['gp'] = {
    'params_y_scale': hp.loguniform('params_y_scale', math.log(0.05), math.log(5.0)),
    'params_length_scale': hp.loguniform('params_length_scale', math.log(0.1), math.log(1.0)),
    'n_jobs': hp.choice('njobs', [1])
}
def gp_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
                                             one_hot=True, impute=True, standardize=True,
                                             cat_features=cat_features)

    def clf_(params_y_scale,params_length_scale, **params):
        return GaussianProcessClassifier(kernel= params_y_scale * RBF(params_length_scale), **params)

    start_time = time.time()
    def stop(trial):
        return time.time() - start_time > max_time, []


    best = fmin(
        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
        space=param_grid_hyperopt['gp'],
        algo=rand.suggest,
        rstate=np.random.RandomState(int(y[:].sum())),
        early_stop_fn=stop,
        # The seed is deterministic but varies for each dataset and each split of it
        max_evals=1000)
    best = space_eval(param_grid_hyperopt['gp'], best)

    clf = clf_(**best)
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, best


# Catboost
# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf

param_grid_hyperopt['catboost'] = {
    'learning_rate': hp.loguniform('learning_rate', math.log(math.pow(math.e, -5)), math.log(1)),
    'random_strength': hp.randint('random_strength', 1, 20),
    'l2_leaf_reg': hp.loguniform('l2_leaf_reg', math.log(1), math.log(10)),
    'bagging_temperature': hp.uniform('bagging_temperature', 0., 1),
    'leaf_estimation_iterations': hp.randint('leaf_estimation_iterations', 1, 20),
    'iterations': hp.randint('iterations', 100, 4000), # This is smaller than in paper, 4000 leads to ram overusage
}

def catboost_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    print(x)

    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
                                             , one_hot=False
                                             , cat_features=cat_features
                                             , impute=False
                                             , standardize=False)

    # Nans in categorical features must be encoded as separate class
    x[:, cat_features], test_x[:, cat_features] = np.nan_to_num(x[:, cat_features], -1), np.nan_to_num(
        test_x[:, cat_features], -1)

    def make_pd_from_np(x):
        data = pd.DataFrame(x)
        for c in cat_features:
            data.iloc[:, c] = data.iloc[:, c].astype('int')
        return data

    x = make_pd_from_np(x)
    test_x = make_pd_from_np(test_x)

    def clf_(**params):
        return CatBoostClassifier(
                               loss_function=get_scoring_string(metric_used, usage='catboost'),
                               thread_count = MULTITHREAD,
                               used_ram_limit='4gb',
            random_seed=int(y[:].sum()),
                               logging_level='Silent',
                                cat_features=cat_features,
                                  **params)

    start_time = time.time()
    def stop(trial):
        return time.time() - start_time > max_time, []

    best = fmin(
        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
        space=param_grid_hyperopt['catboost'],
        algo=rand.suggest,
        rstate=np.random.RandomState(int(y[:].sum())),
        early_stop_fn=stop,
        # The seed is deterministic but varies for each dataset and each split of it
        max_evals=1000)
    best = space_eval(param_grid_hyperopt['catboost'], best)

    clf = clf_(**best)
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, best


# XGBoost
# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf
param_grid_hyperopt['xgb'] = {
    'learning_rate': hp.loguniform('learning_rate', -7, math.log(1)),
    'max_depth': hp.randint('max_depth', 1, 10),
    'subsample': hp.uniform('subsample', 0.2, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
    'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1),
    'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
    'alpha': hp.loguniform('alpha', -16, 2),
    'lambda': hp.loguniform('lambda', -16, 2),
    'gamma': hp.loguniform('gamma', -16, 2),
    'n_estimators': hp.randint('n_estimators', 100, 4000), # This is smaller than in paper
}

def xgb_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
    # XGB Documentation:
    # XGB handles categorical data appropriately without using One Hot Encoding, categorical features are experimetal
    # XGB handles missing values appropriately without imputation

    x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
                                             , one_hot=False
                                             , cat_features=cat_features
                                             , impute=False
                                             , standardize=False)

    def clf_(**params):
        return xgb.XGBClassifier(use_label_encoder=False
                                 , nthread=1
                                 , **params
                                 , eval_metric=get_scoring_string(metric_used, usage='xgb') # AUC not implemented
        )

    start_time = time.time()
    def stop(trial):
        return time.time() - start_time > max_time, []

    best = fmin(
        fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
        space=param_grid_hyperopt['xgb'],
        algo=rand.suggest,
        rstate=np.random.RandomState(int(y[:].sum())),
        early_stop_fn=stop,
        # The seed is deterministic but varies for each dataset and each split of it
        max_evals=1000)
    best = space_eval(param_grid_hyperopt['xgb'], best)

    clf = clf_(**best)
    clf.fit(x, y)

    pred = clf.predict_proba(test_x)
    metric = metric_used(test_y, pred)

    return metric, pred, best


clf_dict = {'gp': gp_metric
                , 'knn': knn_metric
                , 'catboost': catboost_metric
                , 'xgb': xgb_metric
                , 'logistic': logistic_metric
           , 'autosklearn': autosklearn_metric
             , 'autosklearn2': autosklearn2_metric
            , 'autogluon': autogluon_metric}