Spaces:
Build error
Build error
from catboost import CatBoostClassifier, Pool | |
import math | |
from sklearn.impute import SimpleImputer | |
import xgboost as xgb | |
from sklearn import neighbors | |
from sklearn.gaussian_process import GaussianProcessClassifier | |
from sklearn.gaussian_process.kernels import RBF | |
import numpy as np | |
from scripts import tabular_metrics | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import cross_val_score | |
import time | |
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval, rand | |
from sklearn.compose import ColumnTransformer | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.preprocessing import MinMaxScaler | |
import autosklearn.classification | |
CV = 5 | |
MULTITHREAD = 1 # Number of threads baselines are able to use at most | |
param_grid, param_grid_hyperopt = {}, {} | |
def get_scoring_direction(metric_used): | |
# Not needed | |
if metric_used == tabular_metrics.auc_metric: | |
return -1 | |
elif metric_used == tabular_metrics.cross_entropy: | |
return 1 | |
else: | |
raise Exception('No scoring string found for metric') | |
def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"): | |
if metric_used == tabular_metrics.auc_metric: | |
if usage == 'sklearn_cv': | |
return 'roc_auc_ovo' | |
elif usage == 'autogluon': | |
return 'log_loss' # Autogluon crashes when using 'roc_auc' with some datasets usning logloss gives better scores; | |
# We might be able to fix this, but doesn't work out of box. | |
# File bug report? Error happens with dataset robert and fabert | |
if multiclass: | |
return 'roc_auc_ovo_macro' | |
else: | |
return 'roc_auc' | |
elif usage == 'autosklearn': | |
if multiclass: | |
return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead | |
else: | |
return autosklearn.metrics.roc_auc | |
elif usage == 'catboost': | |
return 'MultiClass' # Effectively LogLoss, ROC not available | |
elif usage == 'xgb': | |
return 'logloss' | |
return 'roc_auc' | |
elif metric_used == tabular_metrics.cross_entropy: | |
if usage == 'sklearn_cv': | |
return 'neg_log_loss' | |
elif usage == 'autogluon': | |
return 'log_loss' | |
elif usage == 'autosklearn': | |
return autosklearn.metrics.log_loss | |
elif usage == 'catboost': | |
return 'MultiClass' # Effectively LogLoss | |
return 'logloss' | |
else: | |
raise Exception('No scoring string found for metric') | |
def eval_f(params, clf_, x, y, metric_used, start_time, max_time): | |
if time.time() - start_time > max_time: | |
return np.nan | |
scores = cross_val_score(clf_(**params), x, y, cv=CV, scoring=get_scoring_string(metric_used)) | |
return -np.nanmean(scores) | |
def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]): | |
import warnings | |
def warn(*args, **kwargs): | |
pass | |
warnings.warn = warn | |
x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy() | |
if impute: | |
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') | |
imp_mean.fit(x) | |
x, test_x = imp_mean.transform(x), imp_mean.transform(test_x) | |
if one_hot: | |
def make_pd_from_np(x): | |
data = pd.DataFrame(x) | |
for c in cat_features: | |
data.iloc[:, c] = data.iloc[:, c].astype('int') | |
return data | |
x, test_x = make_pd_from_np(x), make_pd_from_np(test_x) | |
transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough") | |
transformer.fit(x) | |
x, test_x = transformer.transform(x), transformer.transform(test_x) | |
if standardize: | |
scaler = MinMaxScaler() | |
scaler.fit(x) | |
x, test_x = scaler.transform(x), scaler.transform(test_x) | |
return x, y, test_x, test_y | |
## Auto Gluon | |
def autogluon_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
from autogluon.tabular import TabularPredictor # Inside function so package can be sued without installation | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y | |
, one_hot=False | |
, cat_features=cat_features | |
, impute=False | |
, standardize=False) | |
train_data = pd.DataFrame(np.concatenate([x, y[:, np.newaxis]], 1)) | |
test_data = pd.DataFrame(np.concatenate([test_x, test_y[:, np.newaxis]], 1)) | |
# AutoGluon automatically infers datatypes, we don't specify the categorical labels | |
predictor = TabularPredictor( | |
label=train_data.columns[-1], | |
eval_metric=get_scoring_string(metric_used, usage='autogluon', multiclass=(len(np.unique(y)) > 2)), | |
problem_type='multiclass' if len(np.unique(y)) > 2 else 'binary' | |
## seed=int(y[:].sum()) doesn't accept seed | |
).fit( | |
train_data=train_data, | |
time_limit=max_time, | |
presets=['best_quality'] | |
# The seed is deterministic but varies for each dataset and each split of it | |
) | |
pred = predictor.predict_proba(test_data, as_multiclass=True).values | |
metric = metric_used(test_y, pred) | |
return metric, pred, predictor.fit_summary() | |
## AUTO Sklearn | |
def autosklearn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
return autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=max_time, version=1) | |
from autosklearn.experimental.askl2 import AutoSklearn2Classifier | |
from autosklearn.classification import AutoSklearnClassifier | |
def autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300, version=2): | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y | |
, one_hot=False | |
, cat_features=cat_features | |
, impute=False | |
, standardize=False) | |
def make_pd_from_np(x): | |
data = pd.DataFrame(x) | |
for c in cat_features: | |
data.iloc[:, c] = data.iloc[:, c].astype('category') | |
return data | |
x = make_pd_from_np(x) | |
test_x = make_pd_from_np(test_x) | |
clf_ = AutoSklearn2Classifier if version == 2 else AutoSklearnClassifier | |
clf = clf_(time_left_for_this_task=max_time, | |
memory_limit=4000, | |
n_jobs=MULTITHREAD, | |
seed=int(y[:].sum()), | |
# The seed is deterministic but varies for each dataset and each split of it | |
metric=get_scoring_string(metric_used, usage='autosklearn', multiclass=len(np.unique(y)) > 2)) | |
# fit model to data | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, None | |
param_grid_hyperopt['logistic'] = { | |
'penalty': hp.choice('penalty', ['l1', 'l2', 'none']) | |
, 'max_iter': hp.randint('max_iter', [50, 500]) | |
, 'fit_intercept': hp.choice('fit_intercept', [True, False]) | |
, 'C': hp.loguniform('C', -5, math.log(5.0))} # 'normalize': [False], | |
def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y | |
, one_hot=True, impute=True, standardize=True | |
, cat_features=cat_features) | |
def clf_(**params): | |
return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params) | |
start_time = time.time() | |
def stop(trial): | |
return time.time() - start_time > max_time, [] | |
best = fmin( | |
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), | |
space=param_grid_hyperopt['logistic'], | |
algo=rand.suggest, | |
rstate=np.random.RandomState(int(y[:].sum())), | |
early_stop_fn=stop, | |
# The seed is deterministic but varies for each dataset and each split of it | |
max_evals=10000) | |
best = space_eval(param_grid_hyperopt['logistic'], best) | |
clf = clf_(**best) | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, best | |
## KNN | |
param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16) | |
} | |
def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y, | |
one_hot=True, impute=True, standardize=True, | |
cat_features=cat_features) | |
def clf_(**params): | |
return neighbors.KNeighborsClassifier(n_jobs=1, **params) | |
start_time = time.time() | |
def stop(trial): | |
return time.time() - start_time > max_time, [] | |
best = fmin( | |
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), | |
space=param_grid_hyperopt['knn'], | |
algo=rand.suggest, | |
rstate=np.random.RandomState(int(y[:].sum())), | |
early_stop_fn=stop, | |
# The seed is deterministic but varies for each dataset and each split of it | |
max_evals=10000) | |
best = space_eval(param_grid_hyperopt['knn'], best) | |
clf = clf_(**best) | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, best | |
## GP | |
param_grid_hyperopt['gp'] = { | |
'params_y_scale': hp.loguniform('params_y_scale', math.log(0.05), math.log(5.0)), | |
'params_length_scale': hp.loguniform('params_length_scale', math.log(0.1), math.log(1.0)), | |
'n_jobs': hp.choice('njobs', [1]) | |
} | |
def gp_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y, | |
one_hot=True, impute=True, standardize=True, | |
cat_features=cat_features) | |
def clf_(params_y_scale,params_length_scale, **params): | |
return GaussianProcessClassifier(kernel= params_y_scale * RBF(params_length_scale), **params) | |
start_time = time.time() | |
def stop(trial): | |
return time.time() - start_time > max_time, [] | |
best = fmin( | |
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), | |
space=param_grid_hyperopt['gp'], | |
algo=rand.suggest, | |
rstate=np.random.RandomState(int(y[:].sum())), | |
early_stop_fn=stop, | |
# The seed is deterministic but varies for each dataset and each split of it | |
max_evals=1000) | |
best = space_eval(param_grid_hyperopt['gp'], best) | |
clf = clf_(**best) | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, best | |
# Catboost | |
# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf | |
param_grid_hyperopt['catboost'] = { | |
'learning_rate': hp.loguniform('learning_rate', math.log(math.pow(math.e, -5)), math.log(1)), | |
'random_strength': hp.randint('random_strength', 1, 20), | |
'l2_leaf_reg': hp.loguniform('l2_leaf_reg', math.log(1), math.log(10)), | |
'bagging_temperature': hp.uniform('bagging_temperature', 0., 1), | |
'leaf_estimation_iterations': hp.randint('leaf_estimation_iterations', 1, 20), | |
'iterations': hp.randint('iterations', 100, 4000), # This is smaller than in paper, 4000 leads to ram overusage | |
} | |
def catboost_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
print(x) | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y | |
, one_hot=False | |
, cat_features=cat_features | |
, impute=False | |
, standardize=False) | |
# Nans in categorical features must be encoded as separate class | |
x[:, cat_features], test_x[:, cat_features] = np.nan_to_num(x[:, cat_features], -1), np.nan_to_num( | |
test_x[:, cat_features], -1) | |
def make_pd_from_np(x): | |
data = pd.DataFrame(x) | |
for c in cat_features: | |
data.iloc[:, c] = data.iloc[:, c].astype('int') | |
return data | |
x = make_pd_from_np(x) | |
test_x = make_pd_from_np(test_x) | |
def clf_(**params): | |
return CatBoostClassifier( | |
loss_function=get_scoring_string(metric_used, usage='catboost'), | |
thread_count = MULTITHREAD, | |
used_ram_limit='4gb', | |
random_seed=int(y[:].sum()), | |
logging_level='Silent', | |
cat_features=cat_features, | |
**params) | |
start_time = time.time() | |
def stop(trial): | |
return time.time() - start_time > max_time, [] | |
best = fmin( | |
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), | |
space=param_grid_hyperopt['catboost'], | |
algo=rand.suggest, | |
rstate=np.random.RandomState(int(y[:].sum())), | |
early_stop_fn=stop, | |
# The seed is deterministic but varies for each dataset and each split of it | |
max_evals=1000) | |
best = space_eval(param_grid_hyperopt['catboost'], best) | |
clf = clf_(**best) | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, best | |
# XGBoost | |
# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf | |
param_grid_hyperopt['xgb'] = { | |
'learning_rate': hp.loguniform('learning_rate', -7, math.log(1)), | |
'max_depth': hp.randint('max_depth', 1, 10), | |
'subsample': hp.uniform('subsample', 0.2, 1), | |
'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1), | |
'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1), | |
'min_child_weight': hp.loguniform('min_child_weight', -16, 5), | |
'alpha': hp.loguniform('alpha', -16, 2), | |
'lambda': hp.loguniform('lambda', -16, 2), | |
'gamma': hp.loguniform('gamma', -16, 2), | |
'n_estimators': hp.randint('n_estimators', 100, 4000), # This is smaller than in paper | |
} | |
def xgb_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300): | |
# XGB Documentation: | |
# XGB handles categorical data appropriately without using One Hot Encoding, categorical features are experimetal | |
# XGB handles missing values appropriately without imputation | |
x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y | |
, one_hot=False | |
, cat_features=cat_features | |
, impute=False | |
, standardize=False) | |
def clf_(**params): | |
return xgb.XGBClassifier(use_label_encoder=False | |
, nthread=1 | |
, **params | |
, eval_metric=get_scoring_string(metric_used, usage='xgb') # AUC not implemented | |
) | |
start_time = time.time() | |
def stop(trial): | |
return time.time() - start_time > max_time, [] | |
best = fmin( | |
fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time), | |
space=param_grid_hyperopt['xgb'], | |
algo=rand.suggest, | |
rstate=np.random.RandomState(int(y[:].sum())), | |
early_stop_fn=stop, | |
# The seed is deterministic but varies for each dataset and each split of it | |
max_evals=1000) | |
best = space_eval(param_grid_hyperopt['xgb'], best) | |
clf = clf_(**best) | |
clf.fit(x, y) | |
pred = clf.predict_proba(test_x) | |
metric = metric_used(test_y, pred) | |
return metric, pred, best | |
clf_dict = {'gp': gp_metric | |
, 'knn': knn_metric | |
, 'catboost': catboost_metric | |
, 'xgb': xgb_metric | |
, 'logistic': logistic_metric | |
, 'autosklearn': autosklearn_metric | |
, 'autosklearn2': autosklearn2_metric | |
, 'autogluon': autogluon_metric} |