Spaces:

samuelinferences
/

TabPFNEvaluationDemo

Build error

TabPFNEvaluationDemo / TabPFN /scripts /tabular_baselines.py

Samuel Mueller

init

e487255 over 2 years ago

16.5 kB

	from catboost import CatBoostClassifier, Pool

	import math

	from sklearn.impute import SimpleImputer

	import xgboost as xgb
	from sklearn import neighbors
	from sklearn.gaussian_process import GaussianProcessClassifier
	from sklearn.gaussian_process.kernels import RBF
	import numpy as np

	from scripts import tabular_metrics
	import pandas as pd

	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import cross_val_score
	import time

	from hyperopt import fmin, tpe, hp, STATUS_OK, Trials , space_eval, rand
	from sklearn.compose import ColumnTransformer
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.preprocessing import MinMaxScaler

	import autosklearn.classification

	CV = 5
	MULTITHREAD = 1 # Number of threads baselines are able to use at most
	param_grid, param_grid_hyperopt = {}, {}

	def get_scoring_direction(metric_used):
	# Not needed
	if metric_used == tabular_metrics.auc_metric:
	return -1
	elif metric_used == tabular_metrics.cross_entropy:
	return 1
	else:
	raise Exception('No scoring string found for metric')

	def get_scoring_string(metric_used, multiclass=True, usage="sklearn_cv"):
	if metric_used == tabular_metrics.auc_metric:
	if usage == 'sklearn_cv':
	return 'roc_auc_ovo'
	elif usage == 'autogluon':
	return 'log_loss' # Autogluon crashes when using 'roc_auc' with some datasets usning logloss gives better scores;
	# We might be able to fix this, but doesn't work out of box.
	# File bug report? Error happens with dataset robert and fabert
	if multiclass:
	return 'roc_auc_ovo_macro'
	else:
	return 'roc_auc'
	elif usage == 'autosklearn':
	if multiclass:
	return autosklearn.metrics.log_loss # roc_auc only works for binary, use logloss instead
	else:
	return autosklearn.metrics.roc_auc
	elif usage == 'catboost':
	return 'MultiClass' # Effectively LogLoss, ROC not available
	elif usage == 'xgb':
	return 'logloss'
	return 'roc_auc'
	elif metric_used == tabular_metrics.cross_entropy:
	if usage == 'sklearn_cv':
	return 'neg_log_loss'
	elif usage == 'autogluon':
	return 'log_loss'
	elif usage == 'autosklearn':
	return autosklearn.metrics.log_loss
	elif usage == 'catboost':
	return 'MultiClass' # Effectively LogLoss
	return 'logloss'
	else:
	raise Exception('No scoring string found for metric')

	def eval_f(params, clf_, x, y, metric_used, start_time, max_time):
	if time.time() - start_time > max_time:
	return np.nan
	scores = cross_val_score(clf_(**params), x, y, cv=CV, scoring=get_scoring_string(metric_used))

	return -np.nanmean(scores)

	def preprocess_impute(x, y, test_x, test_y, impute, one_hot, standardize, cat_features=[]):
	import warnings
	def warn(args, *kwargs):
	pass

	warnings.warn = warn

	x, y, test_x, test_y = x.cpu().numpy(), y.cpu().long().numpy(), test_x.cpu().numpy(), test_y.cpu().long().numpy()

	if impute:
	imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
	imp_mean.fit(x)
	x, test_x = imp_mean.transform(x), imp_mean.transform(test_x)

	if one_hot:
	def make_pd_from_np(x):
	data = pd.DataFrame(x)
	for c in cat_features:
	data.iloc[:, c] = data.iloc[:, c].astype('int')
	return data
	x, test_x = make_pd_from_np(x), make_pd_from_np(test_x)
	transformer = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), cat_features)], remainder="passthrough")
	transformer.fit(x)
	x, test_x = transformer.transform(x), transformer.transform(test_x)

	if standardize:
	scaler = MinMaxScaler()
	scaler.fit(x)
	x, test_x = scaler.transform(x), scaler.transform(test_x)

	return x, y, test_x, test_y

	## Auto Gluon
	def autogluon_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	from autogluon.tabular import TabularPredictor # Inside function so package can be sued without installation
	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
	, one_hot=False
	, cat_features=cat_features
	, impute=False
	, standardize=False)
	train_data = pd.DataFrame(np.concatenate([x, y[:, np.newaxis]], 1))
	test_data = pd.DataFrame(np.concatenate([test_x, test_y[:, np.newaxis]], 1))

	# AutoGluon automatically infers datatypes, we don't specify the categorical labels
	predictor = TabularPredictor(
	label=train_data.columns[-1],
	eval_metric=get_scoring_string(metric_used, usage='autogluon', multiclass=(len(np.unique(y)) > 2)),
	problem_type='multiclass' if len(np.unique(y)) > 2 else 'binary'
	## seed=int(y[:].sum()) doesn't accept seed
	).fit(
	train_data=train_data,
	time_limit=max_time,
	presets=['best_quality']
	# The seed is deterministic but varies for each dataset and each split of it
	)

	pred = predictor.predict_proba(test_data, as_multiclass=True).values

	metric = metric_used(test_y, pred)

	return metric, pred, predictor.fit_summary()

	## AUTO Sklearn
	def autosklearn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	return autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=max_time, version=1)

	from autosklearn.experimental.askl2 import AutoSklearn2Classifier
	from autosklearn.classification import AutoSklearnClassifier
	def autosklearn2_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300, version=2):
	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
	, one_hot=False
	, cat_features=cat_features
	, impute=False
	, standardize=False)

	def make_pd_from_np(x):
	data = pd.DataFrame(x)
	for c in cat_features:
	data.iloc[:, c] = data.iloc[:, c].astype('category')
	return data

	x = make_pd_from_np(x)
	test_x = make_pd_from_np(test_x)

	clf_ = AutoSklearn2Classifier if version == 2 else AutoSklearnClassifier
	clf = clf_(time_left_for_this_task=max_time,
	memory_limit=4000,
	n_jobs=MULTITHREAD,
	seed=int(y[:].sum()),
	# The seed is deterministic but varies for each dataset and each split of it
	metric=get_scoring_string(metric_used, usage='autosklearn', multiclass=len(np.unique(y)) > 2))

	# fit model to data
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, None

	param_grid_hyperopt['logistic'] = {
	'penalty': hp.choice('penalty', ['l1', 'l2', 'none'])
	, 'max_iter': hp.randint('max_iter', [50, 500])
	, 'fit_intercept': hp.choice('fit_intercept', [True, False])
	, 'C': hp.loguniform('C', -5, math.log(5.0))} # 'normalize': [False],

	def logistic_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
	, one_hot=True, impute=True, standardize=True
	, cat_features=cat_features)

	def clf_(**params):
	return LogisticRegression(solver='saga', tol=1e-4, n_jobs=1, **params)

	start_time = time.time()

	def stop(trial):
	return time.time() - start_time > max_time, []

	best = fmin(
	fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
	space=param_grid_hyperopt['logistic'],
	algo=rand.suggest,
	rstate=np.random.RandomState(int(y[:].sum())),
	early_stop_fn=stop,
	# The seed is deterministic but varies for each dataset and each split of it
	max_evals=10000)
	best = space_eval(param_grid_hyperopt['logistic'], best)

	clf = clf_(**best)
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, best

	## KNN
	param_grid_hyperopt['knn'] = {'n_neighbors': hp.randint('n_neighbors', 1,16)
	}
	def knn_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
	one_hot=True, impute=True, standardize=True,
	cat_features=cat_features)

	def clf_(**params):
	return neighbors.KNeighborsClassifier(n_jobs=1, **params)

	start_time = time.time()

	def stop(trial):
	return time.time() - start_time > max_time, []

	best = fmin(
	fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
	space=param_grid_hyperopt['knn'],
	algo=rand.suggest,
	rstate=np.random.RandomState(int(y[:].sum())),
	early_stop_fn=stop,
	# The seed is deterministic but varies for each dataset and each split of it
	max_evals=10000)
	best = space_eval(param_grid_hyperopt['knn'], best)

	clf = clf_(**best)
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, best

	## GP
	param_grid_hyperopt['gp'] = {
	'params_y_scale': hp.loguniform('params_y_scale', math.log(0.05), math.log(5.0)),
	'params_length_scale': hp.loguniform('params_length_scale', math.log(0.1), math.log(1.0)),
	'n_jobs': hp.choice('njobs', [1])
	}
	def gp_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y,
	one_hot=True, impute=True, standardize=True,
	cat_features=cat_features)

	def clf_(params_y_scale,params_length_scale, **params):
	return GaussianProcessClassifier(kernel= params_y_scale * RBF(params_length_scale), **params)

	start_time = time.time()
	def stop(trial):
	return time.time() - start_time > max_time, []


	best = fmin(
	fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
	space=param_grid_hyperopt['gp'],
	algo=rand.suggest,
	rstate=np.random.RandomState(int(y[:].sum())),
	early_stop_fn=stop,
	# The seed is deterministic but varies for each dataset and each split of it
	max_evals=1000)
	best = space_eval(param_grid_hyperopt['gp'], best)

	clf = clf_(**best)
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, best


	# Catboost
	# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf

	param_grid_hyperopt['catboost'] = {
	'learning_rate': hp.loguniform('learning_rate', math.log(math.pow(math.e, -5)), math.log(1)),
	'random_strength': hp.randint('random_strength', 1, 20),
	'l2_leaf_reg': hp.loguniform('l2_leaf_reg', math.log(1), math.log(10)),
	'bagging_temperature': hp.uniform('bagging_temperature', 0., 1),
	'leaf_estimation_iterations': hp.randint('leaf_estimation_iterations', 1, 20),
	'iterations': hp.randint('iterations', 100, 4000), # This is smaller than in paper, 4000 leads to ram overusage
	}

	def catboost_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	print(x)

	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
	, one_hot=False
	, cat_features=cat_features
	, impute=False
	, standardize=False)

	# Nans in categorical features must be encoded as separate class
	x[:, cat_features], test_x[:, cat_features] = np.nan_to_num(x[:, cat_features], -1), np.nan_to_num(
	test_x[:, cat_features], -1)

	def make_pd_from_np(x):
	data = pd.DataFrame(x)
	for c in cat_features:
	data.iloc[:, c] = data.iloc[:, c].astype('int')
	return data

	x = make_pd_from_np(x)
	test_x = make_pd_from_np(test_x)

	def clf_(**params):
	return CatBoostClassifier(
	loss_function=get_scoring_string(metric_used, usage='catboost'),
	thread_count = MULTITHREAD,
	used_ram_limit='4gb',
	random_seed=int(y[:].sum()),
	logging_level='Silent',
	cat_features=cat_features,
	**params)

	start_time = time.time()
	def stop(trial):
	return time.time() - start_time > max_time, []

	best = fmin(
	fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
	space=param_grid_hyperopt['catboost'],
	algo=rand.suggest,
	rstate=np.random.RandomState(int(y[:].sum())),
	early_stop_fn=stop,
	# The seed is deterministic but varies for each dataset and each split of it
	max_evals=1000)
	best = space_eval(param_grid_hyperopt['catboost'], best)

	clf = clf_(**best)
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, best


	# XGBoost
	# Hyperparameter space: https://arxiv.org/pdf/2106.03253.pdf
	param_grid_hyperopt['xgb'] = {
	'learning_rate': hp.loguniform('learning_rate', -7, math.log(1)),
	'max_depth': hp.randint('max_depth', 1, 10),
	'subsample': hp.uniform('subsample', 0.2, 1),
	'colsample_bytree': hp.uniform('colsample_bytree', 0.2, 1),
	'colsample_bylevel': hp.uniform('colsample_bylevel', 0.2, 1),
	'min_child_weight': hp.loguniform('min_child_weight', -16, 5),
	'alpha': hp.loguniform('alpha', -16, 2),
	'lambda': hp.loguniform('lambda', -16, 2),
	'gamma': hp.loguniform('gamma', -16, 2),
	'n_estimators': hp.randint('n_estimators', 100, 4000), # This is smaller than in paper
	}

	def xgb_metric(x, y, test_x, test_y, cat_features, metric_used, max_time=300):
	# XGB Documentation:
	# XGB handles categorical data appropriately without using One Hot Encoding, categorical features are experimetal
	# XGB handles missing values appropriately without imputation

	x, y, test_x, test_y = preprocess_impute(x, y, test_x, test_y
	, one_hot=False
	, cat_features=cat_features
	, impute=False
	, standardize=False)

	def clf_(**params):
	return xgb.XGBClassifier(use_label_encoder=False
	, nthread=1
	, **params
	, eval_metric=get_scoring_string(metric_used, usage='xgb') # AUC not implemented
	)

	start_time = time.time()
	def stop(trial):
	return time.time() - start_time > max_time, []

	best = fmin(
	fn=lambda params: eval_f(params, clf_, x, y, metric_used, start_time, max_time),
	space=param_grid_hyperopt['xgb'],
	algo=rand.suggest,
	rstate=np.random.RandomState(int(y[:].sum())),
	early_stop_fn=stop,
	# The seed is deterministic but varies for each dataset and each split of it
	max_evals=1000)
	best = space_eval(param_grid_hyperopt['xgb'], best)

	clf = clf_(**best)
	clf.fit(x, y)

	pred = clf.predict_proba(test_x)
	metric = metric_used(test_y, pred)

	return metric, pred, best


	clf_dict = {'gp': gp_metric
	, 'knn': knn_metric
	, 'catboost': catboost_metric
	, 'xgb': xgb_metric
	, 'logistic': logistic_metric
	, 'autosklearn': autosklearn_metric
	, 'autosklearn2': autosklearn2_metric
	, 'autogluon': autogluon_metric}