Spaces:
Runtime error
Runtime error
File size: 3,061 Bytes
5d396e9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 |
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, SplineTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
class GroupImputer(BaseEstimator, TransformerMixin):
"""
Class used for imputing missing values in a pd.DataFrame
using mean, median, or mode by groupwise aggregation,
or a constant.
Parameters:
-----------
target : str
- The name of the column to be imputed
group_cols : list
- List of name(s) of columns on which to groupby
strategy : str
- The method for replacement; can be any of
['mean', 'median', 'mode']
Returns:
--------
X : pd.DataFrame
- The dataframe with imputed values in the target column
"""
def __init__(self,target,group_cols=None,strategy='median'):
assert strategy in ['mean','median','mode'], "strategy must be in ['mean', 'median', 'mode']'"
assert type(group_cols)==list, 'group_cols must be a list of column names'
assert type(target) == str, 'target must be a string'
self.group_cols = group_cols
self.strategy=strategy
self.target = target
def fit(self,X,y=None):
if self.strategy=='mode':
impute_map = X.groupby(self.group_cols)[self.target]\
.agg(lambda x: pd.Series.mode(x,dropna=False)[0])\
.reset_index(drop=False)
else:
impute_map = X.groupby(self.group_cols)[self.target]\
.agg(self.strategy).reset_index(drop=False)
self.impute_map_ = impute_map
return self
def transform(self,X,y=None):
check_is_fitted(self,'impute_map_')
X=X.copy()
for index,row in self.impute_map_.iterrows():
ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
X.loc[ind,self.target] = X.loc[ind,self.target].fillna(row[self.target])
return X
# Sine and consine transformations
def sin_feature_names(transformer, feature_names):
return [f'SIN_{col}' for col in feature_names]
def cos_feature_names(transformer, feature_names):
return [f'COS_{col}' for col in feature_names]
def sin_transformer(period):
return FunctionTransformer(lambda x: np.sin(2*np.pi*x/period),feature_names_out = sin_feature_names)
def cos_transformer(period):
return FunctionTransformer(lambda x: np.cos(2*np.pi*x/period),feature_names_out = cos_feature_names)
# Periodic spline transformation
def periodic_spline_transformer(period, n_splines=None, degree=3):
if n_splines is None:
n_splines = period
n_knots = n_splines + 1 # periodic and include_bias is True
return SplineTransformer(
degree=degree,
n_knots=n_knots,
knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
extrapolation="periodic",
include_bias=True,
) |