BikeSaferPA / lib /transform_data.py
etweedy's picture
Upload 22 files
5d396e9
raw
history blame
3.06 kB
import pandas as pd
import numpy as np
from sklearn.preprocessing import FunctionTransformer, SplineTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
class GroupImputer(BaseEstimator, TransformerMixin):
"""
Class used for imputing missing values in a pd.DataFrame
using mean, median, or mode by groupwise aggregation,
or a constant.
Parameters:
-----------
target : str
- The name of the column to be imputed
group_cols : list
- List of name(s) of columns on which to groupby
strategy : str
- The method for replacement; can be any of
['mean', 'median', 'mode']
Returns:
--------
X : pd.DataFrame
- The dataframe with imputed values in the target column
"""
def __init__(self,target,group_cols=None,strategy='median'):
assert strategy in ['mean','median','mode'], "strategy must be in ['mean', 'median', 'mode']'"
assert type(group_cols)==list, 'group_cols must be a list of column names'
assert type(target) == str, 'target must be a string'
self.group_cols = group_cols
self.strategy=strategy
self.target = target
def fit(self,X,y=None):
if self.strategy=='mode':
impute_map = X.groupby(self.group_cols)[self.target]\
.agg(lambda x: pd.Series.mode(x,dropna=False)[0])\
.reset_index(drop=False)
else:
impute_map = X.groupby(self.group_cols)[self.target]\
.agg(self.strategy).reset_index(drop=False)
self.impute_map_ = impute_map
return self
def transform(self,X,y=None):
check_is_fitted(self,'impute_map_')
X=X.copy()
for index,row in self.impute_map_.iterrows():
ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
X.loc[ind,self.target] = X.loc[ind,self.target].fillna(row[self.target])
return X
# Sine and consine transformations
def sin_feature_names(transformer, feature_names):
return [f'SIN_{col}' for col in feature_names]
def cos_feature_names(transformer, feature_names):
return [f'COS_{col}' for col in feature_names]
def sin_transformer(period):
return FunctionTransformer(lambda x: np.sin(2*np.pi*x/period),feature_names_out = sin_feature_names)
def cos_transformer(period):
return FunctionTransformer(lambda x: np.cos(2*np.pi*x/period),feature_names_out = cos_feature_names)
# Periodic spline transformation
def periodic_spline_transformer(period, n_splines=None, degree=3):
if n_splines is None:
n_splines = period
n_knots = n_splines + 1 # periodic and include_bias is True
return SplineTransformer(
degree=degree,
n_knots=n_knots,
knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
extrapolation="periodic",
include_bias=True,
)