File size: 39,304 Bytes
5d396e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, f_classif
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score, fbeta_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, SplineTransformer, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
# from lightgbm import LGBMClassifier
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.utils.validation import check_is_fitted
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from lib.transform_data import *

class ClassifierStudy():
    """
    A class that contains tools for studying a classifier pipeline
    
    Parameters:
    -----------
    classifier : a scikit-learn compatible binary classifier
    X : pd.DataFrame
        dataframe of features
    y : pd.Series
        series of binary target values corresponding to X
    classifier_name : str or None
        if provided, will use as classifier name in pipeline
        if not, will use 'clf' as name
    features : dict
        a dictionary whose keys are the feature types
        'cyc','cat','ord','num','bin' and whose values
        are lists of features of each type.
        
    Methods:
    -------
    set_data, set_features, set_state
        sets or resets attributes of self
    build_pipeline
        builds out pipeline based on supplied specs
    cv_score
        runs k-fold cross validation and reports scores
    randomized_search
        runs randomized search with cross validation
        and reports results
    fit_pipeline
        fits the model pipeline and stores as
        self.pipe_fitted
    predict_proba_pipeline
        uses a fitted pipeline to compute predicted
        probabilities for test or validation set
    score_pipeline
        scores predicted probabilities
        
    """
    def __init__(self, classifier=None, X = None, y = None,
                 features = None,classifier_name = None,
                 random_state=42):
        self.classifier = classifier
        if X is not None:
            self.X = X.copy()
        if y is not None:
            self.y = y.copy()
        if features is not None:
            self.features = features.copy()
        self.random_state=random_state
        self.pipe, self.pipe_fitted = None, None
        self.classifier_name = classifier_name
        self.X_val, self.y_val = None, None
        self.y_predict_proba = None
        self.best_params, self.best_n_components = None, None
        self.shap_vals = None
    
    def set_data(self,X=None,y=None):
        """Method to set or reset feature and/or target data"""
        if X is not None:
            self.X = X.copy()
        if y is not None:
            self.y = y.copy()
    
    def set_features(self,features):
        """Method to set or reset the feature dictionary"""
        if features is not None:
            self.features = features.copy()        
    
    def set_state(self,random_state):
        """Method to set or reset the random_state"""
        self.random_state = random_state
        
    def build_pipeline(self, cat_method = 'onehot',cyc_method = 'spline',num_ss=True,
                       over_sample = False, pca=False,n_components=None,
                       select_features = False,score_func=None,k='all',
                       poly_features = False, degree=2, interaction_only=False):
        """
        Method to build the model pipeline
        Parameters:
        -----------
        cat_method : str
            specifies whether to encode categorical
            variables as one-hot vectors or ordinals
            must be either 'onehot' or 'ord'
        cyc_method : str
            specifies whether to encode cyclical features
            with sine/cosine encoding or periodic splines
            must be one of 'trig', 'spline', 'interact-trig',
            'interact-spline','onehot', 'ord', or None
            - If 'trig' or 'spline', will set up periodic encoder
              with desired method
            - If 'onehot' or 'ord', will set up appropriate
              categorical encoder
            - If 'interact-{method}', will use <method> encoding for HOUR_OF_DAY,
              encode DAY_OF_WEEK as a binary feature expressing whether
              the day is a weekend day, and then include interaction
              features among this set via PolynomialFeatures.
            - If None, will leave out cyclical features altogether
        num_ss : bool
            Whether or not to apply StandardScaler on the numerical features
        over_sample : bool
            set to True to include imblearn.over_sampling.RandomOverSampler step
        pca : bool
            set to True to include sklearn.decomposition.PCA step
        n_components : int or None
            number of components for sklearn.decomposition.PCA
        select_features : bool
            set to True to include sklearn.feature_selection.SelectKBest step
        score_func : callable
            score function to use for sklearn.feature_selection.SelectKBest
            recommended: chi2, f_classif, or mutual_info_classif
        k : int or 'all'
            number of features for sklearn.feature_selection.SelectKBest
        poly_features : bool
            set to True to include sklearn.preprocessing.PolynomialFeatures step
        degree : int
            max degree for sklearn.preprocessing.PolynomialFeatures
        interaction_only : bool
            whether or not sklearn.preprocessing.PolynomialFeatures will be limited
            to interaction terms only
        """
        
        # Define transformer for categorical features
        if cat_method == 'onehot':
            cat_encoder = ('ohe',OneHotEncoder(handle_unknown='infrequent_if_exist'))
                                
        elif cat_method == 'ord':
            cat_encoder = ('oe',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
        else:
            raise ValueError("cat_method must be either 'onehot' or 'ord'")
    
        cat_transform = Pipeline([('si',SimpleImputer(strategy='most_frequent')),cat_encoder])
    
        # Define transformer for cyclic features
        cyc_dict = {'HOUR_OF_DAY':24,'DAY_OF_WEEK':7}
        if cyc_method == 'trig':
            cyc_transform = [(f'{feat}_cos',cos_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]+\
                        [(f'{feat}_sin',sin_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]
        elif cyc_method =='spline':
            cyc_transform = [(f'{feat}_cyclic',
                          periodic_spline_transformer(cyc_dict[feat],n_splines=cyc_dict[feat]//2),
                          [feat]) for feat in self.features['cyc']]
        elif cyc_method == 'onehot':
            cyc_encoder = ('ohe_cyc',OneHotEncoder(handle_unknown='infrequent_if_exist'))
            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
        elif cyc_method == 'ord':
            cyc_encoder = ('oe_cyc',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
        elif cyc_method == 'interact-spline':
            hour_transform = (f'hour_cyc',periodic_spline_transformer(cyc_dict['HOUR_OF_DAY'],n_splines=12),['HOUR_OF_DAY'])
            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer([hour_transform, wkend_transform],
                                                               remainder='drop',verbose_feature_names_out=False)),
                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
                                                                include_bias=False))]),
                             self.features['cyc'])]
        elif cyc_method == 'interact-trig':
            hour_transform = [(f'HOUR_cos',cos_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY']),
                              (f'HOUR_sin',sin_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY'])]
            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer(hour_transform+[wkend_transform],
                                                               remainder='drop',verbose_feature_names_out=False)),
                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
                                                                include_bias=False))]),
                             self.features['cyc'])]
        elif cyc_method is None:
            cyc_transform = [('cyc','passthrough',[])]
        else:
            raise ValueError("cyc_method must be one of 'trig','spline','interact','onehot','ord',or None")
        
        # Define numerical transform
        num_transform = ('num',StandardScaler(),self.features['num']) if num_ss else\
                        ('num','passthrough',self.features['num'])
        
        # Define column transformer
        col_transform = ColumnTransformer([('cat',cat_transform,self.features['cat']),
                                           ('ord','passthrough',self.features['ord']),
                                           num_transform,
                                           ('bin',SimpleImputer(strategy='most_frequent'),
                                             self.features['bin'])]+\
                                           cyc_transform,
                                           remainder='drop',verbose_feature_names_out=False)
    
        steps = [('col',col_transform)]
    
        if 'AGE' in self.features['num']:
            steps.insert(0,('gi_age',GroupImputer(target = 'AGE', group_cols=['COUNTY'],strategy='median')))
        if 'HOUR_OF_DAY' in self.features['cyc']:
            steps.insert(0,('gi_hour',GroupImputer(target = 'HOUR_OF_DAY', group_cols=['ILLUMINATION','CRASH_MONTH'],strategy='mode')))
        # Insert optional steps as needed
        if over_sample:
            steps.insert(0,('os',RandomOverSampler(random_state=self.random_state)))
        if poly_features:
            steps.append(('pf',PolynomialFeatures(degree=degree,interaction_only=interaction_only)))
        if select_features:
            steps.append(('fs',SelectKBest(score_func = score_func, k = k)))
        if pca:
            steps.append(('pca',PCA(n_components=n_components,random_state=self.random_state)))
        # Append classifier if provided
        if self.classifier is not None:
            if self.classifier_name is not None:
                steps.append((f'{self.classifier_name}_clf',self.classifier))
            else:
                steps.append(('clf',self.classifier))
    
        # Initialize pipeline
        self.pipe = Pipeline(steps)
    
    def cv_score(self, scoring = 'roc_auc', n_splits = 5, n_repeats=3, thresh = 0.5, beta = 1,
                 return_mean_score=False,print_mean_score=True,print_scores=False, n_jobs=-1,
                eval_size=0.1,eval_metric='auc'):
        """
        Method for performing cross validation via RepeatedStratifiedKFold
        
        Parameters:
        -----------
        scoring : str
            scoring function to use.  must be one of
            'roc_auc','acc','f1','','f1w'
        thresh : float
            the classification threshold for computing y_pred
            from y_pred_proba
        beta : float
            the beta-value to use in the f_beta score, if chosen
        n_splits, n_repeats : int, int
            number of splits and number of repeat iterations
            for sklearn.model_selection.RepeatedStratifiedKFold
        return_mean_score : bool
            whether or not to return the mean score
        print_mean_score : bool
            whether to print out a report of the mean score
        print_scores : bool
            whether to print out a report of CV scores for all folds
        n_jobs : int or None
            number of CPU cores to use for parallel processing
            -1 uses all available cores, and None defaults to 1
        eval_size : float
            Fraction of the training set to use for early stopping eval set
        eval_metric : str
            eval metric to use in early stopping
        Returns: None or mean_score, depending on return_mean_score setting
        --------
        """
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
        assert scoring in ['roc_auc','acc','f1','fb','f1w'],"scoring must be one of 'roc_auc','acc','f1','fb','f1w'"
        
        # Initialize CV iterator
        kf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats=n_repeats,
                                     random_state=self.random_state)
        # Restrict to features supplied in self.features
        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        
        lgb_es=False
        # if isinstance(self.pipe[-1],LGBMClassifier):
        #     if 'early_stopping_round' in self.pipe[-1].get_params():
        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
        #             lgb_es=True

        scores = []
        # Iterate over folds and train, predict, score
        for i,(train_idx,test_idx) in enumerate(kf.split(X,self.y)):
            fold_X_train = X.iloc[train_idx,:]
            fold_X_test = X.iloc[test_idx,:]
            fold_y_train = self.y.iloc[train_idx]
            fold_y_test = self.y.iloc[test_idx]
            
            pipe=clone(self.pipe)
            if lgb_es:
                fold_X_train,fold_X_es,fold_y_train,fold_y_es = train_test_split(fold_X_train,fold_y_train,
                                                                                 stratify=fold_y_train,test_size=eval_size,
                                                                                 random_state=self.random_state)
                trans_pipe = pipe[:-1]
                trans_pipe.fit_transform(fold_X_train)
                fold_X_es = trans_pipe.transform(fold_X_es)
                clf_name = pipe.steps[-1][0]
                fit_params = {f'{clf_name}__eval_set':[(fold_X_es,fold_y_es)],
                              f'{clf_name}__eval_metric':eval_metric,
                              f'{clf_name}__verbose':0}
            else:
                fit_params = {}
            
            pipe.fit(fold_X_train,fold_y_train,**fit_params)
            fold_y_pred_proba = pipe.predict_proba(fold_X_test)[:,1]
            
            if scoring == 'roc_auc':
                fold_score = roc_auc_score(fold_y_test, fold_y_pred_proba)
            else:
                fold_y_pred = (fold_y_pred_proba >= thresh).astype('int')
                if scoring == 'acc':
                    fold_score = accuracy_score(fold_y_test,fold_y_pred)
                elif scoring == 'f1':
                    fold_score = f1_score(fold_y_test,fold_y_pred)
                elif scoring == 'f1w':
                    fold_score = f1_score(fold_y_test,fold_y_pred,average='weighted')
                else:
                    fold_score = fbeta_score(fold_y_test,fold_y_pred,beta=beta)
            scores.append(fold_score)
        
        # Average and report
        mean_score = np.mean(scores)
        if print_scores:
            print(f'CV scores using {scoring} score: {scores} \nMean score: {mean_score}')
        if print_mean_score:
            print(f'Mean CV {scoring} score: {mean_score}')
        if return_mean_score:
            return mean_score
            
    def randomized_search(self, params, n_components = None, n_iter=10,
                          scoring='roc_auc',cv=5,refit=False,top_n=10, n_jobs=-1):
        """
        Method for performing randomized search with cross validation on a given dictionary of parameter distributions
        Also displays a table of results the best top_n iterations
        
        Parameters:
        ----------
        params : dict
            parameter distributions to use for RandomizedSearchCV
        n_components : int, or list, or None
            number of components for sklearn.decomposition.PCA
            - if int, will reset the PCA layer in self.pipe with provided value
            - if list, must be list of ints, which will be included in
              RandomizedSearchCV parameter distribution
        scoring : str
            scoring function for sklearn.model_selection.cross_val_score
        n_iter : int
            number of iterations to use in RandomizedSearchCV
        refit : bool
            whether to refit a final classifier with best parameters
            - if False, will only set self.best_params and self.best_score
            - if True, will set self.best_estimator in addition
        top_n : int or None
            if int, will display results from top_n best iterations only
            if None, will display all results
        n_jobs : int or None
            number of CPU cores to use for parallel processing
            -1 uses all available cores, and None defaults to 1
        """
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
        assert (n_components is None)|('pca' in self.pipe.named_steps), 'Your pipeline has no PCA step.  Build a pipeline with PCA first.'
        assert (len(params)>0)|(type(n_components)==list), 'Either pass a parameter distribution or a list of n_components values.'
        
        # Add estimator name prefix to hyperparams
        params = {self.pipe.steps[-1][0]+'__'+key:params[key] for key in params}
        
        # Process supplied n_components
        if type(n_components)==list:
            params['pca__n_components']=n_components
        elif type(n_components)==int:
            self.pipe['pca'].set_params(n_components=n_components)
        
        # Restrict to features supplied in self.features
        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        
        # Initialize rs and fit
        rs = RandomizedSearchCV(self.pipe, param_distributions = params,
                                n_iter=n_iter, scoring = scoring, cv = cv,refit=refit,
                                random_state=self.random_state, n_jobs=n_jobs)
        
        rs.fit(X,self.y)
    
        # Display top n scores
        results = rs.cv_results_
        results_df = pd.DataFrame(results['params'])
        param_names = list(results_df.columns)
        results_df[f'mean cv score ({scoring})']=pd.Series(results['mean_test_score'])
        results_df = results_df.set_index(param_names).sort_values(by=f'mean cv score ({scoring})',ascending=False)
        if top_n is not None:
            display(results_df.head(top_n).style\
                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
        else:
            display(results_df.style\
                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
        if refit:
            self.best_estimator = rs.best_estimator_
        best_params = rs.best_params_
        self.best_params = {key.split('__')[-1]:best_params[key] for key in best_params if key.split('__')[0]!='pca'}
        self.best_n_components = next((best_params[key] for key in best_params if key.split('__')[0]=='pca'), None)
        self.best_score = rs.best_score_
        
    def fit_pipeline(self,split_first=False, eval_size=0.1,eval_metric='auc'):
        """
        Method for fitting self.pipeline on self.X,self.y
        Parameters:
        -----------
        split_first : bool
            if True, a train_test_split will be performed first
            and the validation set will be stored
        early_stopping : bool
            Indicates whether we will use early_stopping for lightgbm.
            If true, will split off an eval set prior to k-fold split
        eval_size : float
            Fraction of the training set to use for early stopping eval set
        eval_metric : str
            eval metric to use in early stopping
        """
        # Need pipe and X to fit
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert self.X is not None, 'X does not exist.  First set X.'
        
        # If no y provided, then no pipeline steps should require y
        step_list = [step[0] for step in self.pipe.steps]
        assert (('clf' not in step_list[-1])&('kf' not in step_list))|(self.y is not None), 'You must provide targets y if pipeline has a classifier step or feature selection step.'
        
        # Don't need to do a train-test split without a classifier
        assert (split_first==False)|('clf' in step_list[-1]), 'Only need train-test split if you have a classifier.'
                
        if split_first:
            X_train,X_val,y_train,y_val = train_test_split(self.X,self.y,stratify=self.y,
                                                           test_size=0.2,random_state=self.random_state)
            self.X_val = X_val
            self.y_val = y_val
        else:
            X_train = self.X.copy()
            if self.y is not None:
                y_train = self.y.copy()        
        
        # Restrict to features supplied in self.features
        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        
        # If LGBM early stopping, then need to split off eval_set and define fit_params
        # if isinstance(self.pipe[-1],LGBMClassifier):
        #     if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
        #         X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
        #                                                        test_size=eval_size,
        #                                                        stratify=y_train,
        #                                                        random_state=self.random_state)
        #         trans_pipe = self.pipe[:-1]
        #         trans_pipe.fit_transform(X_train)
        #         X_es = trans_pipe.transform(X_es)
        #         clf_name = self.pipe.steps[-1][0]
        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
        #                       f'{clf_name}__eval_metric':eval_metric,
        #                      f'{clf_name}__verbose':0}
        #     else:
        #         fit_params = {}
        # else:
        #     fit_params = {}
        fit_params = {}
        # Fit and store fitted pipeline. If no classifier, fit_transform X_train and store transformed version
        pipe = self.pipe
        if 'clf' in step_list[-1]:
            pipe.fit(X_train,y_train,**fit_params)
        else:
            X_transformed = pipe.fit_transform(X_train)
            # X_transformed = pd.DataFrame(X_transformed,columns=pipe[-1].get_column_names_out())
            self.X_transformed = X_transformed
        self.pipe_fitted = pipe
    
    def predict_proba_pipeline(self, X_test = None):
        """
        Method for using a fitted pipeline to compute predicted
        probabilities for X_test (if supplied) or self.X_val
        Parameters:
        -----------
        X_test : pd.DataFrame or None
            test data input features (if None, will use self.X_val)
        """
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
        assert self.pipe_fitted is not None, 'Pipeline is not fitted.  First fit pipeline using fit_pipeline.'
        assert (X_test is not None)|(self.X_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
        
        if X_test is None:
            X_test = self.X_val
            
        # Restrict to features supplied in self.features
        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        
        # Save prediction
        self.y_predict_proba = self.pipe_fitted.predict_proba(X_test)[:,1]
        
    def score_pipeline(self,y_test=None,scoring='roc_auc',thresh=0.5, beta = 1,
                       normalize = None, print_score = True):
        """
        Method for scoring self.pipe_fitted on supplied test data and reporting score
        Parameters:
        -----------
        y_test : pd.Series or None
            true binary targets (if None, will use self.y_val)
        scoring : str
            specifies the metric to use for scoring
            must be one of
            'roc_auc', 'roc_plot', 'acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'
        thresh : float
            threshhold value for computing y_pred
            from y_predict_proba
        beta : float
            the beta parameter in the fb score
        normalize : str or None
            the normalize parameter for the 
            confusion_matrix. must be one of
            'true','pred','all',None
        print_score : bool
            if True, will print a message reporting the score
            if False, will return the score as a float
        """
        assert (y_test is not None)|(self.y_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
        assert self.y_predict_proba is not None, 'Predicted probabilities do not exist.  Run predict_proba_pipeline first.'
        
        if y_test is None:
            y_test = self.y_val
        
        # Score and report
        if scoring == 'roc_plot':
            fig = plt.figure(figsize=(4,4))
            ax = fig.add_subplot(111)
            RocCurveDisplay.from_predictions(y_test,self.y_predict_proba,ax=ax)
            plt.show()
        elif scoring == 'roc_auc':
            score = roc_auc_score(y_test, self.y_predict_proba)
        else:
            y_pred = (self.y_predict_proba >= thresh).astype('int')
            if scoring == 'acc':
                score = accuracy_score(y_test,y_pred)
            elif scoring == 'f1':
                score = f1_score(y_test,y_pred)
            elif scoring == 'f1w':
                score = f1_score(y_test,y_pred,average='weighted')
            elif scoring == 'fb':
                score = fbeta_score(y_test,y_pred,beta=beta)
            elif scoring == 'mcc':
                score = matthews_coffcoeff(y_test,y_pred)
            elif scoring == 'kappa':
                score = cohen_kappa_score(y_test,y_pred)
            elif scoring == 'conf':
                fig = plt.figure(figsize=(3,3))
                ax = fig.add_subplot(111)
                ConfusionMatrixDisplay.from_predictions(y_test,y_pred,ax=ax,colorbar=False)
                plt.show()
            elif scoring == 'classif_report':
                target_names=['neither seriously injured nor killed','seriously injured or killed']
                print(classification_report(y_test, y_pred,target_names=target_names))
            else:
                raise ValueError("scoring must be one of 'roc_auc', 'roc_plot','acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'")
        if scoring not in ['conf','roc_plot','classif_report']:
            if print_score:
                print(f'The {scoring} score is: {score}')
            else:
                return score
    
    def shap_values(self, X_test = None, eval_size=0.1,eval_metric='auc'):
        """
        Method for computing and SHAP values for features
        stratifiedtrain/test split
        A copy of self.pipe is fitted on the training set
        and then SHAP values are computed on test set samples
        Parameters:
        -----------
        X_test : pd.DataFrame
            The test set; if provided, will not perform
            a train/test split before fitting
        eval_size : float
            Fraction of the training set to use for early stopping eval set
        eval_metric : str
            eval metric to use in early stopping
        Returns: None (stores results in self.shap_vals)
        --------
        """
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
        
        
        # Clone pipeline, do train/test split if X_test not provided
        pipe = clone(self.pipe)
        X_train = self.X.copy()
        y_train = self.y.copy()
        if X_test is None:
            X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,stratify=y_train,
                                                                 test_size=0.2,random_state=self.random_state)
        # Restrict to features provided in self.features, and fit
        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
        
        # If LGBM early stopping, then need to split off eval_set and define fit_params
        # if isinstance(self.pipe[-1],LGBMClassifier):
        #     if 'early_stopping_round' in self.pipe[-1].get_params():
        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
        #             X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
        #                                                        test_size=eval_size,
        #                                                        stratify=y_train,
        #                                                        random_state=self.random_state)
        #         trans_pipe = self.pipe[:-1]
        #         trans_pipe.fit_transform(X_train)
        #         X_es = trans_pipe.transform(X_es)
        #         clf_name = self.pipe.steps[-1][0]
        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
        #                       f'{clf_name}__eval_metric':eval_metric,
        #                      f'{clf_name}__verbose':0}
        #     else:
        #         fit_params = {}
        # else:
        #     fit_params = {}
        fit_params = {}
        pipe.fit(X_train,y_train,**fit_params)
            
        # SHAP will just explain classifier, so need transformed X_train and X_test
        X_train_trans, X_test_trans = pipe[:-1].transform(X_train), pipe[:-1].transform(X_test)
            
        # Need masker for linear model
        masker = shap.maskers.Independent(data=X_train_trans)
            
        # Initialize explainer and compute and store SHAP values as an explainer object
        explainer = shap.Explainer(pipe[-1], masker = masker, feature_names = pipe['col'].get_feature_names_out())
        self.shap_vals = explainer(X_test_trans)
        self.X_shap = X_train_trans
        self.y_shap = y_train
            
    def shap_plot(self,max_display='all'):
        """
        Method for generating plots of SHAP value results
        SHAP values should be already computed previously
        Generates two plots side by side:
            - a beeswarm plot of SHAP values of all samples
            - a barplot of mean absolute SHAP values
        Parameters:
        -----------
        max_display : int or 'all'
            The number of features to show in the plot, in descending
            order by mean absolute SHAP value.  If 'all', then
            all features will be included.
            
        Returns: None (plots displayed)
        --------
        """
        assert self.shap_vals is not None, 'No shap values exist.  First compute shap values.'
        assert (isinstance(max_display,int))|(max_display=='all'), "'max_display' must be 'all' or an integer"
        
        if max_display=='all':
            title_add = ', all features'
            max_display = self.shap_vals.shape[1]
        else:
            title_add = f', top {max_display} features'
            
        # Plot
        fig=plt.figure()
        ax1=fig.add_subplot(121)
        shap.summary_plot(self.shap_vals,plot_type='bar',max_display=max_display,
                          show=False,plot_size=0.2)
        ax2=fig.add_subplot(122)
        shap.summary_plot(self.shap_vals,plot_type='violin',max_display=max_display,
                          show=False,plot_size=0.2)
        fig.set_size_inches(12,max_display/3)
        
        ax1.set_title(f'Mean absolute SHAP values'+title_add,fontsize='small')
        ax1.set_xlabel('mean(|SHAP value|)',fontsize='x-small')
        ax2.set_title(f'SHAP values'+title_add,fontsize='small')
        ax2.set_xlabel('SHAP value', fontsize='x-small')
        for ax in [ax1,ax2]:
            ax.set_ylabel('feature name',fontsize='x-small')
            ax.tick_params(axis='y', labelsize='xx-small')
        plt.tight_layout()
        plt.show()
    
    def find_best_threshold(self,beta=1,conf=True,report=True, print_result=True):
        """
        Computes the classification threshold which gives the
        best F_beta score from classifier predictions,
        prints the best threshold and the corresponding F_beta score,
        and displays a confusion matrix and classification report
        corresponding to that threshold

        Parameters:
        -----------
        beta : float
            the desired beta value in the F_beta score
        conf : bool
            whether to display confusion matrix
        report : bool
            whether to display classification report
        print_result : bool
            whether to print a line reporting the best threshold
            and resulting F_beta score
        
        Returns: None (prints results and stores self.best_thresh)
        --------
        """
        prec,rec,threshs = precision_recall_curve(self.y_val,
                                                  self.y_predict_proba)
        F_betas = (1+beta**2)*(prec*rec)/((beta**2*prec)+rec)
        # Above formula is valid when TP!=0.  When TP==0
        # it gives np.nan whereas F_beta should be 0
        F_betas = np.nan_to_num(F_betas)
        idx = np.argmax(F_betas)
        best_thresh = threshs[idx]
        if print_result:
            print(f'Threshold optimizing F_{beta} score:   {best_thresh}\nBest F_{beta} score:   {F_betas[idx]}')
        if conf:
            self.score_pipeline(scoring='conf',thresh=best_thresh,beta=beta)
        if report:
            self.score_pipeline(scoring='classif_report',thresh=best_thresh,beta=beta)
        self.best_thresh = best_thresh

class LRStudy(ClassifierStudy):
    """
    A child class of ClassifierStudy which has an additional method specific to logistic regression
    """
    def __init__(self, classifier=None, X = None, y = None,
                 features=None,classifier_name = 'LR',
                 random_state=42):
        super().__init__(classifier, X, y,features,classifier_name,random_state)
    
    def plot_coeff(self, print_score = True, print_zero = False, title_add=None):
        """
        Method for doing a train/validation split, fitting the classifier,
        predicting and scoring on the validation set, and plotting
        a bar chart of the logistic regression coefficients corresponding
        to various model features.
        Features with coefficient zero and periodic spline features
        will be excluded from the chart.
        Parameters:
        -----------
        print_score : bool
            if True, the validation score are printed
        print_zero : bool
            if True, the list of features with zero coefficients are printed
        title_add : str or None
            an addendum that is added to the end of the plot title
        """
        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
        assert isinstance(self.classifier,LogisticRegression),'Your classifier is not an instance of Logistic Regression.'
        
        # fit and score
        self.fit_pipeline(split_first = True)
        self.predict_proba_pipeline()
        score = roc_auc_score(self.y_val, self.y_predict_proba)
        
        # Retrieve coeff values from fitted pipeline
        coeff = pd.DataFrame({'feature name':self.pipe_fitted['col'].get_feature_names_out(),
                               'coeff value':self.pipe_fitted[-1].coef_.reshape(-1)})\
                            .sort_values(by='coeff value')
        coeff = coeff[~coeff['feature name']\
                .isin([f'HOUR_OF_DAY_sp_{n}' for n in range(12)]\
                        +[f'DAY_OF_WEEK_sp_{n}' for n in range(3)])]\
                .set_index('feature name')
        coeff_zero_features = coeff[coeff['coeff value']==0].index
        coeff = coeff[coeff['coeff value']!=0]
        
        # Plot feature coefficients
        fig = plt.figure(figsize=(30,4))
        ax = fig.add_subplot(111)
        coeff['coeff value'].plot(kind='bar',ylabel='coeff value',ax=ax)
        ax.axhline(y=0, color= 'red', linewidth=2,)
        plot_title = 'PA bicycle collisions, 2002-2021\nLogistic regression model log-odds coefficients'
        if title_add is not None:
            plot_title += f': {title_add}'
        ax.set_title(plot_title)
        ax.tick_params(axis='x', labelsize='x-small')
        plt.show()
        
        if print_score:
            print(f'Score on validation set: {score}')
        if print_zero:
            print(f'Features with zero coefficients in trained model: {list(coeff_zero)}')
        
        self.score = score
        self.coeff = coeff
        self.coeff_zero_features = coeff_zero_features