Spaces:

etweedy
/

BikeSaferPA

Runtime error

App Files Files Community

etweedy commited on Aug 12, 2023

Commit

5d396e9

1 Parent(s): fdc6c87

Upload 22 files

Browse files

Files changed (22) hide show

app.py +555 -0
app_data.pickle +3 -0
crashes.csv +0 -0
cyclists.csv +0 -0
lib/.DS_Store +0 -0
lib/.ipynb_checkpoints/__init__-checkpoint.py +0 -0
lib/.ipynb_checkpoints/get_data-checkpoint.py +74 -0
lib/.ipynb_checkpoints/study_classif-checkpoint.py +787 -0
lib/.ipynb_checkpoints/transform_data-checkpoint.py +83 -0
lib/.ipynb_checkpoints/vis_data-checkpoint.py +287 -0
lib/__init__.py +0 -0
lib/__pycache__/__init__.cpython-310.pyc +0 -0
lib/__pycache__/get_data.cpython-310.pyc +0 -0
lib/__pycache__/study_class.cpython-310.pyc +0 -0
lib/__pycache__/study_classif.cpython-310.pyc +0 -0
lib/__pycache__/transform_data.cpython-310.pyc +0 -0
lib/__pycache__/vis_data.cpython-310.pyc +0 -0
lib/study_classif.py +787 -0
lib/transform_data.py +83 -0
lib/vis_data.py +287 -0
requirements.txt +77 -0
study.pkl +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,555 @@

+import pandas as pd
+import numpy as np
+import streamlit as st
+import plotly.express as px
+from scipy import stats
+import pickle
+import shap
+import lightgbm as lgb
+from lightgbm import LGBMClassifier
+# from sklearn.ensemble import HistGradientBoostingClassifier
+########################
+### Helper functions ###
+########################
+@st.cache_data
+def get_data(filename):
+    """
+    Read dataframe from CSV
+    """
+    return pd.read_csv(filename)
+#############
+### Setup ###
+#############
+# Load dataframes
+crashes = get_data('crashes.csv')
+cyclists = get_data('cyclists.csv')
+# Load in prepared labeling data for app components
+with open('app_data.pickle', 'rb') as file:
+    period_data, cohort_data, time_cat_data, time_bin_data,\
+    geo_data, county_data, feature_names, ord_features,\
+    cat_features,flag_features,model_cat_data,\
+    model_bin_data,veh_data = pickle.load(file)
+features = cat_features+flag_features+ord_features
+features.sort(key=lambda x:feature_names[x].lower())
+# Load trained classifier study object
+@st.cache_resource(show_spinner=False)
+def load_study():
+    """
+    Load the trained classifier pipeline
+    """
+    with open('study.pkl', 'rb') as file:
+        study = pickle.load(file)
+    return study
+################################
+### Initialize app structure ###
+################################
+st.header('BikeSaferPA: understanding cyclist outcomes')
+tabs = st.tabs([
+    'Welcome',
+    'Crashes over time',
+    'Mapping crashes',
+    'Feature distributions',
+    'BikeSaferPA predictions',
+])
+with tabs[0]:
+    intro_container = st.container()
+with tabs[1]:
+    time_intro_container = st.container()
+    time_settings_container = st.container()
+    time_plot_container = st.container()
+with tabs[2]:
+    map_intro_container = st.container()
+    map_settings_container = st.container()
+    map_plot_container = st.container()
+with tabs[3]:
+    feature_intro_container = st.container()
+    feature_settings_container = st.container()
+    feature_plot_container = st.container()
+with tabs[4]:
+    model_intro_container = st.container()
+    model_settings_container = st.container()
+    model_result_container = st.container()
+    model_shap_container = st.container()
+############################
+### Populate welcome tab ###
+############################
+with intro_container:
+    st.markdown(
+"""
+This app provides a suite of tools to accompany Eamonn Tweedy's [BikeSaferPA project](https://github.com/e-tweedy/BikeSaferPA). These tools allow the user to:
+- Visualize data related to crashes involving bicycles in Pennsylvania during the years 2002-2021, which was collected from a publically available [PENNDOT crash dataset](https://pennshare.maps.arcgis.com/apps/webappviewer/index.html?id=8fdbf046e36e41649bbfd9d7dd7c7e7e).
+- Experiment with the BikeSaferPA model, which was trained on this cyclist crash data and designed to predict severity outcomes for cyclists based on crash data.
+Navigate the tabs using the menu at the top to try them out.
+    """)
+######################################
+### Populate crashes over time tab ###
+######################################
+### Intro text ###
+with time_intro_container:
+    st.subheader('Visualizing bicycle crashes in PA over time')
+    st.markdown("""
+This tool provides plots of cyclist crash counts by year, month of the year, day of the week, or hour of the day and can stratify the counts by various crash features.
+You also have the option to restrict to Philadelpha county only, or the PA counties in the greater Philadelphia area (Bucks, Chester, Delaware, Montgomery, and Philadelphia).
+Expand the toolbox below to choose plot options.
+    """)
+### User input - settings for plot ###
+with time_settings_container:
+    # Expander containing plot option user input
+    with st.expander('Click here to expand or collapse plot options menu'):
+        col1,col2 = st.columns([0.4,0.6])
+        with col1:
+            # Geographic restriction selectbox
+            geo = st.selectbox(
+                'Geographic scope:',
+                list(geo_data.keys()),index=0,
+                format_func = lambda x:geo_data[x][0],
+                key = 'time_geo_select',
+            )
+            # Time period selectbox
+            period = st.selectbox(
+                'Time period:',
+                list(period_data.keys()),index=3,
+                format_func = lambda x:period_data[x][0],
+                key = 'time_period_select',
+            )
+        with col2:
+            # Cyclist cohort selectbox
+            cohort = st.selectbox(
+                'Crash severity:',
+                list(cohort_data.keys()),index=0,
+                format_func = lambda x:cohort_data[x],
+                key = 'time_cohort_select',
+            )
+            # Category stratification selectbox
+            stratify = st.selectbox('Stratify crashes by:',
+                                    ['no']+list(time_cat_data.keys()),index=0,
+                                    key = 'time_cat_stratify_select',
+                                    format_func = lambda x:time_cat_data[x][0]\
+                                        if x!='no' else 'do not stratify',
+                                   )
+        st.markdown('Restrict to crashes containing the following factor(s):')
+        title_add = ''
+        cols = st.columns(len(time_bin_data))
+        # Columns of binary feature checkboxes
+        for k,col in enumerate(cols):
+            with col:
+                for feat in time_bin_data[k]:
+                    # make checkbox
+                    time_bin_data[k][feat][2]=st.checkbox(time_bin_data[k][feat][0],key=f'time_{feat}')
+                    # if checked, filter samples and add feature to plot title addendum
+                    if time_bin_data[k][feat][2]:
+                        crashes = crashes[crashes[time_bin_data[k][feat][1]]==1]
+                        title_add+= ', '+time_bin_data[k][feat][0].split('one ')[-1]
+### Post-process user-selected setting data ###
+# Geographic restriction
+if geo != 'statewide':
+    crashes = crashes[crashes.COUNTY.isin(geo_data[geo][1])]
+# Relegate rare categories to 'other' for plot readability
+if stratify=='int_type':
+    crashes['INTERSECT_TYPE']=crashes['INTERSECT_TYPE']\
+    .replace({cat:'other' for cat in crashes.INTERSECT_TYPE.value_counts().index[3:]})
+if stratify=='coll_type':
+    crashes['COLLISION_TYPE']=crashes['COLLISION_TYPE']\
+    .replace({cat:'other' for cat in crashes.COLLISION_TYPE.value_counts().index[6:]})
+if stratify=='weather':
+    crashes['WEATHER']=crashes['WEATHER']\
+    .replace({cat:'other' for cat in crashes.WEATHER.value_counts().index[5:]})
+if stratify=='tcd':
+    crashes['TCD_TYPE']=crashes['TCD_TYPE']\
+    .replace({cat:'other' for cat in crashes.TCD_TYPE.value_counts().index[3:]})
+crashes=crashes.dropna(subset=period_data[period][1])
+# Order categories in descending order by frequency
+category_orders = {time_cat_data[cat][1]:list(crashes[time_cat_data[cat][1]].value_counts().index) for cat in time_cat_data}
+# Define cohort
+if cohort == 'inj':
+    crashes = crashes[crashes.BICYCLE_SUSP_SERIOUS_INJ_COUNT > 0]
+elif cohort == 'fat':
+    crashes = crashes[crashes.BICYCLE_DEATH_COUNT > 0]
+# Replace day,month numbers with string labels
+if period in ['day','month']:
+    crashes[period_data[period][1]] = crashes[period_data[period][1]].apply(lambda x:period_data[period][2][x-1])
+# Plot title addendum
+if len(title_add)>0:
+    title_add = '<br>with'+title_add.lstrip(',')
+# Category stratification plot settings
+if stratify=='no':
+    color,legend_title = None,None
+else:
+    color,legend_title=time_cat_data[stratify][1],time_cat_data[stratify][2]
+    title_add += f'<br>stratified {time_cat_data[stratify][0]}'
+### Build and display plot ###
+with time_plot_container:
+    # Plot samples if any, else report no samples remain
+    if crashes.shape[0]>0:
+        fig = px.histogram(crashes,
+                           x=period_data[period][1],
+                           color=color,
+                           nbins=len(period_data[period][2]),
+                           title=f'PA bicycle crashes 2002-2021 by {period_data[period][0]} - {cohort_data[cohort]}'+title_add,
+                           category_orders = category_orders,
+                          )
+        fig.update_layout(bargap=0.2,
+                          xaxis_title=period_data[period][0],
+                          legend_title_text=legend_title,
+                         )
+        fig.update_xaxes(categoryorder="array",
+                         categoryarray=period_data[period][2],
+                         dtick=1,
+                        )
+        st.plotly_chart(fig,use_container_width=True)
+    else:
+        st.markdown('#### No samples meet these criteria. Please remove some factors.')
+####################################
+### Populate mapping crashes tab ###
+####################################
+### Intro text ###
+with map_intro_container:
+    st.subheader('Mapping bicycle crashes in PA')
+    st.markdown("""
+This tool provides interactive maps of crash events, either statewide or in one of the more populous counties.  Crash event dots are color-coded based on whether the crash involved serious cyclist injury, cyclist fatality, or neither.
+Expand the menu below to adjust map options.
+    """)
+### User input - settings for map plot ###
+with map_settings_container:
+    # Expander containing plot option user input
+    with st.expander('Click here to expand or collapse map options menu'):
+        # Locale selectbox
+        geo = st.selectbox(
+            'Select either statewide or a particular county to plot:',
+            ['Statewide']+[county+' County' for county in county_data],
+            key = 'map_geo_select',
+        )
+        # Animation status selectbox
+        animate = st.selectbox(
+            'Select how to animate the map:',
+            ['do not animate','by year','by month'],
+            key = 'map_animate_select',
+        )
+### Post-process user-selected setting data ###
+if geo == 'Statewide':
+    county = None
+else:
+    geo = geo.split(' ')[0]
+    county = (county_data[geo],geo)
+color_dots=True
+if animate == 'do not animate':
+    animate = False
+    animate_by=None
+else:
+    animate_by = animate.split(' ')[1]
+    animate = True
+    # If county is not None and animating, check whether first frame has all
+    # injury/fatality status categories.  If not, then we will not color dots
+    # by injury/fatality status.
+    # This is to account for bug/feature in plotly 'animation_frame' and 'color' functionality
+    # which yields unexpected results when all color categories not present in first frame
+    # see e.g. https://github.com/plotly/plotly.py/issues/2259
+    if county is not None:
+        if animate_by == 'year':
+            color_dots = len(crashes.query('COUNTY==@county[0] and CRASH_YEAR==2002')\
+                       .BICYCLE_DEATH_COUNT.unique())+\
+                        len(crashes.query('COUNTY==@county[0] and CRASH_YEAR==2002')\
+                       .BICYCLE_SUSP_SERIOUS_INJ_COUNT.unique()) > 3
+        else:
+            color_dots = len(crashes.query('COUNTY==@county[0] and CRASH_YEAR==2002 and CRASH_MONTH==1')\
+                       .BICYCLE_DEATH_COUNT.unique())+\
+                       len(crashes.query('COUNTY==@county[0] and CRASH_YEAR==2002 and CRASH_MONTH==1')\
+                       .BICYCLE_SUSP_SERIOUS_INJ_COUNT.unique()) > 3
+if color_dots==False:
+    st.markdown("""
+    **Warning:** color-coding by injury/death status is disabled; this feature gives unexpected results
+    when not all classes appear in the first animation frame due to bug/feature in Plotly animate functionality.
+    Injury/death status is still visible in hover-text box.
+    """)
+### Build and display map plot ###
+from lib.vis_data import plot_map
+with map_plot_container:
+    fig = plot_map(
+        df=crashes,county=county,animate=animate,
+        color_dots=color_dots,animate_by=animate_by,
+        show_fig=False,return_fig=True,
+    )
+    st.plotly_chart(fig,use_container_width=True)
+##########################################
+### Populate feature distributions tab ###
+##########################################
+### Intro text ###
+with feature_intro_container:
+    st.subheader('Visualizing crash feature distributions')
+    st.markdown("""
+The tools on this page will demonstrate how distributions of values of various crash and cyclist features vary between two groups:
+- all cyclists involved in crashes, and
+- those cyclists who suffered serious injury or fatality
+Expand the following menu to choose a feature, and the graph will show its distribution of its values (via percentages) over the two groups.  Again you may restrict to Philadelpha county only, or the PA counties in the greater Philadelphia area (Bucks, Chester, Delaware, Montgomery, and Philadelphia).
+Pay particular attention to feature values which become more or less prevalent among cyclists suffering serious injury or death - for instance, 6.2% of all cyclists statewide were involved in a head-on collision, whereas 11.8% of those with serious injury or fatality were in a head-on collision.
+    """)
+### User input - settings for plot ###
+with feature_settings_container:
+    # Expander containing plot option user input
+    with st.expander('Click here to expand or collapse feature selection menu'):
+        # Geographic restriction selectbox
+        geo = st.selectbox(
+            'Geographic scope:',
+            list(geo_data.keys()),index=0,
+            format_func = lambda x:geo_data[x][0],
+            key = 'feature_geo_select',
+        )
+        # Feature selectbox
+        feature = st.selectbox('Show distributions of this feature:',
+                               features,format_func = lambda x:feature_names[x],
+                               key = 'feature_select',
+                              )
+### Post-process user-selected settings data ###
+from lib.vis_data import feat_perc,feat_perc_bar
+# Geographic restriction
+if geo != 'statewide':
+    cyclists = cyclists[cyclists.COUNTY.isin(geo_data[geo][1])]
+# Recast binary and day of week data
+if feature not in ord_features:
+    cyclists[feature]=cyclists[feature].replace({1:'yes',0:'no'})
+if feature == 'DAY_OF_WEEK':
+    cyclists[feature]=cyclists[feature].astype(str)
+### Build and display plot ###
+with feature_plot_container:
+    # Generate plot
+    sort = False if feature in ord_features else True
+    fig = feat_perc_bar(
+        feature,cyclists, feat_name=feature_names[feature],
+        return_fig=True,show_fig=False,sort=sort
+    )
+    # Adjust some colorscale and display settings
+    if feature == 'SPEED_LIMIT':
+        fig.update_coloraxes(colorscale='YlOrRd',cmid=35)
+    if feature == 'HOUR_OF_DAY':
+        fig.update_coloraxes(colorscale='balance')
+    if feature == 'DAY_OF_WEEK':
+        days = ['Sun']+list(cal.day_abbr)[:-1]
+        for idx, day in enumerate(days):
+            fig.data[idx].name = day
+            fig.data[idx].hovertemplate = day
+    # Display plot
+    st.plotly_chart(fig,use_container_width=True)
+    st.markdown('See [this Jupyter notebook](https://e-tweedy.github.io/2_BikeSaferPA_vis.html) for an in-depth data exploration and visualization process.')
+######################################
+### Populate model predictions tab ###
+######################################
+from lib.study_classif import ClassifierStudy
+### Intro text ###
+with model_intro_container:
+    st.subheader('Predicting cyclist outcome with BikeSaferPA')
+    st.markdown("""
+An instance of the BikeSaferPA predictive model has been trained in advance on all cyclist samples in the PENNDOT dataset.  This model is a gradient-boosted decision tree classifier model, and the model selection and evaluation process is covered in detail in [this Jupyter notebook](https://e-tweedy.github.io/3_BikeSaferPA_models.html).
+The purpose of this tool is to allow the user to simulate a model prediction on a hypothetical sample, and then explain the model's prediction using SHAP values.
+Expand the following sections to adjust the factors in a hypothetical cyclist crash, and the model will provide a predicted probability that the cyclist involved suffers serious injury or fatality.  You'll find that some factors influence the prediction significantly, and others very little.
+    """)
+### User inputs for model prediction ###
+# Load the trained classifier study object
+study = load_study()
+# Initialize input sample.  User inputs will update values.
+sample = pd.DataFrame(columns = study.pipe['col'].feature_names_in_)
+with model_settings_container:
+    # Expander for numerical inputs
+    with st.expander('Click here to expand or collapse numerical features'):
+        cols = st.columns(3)
+        with cols[0]:
+            sample.loc[0,'AGE'] = st.number_input('Cyclist age (yrs):',
+                            min_value=0,step=1,value=30)
+            sample.loc[0,'SPEED_LIMIT'] = st.number_input('Posted speed limit (mph):',
+                        min_value=0,max_value=100,step=5,value=25)
+            sample.loc[0,'CRASH_YEAR'] = st.number_input('Year crash took place:',
+                        min_value=2002,max_value=2023,step=1)
+        with cols[1]:
+            for k in [0,1,2]:
+                sample.loc[0,f'{veh_data[k][1]}_COUNT']=st.number_input(
+                    f'# {veh_data[k][0]}s involved:',
+                    min_value=0,step=1,max_value=3
+                )
+        with cols[2]:
+            for k in [3,4]:
+                sample.loc[0,f'{veh_data[k][1]}_COUNT']=st.number_input(
+                    f'# {veh_data[k][0]}s involved:',
+                    min_value=0,step=1,max_value=3
+                )
+    # Expander for categorical inputs
+    with st.expander('Click here to expand or collapse categorical features'):
+        cols = st.columns(3)
+        with cols[0]:
+            sample.loc[0,'ILLUMINATION'] = st.selectbox(
+                'Illumination status:',
+                model_cat_data['ILLUMINATION'],
+                format_func= lambda x:x.replace('_',' and '),
+            )
+            sample.loc[0,'URBAN_RURAL'] = st.selectbox(
+                'Collision setting:',
+                model_cat_data['URBAN_RURAL'],
+            )
+            sample.loc[0,'TCD_TYPE'] = st.selectbox(
+                'Traffic control device:',
+                model_cat_data['TCD_TYPE'],
+                format_func= lambda x:x.replace('_',' '),
+            )
+        with cols[1]:
+            sample.loc[0,'VEH_ROLE'] = st.selectbox(
+                'Bicycle role in collision:',
+                model_cat_data['VEH_ROLE'],
+                format_func= lambda x:x.replace('_',' and '),
+            )
+            sample.loc[0,'IMPACT_SIDE'] = st.selectbox(
+                'Bicycle impact side:',
+                model_cat_data['IMPACT_SIDE'],
+                format_func= lambda x:x.replace('_',' '),
+            )
+            sample.loc[0,'GRADE'] = st.selectbox(
+                'Roadway grade:',
+                model_cat_data['GRADE'],
+                format_func= lambda x:x.replace('_',' '),
+            )
+        with cols[2]:
+            sample.loc[0,'RESTRAINT_HELMET'] = st.selectbox(
+                'Cyclist helmet status:',
+                model_cat_data['RESTRAINT_HELMET'],
+                format_func= lambda x:x.replace('_',' ')\
+                .replace('restraint','helmet'),
+            )
+            sample.loc[0,'COLLISION_TYPE'] = st.selectbox(
+                'Collision type:',
+                model_cat_data['COLLISION_TYPE'],
+                format_func= lambda x:x.replace('_',' ')\
+                .replace('dir','direction'),
+            )
+            sample.loc[0,'FEMALE'] = st.selectbox(
+                'Cyclist sex:*',[1,0],
+                format_func = lambda x:'F' if x==1 else 'M',
+            )
+            st.markdown('*Note: the PENNDOT dataset only has a binary sex feature.')
+    # Expander for binary inputs
+    with st.expander('Click here to expand or collapse binary features'):
+        cols = st.columns(len(model_bin_data))
+        for k,col in enumerate(cols):
+            with col:
+                for feat in model_bin_data[k]:
+                    sample.loc[0,model_bin_data[k][feat][1]]=int(st.checkbox(model_bin_data[k][feat][0],
+                                                                             key=f'model_{feat}'))
+### Model prediction and reporting result ###
+with model_result_container:
+    # Fill these columns arbitrarily - they won't affect inference
+    # COUNTY, MUNICIPALITY, HOUR_OF_DAY, CRASH_MONTH used in pipeline for NaN imputation
+    # This version of model doesn't use temporal features as we set cyc_method=None
+    for feat in ['HOUR_OF_DAY','DAY_OF_WEEK','CRASH_MONTH','COUNTY','MUNICIPALITY']:
+        sample.loc[0,feat]=1
+    # Predict and report result
+    study.predict_proba_pipeline(X_test=sample)
+    st.write(f'**BikeSaferPA predicts a :red[{100*float(study.y_predict_proba):.2f}%] probability that a cyclist suffers serious injury or fatality under these conditions.**')
+### SHAP values ####
+with model_shap_container:
+    st.subheader('SHAP analysis for this hypothetical prediction')
+    st.markdown("""
+SHAP (SHapley Additive exPlainer) values provide an excellent method for assessing how various input features influence a model's predictions.  One significant advantage is that SHAP values are 'model agnostic' - they effectively explain the predictions made by many different types of machine learning classifiers.
+The following 'force plot' shows the influence of each feature's SHAP value on the model's predicted probability that the cyclist suffers serious injury or fatality. A feature with a positive (resp. negative) SHAP value indicates that the feature's value pushes the predicted probability higher (resp. lower), which in the force plot corresponds to a push to the right (resp. left).
+The force plot will update as you adjust input features in the menu above.
+    """)
+    # SHAP will just explain classifier, so need transformed X_train and X_test
+    pipe = study.pipe_fitted
+    sample_trans = pipe[:-1].transform(sample)
+    # # Need masker for linear model
+    # masker = shap.maskers.Independent(data=X_train_trans)
+    # Initialize explainer and compute and store SHAP values as an explainer object
+    explainer = shap.TreeExplainer(pipe[-1], feature_names = pipe['col'].get_feature_names_out())
+    shap_values = explainer(sample_trans)
+    sample_trans = pd.DataFrame(sample_trans,columns=pipe['col'].get_feature_names_out())
+    # def st_shap(plot, height=None):
+    #     shap_html = f"<head>{shap.getjs()}</head><body>{plot.html()}</body>"
+    #     components.html(shap_html, height=height)
+    fig=shap.plots.force(explainer.expected_value[1],shap_values.values[0][:,1],sample_trans,
+                         figsize=(20,3),show=False,matplotlib=True)
+    st.pyplot(fig)

app_data.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c8882e6c8ec43e8a4e96724b21f6f1c11347cc9e18317d1a0dbbd5621bd93812
+size 4990

crashes.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

cyclists.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

lib/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

lib/.ipynb_checkpoints/__init__-checkpoint.py ADDED Viewed

File without changes

lib/.ipynb_checkpoints/get_data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import pandas as pd
+def extract_data(year):
+    """
+    A function for loading data corresponding to an individual
+    year from a CSV file.  Data is then preprocessed and the
+    following dataframes are returned:
+    - 'bicycles': samples are bicycle vehicles which were
+       involved in crashes.
+    - 'persons': samples are all individuals involved in
+      crashes involving bicycles.
+    - 'crashes': samples are crash events involving bicycles.
+    - 'roadway': additional features for
+      crash events, related to roadway attributes and conditions.
+    """
+    # Retrieve vehicle samples corresponding to bicycles.
+    # Note that in some samples VEH_TYPE is string, others float
+    vehicles = pd.read_csv(f'data/raw_csv/VEHICLE_{year}_Statewide.csv',encoding='latin')
+    bicycle_filter = vehicles.VEH_TYPE.isin([20,21,'20','21'])
+    cols = ['CRN', 'GRADE', 'IMPACT_POINT',
+            'RDWY_ALIGNMENT','UNIT_NUM',
+            'VEH_MOVEMENT', 'VEH_POSITION','VEH_ROLE', 'VEH_TYPE']
+    bicycles = vehicles[bicycle_filter][cols]
+    del vehicles
+    # Merge onto bicycles dataframe some additional features from cycle
+    cycles = pd.read_csv(f'data/raw_csv/CYCLE_{year}_Statewide.csv',encoding='latin')
+    cols = ['CRN','UNIT_NUM','PC_HDLGHT_IND', 'PC_HLMT_IND','PC_REAR_RFLTR_IND']
+    bicycles = bicycles.merge(cycles[cols],how='left',on=['CRN','UNIT_NUM'])
+    del cycles
+    # Retrieve information about persons involved in crashes involving bikes
+    # (not just the persons riding the bikes)
+    persons = pd.read_csv(f'data/raw_csv/PERSON_{year}_Statewide.csv',encoding='latin')
+    cols = ['AGE','CRN','INJ_SEVERITY','PERSON_TYPE',
+            'RESTRAINT_HELMET','SEX', 'TRANSPORTED', 'UNIT_NUM']
+    persons = persons[persons.CRN.isin(bicycles.CRN)][cols]
+    # Retrieve crash samples involving bikes
+    crashes = pd.read_csv(f'data/raw_csv/CRASH_{year}_Statewide.csv',encoding='latin')
+    cols = ['CRN','ARRIVAL_TM','DISPATCH_TM','COUNTY','MUNICIPALITY','DEC_LAT','DEC_LONG',
+            'BICYCLE_DEATH_COUNT','BICYCLE_SUSP_SERIOUS_INJ_COUNT',
+            'BUS_COUNT','COMM_VEH_COUNT','HEAVY_TRUCK_COUNT','SMALL_TRUCK_COUNT','SUV_COUNT','VAN_COUNT',
+            'CRASH_MONTH', 'CRASH_YEAR','DAY_OF_WEEK','HOUR_OF_DAY',
+            'COLLISION_TYPE','ILLUMINATION','INTERSECT_TYPE',
+            'LOCATION_TYPE','RELATION_TO_ROAD','TIME_OF_DAY',
+            'ROAD_CONDITION','TCD_TYPE','TCD_FUNC_CD','URBAN_RURAL',
+            'WEATHER1','WEATHER2']
+    crashes = crashes[crashes.CRN.isin(bicycles.CRN)][cols]
+    # Retrieve roadway data involving bikes
+    roadway = pd.read_csv(f'data/raw_csv/ROADWAY_{year}_Statewide.csv',encoding='latin')
+    cols = ['CRN','SPEED_LIMIT','RDWY_COUNTY']
+    roadway = roadway[roadway.CRN.isin(bicycles.CRN)][cols]
+    # Merge onto out bicycle_crashes and ped_crashes dataframe
+    # some additional flag features.
+    # Include flag features corresponding to driver impairment,
+    # driver inattention, other driver attributes,relevant road conditions, etc.
+    flags = pd.read_csv(f'data/raw_csv/FLAG_{year}_Statewide.csv',encoding='latin')
+    cols = ['AGGRESSIVE_DRIVING','ALCOHOL_RELATED','ANGLE_CRASH','CELL_PHONE','COMM_VEHICLE',
+            'CRN','CROSS_MEDIAN','CURVED_ROAD','CURVE_DVR_ERROR','DISTRACTED','DRINKING_DRIVER',
+            'DRUGGED_DRIVER','DRUG_RELATED','FATIGUE_ASLEEP','HO_OPPDIR_SDSWP','ICY_ROAD',
+            'ILLUMINATION_DARK','IMPAIRED_DRIVER','INTERSECTION','LANE_DEPARTURE',
+            'NHTSA_AGG_DRIVING','NO_CLEARANCE',
+            'NON_INTERSECTION','REAR_END','RUNNING_RED_LT','RUNNING_STOP_SIGN',
+            'RURAL','SNOW_SLUSH_ROAD','SPEEDING','SPEEDING_RELATED',
+            'SUDDEN_DEER','TAILGATING','URBAN','WET_ROAD','WORK_ZONE',
+            'MATURE_DRIVER','YOUNG_DRIVER']
+    crashes = crashes.merge(flags[cols],how='left',on='CRN')
+    del flags
+    return bicycles, persons, crashes, roadway

lib/.ipynb_checkpoints/study_classif-checkpoint.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import shap
+from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, f_classif
+from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score, fbeta_score, roc_auc_score
+from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, classification_report, precision_recall_curve
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, RandomizedSearchCV, StratifiedKFold
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, SplineTransformer, PolynomialFeatures
+from sklearn.decomposition import PCA
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
+# from lightgbm import LGBMClassifier
+from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.utils.validation import check_is_fitted
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.compose import ColumnTransformer, make_column_transformer
+from lib.transform_data import *
+class ClassifierStudy():
+    """
+    A class that contains tools for studying a classifier pipeline
+    Parameters:
+    -----------
+    classifier : a scikit-learn compatible binary classifier
+    X : pd.DataFrame
+        dataframe of features
+    y : pd.Series
+        series of binary target values corresponding to X
+    classifier_name : str or None
+        if provided, will use as classifier name in pipeline
+        if not, will use 'clf' as name
+    features : dict
+        a dictionary whose keys are the feature types
+        'cyc','cat','ord','num','bin' and whose values
+        are lists of features of each type.
+    Methods:
+    -------
+    set_data, set_features, set_state
+        sets or resets attributes of self
+    build_pipeline
+        builds out pipeline based on supplied specs
+    cv_score
+        runs k-fold cross validation and reports scores
+    randomized_search
+        runs randomized search with cross validation
+        and reports results
+    fit_pipeline
+        fits the model pipeline and stores as
+        self.pipe_fitted
+    predict_proba_pipeline
+        uses a fitted pipeline to compute predicted
+        probabilities for test or validation set
+    score_pipeline
+        scores predicted probabilities
+    """
+    def __init__(self, classifier=None, X = None, y = None,
+                 features = None,classifier_name = None,
+                 random_state=42):
+        self.classifier = classifier
+        if X is not None:
+            self.X = X.copy()
+        if y is not None:
+            self.y = y.copy()
+        if features is not None:
+            self.features = features.copy()
+        self.random_state=random_state
+        self.pipe, self.pipe_fitted = None, None
+        self.classifier_name = classifier_name
+        self.X_val, self.y_val = None, None
+        self.y_predict_proba = None
+        self.best_params, self.best_n_components = None, None
+        self.shap_vals = None
+    def set_data(self,X=None,y=None):
+        """Method to set or reset feature and/or target data"""
+        if X is not None:
+            self.X = X.copy()
+        if y is not None:
+            self.y = y.copy()
+    def set_features(self,features):
+        """Method to set or reset the feature dictionary"""
+        if features is not None:
+            self.features = features.copy()
+    def set_state(self,random_state):
+        """Method to set or reset the random_state"""
+        self.random_state = random_state
+    def build_pipeline(self, cat_method = 'onehot',cyc_method = 'spline',num_ss=True,
+                       over_sample = False, pca=False,n_components=None,
+                       select_features = False,score_func=None,k='all',
+                       poly_features = False, degree=2, interaction_only=False):
+        """
+        Method to build the model pipeline
+        Parameters:
+        -----------
+        cat_method : str
+            specifies whether to encode categorical
+            variables as one-hot vectors or ordinals
+            must be either 'onehot' or 'ord'
+        cyc_method : str
+            specifies whether to encode cyclical features
+            with sine/cosine encoding or periodic splines
+            must be one of 'trig', 'spline', 'interact-trig',
+            'interact-spline','onehot', 'ord', or None
+            - If 'trig' or 'spline', will set up periodic encoder
+              with desired method
+            - If 'onehot' or 'ord', will set up appropriate
+              categorical encoder
+            - If 'interact-{method}', will use <method> encoding for HOUR_OF_DAY,
+              encode DAY_OF_WEEK as a binary feature expressing whether
+              the day is a weekend day, and then include interaction
+              features among this set via PolynomialFeatures.
+            - If None, will leave out cyclical features altogether
+        num_ss : bool
+            Whether or not to apply StandardScaler on the numerical features
+        over_sample : bool
+            set to True to include imblearn.over_sampling.RandomOverSampler step
+        pca : bool
+            set to True to include sklearn.decomposition.PCA step
+        n_components : int or None
+            number of components for sklearn.decomposition.PCA
+        select_features : bool
+            set to True to include sklearn.feature_selection.SelectKBest step
+        score_func : callable
+            score function to use for sklearn.feature_selection.SelectKBest
+            recommended: chi2, f_classif, or mutual_info_classif
+        k : int or 'all'
+            number of features for sklearn.feature_selection.SelectKBest
+        poly_features : bool
+            set to True to include sklearn.preprocessing.PolynomialFeatures step
+        degree : int
+            max degree for sklearn.preprocessing.PolynomialFeatures
+        interaction_only : bool
+            whether or not sklearn.preprocessing.PolynomialFeatures will be limited
+            to interaction terms only
+        """
+        # Define transformer for categorical features
+        if cat_method == 'onehot':
+            cat_encoder = ('ohe',OneHotEncoder(handle_unknown='infrequent_if_exist'))
+        elif cat_method == 'ord':
+            cat_encoder = ('oe',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
+        else:
+            raise ValueError("cat_method must be either 'onehot' or 'ord'")
+        cat_transform = Pipeline([('si',SimpleImputer(strategy='most_frequent')),cat_encoder])
+        # Define transformer for cyclic features
+        cyc_dict = {'HOUR_OF_DAY':24,'DAY_OF_WEEK':7}
+        if cyc_method == 'trig':
+            cyc_transform = [(f'{feat}_cos',cos_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]+\
+                        [(f'{feat}_sin',sin_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]
+        elif cyc_method =='spline':
+            cyc_transform = [(f'{feat}_cyclic',
+                          periodic_spline_transformer(cyc_dict[feat],n_splines=cyc_dict[feat]//2),
+                          [feat]) for feat in self.features['cyc']]
+        elif cyc_method == 'onehot':
+            cyc_encoder = ('ohe_cyc',OneHotEncoder(handle_unknown='infrequent_if_exist'))
+            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
+        elif cyc_method == 'ord':
+            cyc_encoder = ('oe_cyc',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
+            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
+        elif cyc_method == 'interact-spline':
+            hour_transform = (f'hour_cyc',periodic_spline_transformer(cyc_dict['HOUR_OF_DAY'],n_splines=12),['HOUR_OF_DAY'])
+            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
+            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer([hour_transform, wkend_transform],
+                                                               remainder='drop',verbose_feature_names_out=False)),
+                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
+                                                                include_bias=False))]),
+                             self.features['cyc'])]
+        elif cyc_method == 'interact-trig':
+            hour_transform = [(f'HOUR_cos',cos_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY']),
+                              (f'HOUR_sin',sin_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY'])]
+            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
+            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer(hour_transform+[wkend_transform],
+                                                               remainder='drop',verbose_feature_names_out=False)),
+                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
+                                                                include_bias=False))]),
+                             self.features['cyc'])]
+        elif cyc_method is None:
+            cyc_transform = [('cyc','passthrough',[])]
+        else:
+            raise ValueError("cyc_method must be one of 'trig','spline','interact','onehot','ord',or None")
+        # Define numerical transform
+        num_transform = ('num',StandardScaler(),self.features['num']) if num_ss else\
+                        ('num','passthrough',self.features['num'])
+        # Define column transformer
+        col_transform = ColumnTransformer([('cat',cat_transform,self.features['cat']),
+                                           ('ord','passthrough',self.features['ord']),
+                                           num_transform,
+                                           ('bin',SimpleImputer(strategy='most_frequent'),
+                                             self.features['bin'])]+\
+                                           cyc_transform,
+                                           remainder='drop',verbose_feature_names_out=False)
+        steps = [('col',col_transform)]
+        if 'AGE' in self.features['num']:
+            steps.insert(0,('gi_age',GroupImputer(target = 'AGE', group_cols=['COUNTY'],strategy='median')))
+        if 'HOUR_OF_DAY' in self.features['cyc']:
+            steps.insert(0,('gi_hour',GroupImputer(target = 'HOUR_OF_DAY', group_cols=['ILLUMINATION','CRASH_MONTH'],strategy='mode')))
+        # Insert optional steps as needed
+        if over_sample:
+            steps.insert(0,('os',RandomOverSampler(random_state=self.random_state)))
+        if poly_features:
+            steps.append(('pf',PolynomialFeatures(degree=degree,interaction_only=interaction_only)))
+        if select_features:
+            steps.append(('fs',SelectKBest(score_func = score_func, k = k)))
+        if pca:
+            steps.append(('pca',PCA(n_components=n_components,random_state=self.random_state)))
+        # Append classifier if provided
+        if self.classifier is not None:
+            if self.classifier_name is not None:
+                steps.append((f'{self.classifier_name}_clf',self.classifier))
+            else:
+                steps.append(('clf',self.classifier))
+        # Initialize pipeline
+        self.pipe = Pipeline(steps)
+    def cv_score(self, scoring = 'roc_auc', n_splits = 5, n_repeats=3, thresh = 0.5, beta = 1,
+                 return_mean_score=False,print_mean_score=True,print_scores=False, n_jobs=-1,
+                eval_size=0.1,eval_metric='auc'):
+        """
+        Method for performing cross validation via RepeatedStratifiedKFold
+        Parameters:
+        -----------
+        scoring : str
+            scoring function to use.  must be one of
+            'roc_auc','acc','f1','','f1w'
+        thresh : float
+            the classification threshold for computing y_pred
+            from y_pred_proba
+        beta : float
+            the beta-value to use in the f_beta score, if chosen
+        n_splits, n_repeats : int, int
+            number of splits and number of repeat iterations
+            for sklearn.model_selection.RepeatedStratifiedKFold
+        return_mean_score : bool
+            whether or not to return the mean score
+        print_mean_score : bool
+            whether to print out a report of the mean score
+        print_scores : bool
+            whether to print out a report of CV scores for all folds
+        n_jobs : int or None
+            number of CPU cores to use for parallel processing
+            -1 uses all available cores, and None defaults to 1
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        Returns: None or mean_score, depending on return_mean_score setting
+        --------
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert scoring in ['roc_auc','acc','f1','fb','f1w'],"scoring must be one of 'roc_auc','acc','f1','fb','f1w'"
+        # Initialize CV iterator
+        kf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats=n_repeats,
+                                     random_state=self.random_state)
+        # Restrict to features supplied in self.features
+        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        lgb_es=False
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if 'early_stopping_round' in self.pipe[-1].get_params():
+        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #             lgb_es=True
+        scores = []
+        # Iterate over folds and train, predict, score
+        for i,(train_idx,test_idx) in enumerate(kf.split(X,self.y)):
+            fold_X_train = X.iloc[train_idx,:]
+            fold_X_test = X.iloc[test_idx,:]
+            fold_y_train = self.y.iloc[train_idx]
+            fold_y_test = self.y.iloc[test_idx]
+            pipe=clone(self.pipe)
+            if lgb_es:
+                fold_X_train,fold_X_es,fold_y_train,fold_y_es = train_test_split(fold_X_train,fold_y_train,
+                                                                                 stratify=fold_y_train,test_size=eval_size,
+                                                                                 random_state=self.random_state)
+                trans_pipe = pipe[:-1]
+                trans_pipe.fit_transform(fold_X_train)
+                fold_X_es = trans_pipe.transform(fold_X_es)
+                clf_name = pipe.steps[-1][0]
+                fit_params = {f'{clf_name}__eval_set':[(fold_X_es,fold_y_es)],
+                              f'{clf_name}__eval_metric':eval_metric,
+                              f'{clf_name}__verbose':0}
+            else:
+                fit_params = {}
+            pipe.fit(fold_X_train,fold_y_train,**fit_params)
+            fold_y_pred_proba = pipe.predict_proba(fold_X_test)[:,1]
+            if scoring == 'roc_auc':
+                fold_score = roc_auc_score(fold_y_test, fold_y_pred_proba)
+            else:
+                fold_y_pred = (fold_y_pred_proba >= thresh).astype('int')
+                if scoring == 'acc':
+                    fold_score = accuracy_score(fold_y_test,fold_y_pred)
+                elif scoring == 'f1':
+                    fold_score = f1_score(fold_y_test,fold_y_pred)
+                elif scoring == 'f1w':
+                    fold_score = f1_score(fold_y_test,fold_y_pred,average='weighted')
+                else:
+                    fold_score = fbeta_score(fold_y_test,fold_y_pred,beta=beta)
+            scores.append(fold_score)
+        # Average and report
+        mean_score = np.mean(scores)
+        if print_scores:
+            print(f'CV scores using {scoring} score: {scores} \nMean score: {mean_score}')
+        if print_mean_score:
+            print(f'Mean CV {scoring} score: {mean_score}')
+        if return_mean_score:
+            return mean_score
+    def randomized_search(self, params, n_components = None, n_iter=10,
+                          scoring='roc_auc',cv=5,refit=False,top_n=10, n_jobs=-1):
+        """
+        Method for performing randomized search with cross validation on a given dictionary of parameter distributions
+        Also displays a table of results the best top_n iterations
+        Parameters:
+        ----------
+        params : dict
+            parameter distributions to use for RandomizedSearchCV
+        n_components : int, or list, or None
+            number of components for sklearn.decomposition.PCA
+            - if int, will reset the PCA layer in self.pipe with provided value
+            - if list, must be list of ints, which will be included in
+              RandomizedSearchCV parameter distribution
+        scoring : str
+            scoring function for sklearn.model_selection.cross_val_score
+        n_iter : int
+            number of iterations to use in RandomizedSearchCV
+        refit : bool
+            whether to refit a final classifier with best parameters
+            - if False, will only set self.best_params and self.best_score
+            - if True, will set self.best_estimator in addition
+        top_n : int or None
+            if int, will display results from top_n best iterations only
+            if None, will display all results
+        n_jobs : int or None
+            number of CPU cores to use for parallel processing
+            -1 uses all available cores, and None defaults to 1
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert (n_components is None)|('pca' in self.pipe.named_steps), 'Your pipeline has no PCA step.  Build a pipeline with PCA first.'
+        assert (len(params)>0)|(type(n_components)==list), 'Either pass a parameter distribution or a list of n_components values.'
+        # Add estimator name prefix to hyperparams
+        params = {self.pipe.steps[-1][0]+'__'+key:params[key] for key in params}
+        # Process supplied n_components
+        if type(n_components)==list:
+            params['pca__n_components']=n_components
+        elif type(n_components)==int:
+            self.pipe['pca'].set_params(n_components=n_components)
+        # Restrict to features supplied in self.features
+        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # Initialize rs and fit
+        rs = RandomizedSearchCV(self.pipe, param_distributions = params,
+                                n_iter=n_iter, scoring = scoring, cv = cv,refit=refit,
+                                random_state=self.random_state, n_jobs=n_jobs)
+        rs.fit(X,self.y)
+        # Display top n scores
+        results = rs.cv_results_
+        results_df = pd.DataFrame(results['params'])
+        param_names = list(results_df.columns)
+        results_df[f'mean cv score ({scoring})']=pd.Series(results['mean_test_score'])
+        results_df = results_df.set_index(param_names).sort_values(by=f'mean cv score ({scoring})',ascending=False)
+        if top_n is not None:
+            display(results_df.head(top_n).style\
+                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
+        else:
+            display(results_df.style\
+                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
+        if refit:
+            self.best_estimator = rs.best_estimator_
+        best_params = rs.best_params_
+        self.best_params = {key.split('__')[-1]:best_params[key] for key in best_params if key.split('__')[0]!='pca'}
+        self.best_n_components = next((best_params[key] for key in best_params if key.split('__')[0]=='pca'), None)
+        self.best_score = rs.best_score_
+    def fit_pipeline(self,split_first=False, eval_size=0.1,eval_metric='auc'):
+        """
+        Method for fitting self.pipeline on self.X,self.y
+        Parameters:
+        -----------
+        split_first : bool
+            if True, a train_test_split will be performed first
+            and the validation set will be stored
+        early_stopping : bool
+            Indicates whether we will use early_stopping for lightgbm.
+            If true, will split off an eval set prior to k-fold split
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        """
+        # Need pipe and X to fit
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert self.X is not None, 'X does not exist.  First set X.'
+        # If no y provided, then no pipeline steps should require y
+        step_list = [step[0] for step in self.pipe.steps]
+        assert (('clf' not in step_list[-1])&('kf' not in step_list))|(self.y is not None), 'You must provide targets y if pipeline has a classifier step or feature selection step.'
+        # Don't need to do a train-test split without a classifier
+        assert (split_first==False)|('clf' in step_list[-1]), 'Only need train-test split if you have a classifier.'
+        if split_first:
+            X_train,X_val,y_train,y_val = train_test_split(self.X,self.y,stratify=self.y,
+                                                           test_size=0.2,random_state=self.random_state)
+            self.X_val = X_val
+            self.y_val = y_val
+        else:
+            X_train = self.X.copy()
+            if self.y is not None:
+                y_train = self.y.copy()
+        # Restrict to features supplied in self.features
+        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # If LGBM early stopping, then need to split off eval_set and define fit_params
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #         X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
+        #                                                        test_size=eval_size,
+        #                                                        stratify=y_train,
+        #                                                        random_state=self.random_state)
+        #         trans_pipe = self.pipe[:-1]
+        #         trans_pipe.fit_transform(X_train)
+        #         X_es = trans_pipe.transform(X_es)
+        #         clf_name = self.pipe.steps[-1][0]
+        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
+        #                       f'{clf_name}__eval_metric':eval_metric,
+        #                      f'{clf_name}__verbose':0}
+        #     else:
+        #         fit_params = {}
+        # else:
+        #     fit_params = {}
+        fit_params = {}
+        # Fit and store fitted pipeline. If no classifier, fit_transform X_train and store transformed version
+        pipe = self.pipe
+        if 'clf' in step_list[-1]:
+            pipe.fit(X_train,y_train,**fit_params)
+        else:
+            X_transformed = pipe.fit_transform(X_train)
+            # X_transformed = pd.DataFrame(X_transformed,columns=pipe[-1].get_column_names_out())
+            self.X_transformed = X_transformed
+        self.pipe_fitted = pipe
+    def predict_proba_pipeline(self, X_test = None):
+        """
+        Method for using a fitted pipeline to compute predicted
+        probabilities for X_test (if supplied) or self.X_val
+        Parameters:
+        -----------
+        X_test : pd.DataFrame or None
+            test data input features (if None, will use self.X_val)
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert self.pipe_fitted is not None, 'Pipeline is not fitted.  First fit pipeline using fit_pipeline.'
+        assert (X_test is not None)|(self.X_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
+        if X_test is None:
+            X_test = self.X_val
+        # Restrict to features supplied in self.features
+        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # Save prediction
+        self.y_predict_proba = self.pipe_fitted.predict_proba(X_test)[:,1]
+    def score_pipeline(self,y_test=None,scoring='roc_auc',thresh=0.5, beta = 1,
+                       normalize = None, print_score = True):
+        """
+        Method for scoring self.pipe_fitted on supplied test data and reporting score
+        Parameters:
+        -----------
+        y_test : pd.Series or None
+            true binary targets (if None, will use self.y_val)
+        scoring : str
+            specifies the metric to use for scoring
+            must be one of
+            'roc_auc', 'roc_plot', 'acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'
+        thresh : float
+            threshhold value for computing y_pred
+            from y_predict_proba
+        beta : float
+            the beta parameter in the fb score
+        normalize : str or None
+            the normalize parameter for the
+            confusion_matrix. must be one of
+            'true','pred','all',None
+        print_score : bool
+            if True, will print a message reporting the score
+            if False, will return the score as a float
+        """
+        assert (y_test is not None)|(self.y_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
+        assert self.y_predict_proba is not None, 'Predicted probabilities do not exist.  Run predict_proba_pipeline first.'
+        if y_test is None:
+            y_test = self.y_val
+        # Score and report
+        if scoring == 'roc_plot':
+            fig = plt.figure(figsize=(4,4))
+            ax = fig.add_subplot(111)
+            RocCurveDisplay.from_predictions(y_test,self.y_predict_proba,ax=ax)
+            plt.show()
+        elif scoring == 'roc_auc':
+            score = roc_auc_score(y_test, self.y_predict_proba)
+        else:
+            y_pred = (self.y_predict_proba >= thresh).astype('int')
+            if scoring == 'acc':
+                score = accuracy_score(y_test,y_pred)
+            elif scoring == 'f1':
+                score = f1_score(y_test,y_pred)
+            elif scoring == 'f1w':
+                score = f1_score(y_test,y_pred,average='weighted')
+            elif scoring == 'fb':
+                score = fbeta_score(y_test,y_pred,beta=beta)
+            elif scoring == 'mcc':
+                score = matthews_coffcoeff(y_test,y_pred)
+            elif scoring == 'kappa':
+                score = cohen_kappa_score(y_test,y_pred)
+            elif scoring == 'conf':
+                fig = plt.figure(figsize=(3,3))
+                ax = fig.add_subplot(111)
+                ConfusionMatrixDisplay.from_predictions(y_test,y_pred,ax=ax,colorbar=False)
+                plt.show()
+            elif scoring == 'classif_report':
+                target_names=['neither seriously injured nor killed','seriously injured or killed']
+                print(classification_report(y_test, y_pred,target_names=target_names))
+            else:
+                raise ValueError("scoring must be one of 'roc_auc', 'roc_plot','acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'")
+        if scoring not in ['conf','roc_plot','classif_report']:
+            if print_score:
+                print(f'The {scoring} score is: {score}')
+            else:
+                return score
+    def shap_values(self, X_test = None, eval_size=0.1,eval_metric='auc'):
+        """
+        Method for computing and SHAP values for features
+        stratifiedtrain/test split
+        A copy of self.pipe is fitted on the training set
+        and then SHAP values are computed on test set samples
+        Parameters:
+        -----------
+        X_test : pd.DataFrame
+            The test set; if provided, will not perform
+            a train/test split before fitting
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        Returns: None (stores results in self.shap_vals)
+        --------
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        # Clone pipeline, do train/test split if X_test not provided
+        pipe = clone(self.pipe)
+        X_train = self.X.copy()
+        y_train = self.y.copy()
+        if X_test is None:
+            X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,stratify=y_train,
+                                                                 test_size=0.2,random_state=self.random_state)
+        # Restrict to features provided in self.features, and fit
+        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # If LGBM early stopping, then need to split off eval_set and define fit_params
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if 'early_stopping_round' in self.pipe[-1].get_params():
+        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #             X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
+        #                                                        test_size=eval_size,
+        #                                                        stratify=y_train,
+        #                                                        random_state=self.random_state)
+        #         trans_pipe = self.pipe[:-1]
+        #         trans_pipe.fit_transform(X_train)
+        #         X_es = trans_pipe.transform(X_es)
+        #         clf_name = self.pipe.steps[-1][0]
+        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
+        #                       f'{clf_name}__eval_metric':eval_metric,
+        #                      f'{clf_name}__verbose':0}
+        #     else:
+        #         fit_params = {}
+        # else:
+        #     fit_params = {}
+        fit_params = {}
+        pipe.fit(X_train,y_train,**fit_params)
+        # SHAP will just explain classifier, so need transformed X_train and X_test
+        X_train_trans, X_test_trans = pipe[:-1].transform(X_train), pipe[:-1].transform(X_test)
+        # Need masker for linear model
+        masker = shap.maskers.Independent(data=X_train_trans)
+        # Initialize explainer and compute and store SHAP values as an explainer object
+        explainer = shap.Explainer(pipe[-1], masker = masker, feature_names = pipe['col'].get_feature_names_out())
+        self.shap_vals = explainer(X_test_trans)
+        self.X_shap = X_train_trans
+        self.y_shap = y_train
+    def shap_plot(self,max_display='all'):
+        """
+        Method for generating plots of SHAP value results
+        SHAP values should be already computed previously
+        Generates two plots side by side:
+            - a beeswarm plot of SHAP values of all samples
+            - a barplot of mean absolute SHAP values
+        Parameters:
+        -----------
+        max_display : int or 'all'
+            The number of features to show in the plot, in descending
+            order by mean absolute SHAP value.  If 'all', then
+            all features will be included.
+        Returns: None (plots displayed)
+        --------
+        """
+        assert self.shap_vals is not None, 'No shap values exist.  First compute shap values.'
+        assert (isinstance(max_display,int))|(max_display=='all'), "'max_display' must be 'all' or an integer"
+        if max_display=='all':
+            title_add = ', all features'
+            max_display = self.shap_vals.shape[1]
+        else:
+            title_add = f', top {max_display} features'
+        # Plot
+        fig=plt.figure()
+        ax1=fig.add_subplot(121)
+        shap.summary_plot(self.shap_vals,plot_type='bar',max_display=max_display,
+                          show=False,plot_size=0.2)
+        ax2=fig.add_subplot(122)
+        shap.summary_plot(self.shap_vals,plot_type='violin',max_display=max_display,
+                          show=False,plot_size=0.2)
+        fig.set_size_inches(12,max_display/3)
+        ax1.set_title(f'Mean absolute SHAP values'+title_add,fontsize='small')
+        ax1.set_xlabel('mean(|SHAP value|)',fontsize='x-small')
+        ax2.set_title(f'SHAP values'+title_add,fontsize='small')
+        ax2.set_xlabel('SHAP value', fontsize='x-small')
+        for ax in [ax1,ax2]:
+            ax.set_ylabel('feature name',fontsize='x-small')
+            ax.tick_params(axis='y', labelsize='xx-small')
+        plt.tight_layout()
+        plt.show()
+    def find_best_threshold(self,beta=1,conf=True,report=True, print_result=True):
+        """
+        Computes the classification threshold which gives the
+        best F_beta score from classifier predictions,
+        prints the best threshold and the corresponding F_beta score,
+        and displays a confusion matrix and classification report
+        corresponding to that threshold
+        Parameters:
+        -----------
+        beta : float
+            the desired beta value in the F_beta score
+        conf : bool
+            whether to display confusion matrix
+        report : bool
+            whether to display classification report
+        print_result : bool
+            whether to print a line reporting the best threshold
+            and resulting F_beta score
+        Returns: None (prints results and stores self.best_thresh)
+        --------
+        """
+        prec,rec,threshs = precision_recall_curve(self.y_val,
+                                                  self.y_predict_proba)
+        F_betas = (1+beta**2)*(prec*rec)/((beta**2*prec)+rec)
+        # Above formula is valid when TP!=0.  When TP==0
+        # it gives np.nan whereas F_beta should be 0
+        F_betas = np.nan_to_num(F_betas)
+        idx = np.argmax(F_betas)
+        best_thresh = threshs[idx]
+        if print_result:
+            print(f'Threshold optimizing F_{beta} score:   {best_thresh}\nBest F_{beta} score:   {F_betas[idx]}')
+        if conf:
+            self.score_pipeline(scoring='conf',thresh=best_thresh,beta=beta)
+        if report:
+            self.score_pipeline(scoring='classif_report',thresh=best_thresh,beta=beta)
+        self.best_thresh = best_thresh
+class LRStudy(ClassifierStudy):
+    """
+    A child class of ClassifierStudy which has an additional method specific to logistic regression
+    """
+    def __init__(self, classifier=None, X = None, y = None,
+                 features=None,classifier_name = 'LR',
+                 random_state=42):
+        super().__init__(classifier, X, y,features,classifier_name,random_state)
+    def plot_coeff(self, print_score = True, print_zero = False, title_add=None):
+        """
+        Method for doing a train/validation split, fitting the classifier,
+        predicting and scoring on the validation set, and plotting
+        a bar chart of the logistic regression coefficients corresponding
+        to various model features.
+        Features with coefficient zero and periodic spline features
+        will be excluded from the chart.
+        Parameters:
+        -----------
+        print_score : bool
+            if True, the validation score are printed
+        print_zero : bool
+            if True, the list of features with zero coefficients are printed
+        title_add : str or None
+            an addendum that is added to the end of the plot title
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert isinstance(self.classifier,LogisticRegression),'Your classifier is not an instance of Logistic Regression.'
+        # fit and score
+        self.fit_pipeline(split_first = True)
+        self.predict_proba_pipeline()
+        score = roc_auc_score(self.y_val, self.y_predict_proba)
+        # Retrieve coeff values from fitted pipeline
+        coeff = pd.DataFrame({'feature name':self.pipe_fitted['col'].get_feature_names_out(),
+                               'coeff value':self.pipe_fitted[-1].coef_.reshape(-1)})\
+                            .sort_values(by='coeff value')
+        coeff = coeff[~coeff['feature name']\
+                .isin([f'HOUR_OF_DAY_sp_{n}' for n in range(12)]\
+                        +[f'DAY_OF_WEEK_sp_{n}' for n in range(3)])]\
+                .set_index('feature name')
+        coeff_zero_features = coeff[coeff['coeff value']==0].index
+        coeff = coeff[coeff['coeff value']!=0]
+        # Plot feature coefficients
+        fig = plt.figure(figsize=(30,4))
+        ax = fig.add_subplot(111)
+        coeff['coeff value'].plot(kind='bar',ylabel='coeff value',ax=ax)
+        ax.axhline(y=0, color= 'red', linewidth=2,)
+        plot_title = 'PA bicycle collisions, 2002-2021\nLogistic regression model log-odds coefficients'
+        if title_add is not None:
+            plot_title += f': {title_add}'
+        ax.set_title(plot_title)
+        ax.tick_params(axis='x', labelsize='x-small')
+        plt.show()
+        if print_score:
+            print(f'Score on validation set: {score}')
+        if print_zero:
+            print(f'Features with zero coefficients in trained model: {list(coeff_zero)}')
+        self.score = score
+        self.coeff = coeff
+        self.coeff_zero_features = coeff_zero_features

lib/.ipynb_checkpoints/transform_data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import FunctionTransformer, SplineTransformer
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+class GroupImputer(BaseEstimator, TransformerMixin):
+    """
+    Class used for imputing missing values in a pd.DataFrame
+    using mean, median, or mode by groupwise aggregation,
+    or a constant.
+    Parameters:
+    -----------
+    target : str
+        - The name of the column to be imputed
+    group_cols : list
+        - List of name(s) of columns on which to groupby
+    strategy : str
+         - The method for replacement; can be any of
+          ['mean', 'median', 'mode']
+    Returns:
+    --------
+    X : pd.DataFrame
+        - The dataframe with imputed values in the target column
+    """
+    def __init__(self,target,group_cols=None,strategy='median'):
+        assert strategy in ['mean','median','mode'], "strategy must be in ['mean', 'median', 'mode']'"
+        assert type(group_cols)==list, 'group_cols must be a list of column names'
+        assert type(target) == str, 'target must be a string'
+        self.group_cols = group_cols
+        self.strategy=strategy
+        self.target = target
+    def fit(self,X,y=None):
+        if self.strategy=='mode':
+            impute_map = X.groupby(self.group_cols)[self.target]\
+                            .agg(lambda x: pd.Series.mode(x,dropna=False)[0])\
+                            .reset_index(drop=False)
+        else:
+            impute_map = X.groupby(self.group_cols)[self.target]\
+                        .agg(self.strategy).reset_index(drop=False)
+        self.impute_map_ = impute_map
+        return self
+    def transform(self,X,y=None):
+        check_is_fitted(self,'impute_map_')
+        X=X.copy()
+        for index,row in self.impute_map_.iterrows():
+            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
+            X.loc[ind,self.target] = X.loc[ind,self.target].fillna(row[self.target])
+        return X
+# Sine and consine transformations
+def sin_feature_names(transformer, feature_names):
+    return [f'SIN_{col}' for col in feature_names]
+def cos_feature_names(transformer, feature_names):
+    return [f'COS_{col}' for col in feature_names]
+def sin_transformer(period):
+    return FunctionTransformer(lambda x: np.sin(2*np.pi*x/period),feature_names_out = sin_feature_names)
+def cos_transformer(period):
+    return FunctionTransformer(lambda x: np.cos(2*np.pi*x/period),feature_names_out = cos_feature_names)
+# Periodic spline transformation
+def periodic_spline_transformer(period, n_splines=None, degree=3):
+    if n_splines is None:
+        n_splines = period
+    n_knots = n_splines + 1  # periodic and include_bias is True
+    return SplineTransformer(
+        degree=degree,
+        n_knots=n_knots,
+        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
+        extrapolation="periodic",
+        include_bias=True,
+    )

lib/.ipynb_checkpoints/vis_data-checkpoint.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import pandas as pd
+import numpy as np
+import plotly.express as px
+from scipy import stats
+def plot_map(df,city=None,county=None,animate=True,color_dots=True,animate_by='year',show_fig=True,return_fig=False):
+    """
+    Displays a plotly.express.scatter_mapbox interactive map
+    of crashes in a municipality if specified, or otherwise
+    statewide.  Can be animated over time or static.
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        dataframe of crash samples
+    city or county : tuple or None
+        if provided, must be a tuple (code,name)
+        - code : str
+            the code corresponding to the desired municipality/county
+            (see the data dictionary)
+        - name : str
+            the name you want to use for the municipality/county
+            in plot title
+        * At most one of these can be not None!
+    animate : bool
+        if animate==True, then the map will animate using
+        the frequency provided in animate_by
+    color_dots : bool
+        if color_dots==True, then dots will be color-coded by
+        'serious injury or death' status.
+        WARNING: if color_dots and animate, then all frames
+        will be missing samples in 'serious injury or death'
+        classes which aren't present in first frame - due to
+        bug in plotly animation_frame implementation.
+        Recommend only using both when geographic
+        area is statewide or at least has all values of
+        'serious injury or death' in first frame
+    animate_by : str
+        the desired animation frequency, must be
+        either 'year' or 'month'
+    show_fig : bool
+        whether to display figure using fig.show()
+    return_fig : bool
+        whether to return the figure object
+   Returns: Either figure or None
+   --------
+    """
+    assert (city is None)|(county is None), 'A city and county cannot both be provided.'
+    # Copy df and create new column for color coding event type
+    df = df.copy()
+    df.loc[df.BICYCLE_SUSP_SERIOUS_INJ_COUNT>0,'Serious cyclist injury or death']='serious injury'
+    df.loc[df.BICYCLE_DEATH_COUNT>0,'Serious cyclist injury or death']='death'
+    df['Serious cyclist injury or death']=df['Serious cyclist injury or death'].fillna('neither')
+    # Set animation parameters
+    if animate:
+        if animate_by == 'year':
+            animation_frame = 'CRASH_YEAR'
+            title_animate = ' by year'
+        elif animate_by == 'month':
+            df['DATE'] = pd.to_datetime((df['CRASH_MONTH'].astype('str')\
+                                         +'-'+df['CRASH_YEAR'].astype('str')),
+                                       format = "%m-%Y")
+            df=df.sort_values(by='DATE')
+            df['DATE']=df['DATE'].astype('str').apply(lambda x: x.rsplit('-',1)[0])
+            animation_frame = 'DATE'
+            title_animate = ' by month'
+        else:
+            raise ValueError("animate_by must be 'year' or 'month'")
+    else:
+        animation_frame = None
+        title_animate = ''
+    if color_dots:
+        color='Serious cyclist injury or death'
+    else:
+        color=None
+    # Adjustments for when city or county are provided
+    if city is not None:
+        df = df[df.MUNICIPALITY==city[0]]
+        # Ignore extreme outlier samples - lat,lon may be incorrect
+        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
+        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
+        title_place = city[1]+', PA'
+    elif county is not None:
+        df = df[df.COUNTY==county[0]]
+        # Ignore extreme outlier samples - lat,lon may be incorrect
+        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
+        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
+        title_place = county[1]+' county, PA'
+    else:
+        title_place = 'PA'
+    # Compute default zoom level based on lat,lon ranges.
+    # open-street-map uses
+    max_lat, min_lat = df.DEC_LAT.max(), df.DEC_LAT.min()
+    max_lon, min_lon = df.DEC_LONG.max(), df.DEC_LONG.min()
+    # 2^(zoom) = 360/(longitude width of 1 tile)
+    zoom = np.log2(360/max(max_lon-min_lon,max_lat-min_lat))
+    lat_center = (max_lat+min_lat)/2
+    lon_center = (max_lon+min_lon)/2
+    # Adjust width so that aspect ratio matches shape of state
+    width_mult = (max_lon-min_lon)/(max_lat-min_lat)
+    cols  = ['CRN','DEC_LAT','DEC_LONG','Serious cyclist injury or death','CRASH_YEAR','CRASH_MONTH']
+    if animate_by=='month':
+        cols.append('DATE')
+    # Plot mapbox
+    fig = px.scatter_mapbox(df, lat='DEC_LAT',lon='DEC_LONG',
+                            color=color,
+                            color_discrete_map={'neither':'royalblue','serious injury':'orange','death':'crimson'},
+                            mapbox_style='open-street-map',
+                            animation_frame = animation_frame,
+                            animation_group='CRN',
+                            hover_data = {'DEC_LAT':False,'DEC_LONG':False,
+                                         'CRASH_YEAR':True,'CRASH_MONTH':True,
+                                         'Serious cyclist injury or death':True},
+                            width = width_mult*500,height=700,zoom=zoom,
+                            center={'lat':lat_center,'lon':lon_center},
+                            title=f'Crashes involving bicycles{title_animate}<br> in {title_place}, 2002-2021')
+    fig.update_layout(legend=dict(orientation='h',xanchor='right',yanchor='bottom',x=1,y=-0.12),
+                     legend_title_side='top')
+    if show_fig:
+        fig.show()
+    if return_fig:
+        return fig
+def feat_perc(feat, df, col_name = 'percentage', feat_name = None):
+    """
+    Constructs a single-column dataframe 'perc'
+    containing the value counts in the series
+    df[feat] as percentages of the whole.
+    - 'df' is the input dataframe.
+    - 'feat' is the desired column of df.
+    - 'col_name' is the name of the
+    column of the output dataframe
+    - 'feat_name' is the index name
+    of the output dataframe if provided, otherwise
+    will use 'feat' as index name.
+    """
+    perc = pd.DataFrame({col_name:df[feat].value_counts(normalize=True).sort_index()})
+    if feat_name:
+        perc.index.name=feat_name
+    else:
+        perc.index.name=feat
+    return perc
+def feat_perc_bar(feat,df,feat_name=None,cohort_name=None,show_fig=True,return_fig=False,sort=False):
+    """
+    Makes barplot of two series:
+        - distribution of feature among all cyclists
+        - distribution of feature among cyclists with serious injury or fatality
+    Parameters:
+    -----------
+    feat : str
+        The column name of the desired feature
+    df : pd.DataFrame
+        The input dataframe
+    feat_name : str or None
+        The feature name to use in the
+        x-axis label.  If None, will use feat
+    cohort_name : str or None
+        qualifier to use in front of 'cyclists'
+        in titles, if provided, e.g. 'rural cyclists'
+    show_fig : bool
+        whether to finish with fig.show()
+    return_fig : bool
+        whether to return the fig object
+    sort : bool
+        whether to sort bars. If False, will use default sorting
+        by category name or feature value.  If True, will resort
+        in descending order by percentage
+    Returns: figure or None
+    --------
+    """
+    if feat_name is None:
+        feat_name=feat
+    df_inj = df.query('SERIOUS_OR_FATALITY==1')
+    table = feat_perc(feat,df)
+    table.loc[:,'cohort']='all'
+    ordering = list(table['percentage'].sort_values(ascending=False).index) if sort else None
+    table_inj = feat_perc(feat,df_inj)
+    table_inj.loc[:,'cohort']='seriously injured or killed'
+    table = pd.concat([table,table_inj],axis=0).reset_index()
+    category_orders = {'cohort':['all','seriously injured or killed']}
+    if sort:
+        category_orders[feat]=ordering
+    fig = px.bar(table,y='cohort',x='percentage',color=feat,
+                 barmode='stack',text_auto='.1%',
+                category_orders=category_orders,
+                title=f'Distributions of {feat} values within cyclist cohorts')
+    fig.update_yaxes(tickangle=-90)
+    fig.update_xaxes(tickformat=".0%")
+    if show_fig:
+        fig.show()
+    if return_fig:
+        return fig
+# def feat_perc_comp(feat,df,feat_name=None,cohort_name = None,merge_inj_death=True):
+#     """
+#     Returns a styled dataframe (Styler object)
+#     whose underlying dataframe has three columns
+#     containing value counts of 'feat' among:
+#     - all cyclists involved in crashes
+#     - cyclists suffering serious injury or fatality
+#     each formatted as percentages of the series sum.
+#     Styled with bars comparing percentages
+#     Parameters:
+#     -----------
+#     feat : str
+#         The column name of the desired feature
+#     df : pd.DataFrame
+#         The input dataframe
+#     feat_name : str or None
+#         The feature name to use in the output dataframe
+#         index name.  If None, will use feat
+#     cohort_name : str or None
+#         qualifier to use in front of 'cyclists'
+#         in titles, if provided, e.g. 'rural cyclists'
+#     merge_inj_death : bool
+#         whether to merge seriously injured and killed cohorts
+#     Returns:
+#     --------
+#     perc_comp : pd.Styler object
+#     """
+#     # Need qualifier for titles if restricting cyclist cohort
+#     qualifier = cohort_name if cohort_name is not None else ''
+#     # Two columns or three, depending on merge_inj_death
+#     if merge_inj_death:
+#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
+#                          col_name='all cyclists',)\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('SERIOUS_OR_FATALITY==1'),
+#                                  col_name=qualifier+'cyclists with serious injury or fatality'),
+#                       on=feat,how='left')
+#         perc_comp = perc_comp[perc_comp.max(axis=1)>=0.005]
+#     else:
+#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
+#                          col_name='all cyclists')\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('INJ_SEVERITY=="susp_serious_injury"'),
+#                                  col_name=qualifier+'cyclists with serious injury'),
+#                       on=feat,how='left')\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('INJ_SEVERITY=="killed"'),
+#                                  col_name=qualifier+'cyclists with fatality'),
+#                       on=feat,how='left')
+#     # If feature is not ordinal, sort rows descending by crash counts
+#     if feat not in ['AGE_BINS','SPEED_LIMIT','DAY_OF_WEEK','HOUR_OF_DAY']:
+#         perc_comp=perc_comp.sort_values(by='all cyclists',ascending=False)
+#     # Relabel day numbers with strings
+#     if feat == 'DAY_OF_WEEK':
+#         perc_comp.index=['Sun','Mon','Tues','Wed','Thurs','Fri','Sat']
+#         perc_comp.index.name='DAY_OF_WEEK'
+#     perc_comp=perc_comp.fillna(0)
+#     table_columns = list(perc_comp.columns)
+#     # Define format for displaying floats
+#     format_dict={col:'{:.2%}' for col in perc_comp.columns}
+#     # Define table styles
+#     styles = [dict(selector="caption",
+#                    props=[("text-align", "center"),
+#                           ("font-size", "100%"),
+#                           ("color", 'black'),
+#                           ("text-decoration","underline"),
+#                           ("font-weight","bold")])]
+#     # Return formatted dataframe
+#     if feat_name is None:
+#         feat_name=feat
+#     caption = f'Breakdown of {feat_name} among cyclist groups'
+#     return perc_comp.reset_index().style.set_table_attributes("style='display:inline'")\
+#                                     .format(format_dict).bar(color='powderblue',
+#                                     subset=table_columns).hide().set_caption(caption)\
+#                                     .set_table_styles(styles)

lib/__init__.py ADDED Viewed

File without changes

lib/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (222 Bytes). View file

lib/__pycache__/get_data.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

lib/__pycache__/study_class.cpython-310.pyc ADDED Viewed

Binary file (28.3 kB). View file

lib/__pycache__/study_classif.cpython-310.pyc ADDED Viewed

Binary file (28.9 kB). View file

lib/__pycache__/transform_data.cpython-310.pyc ADDED Viewed

Binary file (3.88 kB). View file

lib/__pycache__/vis_data.cpython-310.pyc ADDED Viewed

Binary file (6.64 kB). View file

lib/study_classif.py ADDED Viewed

	@@ -0,0 +1,787 @@

+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import shap
+from sklearn.feature_selection import chi2, SelectKBest, mutual_info_classif, f_classif
+from sklearn.metrics import accuracy_score, log_loss, confusion_matrix, f1_score, fbeta_score, roc_auc_score
+from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay, classification_report, precision_recall_curve
+from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, RandomizedSearchCV, StratifiedKFold
+from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer, SplineTransformer, PolynomialFeatures
+from sklearn.decomposition import PCA
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier
+# from lightgbm import LGBMClassifier
+from sklearn.base import BaseEstimator, TransformerMixin, clone
+from sklearn.utils.validation import check_is_fitted
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline, make_pipeline
+from sklearn.compose import ColumnTransformer, make_column_transformer
+from lib.transform_data import *
+class ClassifierStudy():
+    """
+    A class that contains tools for studying a classifier pipeline
+    Parameters:
+    -----------
+    classifier : a scikit-learn compatible binary classifier
+    X : pd.DataFrame
+        dataframe of features
+    y : pd.Series
+        series of binary target values corresponding to X
+    classifier_name : str or None
+        if provided, will use as classifier name in pipeline
+        if not, will use 'clf' as name
+    features : dict
+        a dictionary whose keys are the feature types
+        'cyc','cat','ord','num','bin' and whose values
+        are lists of features of each type.
+    Methods:
+    -------
+    set_data, set_features, set_state
+        sets or resets attributes of self
+    build_pipeline
+        builds out pipeline based on supplied specs
+    cv_score
+        runs k-fold cross validation and reports scores
+    randomized_search
+        runs randomized search with cross validation
+        and reports results
+    fit_pipeline
+        fits the model pipeline and stores as
+        self.pipe_fitted
+    predict_proba_pipeline
+        uses a fitted pipeline to compute predicted
+        probabilities for test or validation set
+    score_pipeline
+        scores predicted probabilities
+    """
+    def __init__(self, classifier=None, X = None, y = None,
+                 features = None,classifier_name = None,
+                 random_state=42):
+        self.classifier = classifier
+        if X is not None:
+            self.X = X.copy()
+        if y is not None:
+            self.y = y.copy()
+        if features is not None:
+            self.features = features.copy()
+        self.random_state=random_state
+        self.pipe, self.pipe_fitted = None, None
+        self.classifier_name = classifier_name
+        self.X_val, self.y_val = None, None
+        self.y_predict_proba = None
+        self.best_params, self.best_n_components = None, None
+        self.shap_vals = None
+    def set_data(self,X=None,y=None):
+        """Method to set or reset feature and/or target data"""
+        if X is not None:
+            self.X = X.copy()
+        if y is not None:
+            self.y = y.copy()
+    def set_features(self,features):
+        """Method to set or reset the feature dictionary"""
+        if features is not None:
+            self.features = features.copy()
+    def set_state(self,random_state):
+        """Method to set or reset the random_state"""
+        self.random_state = random_state
+    def build_pipeline(self, cat_method = 'onehot',cyc_method = 'spline',num_ss=True,
+                       over_sample = False, pca=False,n_components=None,
+                       select_features = False,score_func=None,k='all',
+                       poly_features = False, degree=2, interaction_only=False):
+        """
+        Method to build the model pipeline
+        Parameters:
+        -----------
+        cat_method : str
+            specifies whether to encode categorical
+            variables as one-hot vectors or ordinals
+            must be either 'onehot' or 'ord'
+        cyc_method : str
+            specifies whether to encode cyclical features
+            with sine/cosine encoding or periodic splines
+            must be one of 'trig', 'spline', 'interact-trig',
+            'interact-spline','onehot', 'ord', or None
+            - If 'trig' or 'spline', will set up periodic encoder
+              with desired method
+            - If 'onehot' or 'ord', will set up appropriate
+              categorical encoder
+            - If 'interact-{method}', will use <method> encoding for HOUR_OF_DAY,
+              encode DAY_OF_WEEK as a binary feature expressing whether
+              the day is a weekend day, and then include interaction
+              features among this set via PolynomialFeatures.
+            - If None, will leave out cyclical features altogether
+        num_ss : bool
+            Whether or not to apply StandardScaler on the numerical features
+        over_sample : bool
+            set to True to include imblearn.over_sampling.RandomOverSampler step
+        pca : bool
+            set to True to include sklearn.decomposition.PCA step
+        n_components : int or None
+            number of components for sklearn.decomposition.PCA
+        select_features : bool
+            set to True to include sklearn.feature_selection.SelectKBest step
+        score_func : callable
+            score function to use for sklearn.feature_selection.SelectKBest
+            recommended: chi2, f_classif, or mutual_info_classif
+        k : int or 'all'
+            number of features for sklearn.feature_selection.SelectKBest
+        poly_features : bool
+            set to True to include sklearn.preprocessing.PolynomialFeatures step
+        degree : int
+            max degree for sklearn.preprocessing.PolynomialFeatures
+        interaction_only : bool
+            whether or not sklearn.preprocessing.PolynomialFeatures will be limited
+            to interaction terms only
+        """
+        # Define transformer for categorical features
+        if cat_method == 'onehot':
+            cat_encoder = ('ohe',OneHotEncoder(handle_unknown='infrequent_if_exist'))
+        elif cat_method == 'ord':
+            cat_encoder = ('oe',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
+        else:
+            raise ValueError("cat_method must be either 'onehot' or 'ord'")
+        cat_transform = Pipeline([('si',SimpleImputer(strategy='most_frequent')),cat_encoder])
+        # Define transformer for cyclic features
+        cyc_dict = {'HOUR_OF_DAY':24,'DAY_OF_WEEK':7}
+        if cyc_method == 'trig':
+            cyc_transform = [(f'{feat}_cos',cos_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]+\
+                        [(f'{feat}_sin',sin_transformer(cyc_dict[feat]),[feat]) for feat in self.features['cyc']]
+        elif cyc_method =='spline':
+            cyc_transform = [(f'{feat}_cyclic',
+                          periodic_spline_transformer(cyc_dict[feat],n_splines=cyc_dict[feat]//2),
+                          [feat]) for feat in self.features['cyc']]
+        elif cyc_method == 'onehot':
+            cyc_encoder = ('ohe_cyc',OneHotEncoder(handle_unknown='infrequent_if_exist'))
+            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
+        elif cyc_method == 'ord':
+            cyc_encoder = ('oe_cyc',OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=np.nan))
+            cyc_transform = [('cyc',Pipeline([cyc_encoder]),self.features['cyc'])]
+        elif cyc_method == 'interact-spline':
+            hour_transform = (f'hour_cyc',periodic_spline_transformer(cyc_dict['HOUR_OF_DAY'],n_splines=12),['HOUR_OF_DAY'])
+            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
+            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer([hour_transform, wkend_transform],
+                                                               remainder='drop',verbose_feature_names_out=False)),
+                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
+                                                                include_bias=False))]),
+                             self.features['cyc'])]
+        elif cyc_method == 'interact-trig':
+            hour_transform = [(f'HOUR_cos',cos_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY']),
+                              (f'HOUR_sin',sin_transformer(cyc_dict['HOUR_OF_DAY']),['HOUR_OF_DAY'])]
+            wkend_transform = ('wkend',FunctionTransformer(lambda x: (x.isin([1,7])).astype(int)),['DAY_OF_WEEK'])
+            cyc_transform = [('cyc',Pipeline([('cyc_col',ColumnTransformer(hour_transform+[wkend_transform],
+                                                               remainder='drop',verbose_feature_names_out=False)),
+                                              ('cyc_poly',PolynomialFeatures(degree=2,interaction_only=True,
+                                                                include_bias=False))]),
+                             self.features['cyc'])]
+        elif cyc_method is None:
+            cyc_transform = [('cyc','passthrough',[])]
+        else:
+            raise ValueError("cyc_method must be one of 'trig','spline','interact','onehot','ord',or None")
+        # Define numerical transform
+        num_transform = ('num',StandardScaler(),self.features['num']) if num_ss else\
+                        ('num','passthrough',self.features['num'])
+        # Define column transformer
+        col_transform = ColumnTransformer([('cat',cat_transform,self.features['cat']),
+                                           ('ord','passthrough',self.features['ord']),
+                                           num_transform,
+                                           ('bin',SimpleImputer(strategy='most_frequent'),
+                                             self.features['bin'])]+\
+                                           cyc_transform,
+                                           remainder='drop',verbose_feature_names_out=False)
+        steps = [('col',col_transform)]
+        if 'AGE' in self.features['num']:
+            steps.insert(0,('gi_age',GroupImputer(target = 'AGE', group_cols=['COUNTY'],strategy='median')))
+        if 'HOUR_OF_DAY' in self.features['cyc']:
+            steps.insert(0,('gi_hour',GroupImputer(target = 'HOUR_OF_DAY', group_cols=['ILLUMINATION','CRASH_MONTH'],strategy='mode')))
+        # Insert optional steps as needed
+        if over_sample:
+            steps.insert(0,('os',RandomOverSampler(random_state=self.random_state)))
+        if poly_features:
+            steps.append(('pf',PolynomialFeatures(degree=degree,interaction_only=interaction_only)))
+        if select_features:
+            steps.append(('fs',SelectKBest(score_func = score_func, k = k)))
+        if pca:
+            steps.append(('pca',PCA(n_components=n_components,random_state=self.random_state)))
+        # Append classifier if provided
+        if self.classifier is not None:
+            if self.classifier_name is not None:
+                steps.append((f'{self.classifier_name}_clf',self.classifier))
+            else:
+                steps.append(('clf',self.classifier))
+        # Initialize pipeline
+        self.pipe = Pipeline(steps)
+    def cv_score(self, scoring = 'roc_auc', n_splits = 5, n_repeats=3, thresh = 0.5, beta = 1,
+                 return_mean_score=False,print_mean_score=True,print_scores=False, n_jobs=-1,
+                eval_size=0.1,eval_metric='auc'):
+        """
+        Method for performing cross validation via RepeatedStratifiedKFold
+        Parameters:
+        -----------
+        scoring : str
+            scoring function to use.  must be one of
+            'roc_auc','acc','f1','','f1w'
+        thresh : float
+            the classification threshold for computing y_pred
+            from y_pred_proba
+        beta : float
+            the beta-value to use in the f_beta score, if chosen
+        n_splits, n_repeats : int, int
+            number of splits and number of repeat iterations
+            for sklearn.model_selection.RepeatedStratifiedKFold
+        return_mean_score : bool
+            whether or not to return the mean score
+        print_mean_score : bool
+            whether to print out a report of the mean score
+        print_scores : bool
+            whether to print out a report of CV scores for all folds
+        n_jobs : int or None
+            number of CPU cores to use for parallel processing
+            -1 uses all available cores, and None defaults to 1
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        Returns: None or mean_score, depending on return_mean_score setting
+        --------
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert scoring in ['roc_auc','acc','f1','fb','f1w'],"scoring must be one of 'roc_auc','acc','f1','fb','f1w'"
+        # Initialize CV iterator
+        kf = RepeatedStratifiedKFold(n_splits = n_splits, n_repeats=n_repeats,
+                                     random_state=self.random_state)
+        # Restrict to features supplied in self.features
+        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        lgb_es=False
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if 'early_stopping_round' in self.pipe[-1].get_params():
+        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #             lgb_es=True
+        scores = []
+        # Iterate over folds and train, predict, score
+        for i,(train_idx,test_idx) in enumerate(kf.split(X,self.y)):
+            fold_X_train = X.iloc[train_idx,:]
+            fold_X_test = X.iloc[test_idx,:]
+            fold_y_train = self.y.iloc[train_idx]
+            fold_y_test = self.y.iloc[test_idx]
+            pipe=clone(self.pipe)
+            if lgb_es:
+                fold_X_train,fold_X_es,fold_y_train,fold_y_es = train_test_split(fold_X_train,fold_y_train,
+                                                                                 stratify=fold_y_train,test_size=eval_size,
+                                                                                 random_state=self.random_state)
+                trans_pipe = pipe[:-1]
+                trans_pipe.fit_transform(fold_X_train)
+                fold_X_es = trans_pipe.transform(fold_X_es)
+                clf_name = pipe.steps[-1][0]
+                fit_params = {f'{clf_name}__eval_set':[(fold_X_es,fold_y_es)],
+                              f'{clf_name}__eval_metric':eval_metric,
+                              f'{clf_name}__verbose':0}
+            else:
+                fit_params = {}
+            pipe.fit(fold_X_train,fold_y_train,**fit_params)
+            fold_y_pred_proba = pipe.predict_proba(fold_X_test)[:,1]
+            if scoring == 'roc_auc':
+                fold_score = roc_auc_score(fold_y_test, fold_y_pred_proba)
+            else:
+                fold_y_pred = (fold_y_pred_proba >= thresh).astype('int')
+                if scoring == 'acc':
+                    fold_score = accuracy_score(fold_y_test,fold_y_pred)
+                elif scoring == 'f1':
+                    fold_score = f1_score(fold_y_test,fold_y_pred)
+                elif scoring == 'f1w':
+                    fold_score = f1_score(fold_y_test,fold_y_pred,average='weighted')
+                else:
+                    fold_score = fbeta_score(fold_y_test,fold_y_pred,beta=beta)
+            scores.append(fold_score)
+        # Average and report
+        mean_score = np.mean(scores)
+        if print_scores:
+            print(f'CV scores using {scoring} score: {scores} \nMean score: {mean_score}')
+        if print_mean_score:
+            print(f'Mean CV {scoring} score: {mean_score}')
+        if return_mean_score:
+            return mean_score
+    def randomized_search(self, params, n_components = None, n_iter=10,
+                          scoring='roc_auc',cv=5,refit=False,top_n=10, n_jobs=-1):
+        """
+        Method for performing randomized search with cross validation on a given dictionary of parameter distributions
+        Also displays a table of results the best top_n iterations
+        Parameters:
+        ----------
+        params : dict
+            parameter distributions to use for RandomizedSearchCV
+        n_components : int, or list, or None
+            number of components for sklearn.decomposition.PCA
+            - if int, will reset the PCA layer in self.pipe with provided value
+            - if list, must be list of ints, which will be included in
+              RandomizedSearchCV parameter distribution
+        scoring : str
+            scoring function for sklearn.model_selection.cross_val_score
+        n_iter : int
+            number of iterations to use in RandomizedSearchCV
+        refit : bool
+            whether to refit a final classifier with best parameters
+            - if False, will only set self.best_params and self.best_score
+            - if True, will set self.best_estimator in addition
+        top_n : int or None
+            if int, will display results from top_n best iterations only
+            if None, will display all results
+        n_jobs : int or None
+            number of CPU cores to use for parallel processing
+            -1 uses all available cores, and None defaults to 1
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert (n_components is None)|('pca' in self.pipe.named_steps), 'Your pipeline has no PCA step.  Build a pipeline with PCA first.'
+        assert (len(params)>0)|(type(n_components)==list), 'Either pass a parameter distribution or a list of n_components values.'
+        # Add estimator name prefix to hyperparams
+        params = {self.pipe.steps[-1][0]+'__'+key:params[key] for key in params}
+        # Process supplied n_components
+        if type(n_components)==list:
+            params['pca__n_components']=n_components
+        elif type(n_components)==int:
+            self.pipe['pca'].set_params(n_components=n_components)
+        # Restrict to features supplied in self.features
+        X = self.X[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # Initialize rs and fit
+        rs = RandomizedSearchCV(self.pipe, param_distributions = params,
+                                n_iter=n_iter, scoring = scoring, cv = cv,refit=refit,
+                                random_state=self.random_state, n_jobs=n_jobs)
+        rs.fit(X,self.y)
+        # Display top n scores
+        results = rs.cv_results_
+        results_df = pd.DataFrame(results['params'])
+        param_names = list(results_df.columns)
+        results_df[f'mean cv score ({scoring})']=pd.Series(results['mean_test_score'])
+        results_df = results_df.set_index(param_names).sort_values(by=f'mean cv score ({scoring})',ascending=False)
+        if top_n is not None:
+            display(results_df.head(top_n).style\
+                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
+        else:
+            display(results_df.style\
+                    .highlight_max(axis=0, props='color:white; font-weight:bold; background-color:seagreen;'))
+        if refit:
+            self.best_estimator = rs.best_estimator_
+        best_params = rs.best_params_
+        self.best_params = {key.split('__')[-1]:best_params[key] for key in best_params if key.split('__')[0]!='pca'}
+        self.best_n_components = next((best_params[key] for key in best_params if key.split('__')[0]=='pca'), None)
+        self.best_score = rs.best_score_
+    def fit_pipeline(self,split_first=False, eval_size=0.1,eval_metric='auc'):
+        """
+        Method for fitting self.pipeline on self.X,self.y
+        Parameters:
+        -----------
+        split_first : bool
+            if True, a train_test_split will be performed first
+            and the validation set will be stored
+        early_stopping : bool
+            Indicates whether we will use early_stopping for lightgbm.
+            If true, will split off an eval set prior to k-fold split
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        """
+        # Need pipe and X to fit
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert self.X is not None, 'X does not exist.  First set X.'
+        # If no y provided, then no pipeline steps should require y
+        step_list = [step[0] for step in self.pipe.steps]
+        assert (('clf' not in step_list[-1])&('kf' not in step_list))|(self.y is not None), 'You must provide targets y if pipeline has a classifier step or feature selection step.'
+        # Don't need to do a train-test split without a classifier
+        assert (split_first==False)|('clf' in step_list[-1]), 'Only need train-test split if you have a classifier.'
+        if split_first:
+            X_train,X_val,y_train,y_val = train_test_split(self.X,self.y,stratify=self.y,
+                                                           test_size=0.2,random_state=self.random_state)
+            self.X_val = X_val
+            self.y_val = y_val
+        else:
+            X_train = self.X.copy()
+            if self.y is not None:
+                y_train = self.y.copy()
+        # Restrict to features supplied in self.features
+        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # If LGBM early stopping, then need to split off eval_set and define fit_params
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #         X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
+        #                                                        test_size=eval_size,
+        #                                                        stratify=y_train,
+        #                                                        random_state=self.random_state)
+        #         trans_pipe = self.pipe[:-1]
+        #         trans_pipe.fit_transform(X_train)
+        #         X_es = trans_pipe.transform(X_es)
+        #         clf_name = self.pipe.steps[-1][0]
+        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
+        #                       f'{clf_name}__eval_metric':eval_metric,
+        #                      f'{clf_name}__verbose':0}
+        #     else:
+        #         fit_params = {}
+        # else:
+        #     fit_params = {}
+        fit_params = {}
+        # Fit and store fitted pipeline. If no classifier, fit_transform X_train and store transformed version
+        pipe = self.pipe
+        if 'clf' in step_list[-1]:
+            pipe.fit(X_train,y_train,**fit_params)
+        else:
+            X_transformed = pipe.fit_transform(X_train)
+            # X_transformed = pd.DataFrame(X_transformed,columns=pipe[-1].get_column_names_out())
+            self.X_transformed = X_transformed
+        self.pipe_fitted = pipe
+    def predict_proba_pipeline(self, X_test = None):
+        """
+        Method for using a fitted pipeline to compute predicted
+        probabilities for X_test (if supplied) or self.X_val
+        Parameters:
+        -----------
+        X_test : pd.DataFrame or None
+            test data input features (if None, will use self.X_val)
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        assert self.pipe_fitted is not None, 'Pipeline is not fitted.  First fit pipeline using fit_pipeline.'
+        assert (X_test is not None)|(self.X_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
+        if X_test is None:
+            X_test = self.X_val
+        # Restrict to features supplied in self.features
+        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # Save prediction
+        self.y_predict_proba = self.pipe_fitted.predict_proba(X_test)[:,1]
+    def score_pipeline(self,y_test=None,scoring='roc_auc',thresh=0.5, beta = 1,
+                       normalize = None, print_score = True):
+        """
+        Method for scoring self.pipe_fitted on supplied test data and reporting score
+        Parameters:
+        -----------
+        y_test : pd.Series or None
+            true binary targets (if None, will use self.y_val)
+        scoring : str
+            specifies the metric to use for scoring
+            must be one of
+            'roc_auc', 'roc_plot', 'acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'
+        thresh : float
+            threshhold value for computing y_pred
+            from y_predict_proba
+        beta : float
+            the beta parameter in the fb score
+        normalize : str or None
+            the normalize parameter for the
+            confusion_matrix. must be one of
+            'true','pred','all',None
+        print_score : bool
+            if True, will print a message reporting the score
+            if False, will return the score as a float
+        """
+        assert (y_test is not None)|(self.y_val is not None), 'Must either provide X_test and y_test or fit the pipeline with split_first=True.'
+        assert self.y_predict_proba is not None, 'Predicted probabilities do not exist.  Run predict_proba_pipeline first.'
+        if y_test is None:
+            y_test = self.y_val
+        # Score and report
+        if scoring == 'roc_plot':
+            fig = plt.figure(figsize=(4,4))
+            ax = fig.add_subplot(111)
+            RocCurveDisplay.from_predictions(y_test,self.y_predict_proba,ax=ax)
+            plt.show()
+        elif scoring == 'roc_auc':
+            score = roc_auc_score(y_test, self.y_predict_proba)
+        else:
+            y_pred = (self.y_predict_proba >= thresh).astype('int')
+            if scoring == 'acc':
+                score = accuracy_score(y_test,y_pred)
+            elif scoring == 'f1':
+                score = f1_score(y_test,y_pred)
+            elif scoring == 'f1w':
+                score = f1_score(y_test,y_pred,average='weighted')
+            elif scoring == 'fb':
+                score = fbeta_score(y_test,y_pred,beta=beta)
+            elif scoring == 'mcc':
+                score = matthews_coffcoeff(y_test,y_pred)
+            elif scoring == 'kappa':
+                score = cohen_kappa_score(y_test,y_pred)
+            elif scoring == 'conf':
+                fig = plt.figure(figsize=(3,3))
+                ax = fig.add_subplot(111)
+                ConfusionMatrixDisplay.from_predictions(y_test,y_pred,ax=ax,colorbar=False)
+                plt.show()
+            elif scoring == 'classif_report':
+                target_names=['neither seriously injured nor killed','seriously injured or killed']
+                print(classification_report(y_test, y_pred,target_names=target_names))
+            else:
+                raise ValueError("scoring must be one of 'roc_auc', 'roc_plot','acc', 'f1', 'f1w', 'fb','mcc','kappa','conf','classif_report'")
+        if scoring not in ['conf','roc_plot','classif_report']:
+            if print_score:
+                print(f'The {scoring} score is: {score}')
+            else:
+                return score
+    def shap_values(self, X_test = None, eval_size=0.1,eval_metric='auc'):
+        """
+        Method for computing and SHAP values for features
+        stratifiedtrain/test split
+        A copy of self.pipe is fitted on the training set
+        and then SHAP values are computed on test set samples
+        Parameters:
+        -----------
+        X_test : pd.DataFrame
+            The test set; if provided, will not perform
+            a train/test split before fitting
+        eval_size : float
+            Fraction of the training set to use for early stopping eval set
+        eval_metric : str
+            eval metric to use in early stopping
+        Returns: None (stores results in self.shap_vals)
+        --------
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert (self.X is not None)&(self.y is not None), 'X and/or y does not exist.  First supply X and y using set_data.'
+        assert 'clf' in self.pipe.steps[-1][0], 'The pipeline has no classifier.  Build a pipeline with a classifier first.'
+        # Clone pipeline, do train/test split if X_test not provided
+        pipe = clone(self.pipe)
+        X_train = self.X.copy()
+        y_train = self.y.copy()
+        if X_test is None:
+            X_train,X_test,y_train,y_test = train_test_split(X_train,y_train,stratify=y_train,
+                                                                 test_size=0.2,random_state=self.random_state)
+        # Restrict to features provided in self.features, and fit
+        X_train = X_train[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        X_test = X_test[[feat for feat_type in self.features for feat in self.features[feat_type]]]
+        # If LGBM early stopping, then need to split off eval_set and define fit_params
+        # if isinstance(self.pipe[-1],LGBMClassifier):
+        #     if 'early_stopping_round' in self.pipe[-1].get_params():
+        #         if self.pipe[-1].get_params()['early_stopping_rounds'] is not None:
+        #             X_train,X_es,y_train,y_es = train_test_split(X_train,y_train,
+        #                                                        test_size=eval_size,
+        #                                                        stratify=y_train,
+        #                                                        random_state=self.random_state)
+        #         trans_pipe = self.pipe[:-1]
+        #         trans_pipe.fit_transform(X_train)
+        #         X_es = trans_pipe.transform(X_es)
+        #         clf_name = self.pipe.steps[-1][0]
+        #         fit_params = {f'{clf_name}__eval_set':[(X_es,y_es)],
+        #                       f'{clf_name}__eval_metric':eval_metric,
+        #                      f'{clf_name}__verbose':0}
+        #     else:
+        #         fit_params = {}
+        # else:
+        #     fit_params = {}
+        fit_params = {}
+        pipe.fit(X_train,y_train,**fit_params)
+        # SHAP will just explain classifier, so need transformed X_train and X_test
+        X_train_trans, X_test_trans = pipe[:-1].transform(X_train), pipe[:-1].transform(X_test)
+        # Need masker for linear model
+        masker = shap.maskers.Independent(data=X_train_trans)
+        # Initialize explainer and compute and store SHAP values as an explainer object
+        explainer = shap.Explainer(pipe[-1], masker = masker, feature_names = pipe['col'].get_feature_names_out())
+        self.shap_vals = explainer(X_test_trans)
+        self.X_shap = X_train_trans
+        self.y_shap = y_train
+    def shap_plot(self,max_display='all'):
+        """
+        Method for generating plots of SHAP value results
+        SHAP values should be already computed previously
+        Generates two plots side by side:
+            - a beeswarm plot of SHAP values of all samples
+            - a barplot of mean absolute SHAP values
+        Parameters:
+        -----------
+        max_display : int or 'all'
+            The number of features to show in the plot, in descending
+            order by mean absolute SHAP value.  If 'all', then
+            all features will be included.
+        Returns: None (plots displayed)
+        --------
+        """
+        assert self.shap_vals is not None, 'No shap values exist.  First compute shap values.'
+        assert (isinstance(max_display,int))|(max_display=='all'), "'max_display' must be 'all' or an integer"
+        if max_display=='all':
+            title_add = ', all features'
+            max_display = self.shap_vals.shape[1]
+        else:
+            title_add = f', top {max_display} features'
+        # Plot
+        fig=plt.figure()
+        ax1=fig.add_subplot(121)
+        shap.summary_plot(self.shap_vals,plot_type='bar',max_display=max_display,
+                          show=False,plot_size=0.2)
+        ax2=fig.add_subplot(122)
+        shap.summary_plot(self.shap_vals,plot_type='violin',max_display=max_display,
+                          show=False,plot_size=0.2)
+        fig.set_size_inches(12,max_display/3)
+        ax1.set_title(f'Mean absolute SHAP values'+title_add,fontsize='small')
+        ax1.set_xlabel('mean(|SHAP value|)',fontsize='x-small')
+        ax2.set_title(f'SHAP values'+title_add,fontsize='small')
+        ax2.set_xlabel('SHAP value', fontsize='x-small')
+        for ax in [ax1,ax2]:
+            ax.set_ylabel('feature name',fontsize='x-small')
+            ax.tick_params(axis='y', labelsize='xx-small')
+        plt.tight_layout()
+        plt.show()
+    def find_best_threshold(self,beta=1,conf=True,report=True, print_result=True):
+        """
+        Computes the classification threshold which gives the
+        best F_beta score from classifier predictions,
+        prints the best threshold and the corresponding F_beta score,
+        and displays a confusion matrix and classification report
+        corresponding to that threshold
+        Parameters:
+        -----------
+        beta : float
+            the desired beta value in the F_beta score
+        conf : bool
+            whether to display confusion matrix
+        report : bool
+            whether to display classification report
+        print_result : bool
+            whether to print a line reporting the best threshold
+            and resulting F_beta score
+        Returns: None (prints results and stores self.best_thresh)
+        --------
+        """
+        prec,rec,threshs = precision_recall_curve(self.y_val,
+                                                  self.y_predict_proba)
+        F_betas = (1+beta**2)*(prec*rec)/((beta**2*prec)+rec)
+        # Above formula is valid when TP!=0.  When TP==0
+        # it gives np.nan whereas F_beta should be 0
+        F_betas = np.nan_to_num(F_betas)
+        idx = np.argmax(F_betas)
+        best_thresh = threshs[idx]
+        if print_result:
+            print(f'Threshold optimizing F_{beta} score:   {best_thresh}\nBest F_{beta} score:   {F_betas[idx]}')
+        if conf:
+            self.score_pipeline(scoring='conf',thresh=best_thresh,beta=beta)
+        if report:
+            self.score_pipeline(scoring='classif_report',thresh=best_thresh,beta=beta)
+        self.best_thresh = best_thresh
+class LRStudy(ClassifierStudy):
+    """
+    A child class of ClassifierStudy which has an additional method specific to logistic regression
+    """
+    def __init__(self, classifier=None, X = None, y = None,
+                 features=None,classifier_name = 'LR',
+                 random_state=42):
+        super().__init__(classifier, X, y,features,classifier_name,random_state)
+    def plot_coeff(self, print_score = True, print_zero = False, title_add=None):
+        """
+        Method for doing a train/validation split, fitting the classifier,
+        predicting and scoring on the validation set, and plotting
+        a bar chart of the logistic regression coefficients corresponding
+        to various model features.
+        Features with coefficient zero and periodic spline features
+        will be excluded from the chart.
+        Parameters:
+        -----------
+        print_score : bool
+            if True, the validation score are printed
+        print_zero : bool
+            if True, the list of features with zero coefficients are printed
+        title_add : str or None
+            an addendum that is added to the end of the plot title
+        """
+        assert self.pipe is not None, 'No pipeline exists; first build a pipeline using build_pipeline.'
+        assert isinstance(self.classifier,LogisticRegression),'Your classifier is not an instance of Logistic Regression.'
+        # fit and score
+        self.fit_pipeline(split_first = True)
+        self.predict_proba_pipeline()
+        score = roc_auc_score(self.y_val, self.y_predict_proba)
+        # Retrieve coeff values from fitted pipeline
+        coeff = pd.DataFrame({'feature name':self.pipe_fitted['col'].get_feature_names_out(),
+                               'coeff value':self.pipe_fitted[-1].coef_.reshape(-1)})\
+                            .sort_values(by='coeff value')
+        coeff = coeff[~coeff['feature name']\
+                .isin([f'HOUR_OF_DAY_sp_{n}' for n in range(12)]\
+                        +[f'DAY_OF_WEEK_sp_{n}' for n in range(3)])]\
+                .set_index('feature name')
+        coeff_zero_features = coeff[coeff['coeff value']==0].index
+        coeff = coeff[coeff['coeff value']!=0]
+        # Plot feature coefficients
+        fig = plt.figure(figsize=(30,4))
+        ax = fig.add_subplot(111)
+        coeff['coeff value'].plot(kind='bar',ylabel='coeff value',ax=ax)
+        ax.axhline(y=0, color= 'red', linewidth=2,)
+        plot_title = 'PA bicycle collisions, 2002-2021\nLogistic regression model log-odds coefficients'
+        if title_add is not None:
+            plot_title += f': {title_add}'
+        ax.set_title(plot_title)
+        ax.tick_params(axis='x', labelsize='x-small')
+        plt.show()
+        if print_score:
+            print(f'Score on validation set: {score}')
+        if print_zero:
+            print(f'Features with zero coefficients in trained model: {list(coeff_zero)}')
+        self.score = score
+        self.coeff = coeff
+        self.coeff_zero_features = coeff_zero_features

lib/transform_data.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import FunctionTransformer, SplineTransformer
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+class GroupImputer(BaseEstimator, TransformerMixin):
+    """
+    Class used for imputing missing values in a pd.DataFrame
+    using mean, median, or mode by groupwise aggregation,
+    or a constant.
+    Parameters:
+    -----------
+    target : str
+        - The name of the column to be imputed
+    group_cols : list
+        - List of name(s) of columns on which to groupby
+    strategy : str
+         - The method for replacement; can be any of
+          ['mean', 'median', 'mode']
+    Returns:
+    --------
+    X : pd.DataFrame
+        - The dataframe with imputed values in the target column
+    """
+    def __init__(self,target,group_cols=None,strategy='median'):
+        assert strategy in ['mean','median','mode'], "strategy must be in ['mean', 'median', 'mode']'"
+        assert type(group_cols)==list, 'group_cols must be a list of column names'
+        assert type(target) == str, 'target must be a string'
+        self.group_cols = group_cols
+        self.strategy=strategy
+        self.target = target
+    def fit(self,X,y=None):
+        if self.strategy=='mode':
+            impute_map = X.groupby(self.group_cols)[self.target]\
+                            .agg(lambda x: pd.Series.mode(x,dropna=False)[0])\
+                            .reset_index(drop=False)
+        else:
+            impute_map = X.groupby(self.group_cols)[self.target]\
+                        .agg(self.strategy).reset_index(drop=False)
+        self.impute_map_ = impute_map
+        return self
+    def transform(self,X,y=None):
+        check_is_fitted(self,'impute_map_')
+        X=X.copy()
+        for index,row in self.impute_map_.iterrows():
+            ind = (X[self.group_cols] == row[self.group_cols]).all(axis=1)
+            X.loc[ind,self.target] = X.loc[ind,self.target].fillna(row[self.target])
+        return X
+# Sine and consine transformations
+def sin_feature_names(transformer, feature_names):
+    return [f'SIN_{col}' for col in feature_names]
+def cos_feature_names(transformer, feature_names):
+    return [f'COS_{col}' for col in feature_names]
+def sin_transformer(period):
+    return FunctionTransformer(lambda x: np.sin(2*np.pi*x/period),feature_names_out = sin_feature_names)
+def cos_transformer(period):
+    return FunctionTransformer(lambda x: np.cos(2*np.pi*x/period),feature_names_out = cos_feature_names)
+# Periodic spline transformation
+def periodic_spline_transformer(period, n_splines=None, degree=3):
+    if n_splines is None:
+        n_splines = period
+    n_knots = n_splines + 1  # periodic and include_bias is True
+    return SplineTransformer(
+        degree=degree,
+        n_knots=n_knots,
+        knots=np.linspace(0, period, n_knots).reshape(n_knots, 1),
+        extrapolation="periodic",
+        include_bias=True,
+    )

lib/vis_data.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import pandas as pd
+import numpy as np
+import plotly.express as px
+from scipy import stats
+def plot_map(df,city=None,county=None,animate=True,color_dots=True,animate_by='year',show_fig=True,return_fig=False):
+    """
+    Displays a plotly.express.scatter_mapbox interactive map
+    of crashes in a municipality if specified, or otherwise
+    statewide.  Can be animated over time or static.
+    Parameters:
+    -----------
+    df : pd.DataFrame
+        dataframe of crash samples
+    city or county : tuple or None
+        if provided, must be a tuple (code,name)
+        - code : str
+            the code corresponding to the desired municipality/county
+            (see the data dictionary)
+        - name : str
+            the name you want to use for the municipality/county
+            in plot title
+        * At most one of these can be not None!
+    animate : bool
+        if animate==True, then the map will animate using
+        the frequency provided in animate_by
+    color_dots : bool
+        if color_dots==True, then dots will be color-coded by
+        'serious injury or death' status.
+        WARNING: if color_dots and animate, then all frames
+        will be missing samples in 'serious injury or death'
+        classes which aren't present in first frame - due to
+        bug in plotly animation_frame implementation.
+        Recommend only using both when geographic
+        area is statewide or at least has all values of
+        'serious injury or death' in first frame
+    animate_by : str
+        the desired animation frequency, must be
+        either 'year' or 'month'
+    show_fig : bool
+        whether to display figure using fig.show()
+    return_fig : bool
+        whether to return the figure object
+   Returns: Either figure or None
+   --------
+    """
+    assert (city is None)|(county is None), 'A city and county cannot both be provided.'
+    # Copy df and create new column for color coding event type
+    df = df.copy()
+    df.loc[df.BICYCLE_SUSP_SERIOUS_INJ_COUNT>0,'Serious cyclist injury or death']='serious injury'
+    df.loc[df.BICYCLE_DEATH_COUNT>0,'Serious cyclist injury or death']='death'
+    df['Serious cyclist injury or death']=df['Serious cyclist injury or death'].fillna('neither')
+    # Set animation parameters
+    if animate:
+        if animate_by == 'year':
+            animation_frame = 'CRASH_YEAR'
+            title_animate = ' by year'
+        elif animate_by == 'month':
+            df['DATE'] = pd.to_datetime((df['CRASH_MONTH'].astype('str')\
+                                         +'-'+df['CRASH_YEAR'].astype('str')),
+                                       format = "%m-%Y")
+            df=df.sort_values(by='DATE')
+            df['DATE']=df['DATE'].astype('str').apply(lambda x: x.rsplit('-',1)[0])
+            animation_frame = 'DATE'
+            title_animate = ' by month'
+        else:
+            raise ValueError("animate_by must be 'year' or 'month'")
+    else:
+        animation_frame = None
+        title_animate = ''
+    if color_dots:
+        color='Serious cyclist injury or death'
+    else:
+        color=None
+    # Adjustments for when city or county are provided
+    if city is not None:
+        df = df[df.MUNICIPALITY==city[0]]
+        # Ignore extreme outlier samples - lat,lon may be incorrect
+        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
+        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
+        title_place = city[1]+', PA'
+    elif county is not None:
+        df = df[df.COUNTY==county[0]]
+        # Ignore extreme outlier samples - lat,lon may be incorrect
+        df = df[np.abs(stats.zscore(df.DEC_LAT))<=4]
+        df = df[np.abs(stats.zscore(df.DEC_LONG))<=4]
+        title_place = county[1]+' county, PA'
+    else:
+        title_place = 'PA'
+    # Compute default zoom level based on lat,lon ranges.
+    # open-street-map uses
+    max_lat, min_lat = df.DEC_LAT.max(), df.DEC_LAT.min()
+    max_lon, min_lon = df.DEC_LONG.max(), df.DEC_LONG.min()
+    # 2^(zoom) = 360/(longitude width of 1 tile)
+    zoom = np.log2(360/max(max_lon-min_lon,max_lat-min_lat))
+    lat_center = (max_lat+min_lat)/2
+    lon_center = (max_lon+min_lon)/2
+    # Adjust width so that aspect ratio matches shape of state
+    width_mult = (max_lon-min_lon)/(max_lat-min_lat)
+    cols  = ['CRN','DEC_LAT','DEC_LONG','Serious cyclist injury or death','CRASH_YEAR','CRASH_MONTH']
+    if animate_by=='month':
+        cols.append('DATE')
+    # Plot mapbox
+    fig = px.scatter_mapbox(df, lat='DEC_LAT',lon='DEC_LONG',
+                            color=color,
+                            color_discrete_map={'neither':'royalblue','serious injury':'orange','death':'crimson'},
+                            mapbox_style='open-street-map',
+                            animation_frame = animation_frame,
+                            animation_group='CRN',
+                            hover_data = {'DEC_LAT':False,'DEC_LONG':False,
+                                         'CRASH_YEAR':True,'CRASH_MONTH':True,
+                                         'Serious cyclist injury or death':True},
+                            width = width_mult*500,height=700,zoom=zoom,
+                            center={'lat':lat_center,'lon':lon_center},
+                            title=f'Crashes involving bicycles{title_animate}<br> in {title_place}, 2002-2021')
+    fig.update_layout(legend=dict(orientation='h',xanchor='right',yanchor='bottom',x=1,y=-0.12),
+                     legend_title_side='top')
+    if show_fig:
+        fig.show()
+    if return_fig:
+        return fig
+def feat_perc(feat, df, col_name = 'percentage', feat_name = None):
+    """
+    Constructs a single-column dataframe 'perc'
+    containing the value counts in the series
+    df[feat] as percentages of the whole.
+    - 'df' is the input dataframe.
+    - 'feat' is the desired column of df.
+    - 'col_name' is the name of the
+    column of the output dataframe
+    - 'feat_name' is the index name
+    of the output dataframe if provided, otherwise
+    will use 'feat' as index name.
+    """
+    perc = pd.DataFrame({col_name:df[feat].value_counts(normalize=True).sort_index()})
+    if feat_name:
+        perc.index.name=feat_name
+    else:
+        perc.index.name=feat
+    return perc
+def feat_perc_bar(feat,df,feat_name=None,cohort_name=None,show_fig=True,return_fig=False,sort=False):
+    """
+    Makes barplot of two series:
+        - distribution of feature among all cyclists
+        - distribution of feature among cyclists with serious injury or fatality
+    Parameters:
+    -----------
+    feat : str
+        The column name of the desired feature
+    df : pd.DataFrame
+        The input dataframe
+    feat_name : str or None
+        The feature name to use in the
+        x-axis label.  If None, will use feat
+    cohort_name : str or None
+        qualifier to use in front of 'cyclists'
+        in titles, if provided, e.g. 'rural cyclists'
+    show_fig : bool
+        whether to finish with fig.show()
+    return_fig : bool
+        whether to return the fig object
+    sort : bool
+        whether to sort bars. If False, will use default sorting
+        by category name or feature value.  If True, will resort
+        in descending order by percentage
+    Returns: figure or None
+    --------
+    """
+    if feat_name is None:
+        feat_name=feat
+    df_inj = df.query('SERIOUS_OR_FATALITY==1')
+    table = feat_perc(feat,df)
+    table.loc[:,'cohort']='all'
+    ordering = list(table['percentage'].sort_values(ascending=False).index) if sort else None
+    table_inj = feat_perc(feat,df_inj)
+    table_inj.loc[:,'cohort']='seriously injured or killed'
+    table = pd.concat([table,table_inj],axis=0).reset_index()
+    category_orders = {'cohort':['all','seriously injured or killed']}
+    if sort:
+        category_orders[feat]=ordering
+    fig = px.bar(table,y='cohort',x='percentage',color=feat,
+                 barmode='stack',text_auto='.1%',
+                category_orders=category_orders,
+                title=f'Distributions of {feat} values within cyclist cohorts')
+    fig.update_yaxes(tickangle=-90)
+    fig.update_xaxes(tickformat=".0%")
+    if show_fig:
+        fig.show()
+    if return_fig:
+        return fig
+# def feat_perc_comp(feat,df,feat_name=None,cohort_name = None,merge_inj_death=True):
+#     """
+#     Returns a styled dataframe (Styler object)
+#     whose underlying dataframe has three columns
+#     containing value counts of 'feat' among:
+#     - all cyclists involved in crashes
+#     - cyclists suffering serious injury or fatality
+#     each formatted as percentages of the series sum.
+#     Styled with bars comparing percentages
+#     Parameters:
+#     -----------
+#     feat : str
+#         The column name of the desired feature
+#     df : pd.DataFrame
+#         The input dataframe
+#     feat_name : str or None
+#         The feature name to use in the output dataframe
+#         index name.  If None, will use feat
+#     cohort_name : str or None
+#         qualifier to use in front of 'cyclists'
+#         in titles, if provided, e.g. 'rural cyclists'
+#     merge_inj_death : bool
+#         whether to merge seriously injured and killed cohorts
+#     Returns:
+#     --------
+#     perc_comp : pd.Styler object
+#     """
+#     # Need qualifier for titles if restricting cyclist cohort
+#     qualifier = cohort_name if cohort_name is not None else ''
+#     # Two columns or three, depending on merge_inj_death
+#     if merge_inj_death:
+#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
+#                          col_name='all cyclists',)\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('SERIOUS_OR_FATALITY==1'),
+#                                  col_name=qualifier+'cyclists with serious injury or fatality'),
+#                       on=feat,how='left')
+#         perc_comp = perc_comp[perc_comp.max(axis=1)>=0.005]
+#     else:
+#         perc_comp = feat_perc(feat,df=df,feat_name=feat_name,
+#                          col_name='all cyclists')\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('INJ_SEVERITY=="susp_serious_injury"'),
+#                                  col_name=qualifier+'cyclists with serious injury'),
+#                       on=feat,how='left')\
+#                 .merge(feat_perc(feat,feat_name=feat_name,
+#                                  df=df.query('INJ_SEVERITY=="killed"'),
+#                                  col_name=qualifier+'cyclists with fatality'),
+#                       on=feat,how='left')
+#     # If feature is not ordinal, sort rows descending by crash counts
+#     if feat not in ['AGE_BINS','SPEED_LIMIT','DAY_OF_WEEK','HOUR_OF_DAY']:
+#         perc_comp=perc_comp.sort_values(by='all cyclists',ascending=False)
+#     # Relabel day numbers with strings
+#     if feat == 'DAY_OF_WEEK':
+#         perc_comp.index=['Sun','Mon','Tues','Wed','Thurs','Fri','Sat']
+#         perc_comp.index.name='DAY_OF_WEEK'
+#     perc_comp=perc_comp.fillna(0)
+#     table_columns = list(perc_comp.columns)
+#     # Define format for displaying floats
+#     format_dict={col:'{:.2%}' for col in perc_comp.columns}
+#     # Define table styles
+#     styles = [dict(selector="caption",
+#                    props=[("text-align", "center"),
+#                           ("font-size", "100%"),
+#                           ("color", 'black'),
+#                           ("text-decoration","underline"),
+#                           ("font-weight","bold")])]
+#     # Return formatted dataframe
+#     if feat_name is None:
+#         feat_name=feat
+#     caption = f'Breakdown of {feat_name} among cyclist groups'
+#     return perc_comp.reset_index().style.set_table_attributes("style='display:inline'")\
+#                                     .format(format_dict).bar(color='powderblue',
+#                                     subset=table_columns).hide().set_caption(caption)\
+#                                     .set_table_styles(styles)

requirements.txt ADDED Viewed

	@@ -0,0 +1,77 @@

+altair==5.0.1
+appnope==0.1.3
+asttokens==2.2.1
+attrs==23.1.0
+backcall==0.2.0
+blinker==1.6.2
+cachetools==5.3.1
+certifi==2023.5.7
+charset-normalizer==3.1.0
+click==8.1.3
+cloudpickle==2.2.1
+contourpy==1.1.0
+cycler==0.11.0
+decorator==5.1.1
+executing==1.2.0
+fonttools==4.40.0
+gitdb==4.0.10
+GitPython==3.1.31
+importlib-metadata==6.7.0
+ipython==8.14.0
+jedi==0.18.2
+Jinja2==3.1.2
+joblib==1.2.0
+jsonschema==4.17.3
+kiwisolver==1.4.4
+lightgbm==4.0.0
+llvmlite==0.40.1
+markdown-it-py==3.0.0
+MarkupSafe==2.1.3
+matplotlib==3.7.1
+matplotlib-inline==0.1.6
+mdurl==0.1.2
+numba==0.57.1
+numpy==1.24.1
+pandas==2.0.2
+parso==0.8.3
+pexpect==4.8.0
+pickleshare==0.7.5
+Pillow==9.5.0
+plotly==5.15.0
+prompt-toolkit==3.0.38
+protobuf==4.23.3
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==12.0.1
+pydeck==0.8.1b0
+Pygments==2.15.1
+Pympler==1.0.1
+pyparsing==3.1.0
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+pytz==2023.3
+pytz-deprecation-shim==0.1.0.post0
+rich==13.4.2
+scikit-learn==1.2.2
+scipy==1.10.1
+seaborn==0.12.2
+shap==0.42.1
+six==1.16.0
+slicer==0.0.7
+smmap==5.0.0
+stack-data==0.6.2
+streamlit==1.23.1
+tenacity==8.2.2
+threadpoolctl==3.1.0
+toml==0.10.2
+toolz==0.12.0
+tornado==6.3.2
+tqdm==4.65.0
+traitlets==5.9.0
+typing_extensions==4.6.3
+tzdata==2023.3
+tzlocal==4.3.1
+urllib3==2.0.3
+validators==0.20.0
+wcwidth==0.2.6
+zipp==3.15.0

study.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b1f563b55025766dc82eb2f00b0e4ae468caa7368f87dd309be9d8047eace6d9
+size 9935040