import pandas as pd import numpy as np import plotly.express as px from scipy import stats def plot_map(df,city=None,county=None,animate=True,color_dots=True,animate_by='year',show_fig=True,return_fig=False): """ Displays a plotly.express.scatter_mapbox interactive map of crashes in a municipality if specified, or otherwise statewide. Can be animated over time or static. Parameters: ----------- df : pd.DataFrame dataframe of crash samples city or county : tuple or None if provided, must be a tuple (code,name) - code : str the code corresponding to the desired municipality/county (see the data dictionary) - name : str the name you want to use for the municipality/county in plot title * At most one of these can be not None! animate : bool if animate==True, then the map will animate using the frequency provided in animate_by color_dots : bool if color_dots==True, then dots will be color-coded by 'serious injury or death' status. WARNING: if color_dots and animate, then all frames will be missing samples in 'serious injury or death' classes which aren't present in first frame - due to bug in plotly animation_frame implementation. Recommend only using both when geographic area is statewide or at least has all values of 'serious injury or death' in first frame animate_by : str the desired animation frequency, must be either 'year' or 'month' show_fig : bool whether to display figure using fig.show() return_fig : bool whether to return the figure object Returns: Either figure or None -------- """ assert (city is None)|(county is None), 'A city and county cannot both be provided.' # Copy df and create new column for color coding event type df = df.copy() df.loc[df.BICYCLE_SUSP_SERIOUS_INJ_COUNT>0,'Serious cyclist injury or death']='serious injury' df.loc[df.BICYCLE_DEATH_COUNT>0,'Serious cyclist injury or death']='death' df['Serious cyclist injury or death']=df['Serious cyclist injury or death'].fillna('neither') # Set animation parameters if animate: if animate_by == 'year': animation_frame = 'CRASH_YEAR' title_animate = ' by year' elif animate_by == 'month': df['DATE'] = pd.to_datetime((df['CRASH_MONTH'].astype('str')\ +'-'+df['CRASH_YEAR'].astype('str')), format = "%m-%Y") df=df.sort_values(by='DATE') df['DATE']=df['DATE'].astype('str').apply(lambda x: x.rsplit('-',1)[0]) animation_frame = 'DATE' title_animate = ' by month' else: raise ValueError("animate_by must be 'year' or 'month'") else: animation_frame = None title_animate = '' if color_dots: color='Serious cyclist injury or death' else: color=None # Adjustments for when city or county are provided if city is not None: df = df[df.MUNICIPALITY==city[0]] # Ignore extreme outlier samples - lat,lon may be incorrect df = df[np.abs(stats.zscore(df.DEC_LAT))<=4] df = df[np.abs(stats.zscore(df.DEC_LONG))<=4] title_place = city[1]+', PA' elif county is not None: df = df[df.COUNTY==county[0]] # Ignore extreme outlier samples - lat,lon may be incorrect df = df[np.abs(stats.zscore(df.DEC_LAT))<=4] df = df[np.abs(stats.zscore(df.DEC_LONG))<=4] title_place = county[1]+' county, PA' else: title_place = 'PA' # Compute default zoom level based on lat,lon ranges. # open-street-map uses max_lat, min_lat = df.DEC_LAT.max(), df.DEC_LAT.min() max_lon, min_lon = df.DEC_LONG.max(), df.DEC_LONG.min() # 2^(zoom) = 360/(longitude width of 1 tile) zoom = np.log2(360/max(max_lon-min_lon,max_lat-min_lat)) lat_center = (max_lat+min_lat)/2 lon_center = (max_lon+min_lon)/2 # Adjust width so that aspect ratio matches shape of state width_mult = (max_lon-min_lon)/(max_lat-min_lat) cols = ['CRN','DEC_LAT','DEC_LONG','Serious cyclist injury or death','CRASH_YEAR','CRASH_MONTH'] if animate_by=='month': cols.append('DATE') # Plot mapbox fig = px.scatter_mapbox(df, lat='DEC_LAT',lon='DEC_LONG', color=color, color_discrete_map={'neither':'royalblue','serious injury':'orange','death':'crimson'}, mapbox_style='open-street-map', animation_frame = animation_frame, animation_group='CRN', hover_data = {'DEC_LAT':False,'DEC_LONG':False, 'CRASH_YEAR':True,'CRASH_MONTH':True, 'Serious cyclist injury or death':True}, width = width_mult*500,height=700,zoom=zoom, center={'lat':lat_center,'lon':lon_center}, title=f'Crashes involving bicycles{title_animate}
in {title_place}, 2002-2021') fig.update_layout(legend=dict(orientation='h',xanchor='right',yanchor='bottom',x=1,y=-0.12), legend_title_side='top') if show_fig: fig.show() if return_fig: return fig def feat_perc(feat, df, col_name = 'percentage', feat_name = None): """ Constructs a single-column dataframe 'perc' containing the value counts in the series df[feat] as percentages of the whole. - 'df' is the input dataframe. - 'feat' is the desired column of df. - 'col_name' is the name of the column of the output dataframe - 'feat_name' is the index name of the output dataframe if provided, otherwise will use 'feat' as index name. """ perc = pd.DataFrame({col_name:df[feat].value_counts(normalize=True).sort_index()}) if feat_name: perc.index.name=feat_name else: perc.index.name=feat return perc def feat_perc_bar(feat,df,feat_name=None,cohort_name=None,show_fig=True,return_fig=False,sort=False): """ Makes barplot of two series: - distribution of feature among all cyclists - distribution of feature among cyclists with serious injury or fatality Parameters: ----------- feat : str The column name of the desired feature df : pd.DataFrame The input dataframe feat_name : str or None The feature name to use in the x-axis label. If None, will use feat cohort_name : str or None qualifier to use in front of 'cyclists' in titles, if provided, e.g. 'rural cyclists' show_fig : bool whether to finish with fig.show() return_fig : bool whether to return the fig object sort : bool whether to sort bars. If False, will use default sorting by category name or feature value. If True, will resort in descending order by percentage Returns: figure or None -------- """ if feat_name is None: feat_name=feat df_inj = df.query('SERIOUS_OR_FATALITY==1') table = feat_perc(feat,df) table.loc[:,'cohort']='all' ordering = list(table['percentage'].sort_values(ascending=False).index) if sort else None table_inj = feat_perc(feat,df_inj) table_inj.loc[:,'cohort']='seriously injured or killed' table = pd.concat([table,table_inj],axis=0).reset_index() category_orders = {'cohort':['all','seriously injured or killed']} if sort: category_orders[feat]=ordering fig = px.bar(table,y='cohort',x='percentage',color=feat, barmode='stack',text_auto='.1%', category_orders=category_orders, title=f'Distributions of {feat} values within cyclist cohorts') fig.update_yaxes(tickangle=-90) fig.update_xaxes(tickformat=".0%") if show_fig: fig.show() if return_fig: return fig # def feat_perc_comp(feat,df,feat_name=None,cohort_name = None,merge_inj_death=True): # """ # Returns a styled dataframe (Styler object) # whose underlying dataframe has three columns # containing value counts of 'feat' among: # - all cyclists involved in crashes # - cyclists suffering serious injury or fatality # each formatted as percentages of the series sum. # Styled with bars comparing percentages # Parameters: # ----------- # feat : str # The column name of the desired feature # df : pd.DataFrame # The input dataframe # feat_name : str or None # The feature name to use in the output dataframe # index name. If None, will use feat # cohort_name : str or None # qualifier to use in front of 'cyclists' # in titles, if provided, e.g. 'rural cyclists' # merge_inj_death : bool # whether to merge seriously injured and killed cohorts # Returns: # -------- # perc_comp : pd.Styler object # """ # # Need qualifier for titles if restricting cyclist cohort # qualifier = cohort_name if cohort_name is not None else '' # # Two columns or three, depending on merge_inj_death # if merge_inj_death: # perc_comp = feat_perc(feat,df=df,feat_name=feat_name, # col_name='all cyclists',)\ # .merge(feat_perc(feat,feat_name=feat_name, # df=df.query('SERIOUS_OR_FATALITY==1'), # col_name=qualifier+'cyclists with serious injury or fatality'), # on=feat,how='left') # perc_comp = perc_comp[perc_comp.max(axis=1)>=0.005] # else: # perc_comp = feat_perc(feat,df=df,feat_name=feat_name, # col_name='all cyclists')\ # .merge(feat_perc(feat,feat_name=feat_name, # df=df.query('INJ_SEVERITY=="susp_serious_injury"'), # col_name=qualifier+'cyclists with serious injury'), # on=feat,how='left')\ # .merge(feat_perc(feat,feat_name=feat_name, # df=df.query('INJ_SEVERITY=="killed"'), # col_name=qualifier+'cyclists with fatality'), # on=feat,how='left') # # If feature is not ordinal, sort rows descending by crash counts # if feat not in ['AGE_BINS','SPEED_LIMIT','DAY_OF_WEEK','HOUR_OF_DAY']: # perc_comp=perc_comp.sort_values(by='all cyclists',ascending=False) # # Relabel day numbers with strings # if feat == 'DAY_OF_WEEK': # perc_comp.index=['Sun','Mon','Tues','Wed','Thurs','Fri','Sat'] # perc_comp.index.name='DAY_OF_WEEK' # perc_comp=perc_comp.fillna(0) # table_columns = list(perc_comp.columns) # # Define format for displaying floats # format_dict={col:'{:.2%}' for col in perc_comp.columns} # # Define table styles # styles = [dict(selector="caption", # props=[("text-align", "center"), # ("font-size", "100%"), # ("color", 'black'), # ("text-decoration","underline"), # ("font-weight","bold")])] # # Return formatted dataframe # if feat_name is None: # feat_name=feat # caption = f'Breakdown of {feat_name} among cyclist groups' # return perc_comp.reset_index().style.set_table_attributes("style='display:inline'")\ # .format(format_dict).bar(color='powderblue', # subset=table_columns).hide().set_caption(caption)\ # .set_table_styles(styles)