BikeSaferPA / lib /.ipynb_checkpoints /get_data-checkpoint.py
etweedy's picture
Upload 22 files
5d396e9
raw
history blame
3.94 kB
import pandas as pd
def extract_data(year):
"""
A function for loading data corresponding to an individual
year from a CSV file. Data is then preprocessed and the
following dataframes are returned:
- 'bicycles': samples are bicycle vehicles which were
involved in crashes.
- 'persons': samples are all individuals involved in
crashes involving bicycles.
- 'crashes': samples are crash events involving bicycles.
- 'roadway': additional features for
crash events, related to roadway attributes and conditions.
"""
# Retrieve vehicle samples corresponding to bicycles.
# Note that in some samples VEH_TYPE is string, others float
vehicles = pd.read_csv(f'data/raw_csv/VEHICLE_{year}_Statewide.csv',encoding='latin')
bicycle_filter = vehicles.VEH_TYPE.isin([20,21,'20','21'])
cols = ['CRN', 'GRADE', 'IMPACT_POINT',
'RDWY_ALIGNMENT','UNIT_NUM',
'VEH_MOVEMENT', 'VEH_POSITION','VEH_ROLE', 'VEH_TYPE']
bicycles = vehicles[bicycle_filter][cols]
del vehicles
# Merge onto bicycles dataframe some additional features from cycle
cycles = pd.read_csv(f'data/raw_csv/CYCLE_{year}_Statewide.csv',encoding='latin')
cols = ['CRN','UNIT_NUM','PC_HDLGHT_IND', 'PC_HLMT_IND','PC_REAR_RFLTR_IND']
bicycles = bicycles.merge(cycles[cols],how='left',on=['CRN','UNIT_NUM'])
del cycles
# Retrieve information about persons involved in crashes involving bikes
# (not just the persons riding the bikes)
persons = pd.read_csv(f'data/raw_csv/PERSON_{year}_Statewide.csv',encoding='latin')
cols = ['AGE','CRN','INJ_SEVERITY','PERSON_TYPE',
'RESTRAINT_HELMET','SEX', 'TRANSPORTED', 'UNIT_NUM']
persons = persons[persons.CRN.isin(bicycles.CRN)][cols]
# Retrieve crash samples involving bikes
crashes = pd.read_csv(f'data/raw_csv/CRASH_{year}_Statewide.csv',encoding='latin')
cols = ['CRN','ARRIVAL_TM','DISPATCH_TM','COUNTY','MUNICIPALITY','DEC_LAT','DEC_LONG',
'BICYCLE_DEATH_COUNT','BICYCLE_SUSP_SERIOUS_INJ_COUNT',
'BUS_COUNT','COMM_VEH_COUNT','HEAVY_TRUCK_COUNT','SMALL_TRUCK_COUNT','SUV_COUNT','VAN_COUNT',
'CRASH_MONTH', 'CRASH_YEAR','DAY_OF_WEEK','HOUR_OF_DAY',
'COLLISION_TYPE','ILLUMINATION','INTERSECT_TYPE',
'LOCATION_TYPE','RELATION_TO_ROAD','TIME_OF_DAY',
'ROAD_CONDITION','TCD_TYPE','TCD_FUNC_CD','URBAN_RURAL',
'WEATHER1','WEATHER2']
crashes = crashes[crashes.CRN.isin(bicycles.CRN)][cols]
# Retrieve roadway data involving bikes
roadway = pd.read_csv(f'data/raw_csv/ROADWAY_{year}_Statewide.csv',encoding='latin')
cols = ['CRN','SPEED_LIMIT','RDWY_COUNTY']
roadway = roadway[roadway.CRN.isin(bicycles.CRN)][cols]
# Merge onto out bicycle_crashes and ped_crashes dataframe
# some additional flag features.
# Include flag features corresponding to driver impairment,
# driver inattention, other driver attributes,relevant road conditions, etc.
flags = pd.read_csv(f'data/raw_csv/FLAG_{year}_Statewide.csv',encoding='latin')
cols = ['AGGRESSIVE_DRIVING','ALCOHOL_RELATED','ANGLE_CRASH','CELL_PHONE','COMM_VEHICLE',
'CRN','CROSS_MEDIAN','CURVED_ROAD','CURVE_DVR_ERROR','DISTRACTED','DRINKING_DRIVER',
'DRUGGED_DRIVER','DRUG_RELATED','FATIGUE_ASLEEP','HO_OPPDIR_SDSWP','ICY_ROAD',
'ILLUMINATION_DARK','IMPAIRED_DRIVER','INTERSECTION','LANE_DEPARTURE',
'NHTSA_AGG_DRIVING','NO_CLEARANCE',
'NON_INTERSECTION','REAR_END','RUNNING_RED_LT','RUNNING_STOP_SIGN',
'RURAL','SNOW_SLUSH_ROAD','SPEEDING','SPEEDING_RELATED',
'SUDDEN_DEER','TAILGATING','URBAN','WET_ROAD','WORK_ZONE',
'MATURE_DRIVER','YOUNG_DRIVER']
crashes = crashes.merge(flags[cols],how='left',on='CRN')
del flags
return bicycles, persons, crashes, roadway