Spaces:
Runtime error
Runtime error
import pandas as pd | |
def extract_data(year): | |
""" | |
A function for loading data corresponding to an individual | |
year from a CSV file. Data is then preprocessed and the | |
following dataframes are returned: | |
- 'bicycles': samples are bicycle vehicles which were | |
involved in crashes. | |
- 'persons': samples are all individuals involved in | |
crashes involving bicycles. | |
- 'crashes': samples are crash events involving bicycles. | |
- 'roadway': additional features for | |
crash events, related to roadway attributes and conditions. | |
""" | |
# Retrieve vehicle samples corresponding to bicycles. | |
# Note that in some samples VEH_TYPE is string, others float | |
vehicles = pd.read_csv(f'data/raw_csv/VEHICLE_{year}_Statewide.csv',encoding='latin') | |
bicycle_filter = vehicles.VEH_TYPE.isin([20,21,'20','21']) | |
cols = ['CRN', 'GRADE', 'IMPACT_POINT', | |
'RDWY_ALIGNMENT','UNIT_NUM', | |
'VEH_MOVEMENT', 'VEH_POSITION','VEH_ROLE', 'VEH_TYPE'] | |
bicycles = vehicles[bicycle_filter][cols] | |
del vehicles | |
# Merge onto bicycles dataframe some additional features from cycle | |
cycles = pd.read_csv(f'data/raw_csv/CYCLE_{year}_Statewide.csv',encoding='latin') | |
cols = ['CRN','UNIT_NUM','PC_HDLGHT_IND', 'PC_HLMT_IND','PC_REAR_RFLTR_IND'] | |
bicycles = bicycles.merge(cycles[cols],how='left',on=['CRN','UNIT_NUM']) | |
del cycles | |
# Retrieve information about persons involved in crashes involving bikes | |
# (not just the persons riding the bikes) | |
persons = pd.read_csv(f'data/raw_csv/PERSON_{year}_Statewide.csv',encoding='latin') | |
cols = ['AGE','CRN','INJ_SEVERITY','PERSON_TYPE', | |
'RESTRAINT_HELMET','SEX', 'TRANSPORTED', 'UNIT_NUM'] | |
persons = persons[persons.CRN.isin(bicycles.CRN)][cols] | |
# Retrieve crash samples involving bikes | |
crashes = pd.read_csv(f'data/raw_csv/CRASH_{year}_Statewide.csv',encoding='latin') | |
cols = ['CRN','ARRIVAL_TM','DISPATCH_TM','COUNTY','MUNICIPALITY','DEC_LAT','DEC_LONG', | |
'BICYCLE_DEATH_COUNT','BICYCLE_SUSP_SERIOUS_INJ_COUNT', | |
'BUS_COUNT','COMM_VEH_COUNT','HEAVY_TRUCK_COUNT','SMALL_TRUCK_COUNT','SUV_COUNT','VAN_COUNT', | |
'CRASH_MONTH', 'CRASH_YEAR','DAY_OF_WEEK','HOUR_OF_DAY', | |
'COLLISION_TYPE','ILLUMINATION','INTERSECT_TYPE', | |
'LOCATION_TYPE','RELATION_TO_ROAD','TIME_OF_DAY', | |
'ROAD_CONDITION','TCD_TYPE','TCD_FUNC_CD','URBAN_RURAL', | |
'WEATHER1','WEATHER2'] | |
crashes = crashes[crashes.CRN.isin(bicycles.CRN)][cols] | |
# Retrieve roadway data involving bikes | |
roadway = pd.read_csv(f'data/raw_csv/ROADWAY_{year}_Statewide.csv',encoding='latin') | |
cols = ['CRN','SPEED_LIMIT','RDWY_COUNTY'] | |
roadway = roadway[roadway.CRN.isin(bicycles.CRN)][cols] | |
# Merge onto out bicycle_crashes and ped_crashes dataframe | |
# some additional flag features. | |
# Include flag features corresponding to driver impairment, | |
# driver inattention, other driver attributes,relevant road conditions, etc. | |
flags = pd.read_csv(f'data/raw_csv/FLAG_{year}_Statewide.csv',encoding='latin') | |
cols = ['AGGRESSIVE_DRIVING','ALCOHOL_RELATED','ANGLE_CRASH','CELL_PHONE','COMM_VEHICLE', | |
'CRN','CROSS_MEDIAN','CURVED_ROAD','CURVE_DVR_ERROR','DISTRACTED','DRINKING_DRIVER', | |
'DRUGGED_DRIVER','DRUG_RELATED','FATIGUE_ASLEEP','HO_OPPDIR_SDSWP','ICY_ROAD', | |
'ILLUMINATION_DARK','IMPAIRED_DRIVER','INTERSECTION','LANE_DEPARTURE', | |
'NHTSA_AGG_DRIVING','NO_CLEARANCE', | |
'NON_INTERSECTION','REAR_END','RUNNING_RED_LT','RUNNING_STOP_SIGN', | |
'RURAL','SNOW_SLUSH_ROAD','SPEEDING','SPEEDING_RELATED', | |
'SUDDEN_DEER','TAILGATING','URBAN','WET_ROAD','WORK_ZONE', | |
'MATURE_DRIVER','YOUNG_DRIVER'] | |
crashes = crashes.merge(flags[cols],how='left',on='CRN') | |
del flags | |
return bicycles, persons, crashes, roadway |