NYC Traffic Danger (2021)

By Mary Nwangwu, Gail Batutis, Mallory Sico, and Annabelle Huether

import pandas as pd from scipy import stats import matplotlib.pyplot as plt import numpy as np from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import RandomOverSampler import geopy from geopy.geocoders import Nominatim from geopy.extra.rate_limiter import RateLimiter import time import matplotlib.pyplot as plt import numpy as np from scipy import stats import folium from folium import plugins from folium.plugins import HeatMap import geopandas as gpd from sklearn.cluster import DBSCAN import seaborn as sns import datetime as dt

all_data = pd.read_csv(r"clean_all_with_zips_buroughs_final_version.csv") all_data = all_data[(all_data.iloc[:,5] > 0)] #dropped a couple hundred rows with zero lat/long values all_data.shape

all_data

#Create folium map centered in NYC m = folium.Map(location=[40.75,-74.125]) #ask for input for map design and make everything lower case val = input('Enter one of the following to see a map of the death: \n all persons \n pedestrians \n cyclists \n motorists \n' ) val = val.lower() #convert text to column number if val == 'all persons': val = 9 elif val == 'pedestrians': val = 11 elif val == 'cyclists': val = 13 elif val == 'motorists': val = 15 else: print("unacceptable entry, please try again") #create death database based on the column number above death = all_data[all_data.iloc[:,val] > 0] death #iteratively create markers in folium map, with lat/long and contributing factor vehicle as popup for i in range(len(death['ZIP CODE'])): folium.Marker( location=[death['LATITUDE'].iloc[i],death['LONGITUDE'].iloc[i]], popup=str(death['CONTRIBUTING FACTOR VEHICLE 1'].iloc[i]), icon=folium.Icon(color="red", icon="info-sign"), ).add_to(m) m

#create data for heat map - all injuries (injury column above 0) heat_data = all_data[all_data.iloc[:,8] > 0] heat_data = [[row['LATITUDE'],row['LONGITUDE']] for index, row in heat_data.iterrows()] #create heat map, centered on NYC heat_map = folium.Map(location=[40.75,-74.125]) HeatMap(heat_data, min_opacity=0.2,blur = 15, radius = 15).add_to(heat_map) heat_map

#Group all the data by zip code, summing all the numerical columns, and cutting down to the only ones we need all_sum_zip = all_data.groupby(['ZIP CODE']).sum() all_sum_zip = all_sum_zip.iloc[:,0:11] #make the index into a column called ZIPCODE, and make it a string and cut off the ".0" on the end all_sum_zip['ZIPCODE'] = all_sum_zip.index.astype(str) all_sum_zip['ZIPCODE'] = [x[:-2] for x in all_sum_zip['ZIPCODE']] all_sum_zip

#read in the geojson file df_places = gpd.read_file(r"zip_code_040114.geojson") # https://data.beta.nyc/en/dataset/nyc-zip-code-tabulation-areas/resource/894e9162-871c-4552-a09c-c6915d8783fb df_places

#merge on the geojson file, on the column ZIPCODE in each df_merged = pd.merge(df_places,all_sum_zip, how = "left", on='ZIPCODE') df_merged

#plot the geojson file with a color map based on values in the Persons Injured column f, ax = plt.subplots(1, figsize=(15, 15)) ax = df_merged.plot(column='NUMBER OF PERSONS INJURED', cmap='Reds' ,ax=ax) plt.show()

#asks for zipcode input for DBSCAN calculation, which condition the resulting dataset zipcode = input("Enter zip code: ") prep = (all_data[(all_data['ZIP CODE']==float(zipcode))]) points = (prep.iloc[:,5:7]) #DBSCAN with radius .0001 (36 ft) and 5 collisions per cluster clustering = DBSCAN(eps=0.0001, min_samples=5).fit(points)

#output of DBSCAN clustering.labels_

#create masks of the above labels in true and false, for graphing core_samples_mask = np.zeros_like(clustering.labels_, dtype=bool) core_samples_mask[clustering.core_sample_indices_] = True labels = clustering.labels_ xy = points #count number of clusters and number of noise points n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) #print number of clusters and number of noise points print("Estimated number of clusters: %d" % n_clusters_) print("Estimated number of noise points: %d" % n_noise_)

# Black removed and is used for noise instead. unique_labels = set(labels) colors = [plt.cm.Set3(each) for each in np.linspace(0, 1, len(unique_labels))] #iterate through graphing clusters and noise, with varying colors for k, col in zip(unique_labels, colors): if k == -1: # Black used for noise. col = [0, 0, 0, 1] class_member_mask = labels == k #plot noise points xy = points[class_member_mask & ~core_samples_mask] plt.plot( xy.iloc[:, 0], xy.iloc[:, 1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=3, ) #plot cluster points ab = points[class_member_mask & core_samples_mask] plt.plot( ab.iloc[:,0], ab.iloc[:,1], "o", markerfacecolor=tuple(col), markeredgecolor="k", markersize=14, ) #name plot and show it plt.title("Estimated number of clusters: %d" % n_clusters_) plt.show()

#get cluster points only, and center folium map on the first item on the resulting list cd = points[~class_member_mask & core_samples_mask] n = folium.Map(location=[cd.iloc[0,0],cd.iloc[0,1]], max_zoom=18, min_zoom = 14.5) #create and iteratively place cluster markers on folium map for i in range(len(cd)): folium.Marker( location=[cd.iloc[i,0],cd.iloc[i,1]], icon=folium.Icon(color="red",icon="fa-solid fa-circle-exclamation"), ).add_to(n) n

#Pull in all bike data from cleaned csv file bike_df = pd.read_csv('clean_bike_final.csv') bike_df #Check names of all columns of bike dataframe print(bike_df.columns) #remove all columns that are old indices from previous iterations of the dataframes del bike_df['Unnamed: 0.2'], bike_df['Unnamed: 0.1'], bike_df['Unnamed: 0'] bike_df

#Bike data by burough bikes_bur = bike_df.groupby('BOROUGH').sum()#.iloc[:, 3:7] bikes_bur['TOTAL NUMBER OF CRASHES'] = bike_df['BOROUGH'].value_counts(sort=True) #print(bikes_bur) #Bike data by zip code bikes_zip = bike_df.groupby('ZIP CODE').sum().iloc[:, 2:6] bikes_zip['TOTAL NUMBER OF CRASHES'] = bike_df['ZIP CODE'].value_counts() bikes_zip #Group by Bourough and Zip Code bikes_zips_and_bur = bike_df.groupby(by= ['BOROUGH', 'ZIP CODE'], group_keys=False) bikes_zips_and_bur bikes_zips_and_bur.sum()

#Add total number of commuters to dataset #Website with data on bike commuters by burough: #https://www.nyc.gov/html/dot/html/bicyclists/cyclinginthecity.shtml total_Number_of_Commuters = {'BRONX': 2699, 'BROOKLYN': 23030,'MANHATTAN': 20859, \ 'QUEENS': 8257, 'STATEN ISLAND': 122} bikes_bur['TOTAL NUMBER OF COMMUTERS'] = [2699, 23030, 20859, 8257, 122] #Create columns for percentage of injuries and fatalities normalized by number of commuters bikes_bur['PERCENT CYCLIST INJURED'] = bikes_bur['NUMBER OF CYCLIST INJURED'] / bikes_bur['TOTAL NUMBER OF COMMUTERS'] bikes_bur['PERCENT CYCLIST KILLED'] = bikes_bur['NUMBER OF CYCLIST KILLED'] / bikes_bur['TOTAL NUMBER OF COMMUTERS'] #Visualization of bike data normalized by number of commuters in each borough #Note: This assumes that all injured people are distinct #Note: Staten Island is likely not accurate because there is not as much data there #Note: burough is not a great metric for determining safest location fig= plt.figure(figsize= [4,4]) ax = fig.add_subplot(1, 1, 1) ax2 = ax.twinx() width = 0.2 bikes_bur['PERCENT CYCLIST INJURED'].plot.bar(color='paleturquoise', ax=ax, width=width, position=1) bikes_bur['PERCENT CYCLIST KILLED'].plot(kind='bar', color='darkcyan', ax=ax2, width=width, position=0) ax.set(title = 'Percentage of Cyclists Injured and Killed \nPer Number of Commuters', ylabel = 'PERCENT CYCLIST INJURED', yticks = np.arange(0,0.9,0.1)) ax2.set(ylabel = 'PERCENT CYCLIST KILLED', yticks = np.arange(0,0.002,0.0002)) ax.legend(loc=2, fontsize = 6.5) ax2.legend(loc=1, fontsize = 6.5) #Create plots of number of total injuries and fatalities by persons and cyclists by burough fig, axes = plt.subplots(nrows = 1, ncols = 2) bikes_bur.iloc[:, [3, 5, 8]].plot.bar(ax=axes[0], figsize=(11,5), color = ['lightsalmon', 'indianred','darkred']) bikes_bur.iloc[:, [4, 6]].plot.bar(ax=axes[1], figsize=(10,5), color = ['plum', 'purple']) axes[0].legend(loc='upper right', fontsize=7) axes[0].set(title = 'NUMBER OF PERSONS AND CYCLISTS INJURED \nBY BOROUGH', xlabel='BOROUGH') axes[1].legend(loc='upper right', fontsize=7) axes[1].set(title = 'NUMBER OF PERSONS AND CYCLISTS KILLED \nBY BOROUGH', xlabel='BOROUGH') bikes_bur

#Loop of t-tests of injuries by burough #Brooklyn has statistically higher mean than all other buroughs #Assumptions: all boroughs have same variance over zip codes #Null-hypothesis: The first borough has a higher mean value of injuries in its distribution of zip codes bur_names = bike_df['BOROUGH'].unique() for i in range(len(bur_names)): for j in range(len(bur_names)): if j!=i: t_stat, p_value = stats.ttest_ind(bike_df.groupby('BOROUGH').get_group(bur_name[i]).groupby('ZIP CODE').sum()['NUMBER OF PERSONS INJURED'],bike_df.groupby('BOROUGH').get_group(bur_name[j]).groupby('ZIP CODE').sum()['NUMBER OF PERSONS INJURED'], \ alternative = 'greater') if p_value < 0.05: print('p-Value: {:.2e}. {} has greater mean than {}'.format( p_value, bur_names[i], bur_names[j]))

#Create bar plots of number of people injured in each burough separated by zipcode def plot_borough(x, borough_name, color): fig, ax = plt.subplots() ax = x.groupby('BOROUGH').get_group(borough_name).groupby('ZIP CODE').sum()['NUMBER OF PERSONS INJURED'].plot.bar(color = color) ax.set_title('{} NUMBER OF PEOPLE INJURED'.format(borough_name)) fig.tight_layout() color_array = ['mediumorchid', 'blue', 'lightcoral', 'mistyrose', 'bisque'] for bur in bike_df['BOROUGH'].unique(): plot_borough(bike_df, bur, color_array[np.where(bike_df['BOROUGH'].unique()==bur)[0][0]])

#Make line plot of data after sorting each borough by zip code in descending order def bur_df_def(x, borough_name): bur_df = bike_df.groupby('BOROUGH').get_group(borough_name).groupby('ZIP CODE').sum()['NUMBER OF PERSONS INJURED'].reset_index() bur_df = bur_df.sort_values(['NUMBER OF PERSONS INJURED'], ascending = False) bur_df = bur_df.reset_index() return np.array(bur_df.index), bur_df['NUMBER OF PERSONS INJURED'] color_array = ['mediumorchid', 'blue', 'lightcoral', 'mistyrose', 'bisque'] for bur in bike_df['BOROUGH'].unique(): x, y = bur_df_def(bike_df, bur) plt.plot(x, y, label = bur, color = color_array[np.where(bike_df['BOROUGH'].unique()==bur)[0][0]]) plt.fill_between(x, y, alpha = 0.5, color = color_array[np.where(bike_df['BOROUGH'].unique()==bur)[0][0]]) plt.legend()

#Scatter Plot of all bike collision locations: plt.scatter(bike_df['LATITUDE'], bike_df['LONGITUDE']) plt.xlabel('LATITUDE') plt.ylabel('LONGITUDE') plt.title('Location of All Collisions in New York City in 2021') #2 dimensional numpy array of all coordinates location = [[bike_df['LATITUDE'][i], bike_df['LONGITUDE'][i]] for i in range(len(bike_df))] bike_df

#Map visualizations for bikes injuries for Staten Island #2 dimensional numpy array of all coordinates in Staten Island stat_isl = bike_df.groupby('BOROUGH').get_group('STATEN ISLAND') location = [[stat_isl['LATITUDE'].iloc[i], stat_isl['LONGITUDE'].iloc[i]] for i in range(len(stat_isl))] #Heat map starting with Staten Island so it is fastest m = folium.Map(location=[40.5795,-74.1502], zoom_start =11.75) # # Plot it on the map HeatMap(location, radius = 15).add_to(m) m

#Heat map of all data points #2 dimensional numpy array of all coordinates location = [[bike_df['LATITUDE'][i], bike_df['LONGITUDE'][i]] for i in range(len(bike_df))] #Heat map starting with Staten Island so it is fastest m = folium.Map(location=[40.7077,-74.0083], zoom_start =11) # # Plot it on the map HeatMap(location, radius = 15, blur = 0).add_to(m) m

#Import clean bike data df_bikes_vs_ebikes = pd.read_csv('clean_bike_final.csv') #Dataframe extracting: extract injuries and fatalities columns where type codes 1-5 are equal to bike and e-bike data = df_bikes_vs_ebikes[['NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED','NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED','VEHICLE TYPE CODE 1','VEHICLE TYPE CODE 2','VEHICLE TYPE CODE 3','VEHICLE TYPE CODE 4','VEHICLE TYPE CODE 5']] #Remove null values data = data[~data['NUMBER OF PERSONS INJURED'].isnull()] #Separete bikes and e-bikes bikes = data.loc[(data['VEHICLE TYPE CODE 1']=='Bike') | (data['VEHICLE TYPE CODE 2']=='Bike') | (data['VEHICLE TYPE CODE 3']=='Bike') | (data['VEHICLE TYPE CODE 4']=='Bike') | (data['VEHICLE TYPE CODE 5']=='Bike')] ebikes = data.loc[(data['VEHICLE TYPE CODE 1']=='E-Bike') | (data['VEHICLE TYPE CODE 1']=='E BIKE') | (data['VEHICLE TYPE CODE 1']=='E-BIKE') | (data['VEHICLE TYPE CODE 1']=='E-bike') | (data['VEHICLE TYPE CODE 2']=='E-Bike')| (data['VEHICLE TYPE CODE 2']=='E BIKE') |(data['VEHICLE TYPE CODE 2']=='E-BIKE') | (data['VEHICLE TYPE CODE 2']=='E-bike') | (data['VEHICLE TYPE CODE 3']=='E-Bike') | (data['VEHICLE TYPE CODE 3']=='E BIKE') | (data['VEHICLE TYPE CODE 3']=='E-BIKE') | (data['VEHICLE TYPE CODE 3']=='E-bike') | (data['VEHICLE TYPE CODE 4']=='E-Bike') | (data['VEHICLE TYPE CODE 4']=='E BIKE') | (data['VEHICLE TYPE CODE 4']=='E-BIKE') | (data['VEHICLE TYPE CODE 4']=='E-bike') | (data['VEHICLE TYPE CODE 5']=='E-Bike')| (data['VEHICLE TYPE CODE 5']=='E BIKE') |(data['VEHICLE TYPE CODE 5']=='E-BIKE')| (data['VEHICLE TYPE CODE 5']=='E-bike')]

#Persons Injured and Cyclists Injured #Calculate Welch t-test p-values, plot histograms, plot bar graphs columns_injured = ['NUMBER OF PERSONS INJURED','NUMBER OF CYCLIST INJURED'] for col in columns: #Extract column for bikes and for e-bikes bikes_extract = bikes[col] ebikes_extract = ebikes[col] #Get values from dataframe and cast as numeric column_bike = pd.to_numeric(bikes_extract, errors='coerce').values column_ebike = pd.to_numeric(ebikes_extract, errors='coerce').values #Welch's t-test (cannot assume equal variance due to unequal group size) p = stats.ttest_ind(column_bike,column_ebike, equal_var=False) print(str(col)+" Welch's t-test p-value: "+str(p[1])) #Plot histogram and bar graphs plt.hist(column_bike, bins=[0.0,1.0,2.0,3.0,4.0,5.0,6.0], label="Bike", alpha=0.5, density=True, color ='lightsalmon') plt.hist(column_ebike, bins=[0.0,1.0,2.0,3.0,4.0,5.0,6.0], label="E-Bike",alpha=0.5, density=True, color = 'darkred') #plt.xticks(ticks, ticklabels) plt.xlabel(col) plt.ylabel("DENSITY") plt.legend() plt.show() plt.bar(["Bike","E-Bike"], [column_bike.sum(),column_ebike.sum()], color ='indianred') plt.xlabel(col) plt.ylabel("COUNT") plt.show()

#Persons Killed and Cyclists Killed #Calculate Welch t-test p-values, plot histograms, plot bar graphs columns_killed = ['NUMBER OF PERSONS KILLED','NUMBER OF CYCLIST KILLED'] for col in columns_killed: #Extract column for bikes and for e-bikes bikes_extract = bikes[col] ebikes_extract = ebikes[col] #Get values from dataframe and cast as numeric column_bike = pd.to_numeric(bikes_extract, errors='coerce').values column_ebike = pd.to_numeric(ebikes_extract, errors='coerce').values #Welch's t-test (cannot assume equal variance due to unequal group size) p = stats.ttest_ind(column_bike,column_ebike, equal_var=False) print(str(col)+" Welch's t-test p-value: "+str(p[1])) #Plot histogram and bar graphs plt.hist(column_bike, bins=[0.0,1.0,2.0,3.0,4.0,5.0,6.0], label="Bike", alpha=0.5, density=True, color ='plum') plt.hist(column_ebike, bins=[0.0,1.0,2.0,3.0,4.0,5.0,6.0], label="E-Bike",alpha=0.5, density=True, color = 'purple') #plt.xticks(ticks, ticklabels) plt.xlabel(col) plt.ylabel("DENSITY") plt.legend() plt.show() plt.bar(["Bike","E-Bike"], [column_bike.sum(),column_ebike.sum()], color ='plum') plt.xlabel(col) plt.ylabel("COUNT") plt.show()

#Need to install imbalanced learn in order to perform the next chunk #conda install -c conda-forge imbalanced-learn

#Bootstrapping/Under-Over Randomized Sampling to Balance Bike and E-Bike Data #When we under and over sample, how many times are bike injuries/deaths significantly different than e-bike injuries/deaths? #What percentage of bike injuries/deaths are significantly different than e-bike injuries/deaths? #Loop through columns (persons injured, persons killed, cyclists injured, cyclists killed) columns = ['NUMBER OF PERSONS INJURED','NUMBER OF PERSONS KILLED','NUMBER OF CYCLIST INJURED','NUMBER OF CYCLIST KILLED'] for col in columns: #Initiate statistical significance counters undersample_count = 0 oversample_count = 0 #Run 1000 trials for i in range(1000): #Create a new dataframe for bike data which includes a column classifying the data as "Bike" df_bike_data = pd.DataFrame(bikes[col], columns=[col]) df_bike_data['Class'] = 'Bike' #Create a new dataframe for e-bike data with includes a column classifying the data as "E-Bike" df_ebike_data = pd.DataFrame(ebikes[col], columns=[col]) df_ebike_data['Class'] = 'E-Bike' #Concatenate the two dataframes df_combined = pd.concat([df_bike_data, df_ebike_data]) #Cast as a numpy array and reshape array in order to use RandomUnder/OverSampler X = np.array(df_combined[col]).reshape(-1,1) Y = np.array(df_combined['Class']).reshape(-1,1) #Undersample (shrink larger class randomly) #Takes the majority class (Bike) and cuts it down to the size of the minority class (E-Bike) by random selection #RandomUnderSampler takes argument random_state: #If None, the random number generator is the RandomState instance used by np.random undersample = RandomUnderSampler(sampling_strategy='not minority') #Get the new even samples (X_res_u is data, Y_res_u is class) X_res_u, Y_res_u = undersample.fit_resample(X, Y) #Create a new dataframe for the new undersampled data undersampled_df = pd.DataFrame(X_res_u,columns=[col]) #Add a column for class undersampled_df['Class'] = Y_res_u #Split the data frame into Bikes and E-Bikes and cast the values as an array for testing test_bike_under = undersampled_df[undersampled_df["Class"]=="Bike"][col] test_ebike_under = undersampled_df[undersampled_df["Class"]=="E-Bike"][col] test_column_bike_under = pd.to_numeric(test_bike_under, errors='coerce').values test_column_ebike_under = pd.to_numeric(test_ebike_under, errors='coerce').values #Perform independent t-test between bikes and e-bikes (groups now have equal sizes so we can assume equal variance) p_val_under = stats.ttest_ind(test_column_bike_under,test_column_ebike_under) #If the result is statistically significant add to the count if p_val_under[1] < 0.05: undersample_count += 1 #Oversample (expand smaller class randomly) #Takes the minority class (E-Bike) and expands it to the size of the majority class (Bike) by random selection #RandomOverSampler takes argument random_state: #If None, the random number generator is the RandomState instance used by np.random oversample = RandomOverSampler(sampling_strategy='not majority') #Get the new even samples (X_res_u is data, Y_res_u is class) X_res_o, Y_res_o = oversample.fit_resample(X, Y) #Create a new dataframe for the new oversampled data oversampled_df = pd.DataFrame(X_res_o,columns=[col]) #Add a column for class oversampled_df['Class'] = Y_res_o #Split the data frame into Bikes and E-Bikes and cast the values as an array for testing test_bike_over = oversampled_df[oversampled_df["Class"]=="Bike"][col] test_ebike_over = oversampled_df[oversampled_df["Class"]=="E-Bike"][col] test_column_bike_over = pd.to_numeric(test_bike_over, errors='coerce').values test_column_ebike_over = pd.to_numeric(test_ebike_over, errors='coerce').values #Perform independent t-test between bikes and e-bikes (groups now have equal sizes so we can assume equal variance) p_val_over = stats.ttest_ind(test_column_bike_over,test_column_ebike_over) #If the result is statistically significant add to the count if p_val_over[1] < 0.05: oversample_count += 1 print(str(col)+":") print("Undersample count: "+str(undersample_count)) print("Undersample significance ratio: "+str(undersample_count/1000)) print("Oversample count: "+str(oversample_count)) print("Oversample significance ratio: "+str(oversample_count/1000)) print("100% of the time there is a significant difference between cyclist injuries for bikes and e-bikes involved in motor vehicle collisions.") print("Bikes tend to be significantly more 'dangerous' - categorized as having more cyclist injuries per crash - than e-bikes.") print("Most of the time there is a significant difference between total people killed for bikes versus e-bikes.") print("E-Bikes tend to be more 'dangerous' - categorized as more total deaths occurring when one is involved in a crash - than bikes.")

#Load data time_df = pd.read_csv('clean_all_with_zips_buroughs_final_version.csv') #Create a crash datetime column crash_datetime = time_df['CRASH DATE'] + ' ' + time_df['CRASH TIME'] #Insert crash datetime into first column position time_df.insert(0,'CRASH DATETIME',crash_datetime) #Drop columns not being used for time series analysis time_df = time_df.drop(time_df.columns[[1,2,3,4,5,6,7,8,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27]], axis=1, inplace=False) #Convert crash datetime to pandas datetime object time_df['CRASH DATETIME'] = pd.to_datetime(time_df['CRASH DATETIME']) #Check for null values # print(time_df.isnull().sum()) #Impute 0 for null values time_df = time_df.fillna(0) # display(time_df[time_df['NUMBER OF PERSONS INJURED'].isnull()]) #Convert counts to type int time_df['NUMBER OF PERSONS INJURED'] = time_df['NUMBER OF PERSONS INJURED'].astype('int64') time_df['NUMBER OF PERSONS KILLED'] = time_df['NUMBER OF PERSONS KILLED'].astype('int64') # display(time_df)

# Convert time_df datetime to index cdt_df = time_df.set_index('CRASH DATETIME') cdt_df['NUMBER OF ACCIDENTS'] = 1 # display(cdt_df) # Time of Day (Every Hour) sum_hour = cdt_df.groupby(cdt_df.index.hour).sum() # display(sum_time) # Day of the Week dow = ['Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun'] sum_dow = cdt_df.groupby(cdt_df.index.weekday).sum() sum_dow.index = dow # display(sum_dow) # Month month = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] sum_month = cdt_df.groupby(cdt_df.index.month).sum() sum_month.index = month # display(sum_month) # Season sum_fall = pd.DataFrame(sum_month.iloc[8:11,:].sum()).transpose() sum_fall.index = ['Fall'] sum_winter = pd.DataFrame(sum_month.iloc[[11,0,1],:].sum()).transpose() sum_winter.index = ['Winter'] sum_spring = pd.DataFrame(sum_month.iloc[2:5,:].sum()).transpose() sum_spring.index = ['Spring'] sum_summer = pd.DataFrame(sum_month.iloc[5:8,:].sum()).transpose() sum_summer.index = ['Summer'] sum_frames = [sum_fall,sum_winter,sum_spring,sum_summer] sum_season = pd.concat(sum_frames) # display(sum_season)

# Time of Day (Every Hour) - Visualization ticks = np.arange(0, 24, step=1) fig = plt.figure(figsize=(10,8), constrained_layout=True) plt.subplot(3,1,1) sum_hour['NUMBER OF ACCIDENTS'].plot(linewidth=2.5, color='mediumseagreen', xticks=ticks, xlabel='', ylabel='Count', title='Number of Accidents (Hourly)', fontsize=13); plt.subplot(3,1,2) sum_hour['NUMBER OF PERSONS INJURED'].plot(linewidth=2.5, color='cyan', xticks=ticks, xlabel='', ylabel='Count', title='Number of Persons Injured (Hourly)', fontsize=13); plt.subplot(3,1,3) sum_hour['NUMBER OF PERSONS KILLED'].plot(linewidth=2.5, color='palevioletred', xticks=ticks, xlabel='', ylabel='Count', title='Number of Persons Killed (Hourly)', fontsize=13);

# Day of the Week - Visualization fig = plt.figure(figsize=(10,8), constrained_layout=True) plt.subplot(3,1,1) sum_dow['NUMBER OF ACCIDENTS'].plot(linewidth=2.5, color='mediumseagreen', ylabel='Count', title='Number of Accidents (Day of Week)', fontsize=13); plt.subplot(3,1,2) sum_dow['NUMBER OF PERSONS INJURED'].plot(linewidth=2.5, color='cyan', ylabel='Count', title='Number of Persons Injured (Day of Week)', fontsize=13); plt.subplot(3,1,3) sum_dow['NUMBER OF PERSONS KILLED'].plot(linewidth=2.5, color='palevioletred', ylabel='Count', title='Number of Persons Killed (Day of Week)', fontsize=13);

# Month - Visualization fig = plt.figure(figsize=(10,8), constrained_layout=True) plt.subplot(3,1,1) sum_month['NUMBER OF ACCIDENTS'].plot(linewidth=2.5, color='mediumseagreen', ylabel='Count', title='Number of Accidents (Monthly)', fontsize=13); plt.subplot(3,1,2) sum_month['NUMBER OF PERSONS INJURED'].plot(linewidth=2.5,color='cyan', ylabel='Count', title='Number of Persons Injured (Monthly)', fontsize=13); plt.subplot(3,1,3) sum_month['NUMBER OF PERSONS KILLED'].plot(linewidth=2.5, color='palevioletred', ylabel='Count', title='Number of Persons Killed (Monthly)', fontsize=13);

# Season - Visualization fig = plt.figure(figsize=(10,8), constrained_layout=True) plt.subplot(3,1,1) sum_season['NUMBER OF ACCIDENTS'].plot(linewidth=2.5, color='mediumseagreen', ylabel='Count', title='Number of Accidents (Season)', fontsize=13); plt.subplot(3,1,2) sum_season['NUMBER OF PERSONS INJURED'].plot(linewidth=2.5,color='cyan', ylabel='Count', title='Number of Persons Injured (Season)', fontsize=13); plt.subplot(3,1,3) sum_season['NUMBER OF PERSONS KILLED'].plot(linewidth=2.5, color='palevioletred', ylabel='Count', title='Number of Persons Killed (Season)', fontsize=13);

# Holidays - Dataframe and Visualization day = time_df.groupby(pd.Grouper(key='CRASH DATETIME', freq='1D')) #Number of occurences per day occur_day = day.size().reset_index() occur_day.rename({'CRASH DATETIME':'CRASH DATE', 0: 'NUMBER OF ACCIDENTS'}, axis=1, inplace=True) #Persons Injured/Killed per day persons_day = day.sum().reset_index() persons_day.rename({'CRASH DATETIME':'CRASH DATE'}, axis=1, inplace=True) #Merge dataframes crash_day = pd.merge(occur_day, persons_day) #display(crash_day) #New Year's Day NYD = (crash_day[crash_day['CRASH DATE']=='2021-01-01']).reset_index(drop=True) NYD = NYD.replace({'2021-01-01':"New Year's Day"}) #Valentine's Day VD = (crash_day[crash_day['CRASH DATE']=='2021-02-14']).reset_index(drop=True) VD = VD.replace({'2021-02-14':"Valentine's Day"}) #Memorial Day MD = (crash_day[crash_day['CRASH DATE']=='2021-05-31']).reset_index(drop=True) MD = MD.replace({'2021-05-31':"Memorial Day"}) #Independence Day ID = (crash_day[crash_day['CRASH DATE']=='2021-07-04']).reset_index(drop=True) ID = ID.replace({'2021-07-04':"Independence Day"}) #Labor Day LD = (crash_day[crash_day['CRASH DATE']=='2021-09-06']).reset_index(drop=True) LD = LD.replace({'2021-09-06':"Labor Day"}) #Thanksgiving Day TD = (crash_day[crash_day['CRASH DATE']=='2021-11-25']).reset_index(drop=True) TD = TD.replace({'2021-11-25':"Thanksgiving Day"}) #Black Friday BF = (crash_day[crash_day['CRASH DATE']=='2021-11-26']).reset_index(drop=True) BF = BF.replace({'2021-11-26':"Black Friday"}) #Christmas Eve CE = (crash_day[crash_day['CRASH DATE']=='2021-12-24']).reset_index(drop=True) CE = CE.replace({'2021-12-24':"Christmas Eve"}) #Christmas Day CD = (crash_day[crash_day['CRASH DATE']=='2021-12-25']).reset_index(drop=True) CD = CD.replace({'2021-12-25':"Christmas Day"}) #New Year's Eve NYE = (crash_day[crash_day['CRASH DATE']=='2021-12-31']).reset_index(drop=True) NYE = NYE.replace({'2021-12-31':"New Year's Eve"}) #Create Holiday Dataframe holidays = [NYD, VD, MD, ID, LD, TD, BF, CE, CD, NYE] holidays_df = pd.concat(holidays) holidays_df.rename({'CRASH DATE':'HOLIDAY'}, axis=1, inplace=True) holidays_df = holidays_df.set_index('HOLIDAY') #display(holidays_df) #Create Holiday dictionaries holiday_acc_dict = holidays_df['NUMBER OF ACCIDENTS'].to_dict() names_acc = list(holiday_acc_dict.keys()) values_acc = list(holiday_acc_dict.values()) holiday_injury_dict = holidays_df['NUMBER OF PERSONS INJURED'].to_dict() names_injury = list(holiday_injury_dict.keys()) values_injury = list(holiday_injury_dict.values()) holiday_death_dict = holidays_df['NUMBER OF PERSONS KILLED'].to_dict() names_death = list(holiday_death_dict.keys()) values_death = list(holiday_death_dict.values()) #Visualization - Number of Accidents (Holidays) colors = ['wheat', 'pink', 'lightsteelblue', 'lightcoral', 'beige', 'chocolate', 'silver', 'thistle', 'lightsalmon', 'powderblue'] f, ax = plt.subplots(figsize = (20,5)) sns.set_style("ticks") ax.bar(names_acc, values_acc, color=colors) plt.title('Number of Traffic Accidents (Holidays)', fontsize=15) plt.xticks(fontsize=13) plt.ylabel(ylabel='Count', fontsize=15) plt.yticks(fontsize=13); #Visualization - Number of Persons Injured (Holidays) f2, ax = plt.subplots(figsize = (20,5)) sns.set_style("ticks") ax.bar(names_injury, values_injury, color=colors) plt.title('Number of Persons Injured (Holidays)', fontsize=15) plt.xticks(fontsize=13) plt.ylabel(ylabel='Count', fontsize=15) plt.yticks(fontsize=13); #Visualization - Number of Killed (Holidays) f3, ax = plt.subplots(figsize = (20,5)) sns.set_style("ticks") ax.bar(names_death, values_death, color=colors) plt.title('Number of Persons Killed (Holidays)', fontsize=15) plt.xticks(fontsize=13) plt.ylabel(ylabel='Count', fontsize=15) plt.yticks(fontsize=13);