import pandas as pd
df = pd.read_csv('/work/Geographic Factors/crimedata_csv_all_years.csv')
df.sample(5)
df.info()
# 797 566 observations
# some null
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 797566 entries, 0 to 797565
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 TYPE 797566 non-null object
1 YEAR 797566 non-null int64
2 MONTH 797566 non-null int64
3 DAY 797566 non-null int64
4 HOUR 797566 non-null int64
5 MINUTE 797566 non-null int64
6 HUNDRED_BLOCK 797554 non-null object
7 NEIGHBOURHOOD 797442 non-null object
8 X 797493 non-null float64
9 Y 797493 non-null float64
dtypes: float64(2), int64(5), object(3)
memory usage: 60.8+ MB
N = len(df)
(df.isnull().sum()) / N *100
# small percentage of null on 4 variables
#df.describe()
categorical = df.select_dtypes('object').columns
print('% of unique values:')
for cat in categorical:
unique = df[cat].nunique()
unique = round(unique/N*100,2)
print(cat, unique)
# no significant cardinality in categoricals
% of unique values:
TYPE 0.0
HUNDRED_BLOCK 2.88
NEIGHBOURHOOD 0.0
df.head()
type = df.TYPE.value_counts()
type_perc = round((type/N)*100 ,2)
type_perc
# 1/4 of crime are vehicle thefts,
# another 1/4 are other forms of thefts
# major personal injuries such are offense against a person and
# homicide comprise <10%
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
type_perc.plot(kind='bar', rot = 90)
plt.xlabel('Crime type')
plt.ylabel('%')
plt.title('Percentage distribution of Types of Crime');
neigh = df.NEIGHBOURHOOD.value_counts()
neigh_perc = round((neigh/N)*100 ,2)
neigh_perc
## with minimal null, not yet filled
# 1/4 of crimes occur in the CBD
neigh_perc.plot(kind='bar', rot = 90)
plt.xlabel('Neighbourhood')
plt.ylabel('%')
plt.title('Percentage distribution of Neighbourhood crime');
type_neigh = df.groupby('TYPE')['NEIGHBOURHOOD'].value_counts()
(((type_neigh.sort_values(ascending=False))/N)*100).head()
# the leading crimes are thefts and mischief that occur in the CBD,
# accounting for almost 20%
# https://geodash.vpd.ca/opendata/
# Other Theft: Theft of property that includes personal items (purse, wallet, cellphone, laptop, etc.), bicycle, etc.
year = df.YEAR.value_counts()
year_perc = ((year/N)*100).sort_index(ascending=False)
year_perc
year_perc.plot(kind='bar', rot = 70)
plt.xlabel('Year')
plt.ylabel('%')
plt.title('Annual percentage distribution of crime (2003-2021');
# decreasing percentage from 2003 - 2012
# increasing trend 2013-2019
# decreased in 2020 --> ? pandemic effect
# 2021 ongoing
year_type = df.groupby('YEAR')['TYPE'].value_counts()
(((year_type.sort_values(ascending=False))/N)*100).head()
veh = df[df.TYPE == 'Theft from Vehicle']
veh.groupby('YEAR')['NEIGHBOURHOOD'].value_counts().sort_values(ascending=False).head(10)
yr_2021 = df[df.YEAR==2021]
cbd = df[df.NEIGHBOURHOOD == 'Central Business District']
yr_2021_cbd = yr_2021[yr_2021.NEIGHBOURHOOD == 'Central Business District']
yr_2021_cbd_type = yr_2021_cbd.groupby('TYPE')
yr_2021_cbd_type_ct = yr_2021_cbd_type.count()
yr_2021_cbd_type_ct['YEAR'].sort_values(ascending=False)
(yr_2021_cbd_type_ct['YEAR'].sort_values(ascending=False)).plot(kind='bar')
plt.xlabel('Crime type')
plt.ylabel('Frequency of crime')
plt.title('Frequency of Crime types in the Central Business District in 2021');
df2 = pd.read_csv('/work/Amenity_Rooms_and_Assets.csv')
df2.sample(5)
## would have helped to have neighbourhood name info, to correlate with statcan
df2['neighbourhood'] = df2['Postal Code']
df2.neighbourhood.replace({'V5R 5C6': 'Renfrew-Collingwood',
'V5Z 1B3' : 'Fairview',
'V6G 1K1' : 'Stanley Park',
'V6K 1L9': 'Kitsilano',
'V5T 1J9': 'Mount Pleasant',
'V6B 5X8': 'Central Business District',
'V5K 4X8': 'Hastings-Sunrise',
'V6K 3M8': 'Kitsilano',
'V5R 2K6': 'Renfrew-Collingwood',
'V5R 5V6': 'Renfrew-Collingwood',
'V5R 3N3': 'Rendrew-Collingwood',
'V5N 5X1': 'Grandview-Woodland',
'V6J 1P8': 'Kitsilano',
'V5N 2A1': 'Grandview-Woodland',
'V6E 1R2': 'West End',
'V6E 1S2': 'West End',
'V5N 1Y6': 'Kensington-Cedar Cottage',
'V5R 4S2': 'Renfrew-Collingwood',
'V6H 2S2': 'Fairview',
'V6P 4W9': 'Marpole',
'V5N 2B6': 'Kensington-Cedar Cottage',
'V5P 1K5': 'Victoria-Fraserview',
'V5N 1P3': 'Grandview-Woodland',
'V6M 3Y1': 'Kerrisdale'
}, inplace=True)
df2.neighbourhood.unique()
## need to relate postal codes with crime type neighbourhood/x/y
## alternative: derive neighbourhoods from postal code
### not promising, too much overlap
#### alternative: area code https://www.zip-codes.com/canadian/postal-code.asp?postalcode=v6g+1k1
# postal code info https://postalcodeinfo.ca/postal-code/v6g-1k1/
# lacking info filled with google maps search
df.NEIGHBOURHOOD.unique()
df.head()
df2.sample(5)
# with neighbourhood
yr_2021.head()
# df crime
yr_2021_neigh = yr_2021.groupby('NEIGHBOURHOOD')
yr_2021_neigh = yr_2021_neigh['TYPE']
yr_2021_neigh_ct = yr_2021_neigh.count()
yr_2021_neigh_ct
# df2 amenities
df2 = df2.rename(columns= {'neighbourhood': 'NEIGHBOURHOOD'})
df2.head()
df3 = df2.merge(yr_2021_neigh_ct, on='NEIGHBOURHOOD')
df3.head(3)
df3 = df3.rename(columns = {'TYPE': '2021_crime_freq'})
df3.head(2)
df3['2021_crime_freq'].sum()
df3['2021_crime_rate'] = (df3['2021_crime_freq'] / df3['2021_crime_freq'].sum()) *100
df3['2021_crime_rate'] = round(df3['2021_crime_rate'], 2)
df3.head()
bldg_neigh_crime = df3[['Building Names', '2021_crime_rate']]
bldg_neigh_crime.set_index('Building Names', inplace=True)
bldg_neigh_crime.head()
plt.figure(figsize=(10,5))
bldg_neigh_crime.plot(kind='bar');
plt.ylabel('% crime rate')
plt.title('Crime rates per Building, 2021');
# high crime rates in Bridgeview Place neighbourhood
# lowest in Gordon Fahrni neighbourhood