import pandas as pd
ri = pd.read_csv('police.csv')
ri.head()
ri.shape
ri.isnull().sum()
ri.dtypes
ri.is_arrested.unique()
# RI = Rhode Island
ri.state.unique()
ri.drop(['state', 'county_name'], axis='columns', inplace=True)
ri.dropna(subset=['driver_gender'], inplace=True)
ri.isnull().sum()
ri['is_arrested'] = ri.is_arrested.astype('bool')
combined = ri.stop_date.str.cat(ri.stop_time, sep=' ')
ri['stop_datetime'] = pd.to_datetime(combined)
ri.set_index('stop_datetime', inplace=True)
ri.index
ri.violation.value_counts(normalize=True)
female = ri[ri['driver_gender'] == 'F']
male = ri[ri['driver_gender'] == 'M']
female.violation.value_counts(normalize=True)
male.violation.value_counts(normalize=True)
female_and_speeding = ri[(ri['driver_gender']=='F') & (ri['violation']=='Speeding')]
male_and_speeding = ri[(ri['driver_gender']=='M') & (ri['violation']=='Speeding')]
female_and_speeding.stop_outcome.value_counts(normalize=True)
male_and_speeding.stop_outcome.value_counts(normalize=True)
ri.search_conducted.value_counts(normalize=True)
ri.groupby('driver_gender').search_conducted.mean()
ri.groupby(['violation', 'driver_gender']).search_conducted.mean()
ri.search_type.value_counts()
# Include:
#Probable Cause,Protective Frisk; Incident to Arrest,Protective Frisk; Incident to Arrest,Inventory,Protective Frisk ...
ri['frisk'] = ri.search_type.str.contains('Protective Frisk', na=False)
ri.frisk.sum()
searched = ri[ri['search_conducted']==True]
searched.groupby('driver_gender').frisk.mean()
import seaborn as sns
sns.set()
#Calculating the hourly arrest rate
hourly_arrest_rate = ri.groupby(ri.index.hour).is_arrested.mean()
import matplotlib.pyplot as plt
hourly_arrest_rate.plot()
plt.xlabel('Hour')
plt.ylabel('Arrest Rate')
plt.title('Arrest Rate by Time of Day')
plt.show()
annual_drug_rate = ri.drugs_related_stop.resample('A').mean()
annual_drug_rate.plot()
plt.xlabel('Year')
plt.ylabel('Drug-related stops Rate')
plt.title('Drug-related stops Rate by Year')
plt.show()
annual_search_rate = ri.search_conducted.resample('A').mean()
annual = pd.concat([annual_drug_rate, annual_search_rate], axis='columns')
annual.plot(subplots=True)
plt.xlabel('Year')
plt.show()
# Tallying violations by district
all_zones = pd.crosstab(ri.district, ri.violation)
k_zones = all_zones.loc['Zone K1':'Zone K3']
k_zones.plot(kind='bar', stacked=True)
plt.ylabel('Count of violations')
plt.title("Violations for K's zones")
plt.show()
mapping = {'0-15 Min':8, '16-30 Min':23, '30+ Min':45}
ri['stop_minutes'] = ri.stop_duration.map(mapping)
stop_length = ri.groupby('violation_raw').stop_minutes.mean()
stop_length.sort_values().plot(kind='barh')
plt.show()
weather = pd.read_csv('weather.csv')
weather.head()
weather.shape
weather.isnull().sum()
weather[['TAVG', 'TMIN', 'TMAX']].describe()
weather[['TMIN', 'TAVG', 'TMAX']].plot(kind='box')
plt.show()
weather['TDIFF'] = weather['TMAX'] - weather['TMIN']
weather.TDIFF.plot(kind='hist', bins=20)
WT = weather.loc[:,'WT01':'WT22']
# Adding bad_conditions column
weather['bad_conditions'] = WT.sum(axis='columns')
weather['bad_conditions'] = weather.bad_conditions.fillna(0).astype(int)
weather.bad_conditions.plot(kind='hist')
plt.show()
weather.bad_conditions.value_counts().sort_index()
mapping = {0:'good', 1:'bad', 2:'bad', 3:'bad', 4:'bad', 5:'worse',
6:'worse', 7:'worse', 8:'worse', 9:'worse'}
weather['rating'] = weather.bad_conditions.map(mapping)
weather.rating.value_counts()
cats = ['good', 'bad', 'worse']
from pandas.api.types import CategoricalDtype
cat_type = CategoricalDtype(categories=cats, ordered=True)
weather['rating'] = weather.rating.astype(cat_type)
ri.reset_index(inplace=True)
weather_rating = weather.loc[:,['DATE', 'rating']]
ri_weather = pd.merge(left=ri, right=weather_rating, left_on='stop_date', right_on='DATE', how='left')
ri_weather.set_index('stop_datetime',inplace=True)
ri_weather.groupby('rating').is_arrested.mean()
ri_weather.pivot_table(index='violation', columns='rating', values='is_arrested')