!pip install ipywidgets
! pip install graphviz
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import beta, binom
import itertools
from ipywidgets import interact, interactive
import pymc3 as pm
pm.__version__
EDA
# reads in current dataset from Washington Post github
data = pd.read_csv('https://raw.githubusercontent.com/washingtonpost/data-police-shootings/master/fatal-police-shootings-data.csv')
data.head()
counts_by_race = data[['race']].groupby('race').size()
counts_by_race
# O:'Other', N: 'Native American', A: 'Asian', H: 'Hispanic', B: 'Black, non-Hispanic', W: 'White, non-Hispanic'
sns.barplot(counts_by_race.index,
counts_by_race,
order=['O', 'N', 'A', 'H', 'B', 'W'],
palette = 'Blues_d')
plt.xlabel('Race')
plt.ylabel('# Of Police Shootings');
plt.title('Number of Police Shootings by Race');
plt.savefig('1.jpeg')
# Drop null values
armed_race_data = data[['armed', 'race']].dropna()
# Get races used
races = list(armed_race_data['race'].unique())
# group by armed status and race
armed_race_data = armed_race_data.groupby(['race', 'armed']).size().sort_index(ascending=False).reset_index()
# change 'armed' column to 'armed' or 'unarmed'
armed_race_data['armed'] = ['unarmed' if x == 'unarmed' else 'armed' for x in armed_race_data['armed']]
# Rename column
armed_race_data = armed_race_data.rename(columns={0: 'count'})
armed_race_data.head(3)
# O:'Other', N: 'Native American', A: 'Asian', H: 'Hispanic', B: 'Black, non-Hispanic', W: 'White, non-Hispanic'
sns.barplot(x='race',
y='count',
hue = 'armed',
data = armed_race_data,
order=['O', 'N', 'A', 'H', 'B', 'W'],
palette = 'Blues_d')
plt.xlabel('Race')
plt.ylabel('# Of Police Shootings');
plt.title('Number of Police Shootings by \nRace and Armed Status');
plt.savefig('2.jpeg')
deaths_by_month = data[['date']]
deaths_by_month['date'] = [re.findall(r'\d{4}-\d{2}', x)[0] for x in deaths_by_month['date']]
deaths_by_month = deaths_by_month.groupby('date').size().to_frame().reset_index().rename(columns={0: 'deaths'})
deaths_by_month.head(3)
plot = sns.scatterplot(x='date', y='deaths', data=deaths_by_month);
[l.set_visible(False) for (i,l) in enumerate(plot.xaxis.get_ticklabels()) if i % 12 != 0]
plot.set_xlabel('Year-Month')
plot.set_ylabel('# Deaths per month');
plot.set_title('Number of Police Shootings by Month');
plot.figure.savefig('3.jpeg')
def by_quarter(date):
'''
Splits months into quarters (not as crowded as months and more precision than years)
date: a string in the form YYYY-MM-DD
'''
year = re.findall(r'\d{4}', date)[0]
month = int(re.findall(r'\d{4}-(\d{2})', date)[0])
if 1 <= month < 4:
return year + '-' + 'Q1'
if 4 <= month < 7:
return year + '-' + 'Q2'
if 7 <= month < 10:
return year + '-' + 'Q3'
if 10 <= month < 13:
return year + '-' + 'Q4'
deaths_by_month = data[['date', 'race']]
deaths_by_month['date'] = [by_quarter(x) for x in deaths_by_month['date']]
deaths_by_month = deaths_by_month.groupby(['date', 'race']).size().to_frame().reset_index().rename(columns={0: 'deaths'}).reset_index()
deaths_by_month.head(3)
# O:'Other', N: 'Native American', A: 'Asian', H: 'Hispanic', B: 'Black, non-Hispanic', W: 'White, non-Hispanic'
plot = sns.lineplot(x='date', y='deaths', hue='race', data=deaths_by_month);
[l.set_visible(False) for (i,l) in enumerate(plot.xaxis.get_ticklabels()) if i % 12 != 0]
plot.set_xlabel('Year-Month')
plot.set_ylabel('# Deaths per month');
plot.set_title('Number of Police Shootings by Month and Race');
plot.figure.savefig('4.jpeg')
# Data set with only race
race_only = data[['race']].dropna().groupby('race').size().to_frame().rename(columns={0: 'decedents'})
# Data set with race and armed status
# Drop null values
race_armed = data[['armed', 'race']].dropna()
# change 'armed' column to 'armed' or 'unarmed'
race_armed['armed'] = ['unarmed' if x == 'unarmed' else 'armed' for x in race_armed['armed']]
# group by armed status and race
race_armed = race_armed.groupby(['race', 'armed']).size().sort_index(ascending=False).reset_index()
# Rename column
race_armed = race_armed.rename(columns={0: 'count'})
# Data set with race and camera
race_camera = data[['race', 'body_camera']].dropna()
race_camera = race_camera.groupby(['race', 'body_camera']).size().sort_index(ascending=False).reset_index()
race_camera = race_camera.rename(columns = {0: 'count', 'body_camera': 'camera'})
total_shot = sum(race_only['decedents'])
print('Percentage African American: ' + str(race_only.loc[['B']]['decedents'][0] / total_shot))
p_race_only = 1-binom.cdf(1557, 5855, 0.142)
p_race_only
p_armed = 1-binom.cdf(1364, 5284, 0.142)
p_armed
p_unarmed = 1-binom.cdf(137, 410, 0.142)
p_unarmed
p_NoCamera = 1-binom.cdf(1258, 5032, 0.142)
p_NoCamera
p_Camera = 1-binom.cdf(299, 832, 0.142)
p_Camera
p_val_list = np.sort([p_race_only, p_armed, p_unarmed, p_NoCamera, p_Camera])
p_val_list
# rank * alpha / M
values = np.array([i * 0.05 / 5 for i in range(1,6)])
for i in np.arange(4,-1,-1):
if p_val_list[i] < values[i]:
cutoff = p_val_list[i]
cutoff
# drop the undefined races
data = data.dropna(subset = ['race'])
# select features
Xf = data[['armed','age','gender','signs_of_mental_illness', 'threat_level', 'flee', 'body_camera']]
Xf['is_black'] = data['race'] == 'B'
# One hot encoding features
Xf = pd.get_dummies(Xf,dummy_na=True,drop_first=True)
# impute missing features
Xf.loc[Xf['age'].isna(),'age'] = Xf['age'].mean()
def pooled_inference(alpha, beta, study_df):
"""
Creates and fits a PyMC3 model corresponding to the graphical model above
Inputs:
alpha_value, beta_value : floats, parameters of the prior Beta Distribution
study_df : DataFrame containing study data
Outputs: (model, trace)
"""
with pm.Model() as model:
theta = pm.Beta('theta', alpha=alpha, beta=beta)
X = pm.Bernoulli('is_black', p=theta, observed=study_df['is_black'])
trace = pm.sample(1000, tune=1000, target_accept=0.95)
return (model, trace)
# assume the true theta is around beta(1,2) with mean of 0.33> 0.266
mdl, trace = pooled_inference(3, 6, Xf)
# Trace plot
pm.traceplot(trace)
# report mean and credible interval
mean = np.mean(trace['theta'])
low = np.percentile(trace['theta'], 2.5)
high = np.percentile(trace['theta'], 97.5)
print(f"Estimated theta {mean:.3f}")
print(f"95% BCI: [{low:.3f}, {high:.3f}]")
counts_by_race = data.groupby(['race'])['body_camera'].mean()
sns.barplot(counts_by_race.index,
counts_by_race,
order=['O', 'N', 'A', 'H', 'B', 'W'],
palette = 'Blues_d')
plt.xlabel('Race')
plt.ylabel('Rate Body Camera Open');
plt.title('Rate of Body Camera');
def hierarchical_inference(alpha, beta, study_df):
"""
Creates and fits a PyMC3 model corresponding to the graphical model above
Inputs:
alpha_value, beta_value : floats, parameters of the prior Beta Distribution
study_df : DataFrame containing study data
Outputs: (model, trace)
"""
with pm.Model() as model:
body_camera = pm.Data("no_camera", 1-study_df["body_camera"])
theta = pm.Beta('theta', alpha=alpha, beta=beta)
beta = pm.Uniform('beta')
X = pm.Bernoulli('is_black', p=theta * np.exp(-beta*body_camera), observed=study_df['is_black'])
trace = pm.sample(1000, tune=1000, target_accept=0.95)
return (model, trace)
# assume the true thea is around beta(1,2) with mean of 0.33> 0.266
mdl, trace = hierarchical_inference(3, 6, Xf)
pm.model_to_graphviz(mdl)
# Trace plot
pm.traceplot(trace)
# report mean and credible interval
mean = np.mean(trace['theta'])
low = np.percentile(trace['theta'], 2.5)
high = np.percentile(trace['theta'], 97.5)
print(f"Estimated theta {mean:.3f}")
print(f"95% BCI: [{low:.3f}, {high:.3f}]")
# report mean and credible interval
mean = 1- np.exp(-np.mean(trace['beta']))
high = 1- np.exp(- np.percentile(trace['beta'], 2.5))
low =1-np.exp(-np.percentile(trace['beta'], 97.5))
print(f"Estimated probability of under-report if no camera present {mean:.3f}")
print(f"95% BCI: [{low:.3f}, {high:.3f}]")