import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pymc3 as pm
import arviz as az
import matplotlib.lines as mlines
import warnings
warnings.filterwarnings('ignore')
from collections import OrderedDict
import theano
import theano.tensor as tt
import itertools
from IPython.core.pylabtools import figsize
pd.set_option('display.max_columns', 30)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
df = pd.read_csv('banking.csv')
df.head()
sns.stripplot(x="y", y="age", data=df, jitter=True)
plt.show();
sns.stripplot(x="y", y="euribor3m", data=df, jitter=True)
plt.show();
df['education'].value_counts()
df['job'].value_counts()
df.poutcome.unique()
def replace_education(education):
"""
This function codes the highest education attained.
"""
if education == 'university.degree':
return 8
elif education == 'professional.course':
return 7
elif education == 'high.school':
return 6
elif education == 'basic.9y':
return 5
elif education == 'basic.6y':
return 4
elif education =='basic.4y':
return 3
elif education == 'unknown':
return 2
elif education == 'illiterate':
return 1
def replace_job(job):
"""
This function codes the highest job earned.
"""
if job == 'management':
return 12
elif job == 'admin.':
return 11
elif job == 'entrepreneur':
return 10
elif job == 'technician':
return 9
elif job == 'services':
return 8
elif job == 'self-employed':
return 7
elif job == 'blue-collar':
return 6
elif job == 'retired':
return 5
elif job == 'housemaid':
return 4
elif job == 'unemployed':
return 3
elif job == 'unknown':
return 2
elif job == 'student':
return 1
def replace_marital(marital):
if marital == 'married':
return 4
elif marital == 'single':
return 3
elif marital == 'divorced':
return 2
elif marital == 'unknown':
return 1
def replace_default(default):
if default == 'no':
return 0
elif default == 'yes':
return 1
elif default == 'unknown':
return 2
def replace_housing(housing):
if housing == 'no':
return 0
elif housing == 'yes':
return 1
elif housing == 'unknown':
return 2
def replace_loan(loan):
if loan == 'no':
return 0
elif loan == 'yes':
return 1
elif loan == 'unknown':
return 2
def replace_contact(contact):
if contact == 'cellular':
return 1
elif contact == 'telephone':
return 2
def replace_poutcome(poutcome):
if poutcome == 'failure':
return 0
elif poutcome == 'success':
return 1
elif poutcome == 'nonexistent':
return 2
df['education'] = df['education'].apply(lambda x: replace_education(x))
df['job'] = df['job'].apply(lambda x: replace_job(x))
df['marital'] = df['marital'].apply(lambda x: replace_marital(x))
df['default'] = df['default'].apply(lambda x: replace_default(x))
df['housing'] = df['housing'].apply(lambda x: replace_housing(x))
df['loan'] = df['loan'].apply(lambda x: replace_loan(x))
df['contact'] = df['contact'].apply(lambda x: replace_contact(x))
df['poutcome'] = df['poutcome'].apply(lambda x: replace_poutcome(x))
look_up = {'aug': 8, 'nov': 11, 'jun': 6, 'apr': 4, 'jul': 7,
'may': 5, 'oct': 10, 'mar': 3, 'sep': 9, 'dec': 12}
df['month'] = df['month'].apply(lambda x: look_up[x])
look_up = {'thu': 4, 'fri': 5, 'tue': 2, 'mon': 1, 'wed': 3}
df['day_of_week'] = df['day_of_week'].apply(lambda x: look_up[x])
df.head(3)
sns.stripplot(x="y", y="education", data=df, jitter=True)
plt.show();
sns.stripplot(x="y", y="job", data=df, jitter=True)
plt.show();
df.head(3)
outcome = df['y']
data = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'euribor3m']]
data['outcome'] = outcome
data.corr()['outcome'].sort_values(ascending=False)
y_simple = data['outcome']
x_n = 'duration'
x_0 = data[x_n].values
x_c = x_0 - x_0.mean()
with pm.Model() as model_simple:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=10)
μ = α + pm.math.dot(x_c, β)
θ = pm.Deterministic('θ', pm.math.sigmoid(μ))
bd = pm.Deterministic('bd', -α/β)
y_1 = pm.Bernoulli('y_1', p=θ, observed=y_simple)
trace_simple = pm.sample(1000, tune=1000)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [β, α]
Sampling 2 chains: 100%|██████████| 4000/4000 [00:40<00:00, 66.44draws/s]
theta = trace_simple['θ'].mean(axis=0)
idx = np.argsort(x_c)
plt.plot(x_c[idx], theta[idx], color='C2', lw=3)
plt.vlines(trace_simple['bd'].mean(), 0, 1, color='k')
bd_hpd = az.hpd(trace_simple['bd'])
plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)
plt.scatter(x_c, np.random.normal(y_simple, 0.02),
marker='.', color=[f'C{x}' for x in y_simple])
az.plot_hpd(x_c, trace_simple['θ'], color='C2')
plt.xlabel(x_n)
plt.ylabel('θ', rotation=0)
locs, _ = plt.xticks()
plt.xticks(locs, np.round(locs + x_0.mean(), 1));
az.summary(trace_simple, var_names=['α', 'β'])
ppc = pm.sample_ppc(trace_simple, model=model_simple, samples=500)
preds = np.rint(ppc['y_1'].mean(axis=0)).astype('int')
100%|██████████| 500/500 [00:02<00:00, 240.47it/s]
print('Accuracy of the simplest model:', accuracy_score(preds, data['outcome']))
print('f1 score of the simplest model:', f1_score(preds, data['outcome']))
Accuracy of the simplest model: 0.8933670000971157
f1 score of the simplest model: 0.2591093117408907
outcome.value_counts()
plt.figure(figsize=(15, 15))
corr = data.corr()
mask = np.tri(*corr.shape).T
sns.heatmap(corr.abs(), mask=mask, annot=True, cmap='viridis');
# Scale age by 10, it helps with model convergence.
data['age'] = data['age'] / 10
data['age2'] = np.square(data['age'])
with pm.Model() as logistic_model:
pm.glm.GLM.from_formula('outcome ~ age + age2 + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m', data, family = pm.glm.families.Binomial())
trace = pm.sample(1000, tune = 1000, init = 'adapt_diag')
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [euribor3m, previous, pdays, campaign, duration, day_of_week, month, contact, loan, housing, default, education, marital, job, age2, age, Intercept]
Sampling 2 chains: 100%|██████████| 4000/4000 [26:25<00:00, 3.53draws/s]
There were 28 divergences after tuning. Increase `target_accept` or reparameterize.
There were 30 divergences after tuning. Increase `target_accept` or reparameterize.
az.plot_trace(trace);
def lm_full(trace, age, education, marital):
shape = np.broadcast(age, education, marital).shape
x_norm = np.asarray([np.broadcast_to(x, shape) for x in [age/10., education, marital]])
return 1 / (1 + np.exp(-(trace['Intercept'] +
trace['age']*x_norm[0] +
trace['age2']*(x_norm[0]**2) +
trace['education']*x_norm[1] +
trace['marital']*x_norm[2])))
lm = lambda x, samples: lm_full(samples, x, 1., 4.)
lm2 = lambda x, samples: lm_full(samples, x, 5., 4.)
lm3 = lambda x, samples: lm_full(samples, x, 8., 4.)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm, samples=100, color='blue', alpha=.8)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm2, samples=100, color='green', alpha=.8)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm3, samples=100, color='red', alpha=.8)
blue_line = mlines.Line2D(['lm'], [], color='b', label='illiterate')
green_line = mlines.Line2D(['lm2'], [], color='g', label='basic.9y')
red_line = mlines.Line2D(['lm3'], [], color='r', label='university.degree')
plt.legend(handles=[blue_line, green_line, red_line], loc='lower right')
plt.ylabel("P(Outcome = Yes)")
plt.xlabel("Age")
plt.show();
b = trace['education']
plt.hist(np.exp(b), bins=20, normed=True)
plt.xlabel("Odds Ratio")
plt.show();
lb, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
print("P(%.3f < Odds Ratio < %.3f) = 0.95" % (np.exp(lb), np.exp(ub)))
P(1.058 < Odds Ratio < 1.109) = 0.95
stat_df = pm.summary(trace)
stat_df['odds_ratio'] = np.exp(stat_df['mean'])
stat_df['percentage_effect'] = 100 * (stat_df['odds_ratio'] - 1)
stat_df
az.plot_forest(trace);
pm.plot_posterior(trace);
def run_models(df, upper_order=5):
'''
Convenience function:
Fit a range of pymc3 models of increasing polynomial complexity.
Suggest limit to max order 5 since calculation time is exponential.
'''
models, traces = OrderedDict(), OrderedDict()
for k in range(1,upper_order+1):
nm = 'k{}'.format(k)
fml = create_poly_modelspec(k)
with pm.Model() as models[nm]:
print('\nRunning: {}'.format(nm))
pm.glm.GLM.from_formula(fml, df,
family=pm.glm.families.Binomial())
traces[nm] = pm.sample(1000, tune=1000, init='adapt_diag')
return models, traces
def create_poly_modelspec(k=1):
'''
Convenience function:
Create a polynomial modelspec string for patsy
'''
return ('outcome ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m ' + ' '.join(['+ np.power(age,{})'.format(j)
for j in range(2,k+1)])).strip()
models_lin, traces_lin = run_models(data, 3)
Running: k1
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [euribor3m, previous, pdays, campaign, duration, day_of_week, month, contact, loan, housing, default, education, marital, job, age, Intercept]
Sampling 2 chains: 100%|██████████| 4000/4000 [15:43<00:00, 4.93draws/s]
There were 25 divergences after tuning. Increase `target_accept` or reparameterize.
There were 16 divergences after tuning. Increase `target_accept` or reparameterize.
Running: k2
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [np.power(age, 2), euribor3m, previous, pdays, campaign, duration, day_of_week, month, contact, loan, housing, default, education, marital, job, age, Intercept]
Sampling 2 chains: 100%|██████████| 4000/4000 [25:02<00:00, 3.76draws/s]
There were 46 divergences after tuning. Increase `target_accept` or reparameterize.
There were 42 divergences after tuning. Increase `target_accept` or reparameterize.
Running: k3
Auto-assigning NUTS sampler...
Initializing NUTS using adapt_diag...
Multiprocess sampling (2 chains in 2 jobs)
NUTS: [np.power(age, 3), np.power(age, 2), euribor3m, previous, pdays, campaign, duration, day_of_week, month, contact, loan, housing, default, education, marital, job, age, Intercept]
Sampling 2 chains: 100%|██████████| 4000/4000 [1:32:48<00:00, 1.39s/draws]
There were 26 divergences after tuning. Increase `target_accept` or reparameterize.
There were 39 divergences after tuning. Increase `target_accept` or reparameterize.
model_trace_dict = dict()
for nm in ['k1', 'k2', 'k3']:
models_lin[nm].name = nm
model_trace_dict.update({models_lin[nm]: traces_lin[nm]})
dfwaic = pm.compare(model_trace_dict, ic='WAIC')
print(dfwaic)
pm.compareplot(dfwaic);
WAIC pWAIC dWAIC weight SE dSE var_warn
k3 18273 20.28 0 0.91 227.6 0 0
k2 18302.9 19.45 29.83 0.04 227.66 12.12 0
k1 18448.7 17.87 175.66 0.05 228.27 28.4 0
dfwaic
ppc = pm.sample_ppc(trace, model=logistic_model, samples=500)
100%|██████████| 500/500 [00:02<00:00, 191.77it/s]
ppc['y'].mean(axis=0)
preds = np.rint(ppc['y'].mean(axis=0)).astype('int')
def plot_confusion_matrix(cm, classes = ['Not subscribe', 'Subscribe'],
title='Subscription Confusion matrix',
cmap=plt.cm.Reds):
# Display the matrix in text form
print('Confusion matrix')
print(cm)
figsize(8, 8)
# Show the matrix using the imshow functionality
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 20)
# Tick marks show classes
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 12)
plt.yticks(tick_marks, classes, rotation = 90, size = 12)
# Formatting for text labels on plot
fmt1 = 's'
fmt2 = 'd'
thresh = cm.max() / 2.
# Four types of classifications
types = [['True Negative', 'False Positive'],
['False Negative', 'True Positive']]
# Add the actual numbers and the types onto the heatmap plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i - 0.05, format(types[i][j], fmt1),
horizontalalignment="center", size = 18,
color="white" if cm[i, j] > thresh else "black")
plt.text(j, i + 0.15, format(cm[i, j], fmt2),
horizontalalignment="center", size = 24,
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label', size = 16)
plt.xlabel('Predicted Label', size = 16)
cm = confusion_matrix(data['outcome'], preds)
plot_confusion_matrix(cm)
Confusion matrix
[[35648 900]
[ 2867 1773]]
print('Accuracy of the full model: ', accuracy_score(preds, data['outcome']))
print('f1 score of the full model: ', f1_score(preds, data['outcome']))
Accuracy of the full model: 0.908541322715354
f1 score of the full model: 0.4848899220566115