import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pymc3 as pm
import arviz as az
import matplotlib.lines as mlines
import warnings
warnings.filterwarnings('ignore')
from collections import OrderedDict
import theano
import theano.tensor as tt
import itertools
from IPython.core.pylabtools import figsize
pd.set_option('display.max_columns', 30)
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
df = pd.read_csv('banking.csv')
df.head()
sns.stripplot(x="y", y="age", data=df, jitter=True)
plt.show();
sns.stripplot(x="y", y="euribor3m", data=df, jitter=True)
plt.show();
df['education'].value_counts()
df['job'].value_counts()
df.poutcome.unique()
def replace_education(education):
"""
This function codes the highest education attained.
"""
if education == 'university.degree':
return 8
elif education == 'professional.course':
return 7
elif education == 'high.school':
return 6
elif education == 'basic.9y':
return 5
elif education == 'basic.6y':
return 4
elif education =='basic.4y':
return 3
elif education == 'unknown':
return 2
elif education == 'illiterate':
return 1
def replace_job(job):
"""
This function codes the highest job earned.
"""
if job == 'management':
return 12
elif job == 'admin.':
return 11
elif job == 'entrepreneur':
return 10
elif job == 'technician':
return 9
elif job == 'services':
return 8
elif job == 'self-employed':
return 7
elif job == 'blue-collar':
return 6
elif job == 'retired':
return 5
elif job == 'housemaid':
return 4
elif job == 'unemployed':
return 3
elif job == 'unknown':
return 2
elif job == 'student':
return 1
def replace_marital(marital):
if marital == 'married':
return 4
elif marital == 'single':
return 3
elif marital == 'divorced':
return 2
elif marital == 'unknown':
return 1
def replace_default(default):
if default == 'no':
return 0
elif default == 'yes':
return 1
elif default == 'unknown':
return 2
def replace_housing(housing):
if housing == 'no':
return 0
elif housing == 'yes':
return 1
elif housing == 'unknown':
return 2
def replace_loan(loan):
if loan == 'no':
return 0
elif loan == 'yes':
return 1
elif loan == 'unknown':
return 2
def replace_contact(contact):
if contact == 'cellular':
return 1
elif contact == 'telephone':
return 2
def replace_poutcome(poutcome):
if poutcome == 'failure':
return 0
elif poutcome == 'success':
return 1
elif poutcome == 'nonexistent':
return 2
df['education'] = df['education'].apply(lambda x: replace_education(x))
df['job'] = df['job'].apply(lambda x: replace_job(x))
df['marital'] = df['marital'].apply(lambda x: replace_marital(x))
df['default'] = df['default'].apply(lambda x: replace_default(x))
df['housing'] = df['housing'].apply(lambda x: replace_housing(x))
df['loan'] = df['loan'].apply(lambda x: replace_loan(x))
df['contact'] = df['contact'].apply(lambda x: replace_contact(x))
df['poutcome'] = df['poutcome'].apply(lambda x: replace_poutcome(x))
look_up = {'aug': 8, 'nov': 11, 'jun': 6, 'apr': 4, 'jul': 7,
'may': 5, 'oct': 10, 'mar': 3, 'sep': 9, 'dec': 12}
df['month'] = df['month'].apply(lambda x: look_up[x])
look_up = {'thu': 4, 'fri': 5, 'tue': 2, 'mon': 1, 'wed': 3}
df['day_of_week'] = df['day_of_week'].apply(lambda x: look_up[x])
df.head(3)
sns.stripplot(x="y", y="education", data=df, jitter=True)
plt.show();
sns.stripplot(x="y", y="job", data=df, jitter=True)
plt.show();
df.head(3)
outcome = df['y']
data = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'euribor3m']]
data['outcome'] = outcome
data.corr()['outcome'].sort_values(ascending=False)
y_simple = data['outcome']
x_n = 'duration'
x_0 = data[x_n].values
x_c = x_0 - x_0.mean()
with pm.Model() as model_simple:
α = pm.Normal('α', mu=0, sd=10)
β = pm.Normal('β', mu=0, sd=10)
μ = α + pm.math.dot(x_c, β)
θ = pm.Deterministic('θ', pm.math.sigmoid(μ))
bd = pm.Deterministic('bd', -α/β)
y_1 = pm.Bernoulli('y_1', p=θ, observed=y_simple)
trace_simple = pm.sample(1000, tune=1000)
theta = trace_simple['θ'].mean(axis=0)
idx = np.argsort(x_c)
plt.plot(x_c[idx], theta[idx], color='C2', lw=3)
plt.vlines(trace_simple['bd'].mean(), 0, 1, color='k')
bd_hpd = az.hpd(trace_simple['bd'])
plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)
plt.scatter(x_c, np.random.normal(y_simple, 0.02),
marker='.', color=[f'C{x}' for x in y_simple])
az.plot_hpd(x_c, trace_simple['θ'], color='C2')
plt.xlabel(x_n)
plt.ylabel('θ', rotation=0)
locs, _ = plt.xticks()
plt.xticks(locs, np.round(locs + x_0.mean(), 1));
az.summary(trace_simple, var_names=['α', 'β'])
ppc = pm.sample_ppc(trace_simple, model=model_simple, samples=500)
preds = np.rint(ppc['y_1'].mean(axis=0)).astype('int')
print('Accuracy of the simplest model:', accuracy_score(preds, data['outcome']))
print('f1 score of the simplest model:', f1_score(preds, data['outcome']))
outcome.value_counts()
plt.figure(figsize=(15, 15))
corr = data.corr()
mask = np.tri(*corr.shape).T
sns.heatmap(corr.abs(), mask=mask, annot=True, cmap='viridis');
# Scale age by 10, it helps with model convergence.
data['age'] = data['age'] / 10
data['age2'] = np.square(data['age'])
with pm.Model() as logistic_model:
pm.glm.GLM.from_formula('outcome ~ age + age2 + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m', data, family = pm.glm.families.Binomial())
trace = pm.sample(1000, tune = 1000, init = 'adapt_diag')
az.plot_trace(trace);
def lm_full(trace, age, education, marital):
shape = np.broadcast(age, education, marital).shape
x_norm = np.asarray([np.broadcast_to(x, shape) for x in [age/10., education, marital]])
return 1 / (1 + np.exp(-(trace['Intercept'] +
trace['age']*x_norm[0] +
trace['age2']*(x_norm[0]**2) +
trace['education']*x_norm[1] +
trace['marital']*x_norm[2])))
lm = lambda x, samples: lm_full(samples, x, 1., 4.)
lm2 = lambda x, samples: lm_full(samples, x, 5., 4.)
lm3 = lambda x, samples: lm_full(samples, x, 8., 4.)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm, samples=100, color='blue', alpha=.8)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm2, samples=100, color='green', alpha=.8)
pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm3, samples=100, color='red', alpha=.8)
blue_line = mlines.Line2D(['lm'], [], color='b', label='illiterate')
green_line = mlines.Line2D(['lm2'], [], color='g', label='basic.9y')
red_line = mlines.Line2D(['lm3'], [], color='r', label='university.degree')
plt.legend(handles=[blue_line, green_line, red_line], loc='lower right')
plt.ylabel("P(Outcome = Yes)")
plt.xlabel("Age")
plt.show();
b = trace['education']
plt.hist(np.exp(b), bins=20, normed=True)
plt.xlabel("Odds Ratio")
plt.show();
lb, ub = np.percentile(b, 2.5), np.percentile(b, 97.5)
print("P(%.3f < Odds Ratio < %.3f) = 0.95" % (np.exp(lb), np.exp(ub)))
stat_df = pm.summary(trace)
stat_df['odds_ratio'] = np.exp(stat_df['mean'])
stat_df['percentage_effect'] = 100 * (stat_df['odds_ratio'] - 1)
stat_df
az.plot_forest(trace);
pm.plot_posterior(trace);
def run_models(df, upper_order=5):
'''
Convenience function:
Fit a range of pymc3 models of increasing polynomial complexity.
Suggest limit to max order 5 since calculation time is exponential.
'''
models, traces = OrderedDict(), OrderedDict()
for k in range(1,upper_order+1):
nm = 'k{}'.format(k)
fml = create_poly_modelspec(k)
with pm.Model() as models[nm]:
print('\nRunning: {}'.format(nm))
pm.glm.GLM.from_formula(fml, df,
family=pm.glm.families.Binomial())
traces[nm] = pm.sample(1000, tune=1000, init='adapt_diag')
return models, traces
def create_poly_modelspec(k=1):
'''
Convenience function:
Create a polynomial modelspec string for patsy
'''
return ('outcome ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m ' + ' '.join(['+ np.power(age,{})'.format(j)
for j in range(2,k+1)])).strip()
models_lin, traces_lin = run_models(data, 3)
model_trace_dict = dict()
for nm in ['k1', 'k2', 'k3']:
models_lin[nm].name = nm
model_trace_dict.update({models_lin[nm]: traces_lin[nm]})
dfwaic = pm.compare(model_trace_dict, ic='WAIC')
print(dfwaic)
pm.compareplot(dfwaic);
dfwaic
ppc = pm.sample_ppc(trace, model=logistic_model, samples=500)
ppc['y'].mean(axis=0)
preds = np.rint(ppc['y'].mean(axis=0)).astype('int')
def plot_confusion_matrix(cm, classes = ['Not subscribe', 'Subscribe'],
title='Subscription Confusion matrix',
cmap=plt.cm.Reds):
# Display the matrix in text form
print('Confusion matrix')
print(cm)
figsize(8, 8)
# Show the matrix using the imshow functionality
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title, size = 20)
# Tick marks show classes
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45, size = 12)
plt.yticks(tick_marks, classes, rotation = 90, size = 12)
# Formatting for text labels on plot
fmt1 = 's'
fmt2 = 'd'
thresh = cm.max() / 2.
# Four types of classifications
types = [['True Negative', 'False Positive'],
['False Negative', 'True Positive']]
# Add the actual numbers and the types onto the heatmap plot
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i - 0.05, format(types[i][j], fmt1),
horizontalalignment="center", size = 18,
color="white" if cm[i, j] > thresh else "black")
plt.text(j, i + 0.15, format(cm[i, j], fmt2),
horizontalalignment="center", size = 24,
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label', size = 16)
plt.xlabel('Predicted Label', size = 16)
cm = confusion_matrix(data['outcome'], preds)
plot_confusion_matrix(cm)
print('Accuracy of the full model: ', accuracy_score(preds, data['outcome']))
print('f1 score of the full model: ', f1_score(preds, data['outcome']))