Bayesian Logistic Regression Bank Marketing

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import pymc3 as pm import arviz as az import matplotlib.lines as mlines import warnings warnings.filterwarnings('ignore') from collections import OrderedDict import theano import theano.tensor as tt import itertools from IPython.core.pylabtools import figsize pd.set_option('display.max_columns', 30) from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

df = pd.read_csv('banking.csv')

df.head()

sns.stripplot(x="y", y="age", data=df, jitter=True) plt.show();

sns.stripplot(x="y", y="euribor3m", data=df, jitter=True) plt.show();

df['education'].value_counts()

df['job'].value_counts()

df.poutcome.unique()

def replace_education(education): """ This function codes the highest education attained. """ if education == 'university.degree': return 8 elif education == 'professional.course': return 7 elif education == 'high.school': return 6 elif education == 'basic.9y': return 5 elif education == 'basic.6y': return 4 elif education =='basic.4y': return 3 elif education == 'unknown': return 2 elif education == 'illiterate': return 1 def replace_job(job): """ This function codes the highest job earned. """ if job == 'management': return 12 elif job == 'admin.': return 11 elif job == 'entrepreneur': return 10 elif job == 'technician': return 9 elif job == 'services': return 8 elif job == 'self-employed': return 7 elif job == 'blue-collar': return 6 elif job == 'retired': return 5 elif job == 'housemaid': return 4 elif job == 'unemployed': return 3 elif job == 'unknown': return 2 elif job == 'student': return 1 def replace_marital(marital): if marital == 'married': return 4 elif marital == 'single': return 3 elif marital == 'divorced': return 2 elif marital == 'unknown': return 1 def replace_default(default): if default == 'no': return 0 elif default == 'yes': return 1 elif default == 'unknown': return 2 def replace_housing(housing): if housing == 'no': return 0 elif housing == 'yes': return 1 elif housing == 'unknown': return 2 def replace_loan(loan): if loan == 'no': return 0 elif loan == 'yes': return 1 elif loan == 'unknown': return 2 def replace_contact(contact): if contact == 'cellular': return 1 elif contact == 'telephone': return 2 def replace_poutcome(poutcome): if poutcome == 'failure': return 0 elif poutcome == 'success': return 1 elif poutcome == 'nonexistent': return 2 df['education'] = df['education'].apply(lambda x: replace_education(x)) df['job'] = df['job'].apply(lambda x: replace_job(x)) df['marital'] = df['marital'].apply(lambda x: replace_marital(x)) df['default'] = df['default'].apply(lambda x: replace_default(x)) df['housing'] = df['housing'].apply(lambda x: replace_housing(x)) df['loan'] = df['loan'].apply(lambda x: replace_loan(x)) df['contact'] = df['contact'].apply(lambda x: replace_contact(x)) df['poutcome'] = df['poutcome'].apply(lambda x: replace_poutcome(x))

look_up = {'aug': 8, 'nov': 11, 'jun': 6, 'apr': 4, 'jul': 7, 'may': 5, 'oct': 10, 'mar': 3, 'sep': 9, 'dec': 12} df['month'] = df['month'].apply(lambda x: look_up[x])

look_up = {'thu': 4, 'fri': 5, 'tue': 2, 'mon': 1, 'wed': 3} df['day_of_week'] = df['day_of_week'].apply(lambda x: look_up[x])

df.head(3)

sns.stripplot(x="y", y="education", data=df, jitter=True) plt.show();

sns.stripplot(x="y", y="job", data=df, jitter=True) plt.show();

df.head(3)

outcome = df['y'] data = df[['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'euribor3m']] data['outcome'] = outcome data.corr()['outcome'].sort_values(ascending=False)

y_simple = data['outcome'] x_n = 'duration' x_0 = data[x_n].values x_c = x_0 - x_0.mean() with pm.Model() as model_simple: α = pm.Normal('α', mu=0, sd=10) β = pm.Normal('β', mu=0, sd=10) μ = α + pm.math.dot(x_c, β) θ = pm.Deterministic('θ', pm.math.sigmoid(μ)) bd = pm.Deterministic('bd', -α/β) y_1 = pm.Bernoulli('y_1', p=θ, observed=y_simple) trace_simple = pm.sample(1000, tune=1000)

theta = trace_simple['θ'].mean(axis=0) idx = np.argsort(x_c) plt.plot(x_c[idx], theta[idx], color='C2', lw=3) plt.vlines(trace_simple['bd'].mean(), 0, 1, color='k') bd_hpd = az.hpd(trace_simple['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5) plt.scatter(x_c, np.random.normal(y_simple, 0.02), marker='.', color=[f'C{x}' for x in y_simple]) az.plot_hpd(x_c, trace_simple['θ'], color='C2') plt.xlabel(x_n) plt.ylabel('θ', rotation=0) locs, _ = plt.xticks() plt.xticks(locs, np.round(locs + x_0.mean(), 1));

az.summary(trace_simple, var_names=['α', 'β'])

ppc = pm.sample_ppc(trace_simple, model=model_simple, samples=500) preds = np.rint(ppc['y_1'].mean(axis=0)).astype('int')

print('Accuracy of the simplest model:', accuracy_score(preds, data['outcome'])) print('f1 score of the simplest model:', f1_score(preds, data['outcome']))

outcome.value_counts()

plt.figure(figsize=(15, 15)) corr = data.corr() mask = np.tri(*corr.shape).T sns.heatmap(corr.abs(), mask=mask, annot=True, cmap='viridis');

# Scale age by 10, it helps with model convergence. data['age'] = data['age'] / 10 data['age2'] = np.square(data['age'])

with pm.Model() as logistic_model: pm.glm.GLM.from_formula('outcome ~ age + age2 + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m', data, family = pm.glm.families.Binomial()) trace = pm.sample(1000, tune = 1000, init = 'adapt_diag')

az.plot_trace(trace);

def lm_full(trace, age, education, marital): shape = np.broadcast(age, education, marital).shape x_norm = np.asarray([np.broadcast_to(x, shape) for x in [age/10., education, marital]]) return 1 / (1 + np.exp(-(trace['Intercept'] + trace['age']*x_norm[0] + trace['age2']*(x_norm[0]**2) + trace['education']*x_norm[1] + trace['marital']*x_norm[2]))) lm = lambda x, samples: lm_full(samples, x, 1., 4.) lm2 = lambda x, samples: lm_full(samples, x, 5., 4.) lm3 = lambda x, samples: lm_full(samples, x, 8., 4.)

pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm, samples=100, color='blue', alpha=.8) pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm2, samples=100, color='green', alpha=.8) pm.plot_posterior_predictive_glm(trace, eval=np.linspace(25,75,1000), lm=lm3, samples=100, color='red', alpha=.8) blue_line = mlines.Line2D(['lm'], [], color='b', label='illiterate') green_line = mlines.Line2D(['lm2'], [], color='g', label='basic.9y') red_line = mlines.Line2D(['lm3'], [], color='r', label='university.degree') plt.legend(handles=[blue_line, green_line, red_line], loc='lower right') plt.ylabel("P(Outcome = Yes)") plt.xlabel("Age") plt.show();

b = trace['education'] plt.hist(np.exp(b), bins=20, normed=True) plt.xlabel("Odds Ratio") plt.show();

lb, ub = np.percentile(b, 2.5), np.percentile(b, 97.5) print("P(%.3f < Odds Ratio < %.3f) = 0.95" % (np.exp(lb), np.exp(ub)))

stat_df = pm.summary(trace) stat_df['odds_ratio'] = np.exp(stat_df['mean']) stat_df['percentage_effect'] = 100 * (stat_df['odds_ratio'] - 1) stat_df

az.plot_forest(trace);

pm.plot_posterior(trace);

def run_models(df, upper_order=5): ''' Convenience function: Fit a range of pymc3 models of increasing polynomial complexity. Suggest limit to max order 5 since calculation time is exponential. ''' models, traces = OrderedDict(), OrderedDict() for k in range(1,upper_order+1): nm = 'k{}'.format(k) fml = create_poly_modelspec(k) with pm.Model() as models[nm]: print('\nRunning: {}'.format(nm)) pm.glm.GLM.from_formula(fml, df, family=pm.glm.families.Binomial()) traces[nm] = pm.sample(1000, tune=1000, init='adapt_diag') return models, traces def create_poly_modelspec(k=1): ''' Convenience function: Create a polynomial modelspec string for patsy ''' return ('outcome ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + duration + campaign + pdays + previous + euribor3m ' + ' '.join(['+ np.power(age,{})'.format(j) for j in range(2,k+1)])).strip() models_lin, traces_lin = run_models(data, 3)

model_trace_dict = dict() for nm in ['k1', 'k2', 'k3']: models_lin[nm].name = nm model_trace_dict.update({models_lin[nm]: traces_lin[nm]}) dfwaic = pm.compare(model_trace_dict, ic='WAIC') print(dfwaic) pm.compareplot(dfwaic);

dfwaic

ppc = pm.sample_ppc(trace, model=logistic_model, samples=500)

ppc['y'].mean(axis=0)

preds = np.rint(ppc['y'].mean(axis=0)).astype('int')

def plot_confusion_matrix(cm, classes = ['Not subscribe', 'Subscribe'], title='Subscription Confusion matrix', cmap=plt.cm.Reds): # Display the matrix in text form print('Confusion matrix') print(cm) figsize(8, 8) # Show the matrix using the imshow functionality plt.imshow(cm, interpolation='nearest', cmap=cmap) plt.title(title, size = 20) # Tick marks show classes tick_marks = np.arange(len(classes)) plt.xticks(tick_marks, classes, rotation=45, size = 12) plt.yticks(tick_marks, classes, rotation = 90, size = 12) # Formatting for text labels on plot fmt1 = 's' fmt2 = 'd' thresh = cm.max() / 2. # Four types of classifications types = [['True Negative', 'False Positive'], ['False Negative', 'True Positive']] # Add the actual numbers and the types onto the heatmap plot for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])): plt.text(j, i - 0.05, format(types[i][j], fmt1), horizontalalignment="center", size = 18, color="white" if cm[i, j] > thresh else "black") plt.text(j, i + 0.15, format(cm[i, j], fmt2), horizontalalignment="center", size = 24, color="white" if cm[i, j] > thresh else "black") plt.tight_layout() plt.ylabel('True label', size = 16) plt.xlabel('Predicted Label', size = 16)

cm = confusion_matrix(data['outcome'], preds) plot_confusion_matrix(cm)

print('Accuracy of the full model: ', accuracy_score(preds, data['outcome'])) print('f1 score of the full model: ', f1_score(preds, data['outcome']))