IEOR 142 HW 2

!pip install statsmodels==0.13.0

import pandas as pd import numpy as np import os import statsmodels.formula.api as smf

train = pd.read_csv('framingham_train.csv') test = pd.read_csv('framingham_test.csv')

train.info()

train.head()

def accuracy(tn, fp, fn, tp): return (tn + tp)/(tn+fp+fn+tp)

2a i) ii)

logreg = smf.logit(formula = 'TenYearCHD ~ male + age + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose', data = train).fit() print(logreg.summary())

#observe male (sex), age, sysBP, and glucose have the lowest p-scores #let's pick age variable to see how much accuracy changes with and without the variable. #the following calculation includes all 12 variables. #let's also assume that p threshold is at default of 0.5. y_prob = logreg.predict(test) y_pred = pd.Series([1 if X > 1/2 else 0 for X in y_prob], index=y_prob.index)

#this is with all 12 variables from sklearn.metrics import confusion_matrix y_test = test['TenYearCHD'] cm = confusion_matrix(y_test, y_pred) print ("Confusion Matrix : \n", cm)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() (tn, fp, fn, tp)

accuracy(tn, fp, fn, tp)

#this is with all 12 minus age variable test2 = test.drop(columns=["age"]) train2 = train.drop(columns=["age"])

logreg_age = smf.logit(formula = 'TenYearCHD ~ male + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose', data = train).fit() y_prob2 = logreg_age.predict(test2) y_pred2 = pd.Series([1 if X > 1/2 else 0 for X in y_prob2], index=y_prob2.index)

from sklearn.metrics import confusion_matrix y_test2 = test2['TenYearCHD'] cm2 = confusion_matrix(y_test2, y_pred2) print ("Confusion Matrix : \n", cm2) tn2, fp2, fn2, tp2 = confusion_matrix(y_test2, y_pred2).ravel() (tn2, fp2, fn2, tp2)

accuracy(tn2, fp2, fn2, tp2)

2a iv)

#try with the derived threshold from 2a iii) y_pred3 = pd.Series([1 if X > 0.126 else 0 for X in y_prob], index=y_prob.index) cm3 = confusion_matrix(y_test, y_pred3) print ("Confusion Matrix : \n", cm3) tn3, fp3, fn3, tp3 = confusion_matrix(y_test, y_pred3).ravel() (tn3, fp3, fn3, tp3)

accuracy(tn3, fp3, fn3, tp3)

2a v): please see code from 2a i) ii) to see the confusion matrix.

2a vi)

#none of the patients are at high risk (threshold >= 1) y_pred4 = pd.Series([1 if X >= 1 else 0 for X in y_prob], index=y_prob.index) cm4 = confusion_matrix(y_test, y_pred4) print ("Confusion Matrix : \n", cm4) tn4, fp4, fn4, tp4 = confusion_matrix(y_test, y_pred4).ravel() (tn4, fp4, fn4, tp4)

accuracy(tn4, fp4, fn4, tp4)

2 a vii)

a7_test = pd.DataFrame(data = {'male':[0],'age':[45], 'education':['College'],'currentSmoker':[1], 'cigsPerDay':[9],'BPMeds':[1],'prevalentStroke':[1],'prevalentHyp':[0],'diabetes':[1],'totChol':[220],'sysBP':[140],'diaBP':[100],'BMI':[33],'heartRate':[69],'glucose':[74]})

a7_test.head()

y_prob2a7 = logreg.predict(a7_test) y_prob2a7

2 b

y_train = train['TenYearCHD'] x_train = train.drop(['TenYearCHD'], axis=1) y_test = test['TenYearCHD'] x_test = test.drop(['TenYearCHD'], axis=1)

import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc fpr, tpr, _ = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.title('ROC Curve', fontsize=18) plt.xlabel('FPR', fontsize=16) plt.ylabel('TPR', fontsize=16) plt.xlim([-0.01, 1.00]) plt.ylim([-0.01, 1.01]) plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc)) plt.plot([0,1],[0,1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)') plt.legend(loc='lower right', fontsize=14) plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}2a i) ii)

2a iv)

2a v): please see code from 2a i) ii) to see the confusion matrix.

2a vi)

2 a vii)

2 b

2a i) ii)