!pip install statsmodels==0.13.0
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
train = pd.read_csv('framingham_train.csv')
test = pd.read_csv('framingham_test.csv')
train.info()
train.head()
def accuracy(tn, fp, fn, tp):
return (tn + tp)/(tn+fp+fn+tp)
2a i) ii)
logreg = smf.logit(formula = 'TenYearCHD ~ male + age + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose',
data = train).fit()
print(logreg.summary())
#observe male (sex), age, sysBP, and glucose have the lowest p-scores
#let's pick age variable to see how much accuracy changes with and without the variable.
#the following calculation includes all 12 variables.
#let's also assume that p threshold is at default of 0.5.
y_prob = logreg.predict(test)
y_pred = pd.Series([1 if X > 1/2 else 0 for X in y_prob], index=y_prob.index)
#this is with all 12 variables
from sklearn.metrics import confusion_matrix
y_test = test['TenYearCHD']
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
accuracy(tn, fp, fn, tp)
#this is with all 12 minus age variable
test2 = test.drop(columns=["age"])
train2 = train.drop(columns=["age"])
logreg_age = smf.logit(formula = 'TenYearCHD ~ male + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose',
data = train).fit()
y_prob2 = logreg_age.predict(test2)
y_pred2 = pd.Series([1 if X > 1/2 else 0 for X in y_prob2], index=y_prob2.index)
from sklearn.metrics import confusion_matrix
y_test2 = test2['TenYearCHD']
cm2 = confusion_matrix(y_test2, y_pred2)
print ("Confusion Matrix : \n", cm2)
tn2, fp2, fn2, tp2 = confusion_matrix(y_test2, y_pred2).ravel()
(tn2, fp2, fn2, tp2)
accuracy(tn2, fp2, fn2, tp2)
2a iv)
#try with the derived threshold from 2a iii)
y_pred3 = pd.Series([1 if X > 0.126 else 0 for X in y_prob], index=y_prob.index)
cm3 = confusion_matrix(y_test, y_pred3)
print ("Confusion Matrix : \n", cm3)
tn3, fp3, fn3, tp3 = confusion_matrix(y_test, y_pred3).ravel()
(tn3, fp3, fn3, tp3)
accuracy(tn3, fp3, fn3, tp3)
2a v): please see code from 2a i) ii) to see the confusion matrix.
2a vi)
#none of the patients are at high risk (threshold >= 1)
y_pred4 = pd.Series([1 if X >= 1 else 0 for X in y_prob], index=y_prob.index)
cm4 = confusion_matrix(y_test, y_pred4)
print ("Confusion Matrix : \n", cm4)
tn4, fp4, fn4, tp4 = confusion_matrix(y_test, y_pred4).ravel()
(tn4, fp4, fn4, tp4)
accuracy(tn4, fp4, fn4, tp4)
2 a vii)
a7_test = pd.DataFrame(data = {'male':[0],'age':[45], 'education':['College'],'currentSmoker':[1], 'cigsPerDay':[9],'BPMeds':[1],'prevalentStroke':[1],'prevalentHyp':[0],'diabetes':[1],'totChol':[220],'sysBP':[140],'diaBP':[100],'BMI':[33],'heartRate':[69],'glucose':[74]})
a7_test.head()
y_prob2a7 = logreg.predict(a7_test)
y_prob2a7
2 b
y_train = train['TenYearCHD']
x_train = train.drop(['TenYearCHD'], axis=1)
y_test = test['TenYearCHD']
x_test = test.drop(['TenYearCHD'], axis=1)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0,1],[0,1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)')
plt.legend(loc='lower right', fontsize=14)
plt.show()