!pip install statsmodels==0.13.0
Collecting statsmodels==0.13.0
Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 18.9 MB/s
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.2.5)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.19.5)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 42.1 MB/s
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.0) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.0) (2021.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels==0.13.0) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.0
import pandas as pd
import numpy as np
import os
import statsmodels.formula.api as smf
train = pd.read_csv('framingham_train.csv')
test = pd.read_csv('framingham_test.csv')
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2560 entries, 0 to 2559
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 male 2560 non-null int64
1 age 2560 non-null int64
2 education 2560 non-null object
3 currentSmoker 2560 non-null int64
4 cigsPerDay 2560 non-null int64
5 BPMeds 2560 non-null int64
6 prevalentStroke 2560 non-null int64
7 prevalentHyp 2560 non-null int64
8 diabetes 2560 non-null int64
9 totChol 2560 non-null int64
10 sysBP 2560 non-null float64
11 diaBP 2560 non-null float64
12 BMI 2560 non-null float64
13 heartRate 2560 non-null int64
14 glucose 2560 non-null int64
15 TenYearCHD 2560 non-null int64
dtypes: float64(3), int64(12), object(1)
memory usage: 320.1+ KB
train.head()
def accuracy(tn, fp, fn, tp):
return (tn + tp)/(tn+fp+fn+tp)
2a i) ii)
logreg = smf.logit(formula = 'TenYearCHD ~ male + age + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose',
data = train).fit()
print(logreg.summary())
Optimization terminated successfully.
Current function value: 0.365281
Iterations 7
Logit Regression Results
==============================================================================
Dep. Variable: TenYearCHD No. Observations: 2560
Model: Logit Df Residuals: 2542
Method: MLE Df Model: 17
Date: Tue, 05 Oct 2021 Pseudo R-squ.: 0.1331
Time: 17:29:11 Log-Likelihood: -935.12
converged: True LL-Null: -1078.7
Covariance Type: nonrobust LLR p-value: 5.181e-51
===============================================================================================================
coef std err z P>|z| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------
Intercept -9.2740 0.882 -10.516 0.000 -11.002 -7.546
education[T.High school/GED] -0.1053 0.217 -0.485 0.628 -0.531 0.321
education[T.Some college/vocational school] -0.1025 0.241 -0.425 0.671 -0.575 0.370
education[T.Some high school] 0.0610 0.202 0.302 0.762 -0.334 0.456
male 0.5621 0.134 4.189 0.000 0.299 0.825
age 0.0689 0.008 8.303 0.000 0.053 0.085
currentSmoker 0.1539 0.189 0.816 0.415 -0.216 0.524
cigsPerDay 0.0155 0.007 2.077 0.038 0.001 0.030
BPMeds 0.1528 0.281 0.544 0.587 -0.398 0.704
prevalentStroke 0.8209 0.570 1.441 0.150 -0.296 1.938
prevalentHyp 0.2075 0.167 1.246 0.213 -0.119 0.534
diabetes -0.2975 0.395 -0.753 0.452 -1.072 0.477
totChol 0.0020 0.001 1.445 0.148 -0.001 0.005
sysBP 0.0181 0.005 3.900 0.000 0.009 0.027
diaBP -0.0045 0.008 -0.584 0.560 -0.020 0.011
BMI 0.0136 0.016 0.867 0.386 -0.017 0.044
heartRate -0.0046 0.005 -0.888 0.374 -0.015 0.006
glucose 0.0096 0.003 3.439 0.001 0.004 0.015
===============================================================================================================
#observe male (sex), age, sysBP, and glucose have the lowest p-scores
#let's pick age variable to see how much accuracy changes with and without the variable.
#the following calculation includes all 12 variables.
#let's also assume that p threshold is at default of 0.5.
y_prob = logreg.predict(test)
y_pred = pd.Series([1 if X > 1/2 else 0 for X in y_prob], index=y_prob.index)
#this is with all 12 variables
from sklearn.metrics import confusion_matrix
y_test = test['TenYearCHD']
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix : \n", cm)
Confusion Matrix :
[[912 11]
[155 20]]
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
accuracy(tn, fp, fn, tp)
#this is with all 12 minus age variable
test2 = test.drop(columns=["age"])
train2 = train.drop(columns=["age"])
logreg_age = smf.logit(formula = 'TenYearCHD ~ male + education + currentSmoker + cigsPerDay + BPMeds + prevalentStroke + prevalentHyp + diabetes + totChol + sysBP + diaBP + BMI + heartRate + glucose',
data = train).fit()
y_prob2 = logreg_age.predict(test2)
y_pred2 = pd.Series([1 if X > 1/2 else 0 for X in y_prob2], index=y_prob2.index)
Optimization terminated successfully.
Current function value: 0.379328
Iterations 7
from sklearn.metrics import confusion_matrix
y_test2 = test2['TenYearCHD']
cm2 = confusion_matrix(y_test2, y_pred2)
print ("Confusion Matrix : \n", cm2)
tn2, fp2, fn2, tp2 = confusion_matrix(y_test2, y_pred2).ravel()
(tn2, fp2, fn2, tp2)
Confusion Matrix :
[[913 10]
[160 15]]
accuracy(tn2, fp2, fn2, tp2)
2a iv)
#try with the derived threshold from 2a iii)
y_pred3 = pd.Series([1 if X > 0.126 else 0 for X in y_prob], index=y_prob.index)
cm3 = confusion_matrix(y_test, y_pred3)
print ("Confusion Matrix : \n", cm3)
tn3, fp3, fn3, tp3 = confusion_matrix(y_test, y_pred3).ravel()
(tn3, fp3, fn3, tp3)
Confusion Matrix :
[[569 354]
[ 56 119]]
accuracy(tn3, fp3, fn3, tp3)
2a v): please see code from 2a i) ii) to see the confusion matrix.
2a vi)
#none of the patients are at high risk (threshold >= 1)
y_pred4 = pd.Series([1 if X >= 1 else 0 for X in y_prob], index=y_prob.index)
cm4 = confusion_matrix(y_test, y_pred4)
print ("Confusion Matrix : \n", cm4)
tn4, fp4, fn4, tp4 = confusion_matrix(y_test, y_pred4).ravel()
(tn4, fp4, fn4, tp4)
Confusion Matrix :
[[923 0]
[175 0]]
accuracy(tn4, fp4, fn4, tp4)
2 a vii)
a7_test = pd.DataFrame(data = {'male':[0],'age':[45], 'education':['College'],'currentSmoker':[1], 'cigsPerDay':[9],'BPMeds':[1],'prevalentStroke':[1],'prevalentHyp':[0],'diabetes':[1],'totChol':[220],'sysBP':[140],'diaBP':[100],'BMI':[33],'heartRate':[69],'glucose':[74]})
a7_test.head()
y_prob2a7 = logreg.predict(a7_test)
y_prob2a7
2 b
y_train = train['TenYearCHD']
x_train = train.drop(['TenYearCHD'], axis=1)
y_test = test['TenYearCHD']
x_test = test.drop(['TenYearCHD'], axis=1)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0,1],[0,1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)')
plt.legend(loc='lower right', fontsize=14)
plt.show()