Mini Project #4 - Predictive Analytics on Dementia
1. Problem Statement
For our project, we chose the topic area of Dementia. Dementia is a collection of medical conditions that are expressed as symptoms of cognitive decline like forgetfulness or sustained confusion. These impairments are generally targeted at memory, communication, and overall decision making.
2. Approach
2.1 Algorithm Selection
Analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
# Dataset Import
dataset = "oasis_longitudinal.csv"
df = pd.read_csv(dataset)
df.head()
# Data Analysis and Cleaning
input_features = ['M/F', 'Age', 'EDUC', 'SES',
'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF']
target_feature = 'Group'
# Only include features of importance
df = df[input_features + [target_feature]]
print(df.columns)
# Convert cateogrical columns into numericals
print(df[target_feature].unique())
df['Group'] = df['Group'].replace(['Converted'], ['Demented'])
df['Group'] = df['Group'].replace(['Nondemented', 'Demented'], [0,1])
print(df[target_feature].unique())
print(df["M/F"].unique())
df['M/F'] = df['M/F'].replace(['M', 'F'], [0,1])
print(df["M/F"].unique())
Index(['M/F', 'Age', 'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF',
'Group'],
dtype='object')
['Nondemented' 'Demented' 'Converted']
[0 1]
['M' 'F']
[0 1]
# Missing Values
pd.isnull(df).sum()
# Describe and plot with missing values
print(df['SES'].describe())
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(221)
df['SES'].plot(kind='hist',ax=ax1, grid=True)
ax1.set_title('Histogram of SES')
plt.show()
count 354.000000
mean 2.460452
std 1.134005
min 1.000000
25% 2.000000
50% 2.000000
75% 3.000000
max 5.000000
Name: SES, dtype: float64
print(df['MMSE'].describe())
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(221)
df['MMSE'].plot(kind='hist',ax=ax1, grid=True)
ax1.set_title('Histogram of MMSE')
plt.show()
count 371.000000
mean 27.342318
std 3.683244
min 4.000000
25% 27.000000
50% 29.000000
75% 30.000000
max 30.000000
Name: MMSE, dtype: float64
df = df.dropna(axis=0, how='any')
pd.isnull(df_dropna).sum()
corr_matrix = df.corr()
rcParams['figure.figsize'] = 15, 10
sns.heatmap(corr_matrix, annot = True)
'''
Given our correlation matrix, we can see that CDR is a highly correlated
valueset to our classification target feature "Group", so we will explore that.
'''
# CDR on all data
print(df['CDR'].describe())
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(221)
df['CDR'].plot(kind='hist',ax=ax1, grid=True)
ax1.set_title('Histogram of CDR')
plt.show()
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(212)
ax1 = sns.distplot(df[['CDR']], hist=False)
ax1.set_title('Distribution of CDR')
plt.show()
count 354.000000
mean 0.271186
std 0.370537
min 0.000000
25% 0.000000
50% 0.000000
75% 0.500000
max 2.000000
Name: CDR, dtype: float64
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
# CDR on Nondemented Group
df_nondemented = df.loc[df['Group'] == 0]
print(df_nondemented['CDR'].describe())
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(221)
df_nondemented['CDR'].plot(kind='hist',ax=ax1, grid=True)
ax1.set_title('Histogram of CDR for Nondemented Group')
plt.show()
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(212)
ax1 = sns.distplot(df_nondemented[['CDR']], hist=False)
ax1.set_title('Distribution of CDR for Nondemented Group')
plt.show()
count 190.000000
mean 0.005263
std 0.051163
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 0.500000
Name: CDR, dtype: float64
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
# CDR on Demented Group
df_demented = df.loc[df['Group'] == 1]
print(df_demented['CDR'].describe())
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(221)
df_demented['CDR'].plot(kind='hist',ax=ax1, grid=True)
ax1.set_title('Histogram of CDR for Demented Group')
plt.show()
fig = plt.figure(figsize=(16,12))
ax1 = fig.add_subplot(212)
ax1 = sns.distplot(df_demented[['CDR']], hist=False)
ax1.set_title('Distribution of CDR for Demented Group')
plt.show()
count 164.000000
mean 0.579268
std 0.341141
min 0.000000
25% 0.500000
50% 0.500000
75% 0.500000
max 2.000000
Name: CDR, dtype: float64
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `kdeplot` (an axes-level function for kernel density plots).
warnings.warn(msg, FutureWarning)
df['Group'].value_counts()
# Data Modeling and Performance
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix
df_y = df['Group'].values
df_x = df.drop('Group', 1)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y)
# Random Forest Classifier
rf = RandomForestClassifier(max_depth=i, random_state=0)
rf.fit(x_train, y_train)
pred = rf.predict(x_test)
print("--- Random Forrest Results ---")
print(classification_report(y_test, pred))
--- Random Forrest Results ---
precision recall f1-score support
0 0.98 1.00 0.99 55
1 1.00 0.97 0.99 34
accuracy 0.99 89
macro avg 0.99 0.99 0.99 89
weighted avg 0.99 0.99 0.99 89
# Analysis to find best k for kNN
error_rate = []
for i in range(2,10):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(x_train, y_train)
pred_i = knn.predict(x_test)
error_rate.append(np.mean(pred_i != y_test))
plt.figure(figsize=(10,6))
plt.plot(range(2,10),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')
# Based on our analysis, using 5 neighbors works the best
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train, y_train)
pred = knn.predict(x_test)
print("--- kNN Results ---")
print(classification_report(y_test, pred))
--- kNN Results ---
precision recall f1-score support
0 0.71 0.75 0.73 55
1 0.55 0.50 0.52 34
accuracy 0.65 89
macro avg 0.63 0.62 0.62 89
weighted avg 0.65 0.65 0.65 89
Outcome
In order to demonstrate the capabilities of our model, in this section we will demonstrate a python backed web interface for our model where a physician or patient can input the following information:
For Wealth, 1 is low and 5 is high net worth. For Mini Mental State Examination, the range is 1 to 30 inclusive. For Clinical Dementia Rating, the range is 0 to 2 inclusive.
gender = 0 if Gender == 'M' else 1
feature = np.array([gender, Age, YearsofEducation, Wealth, MiniMentalStateExamination,
ClinicalDementiaRating, EstimatedTotalIntracranialVolume,
NormalizeWholeBrainVolume, AtlasScalingFactor]).reshape(1, -1)
final = 'Nondemented' if rf.predict(feature) == 0 else 'Demented'
print("Our anaysis shows that according to the information provided, your experimental diagnosis is " + final + ".")
Our anaysis shows that according to the information provided, your experimental diagnosis is Demented.