Group 43 - Anna Bialas, Eric Helmold, Maxime Laasri, Marius Merkle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
mortality_data = pd.read_csv("../Data/Dataset.csv")
mortality_data_train = pd.read_csv("../Data/mortality_data_train.csv")
mortality_data_val = pd.read_csv("../Data/mortality_data_val.csv")
mortality_data_test = pd.read_csv("../Data/mortality_data_test.csv")
Description of Data
Missing Values
df_train = mortality_data_train
df_train.drop(columns = ["Survived", "died", "length_in_study", "weight_per_height", "min_age_died", "lived_min_5_years"], inplace = True)
df_train.isnull().sum()
percentage_missing = df_train.isnull().sum() / len(df_train.index)
_, ax = plt.subplots(figsize = (20, 8))
ax.bar(df_train.columns, percentage_missing.values)
ax.set_xlabel("Feature")
ax.set_ylabel("Proportion of data points")
ax.set_title("Proportion of data points missing each feature")
plt.xticks(rotation = 90)
plt.show()
try:
import missingno as msno
except:
!pip install missingno
import missingno as msno
msno.matrix(df_train);
msno.heatmap(df_train);
msno.dendrogram(df_train);
EDA
Variable Correlation
Variable Exploration
### Perform EDA on the Training Data Set: mortality_data_train ###
# Generate 2d scatter plots to visualize how features correspond to mortality at different time horizons
def add_survival_indicator(time_span, data_frame):
"""
Returns a dataframe with with an binary indicator variable added which denotes each record's survival (1 or 0)
over the specified time_span since first entering the study. E.g. for a time_span of 3, a column will be appended
to the input data_frame indicating which individuals survived for 3 or more years since entering the study.
In general:
0 < y < time_span observations would be assigned a value of 0 indicating they passed away within time_span years
for y >= time_span or y < (-1)time_span, observations would be assigned a value of 1 indicating they did survive time_span years
for (-1)time_span < y < 0 we will set these values of y to NA since we cannot say for certain if they survived or not
"""
data_frame=data_frame.copy()
data_frame["time_span_indicator"] = (1)*((data_frame.loc[:,'y']<=(-1)*time_span)|(data_frame.loc[:,'y']>=time_span))
data_frame.loc[((data_frame.loc[:,'y']<0)&(data_frame.loc[:,'y']>(-1)*time_span)),"time_span_indicator"] = np.NaN
return data_frame
#display(new_df.head());display(mortality_data_train.head())
def generate_2d_scatter(feature1,feature2,data_frame,color_indicator,random_selection=False):
"""
Creates a 2d scatter plot of x = feature1 and y = feature2 with color labels corresponding to 1 for survived
and 0 for deceased to visually plot how the response variable differs over a particular covariate feature space
Plot produced is one typically seen in a bi-variate classification setting
"""
if random_selection:
data_frame = data_frame.sample(n=random_selection,replace=False)
plt.figure(figsize=(12,7))
plt.scatter(x=data_frame.loc[data_frame[color_indicator]==1,feature1],
y=data_frame.loc[data_frame[color_indicator]==1,feature2],
label="Survived",color="blue",s=5)
plt.scatter(x=data_frame.loc[data_frame[color_indicator]==0,feature1],
y=data_frame.loc[data_frame[color_indicator]==0,feature2],
label="Deceased",color="orange",s=5)
plt.xlabel(feature1);plt.ylabel(feature2);plt.title(str(feature2)+" vs "+str(feature1)+" Scatterplot")
plt.grid(color='lightgrey');plt.legend();plt.show()
def generate_histogram_plots(features, data_frame, color_indicator):
"""
Creates a density estimation comparison plot for each feature in the features list comparing the density of the feature
for those where survived and those become deceased. These plots help illistrate how individuals may differ along a
particular feature given their survival outcome label
"""
# Plot a density-estimate/histogram for each feature, one for those who survived and one for those who are now deceased
filter_vector = data_frame[color_indicator]==1 # Create a boolean vector to filter the data by survival
if len(features) % 2 == 1:
n = len(features)// 3 + 1
else:
n = len(features)//3
f, axs = plt.subplots(nrows=n, ncols=3, figsize=(14, 3*n))
#for i, feature in enumerate(features):
for i, ax in enumerate(axs.flat):
sns.kdeplot(data_frame.loc[filter_vector, features[i]],label="Survived",ax=ax)
sns.kdeplot(data_frame.loc[~filter_vector, features[i]],label="Deceased",ax=ax)
ax.legend();ax.set_xlabel(features[i])
ax.set_title(str(features[i])+" - Density Estimates - Training Set")
f.tight_layout();plt.show()
new_df = add_survival_indicator(5, mortality_data_train)
generate_2d_scatter("age_entered","systolic_blood_pressure",new_df,"time_span_indicator",2000)
# Create density plots for long term mortality
new_df = add_survival_indicator(5, mortality_data_train)
notable_EDA_cols = ["age_entered", "poverty_index", "uric_acid", "sedimentation_rate", "pulse_pressure", "weight_per_height"]
generate_histogram_plots(notable_EDA_cols,new_df,"time_span_indicator")
Research Question
The research question that we seek to explore is: What factors from our data set are predictive of an individuals longevity? Or in other words, what factors from our data set are predictive of living at least x years into the future from an individual's initial examination? In our analysis, we would like to address this question not only for a particular time horizon chosen for x e.g. 5 or 10 years, but also examine how feature importance may differ at different time horizons.
Baseline Model: Logistic Regression for Classification
Data-Preprocessing
mortality_data = pd.read_csv("../Data/Dataset.csv")
mortality_data_train = pd.read_csv("../Data/mortality_data_train.csv")
mortality_data_val = pd.read_csv("../Data/mortality_data_val.csv")
mortality_data_test = pd.read_csv("../Data/mortality_data_test.csv")
# Specify covariates and the response variable, drop datapoints for which no prediction can be made
predictors = ['age_entered', 'poverty_index', 'pulse_pressure', 'uric_acid', 'sedimentation_rate', 'weight_per_height']
response = 'lived_min_5_years'
mortality_data_train = mortality_data_train[(mortality_data_train['y'] > 0) | (mortality_data_train['y'] < -5)]
mortality_data_val = mortality_data_val[(mortality_data_val['y'] > 0) | (mortality_data_val['y'] < -5)]
x_train = mortality_data_train[predictors]
y_train = mortality_data_train[response]
x_val = mortality_data_val[predictors]
y_val = mortality_data_val[response]
# Imputation for missing values: to be replaced with PDF imputer
mean_imputer = SimpleImputer(strategy = 'mean')
mean_imputer.fit(x_train)
standardizer = StandardScaler()
standardizer.fit(x_train)
x_train = mean_imputer.transform(x_train)
x_val = mean_imputer.transform(x_val)
x_train = standardizer.transform(x_train)
x_val = standardizer.transform(x_val)
Model Setup
# Model training and prediction
classifier = LogisticRegression(penalty = 'none', max_iter = 1000, class_weight = 'None', random_state = 209)
classifier.fit(x_train, y_train)
y_train_pred = classifier.predict(x_train)
y_val_pred = classifier.predict(x_val)
Analysis
def evaluate_model(y_true, y_pred):
# Model evaluation
accuracy = accuracy_score(y_true, y_pred)
true_negative, false_positive, false_negative, true_positive = confusion_matrix(y_true, y_pred).ravel()
# sensitivity, recall, precision true positive rate: true predicted positives/all true positives
report = classification_report(y_true, y_pred, output_dict = True)
sensitivity = true_positive/(true_positive + false_negative)
#sensitivity_train = report['True']['recall']
specificity = true_negative/(true_negative + false_positive)
#f1_score = report['True']['f1-score']
print(f'The model achieves an overall accuracy of {100*accuracy:.3f}%.')
#print(f'The model achieves an F1 score of {100*f1_score:.3f}%.')
print(f'The model achieves a true positive rate (sensitivity) of {100*sensitivity:.3f}%.')
print(f'The model achieves a true negative rate (specificity) of {100*specificity:.3f}%.')
return
print('Performance on the Training Set')
evaluate_model(y_train, y_train_pred)
print('\nPerformance on the Validation Set')
evaluate_model(y_val, y_val_pred)
intercept_baseline = classifier.intercept_[0]
coefficients_baseline = classifier.coef_[0]
N_train = mortality_data_train.shape[0]
N_train_negative, N_train_positive = mortality_data_train.groupby(['lived_min_5_years']).size()
negative_fraction = N_train_negative/N_train
positive_fraction = N_train_positive/N_train
print(f"The fraction of people that lived at least 5 years in the dataset is {100*positive_fraction:.1f}%.")
print(f"The fraction of people that died within the first 5 years in the dataset is {100*negative_fraction:.1f}%.")
# Model training and prediction
classifier = LogisticRegression(penalty = 'none', max_iter = 1000, class_weight = 'balanced', random_state = 209)
classifier.fit(x_train, y_train)
y_train_pred = classifier.predict(x_train)
y_val_pred = classifier.predict(x_val)
print('Performance on the Training Set')
evaluate_model(y_train, y_train_pred)
print('\nPerformance on the Validation Set')
evaluate_model(y_val, y_val_pred)
intercept_balanced = classifier.intercept_[0]
coefficients_balanced = classifier.coef_[0]
width = 0.4
indices = np.arange(7)
fig = plt.figure(figsize = (20, 5))
plt.bar(indices, np.hstack((intercept_baseline, coefficients_baseline)), width, color = 'blue', label = 'Baseline')
plt.bar(indices + width, np.hstack((intercept_balanced, coefficients_balanced)), width, color = 'orange', label = 'Weighted')
plt.xticks(indices + width / 2, ('intercept', 'age_entered', 'poverty_index', 'pulse_pressure', 'uric_acid', 'sedimentation_rate', 'weight_per_height'))
plt.xlabel('Predictor')
plt.ylabel('Coefficient value')
plt.title('Comparison of Coefficient Values for an Unweighted and Balanced Logistic Regression Model')
plt.legend(loc='best')
plt.grid()
plt.show()
Outlook
### Build a basic chart showing the usage of this time_span_indicator variable, how many people fall into each catagory
# as a function of x over time
def compute_proportion_survival(y_vector, time_span):
# Computes the % of people who survive and who do no survive throughout the study on a rolling basis from when they
# enter the study. Also computes the number who are indeterminant due to them having a negative y values that is
# less than the magnitude of time_span
output_labels = (1)*((y_vector<=(-1)*time_span)|(y_vector>=time_span))
output_labels[((y_vector<0)&(y_vector>(-1)*time_span))] = np.NaN
NA_indicies = np.isnan(output_labels)
# Return the % of 1s in the output_labels for this given time_span along with the % of the data set represented
return sum(output_labels[np.invert(NA_indicies)])/(len(output_labels)-sum(NA_indicies)), 1-sum(NA_indicies)/len(output_labels)
time_span_range = list(range(1,23))
data_coverage = [compute_proportion_survival(mortality_data_train["y"],i) for i in time_span_range]
plt.figure(figsize = (15, 6))
plt.plot(time_span_range, [item[0] for item in data_coverage],label="% Survived")
plt.plot(time_span_range, [item[1] for item in data_coverage],label="% of Data Set Represented")
plt.title("Survived/Deceased % by Time Span Threshold and Data Set Inclusion");plt.xlabel("Time Span Cutoff (Years)");plt.ylabel("Proportion")
plt.legend();plt.grid(color="lightgrey");plt.show()