Group 43 - Anna Bialas, Eric Helmold, Maxime Laasri, Marius Merkle

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.impute import SimpleImputer from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report

mortality_data = pd.read_csv("../Data/Dataset.csv") mortality_data_train = pd.read_csv("../Data/mortality_data_train.csv") mortality_data_val = pd.read_csv("../Data/mortality_data_val.csv") mortality_data_test = pd.read_csv("../Data/mortality_data_test.csv")

Description of Data

Missing Values

df_train = mortality_data_train df_train.drop(columns = ["Survived", "died", "length_in_study", "weight_per_height", "min_age_died", "lived_min_5_years"], inplace = True) df_train.isnull().sum()

percentage_missing = df_train.isnull().sum() / len(df_train.index) _, ax = plt.subplots(figsize = (20, 8)) ax.bar(df_train.columns, percentage_missing.values) ax.set_xlabel("Feature") ax.set_ylabel("Proportion of data points") ax.set_title("Proportion of data points missing each feature") plt.xticks(rotation = 90) plt.show()

try: import missingno as msno except: !pip install missingno import missingno as msno

msno.matrix(df_train);

msno.heatmap(df_train);

msno.dendrogram(df_train);

EDA

Variable Correlation

Variable Exploration

### Perform EDA on the Training Data Set: mortality_data_train ### # Generate 2d scatter plots to visualize how features correspond to mortality at different time horizons def add_survival_indicator(time_span, data_frame): """ Returns a dataframe with with an binary indicator variable added which denotes each record's survival (1 or 0) over the specified time_span since first entering the study. E.g. for a time_span of 3, a column will be appended to the input data_frame indicating which individuals survived for 3 or more years since entering the study. In general: 0 < y < time_span observations would be assigned a value of 0 indicating they passed away within time_span years for y >= time_span or y < (-1)time_span, observations would be assigned a value of 1 indicating they did survive time_span years for (-1)time_span < y < 0 we will set these values of y to NA since we cannot say for certain if they survived or not """ data_frame=data_frame.copy() data_frame["time_span_indicator"] = (1)*((data_frame.loc[:,'y']<=(-1)*time_span)|(data_frame.loc[:,'y']>=time_span)) data_frame.loc[((data_frame.loc[:,'y']<0)&(data_frame.loc[:,'y']>(-1)*time_span)),"time_span_indicator"] = np.NaN return data_frame #display(new_df.head());display(mortality_data_train.head()) def generate_2d_scatter(feature1,feature2,data_frame,color_indicator,random_selection=False): """ Creates a 2d scatter plot of x = feature1 and y = feature2 with color labels corresponding to 1 for survived and 0 for deceased to visually plot how the response variable differs over a particular covariate feature space Plot produced is one typically seen in a bi-variate classification setting """ if random_selection: data_frame = data_frame.sample(n=random_selection,replace=False) plt.figure(figsize=(12,7)) plt.scatter(x=data_frame.loc[data_frame[color_indicator]==1,feature1], y=data_frame.loc[data_frame[color_indicator]==1,feature2], label="Survived",color="blue",s=5) plt.scatter(x=data_frame.loc[data_frame[color_indicator]==0,feature1], y=data_frame.loc[data_frame[color_indicator]==0,feature2], label="Deceased",color="orange",s=5) plt.xlabel(feature1);plt.ylabel(feature2);plt.title(str(feature2)+" vs "+str(feature1)+" Scatterplot") plt.grid(color='lightgrey');plt.legend();plt.show() def generate_histogram_plots(features, data_frame, color_indicator): """ Creates a density estimation comparison plot for each feature in the features list comparing the density of the feature for those where survived and those become deceased. These plots help illistrate how individuals may differ along a particular feature given their survival outcome label """ # Plot a density-estimate/histogram for each feature, one for those who survived and one for those who are now deceased filter_vector = data_frame[color_indicator]==1 # Create a boolean vector to filter the data by survival if len(features) % 2 == 1: n = len(features)// 3 + 1 else: n = len(features)//3 f, axs = plt.subplots(nrows=n, ncols=3, figsize=(14, 3*n)) #for i, feature in enumerate(features): for i, ax in enumerate(axs.flat): sns.kdeplot(data_frame.loc[filter_vector, features[i]],label="Survived",ax=ax) sns.kdeplot(data_frame.loc[~filter_vector, features[i]],label="Deceased",ax=ax) ax.legend();ax.set_xlabel(features[i]) ax.set_title(str(features[i])+" - Density Estimates - Training Set") f.tight_layout();plt.show()

new_df = add_survival_indicator(5, mortality_data_train) generate_2d_scatter("age_entered","systolic_blood_pressure",new_df,"time_span_indicator",2000)

# Create density plots for long term mortality new_df = add_survival_indicator(5, mortality_data_train) notable_EDA_cols = ["age_entered", "poverty_index", "uric_acid", "sedimentation_rate", "pulse_pressure", "weight_per_height"] generate_histogram_plots(notable_EDA_cols,new_df,"time_span_indicator")

Research Question

The research question that we seek to explore is: What factors from our data set are predictive of an individuals longevity? Or in other words, what factors from our data set are predictive of living at least x years into the future from an individual's initial examination? In our analysis, we would like to address this question not only for a particular time horizon chosen for x e.g. 5 or 10 years, but also examine how feature importance may differ at different time horizons.

Baseline Model: Logistic Regression for Classification

Data-Preprocessing

# Specify covariates and the response variable, drop datapoints for which no prediction can be made predictors = ['age_entered', 'poverty_index', 'pulse_pressure', 'uric_acid', 'sedimentation_rate', 'weight_per_height'] response = 'lived_min_5_years' mortality_data_train = mortality_data_train[(mortality_data_train['y'] > 0) | (mortality_data_train['y'] < -5)] mortality_data_val = mortality_data_val[(mortality_data_val['y'] > 0) | (mortality_data_val['y'] < -5)] x_train = mortality_data_train[predictors] y_train = mortality_data_train[response] x_val = mortality_data_val[predictors] y_val = mortality_data_val[response]

# Imputation for missing values: to be replaced with PDF imputer mean_imputer = SimpleImputer(strategy = 'mean') mean_imputer.fit(x_train) standardizer = StandardScaler() standardizer.fit(x_train) x_train = mean_imputer.transform(x_train) x_val = mean_imputer.transform(x_val) x_train = standardizer.transform(x_train) x_val = standardizer.transform(x_val)

Model Setup

# Model training and prediction classifier = LogisticRegression(penalty = 'none', max_iter = 1000, class_weight = 'None', random_state = 209) classifier.fit(x_train, y_train) y_train_pred = classifier.predict(x_train) y_val_pred = classifier.predict(x_val)

Analysis

def evaluate_model(y_true, y_pred): # Model evaluation accuracy = accuracy_score(y_true, y_pred) true_negative, false_positive, false_negative, true_positive = confusion_matrix(y_true, y_pred).ravel() # sensitivity, recall, precision true positive rate: true predicted positives/all true positives report = classification_report(y_true, y_pred, output_dict = True) sensitivity = true_positive/(true_positive + false_negative) #sensitivity_train = report['True']['recall'] specificity = true_negative/(true_negative + false_positive) #f1_score = report['True']['f1-score'] print(f'The model achieves an overall accuracy of {100*accuracy:.3f}%.') #print(f'The model achieves an F1 score of {100*f1_score:.3f}%.') print(f'The model achieves a true positive rate (sensitivity) of {100*sensitivity:.3f}%.') print(f'The model achieves a true negative rate (specificity) of {100*specificity:.3f}%.') return

print('Performance on the Training Set') evaluate_model(y_train, y_train_pred) print('\nPerformance on the Validation Set') evaluate_model(y_val, y_val_pred)

intercept_baseline = classifier.intercept_[0] coefficients_baseline = classifier.coef_[0] N_train = mortality_data_train.shape[0] N_train_negative, N_train_positive = mortality_data_train.groupby(['lived_min_5_years']).size() negative_fraction = N_train_negative/N_train positive_fraction = N_train_positive/N_train print(f"The fraction of people that lived at least 5 years in the dataset is {100*positive_fraction:.1f}%.") print(f"The fraction of people that died within the first 5 years in the dataset is {100*negative_fraction:.1f}%.")

# Model training and prediction classifier = LogisticRegression(penalty = 'none', max_iter = 1000, class_weight = 'balanced', random_state = 209) classifier.fit(x_train, y_train) y_train_pred = classifier.predict(x_train) y_val_pred = classifier.predict(x_val)

print('Performance on the Training Set') evaluate_model(y_train, y_train_pred) print('\nPerformance on the Validation Set') evaluate_model(y_val, y_val_pred)

intercept_balanced = classifier.intercept_[0] coefficients_balanced = classifier.coef_[0] width = 0.4 indices = np.arange(7) fig = plt.figure(figsize = (20, 5)) plt.bar(indices, np.hstack((intercept_baseline, coefficients_baseline)), width, color = 'blue', label = 'Baseline') plt.bar(indices + width, np.hstack((intercept_balanced, coefficients_balanced)), width, color = 'orange', label = 'Weighted') plt.xticks(indices + width / 2, ('intercept', 'age_entered', 'poverty_index', 'pulse_pressure', 'uric_acid', 'sedimentation_rate', 'weight_per_height')) plt.xlabel('Predictor') plt.ylabel('Coefficient value') plt.title('Comparison of Coefficient Values for an Unweighted and Balanced Logistic Regression Model') plt.legend(loc='best') plt.grid() plt.show()

Outlook

### Build a basic chart showing the usage of this time_span_indicator variable, how many people fall into each catagory # as a function of x over time def compute_proportion_survival(y_vector, time_span): # Computes the % of people who survive and who do no survive throughout the study on a rolling basis from when they # enter the study. Also computes the number who are indeterminant due to them having a negative y values that is # less than the magnitude of time_span output_labels = (1)*((y_vector<=(-1)*time_span)|(y_vector>=time_span)) output_labels[((y_vector<0)&(y_vector>(-1)*time_span))] = np.NaN NA_indicies = np.isnan(output_labels) # Return the % of 1s in the output_labels for this given time_span along with the % of the data set represented return sum(output_labels[np.invert(NA_indicies)])/(len(output_labels)-sum(NA_indicies)), 1-sum(NA_indicies)/len(output_labels)

time_span_range = list(range(1,23)) data_coverage = [compute_proportion_survival(mortality_data_train["y"],i) for i in time_span_range]

plt.figure(figsize = (15, 6)) plt.plot(time_span_range, [item[0] for item in data_coverage],label="% Survived") plt.plot(time_span_range, [item[1] for item in data_coverage],label="% of Data Set Represented") plt.title("Survived/Deceased % by Time Span Threshold and Data Set Inclusion");plt.xlabel("Time Span Cutoff (Years)");plt.ylabel("Proportion") plt.legend();plt.grid(color="lightgrey");plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Group 43 - Anna Bialas, Eric Helmold, Maxime Laasri, Marius Merkle

Description of Data

Missing Values

EDA

Variable Correlation

Variable Exploration

Research Question

Baseline Model: Logistic Regression for Classification

Data-Preprocessing

Model Setup

Analysis

Outlook

Group 43 - Anna Bialas, Eric Helmold, Maxime Laasri, Marius Merkle