Diabetes and titanic data analysis

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.linear_model import LogisticRegression from sklearn.metrics import mean_squared_error from sklearn.metrics import confusion_matrix from sklearn.preprocessing import LabelEncoder from sklearn.metrics import r2_score import statsmodels.api as sm

# 3.1 diabetes_data = pd.read_excel('datasets/Diabetes_Data.xlsx') diabetes_data.head() Y = diabetes_data['Y'] X = diabetes_data.drop(['Y'], axis=1) # correlation matrix corr_matrix_X = X.corr() # heatmap plot sns.heatmap(corr_matrix_X, annot=True, cmap='coolwarm') plt.figure(figsize=(15, 10)) plt.show() # plt.imshow(corr_matrix_X, cmap='coolwarm', interpolation='none') # plt.colorbar() # plt.show()

# 3.3 # multivariate model lin_reg = LinearRegression() model1 = lin_reg.fit(X, Y) # print(model1.coef_) # mean squared error Y_pred = model1.predict(X) mse = mean_squared_error(Y, Y_pred) print("MSE:", mse) # adjusted r-squared adj_r2 = 1 - (1-r2_score(Y, Y_pred)) * (len(Y)-1)/(len(Y)-X.shape[1]-1) print("Adjusted R-squared:", adj_r2)

MSE: 2859.69634758675
Adjusted R-squared: 0.5065592904853232

# 3.5 # forward selection def forward_regression(X, y, threshold_in, verbose=False): initial_list = [] included = list(initial_list) while True: changed=False excluded = list(set(X.columns)-set(included)) new_pval = pd.Series(index=excluded, dtype=float) for new_column in excluded: model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit() new_pval[new_column] = model.pvalues[new_column] best_pval = new_pval.min() if best_pval < threshold_in: best_feature = new_pval.idxmin() included.append(best_feature) changed=True if verbose: print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval)) if not changed: break return included print("Useful variables using forward selection: ", forward_regression(X, Y, 0.05))

Useful variables using forward selection:  ['BMI', 'S5', 'BP', 'S1', 'SEX', 'S2']

# new independent variables from forward selection new_X = X[forward_regression(X, Y, 0.05)] # model2 lin_reg = LinearRegression() model2 = lin_reg.fit(new_X, Y) # mean squared error Y_pred = model2.predict(new_X) mse = mean_squared_error(Y, Y_pred) print("New MSE:", mse) # r-squared r2 = r2_score(Y, Y_pred) print("R-squared:", r2)

New MSE: 2876.683251787016
New R-squared: 0.5148837959256445

# 4 # load dataset titanic_data = pd.read_csv('datasets/titanic3.csv') # where survived = 1 titanic_data_survived = titanic_data[titanic_data['survived'] == 1] # probability of survival survived_prob = np.size(titanic_data_survived, 0) / np.size(titanic_data, 0) print("Probability of survival for a passenger:", survived_prob)

Probability of survival for a passenger: 0.3819709702062643

""" First class """ # first class passengers titanic_data_first_class = titanic_data[titanic_data['pclass'] == 1] # first class survived first_class_survived = titanic_data_first_class[titanic_data_first_class['survived'] == 1] # probability of class 1 survival first_class_survived_prob = np.size(first_class_survived, 0) / np.size(titanic_data_first_class, 0) print("Probability of survival in first class:", first_class_survived_prob) """ Second class """ # second class passengers titanic_data_second_class = titanic_data[titanic_data['pclass'] == 2] # second class survived second_class_survived = titanic_data_second_class[titanic_data_second_class['survived'] == 1] # probability of class 2 survival second_class_survived_prob = np.size(second_class_survived, 0) / np.size(titanic_data_second_class, 0) print("Probability of survival in second class:", second_class_survived_prob) """ Third class """ # third class passengers titanic_data_third_class = titanic_data[titanic_data['pclass'] == 3] # third class survived third_class_survived = titanic_data_third_class[titanic_data_third_class['survived'] == 1] # probability of class 3 survival third_class_survived_prob = np.size(third_class_survived, 0) / np.size(titanic_data_third_class, 0) print("Probability of survival in third class:", third_class_survived_prob)

Probability of survival in first class: 0.6191950464396285
Probability of survival in second class: 0.4296028880866426
Probability of survival in third class: 0.2552891396332863

""" Female passengers """ # female passengers titanic_female = titanic_data[titanic_data['sex'] == 'female'] # female survived female_survived = titanic_female[titanic_female['survived'] == 1] # probability of female survived female_survived_prob = np.size(female_survived, 0) / np.size(titanic_female, 0) print("Female passengers survival rate: ", female_survived_prob) """ Male passengers """ # male passengers titanic_male = titanic_data[titanic_data['sex'] == 'male'] # male survived male_survived = titanic_male[titanic_male['survived'] == 1] # probability of male survived male_survived_prob = np.size(male_survived, 0) / np.size(titanic_male, 0) print("Male passengers survival rate: ", male_survived_prob)

Female passengers survival rate:  0.7274678111587983
Male passengers survival rate:  0.19098457888493475

""" Age """ # filling nan values with mean titanic_age = titanic_data['age'] mean_age = titanic_age.mean() titanic_age.fillna(value=mean_age, inplace=True) # under 5 of age u_5_titanic = titanic_data[titanic_age < 5] u_5_survived = u_5_titanic[u_5_titanic['survived'] == 1] u_5_survived_prob = np.size(u_5_survived, 0) / np.size(u_5_titanic, 0) print("Probability of survival in under 5:", u_5_survived_prob) # under 18 of age u_18_titanic = titanic_data[titanic_age < 18] u_18_survived = u_18_titanic[u_18_titanic['survived'] == 1] u_18_survived_prob = np.size(u_18_survived, 0) / np.size(u_18_titanic, 0) print("Probability of survival in under 18:", u_18_survived_prob) # under 30 of age u_30_titanic = titanic_data[titanic_age < 30] u_30_survived = u_30_titanic[u_30_titanic['survived'] == 1] u_30_survived_prob = np.size(u_30_survived, 0) / np.size(u_30_titanic, 0) print("Probability of survival in under 30:", u_30_survived_prob) # under 50 of age u_50_titanic = titanic_data[titanic_age < 50] u_50_survived = u_50_titanic[u_50_titanic['survived'] == 1] u_50_survived_prob = np.size(u_50_survived, 0) / np.size(u_50_titanic, 0) print("Probability of survival in under 50:", u_50_survived_prob) # under 90 of age u_90_titanic = titanic_data[titanic_age < 90] u_90_survived = u_90_titanic[u_90_titanic['survived'] == 1] u_90_survived_prob = np.size(u_90_survived, 0) / np.size(u_90_titanic, 0) print("Probability of survival in under 90:", u_90_survived_prob)

Probability of survival in under 5: 0.6470588235294118
Probability of survival in under 18: 0.525974025974026
Probability of survival in under 30: 0.36778846153846156
Probability of survival in under 50: 0.3803169307756464
Probability of survival in under 90: 0.3819709702062643

# logistic regression # labelling gender le = LabelEncoder() titanic_data['sex'] = le.fit_transform(titanic_data['sex']) # filling missing values titanic_age = titanic_data['age'] mean_age = titanic_age.mean() titanic_age.fillna(value=mean_age, inplace=True) X = titanic_data[['pclass', 'age', 'sex']] y = titanic_data['survived'] log_reg = sm.Logit(y, sm.add_constant(X)) model1 = log_reg.fit() print(model1.summary())

Optimization terminated successfully.
         Current function value: 0.469029
         Iterations 6
                           Logit Regression Results                           
==============================================================================
Dep. Variable:               survived   No. Observations:                 1309
Model:                          Logit   Df Residuals:                     1305
Method:                           MLE   Df Model:                            3
Date:                Tue, 02 Nov 2021   Pseudo R-squ.:                  0.2947
Time:                        23:26:48   Log-Likelihood:                -613.96
converged:                       True   LL-Null:                       -870.51
Covariance Type:            nonrobust   LLR p-value:                6.892e-111
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          4.3634      0.366     11.936      0.000       3.647       5.080
pclass        -1.0653      0.096    -11.122      0.000      -1.253      -0.878
age           -0.0320      0.006     -5.294      0.000      -0.044      -0.020
sex           -2.4979      0.149    -16.793      0.000      -2.789      -2.206
==============================================================================
Perfomance of the model: 0.786096256684492

# coefficient of estimates coef_est = np.exp(model1.params) print(coef_est)

const     78.520886
pclass     0.344632
age        0.968529
sex        0.082261
dtype: float64

# split training and test set X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2) conf_table = model1.pred_table() # preformance = TN + FP / (TN + FP + FN + TP) perf_model = (conf_table[0][0] + conf_table[1][1]) / (conf_table[0][0] + conf_table[1][1] + conf_table[0][1] + conf_table[1][0]) print("Perfomance of the model:", perf_model)

Perfomance of the model: 0.786096256684492