import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
import statsmodels.api as sm
# 3.1
diabetes_data = pd.read_excel('datasets/Diabetes_Data.xlsx')
diabetes_data.head()
Y = diabetes_data['Y']
X = diabetes_data.drop(['Y'], axis=1)
# correlation matrix
corr_matrix_X = X.corr()
# heatmap plot
sns.heatmap(corr_matrix_X, annot=True, cmap='coolwarm')
plt.figure(figsize=(15, 10))
plt.show()
# plt.imshow(corr_matrix_X, cmap='coolwarm', interpolation='none')
# plt.colorbar()
# plt.show()
# 3.3
# multivariate model
lin_reg = LinearRegression()
model1 = lin_reg.fit(X, Y)
# print(model1.coef_)
# mean squared error
Y_pred = model1.predict(X)
mse = mean_squared_error(Y, Y_pred)
print("MSE:", mse)
# adjusted r-squared
adj_r2 = 1 - (1-r2_score(Y, Y_pred)) * (len(Y)-1)/(len(Y)-X.shape[1]-1)
print("Adjusted R-squared:", adj_r2)
MSE: 2859.69634758675
Adjusted R-squared: 0.5065592904853232
# 3.5
# forward selection
def forward_regression(X, y,
threshold_in,
verbose=False):
initial_list = []
included = list(initial_list)
while True:
changed=False
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded, dtype=float)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
if not changed:
break
return included
print("Useful variables using forward selection: ", forward_regression(X, Y, 0.05))
Useful variables using forward selection: ['BMI', 'S5', 'BP', 'S1', 'SEX', 'S2']
# new independent variables from forward selection
new_X = X[forward_regression(X, Y, 0.05)]
# model2
lin_reg = LinearRegression()
model2 = lin_reg.fit(new_X, Y)
# mean squared error
Y_pred = model2.predict(new_X)
mse = mean_squared_error(Y, Y_pred)
print("New MSE:", mse)
# r-squared
r2 = r2_score(Y, Y_pred)
print("R-squared:", r2)
New MSE: 2876.683251787016
New R-squared: 0.5148837959256445
# 4
# load dataset
titanic_data = pd.read_csv('datasets/titanic3.csv')
# where survived = 1
titanic_data_survived = titanic_data[titanic_data['survived'] == 1]
# probability of survival
survived_prob = np.size(titanic_data_survived, 0) / np.size(titanic_data, 0)
print("Probability of survival for a passenger:", survived_prob)
Probability of survival for a passenger: 0.3819709702062643
"""
First class
"""
# first class passengers
titanic_data_first_class = titanic_data[titanic_data['pclass'] == 1]
# first class survived
first_class_survived = titanic_data_first_class[titanic_data_first_class['survived'] == 1]
# probability of class 1 survival
first_class_survived_prob = np.size(first_class_survived, 0) / np.size(titanic_data_first_class, 0)
print("Probability of survival in first class:", first_class_survived_prob)
"""
Second class
"""
# second class passengers
titanic_data_second_class = titanic_data[titanic_data['pclass'] == 2]
# second class survived
second_class_survived = titanic_data_second_class[titanic_data_second_class['survived'] == 1]
# probability of class 2 survival
second_class_survived_prob = np.size(second_class_survived, 0) / np.size(titanic_data_second_class, 0)
print("Probability of survival in second class:", second_class_survived_prob)
"""
Third class
"""
# third class passengers
titanic_data_third_class = titanic_data[titanic_data['pclass'] == 3]
# third class survived
third_class_survived = titanic_data_third_class[titanic_data_third_class['survived'] == 1]
# probability of class 3 survival
third_class_survived_prob = np.size(third_class_survived, 0) / np.size(titanic_data_third_class, 0)
print("Probability of survival in third class:", third_class_survived_prob)
Probability of survival in first class: 0.6191950464396285
Probability of survival in second class: 0.4296028880866426
Probability of survival in third class: 0.2552891396332863
"""
Female passengers
"""
# female passengers
titanic_female = titanic_data[titanic_data['sex'] == 'female']
# female survived
female_survived = titanic_female[titanic_female['survived'] == 1]
# probability of female survived
female_survived_prob = np.size(female_survived, 0) / np.size(titanic_female, 0)
print("Female passengers survival rate: ", female_survived_prob)
"""
Male passengers
"""
# male passengers
titanic_male = titanic_data[titanic_data['sex'] == 'male']
# male survived
male_survived = titanic_male[titanic_male['survived'] == 1]
# probability of male survived
male_survived_prob = np.size(male_survived, 0) / np.size(titanic_male, 0)
print("Male passengers survival rate: ", male_survived_prob)
Female passengers survival rate: 0.7274678111587983
Male passengers survival rate: 0.19098457888493475
"""
Age
"""
# filling nan values with mean
titanic_age = titanic_data['age']
mean_age = titanic_age.mean()
titanic_age.fillna(value=mean_age, inplace=True)
# under 5 of age
u_5_titanic = titanic_data[titanic_age < 5]
u_5_survived = u_5_titanic[u_5_titanic['survived'] == 1]
u_5_survived_prob = np.size(u_5_survived, 0) / np.size(u_5_titanic, 0)
print("Probability of survival in under 5:", u_5_survived_prob)
# under 18 of age
u_18_titanic = titanic_data[titanic_age < 18]
u_18_survived = u_18_titanic[u_18_titanic['survived'] == 1]
u_18_survived_prob = np.size(u_18_survived, 0) / np.size(u_18_titanic, 0)
print("Probability of survival in under 18:", u_18_survived_prob)
# under 30 of age
u_30_titanic = titanic_data[titanic_age < 30]
u_30_survived = u_30_titanic[u_30_titanic['survived'] == 1]
u_30_survived_prob = np.size(u_30_survived, 0) / np.size(u_30_titanic, 0)
print("Probability of survival in under 30:", u_30_survived_prob)
# under 50 of age
u_50_titanic = titanic_data[titanic_age < 50]
u_50_survived = u_50_titanic[u_50_titanic['survived'] == 1]
u_50_survived_prob = np.size(u_50_survived, 0) / np.size(u_50_titanic, 0)
print("Probability of survival in under 50:", u_50_survived_prob)
# under 90 of age
u_90_titanic = titanic_data[titanic_age < 90]
u_90_survived = u_90_titanic[u_90_titanic['survived'] == 1]
u_90_survived_prob = np.size(u_90_survived, 0) / np.size(u_90_titanic, 0)
print("Probability of survival in under 90:", u_90_survived_prob)
Probability of survival in under 5: 0.6470588235294118
Probability of survival in under 18: 0.525974025974026
Probability of survival in under 30: 0.36778846153846156
Probability of survival in under 50: 0.3803169307756464
Probability of survival in under 90: 0.3819709702062643
# logistic regression
# labelling gender
le = LabelEncoder()
titanic_data['sex'] = le.fit_transform(titanic_data['sex'])
# filling missing values
titanic_age = titanic_data['age']
mean_age = titanic_age.mean()
titanic_age.fillna(value=mean_age, inplace=True)
X = titanic_data[['pclass', 'age', 'sex']]
y = titanic_data['survived']
log_reg = sm.Logit(y, sm.add_constant(X))
model1 = log_reg.fit()
print(model1.summary())
Optimization terminated successfully.
Current function value: 0.469029
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: survived No. Observations: 1309
Model: Logit Df Residuals: 1305
Method: MLE Df Model: 3
Date: Tue, 02 Nov 2021 Pseudo R-squ.: 0.2947
Time: 23:26:48 Log-Likelihood: -613.96
converged: True LL-Null: -870.51
Covariance Type: nonrobust LLR p-value: 6.892e-111
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
const 4.3634 0.366 11.936 0.000 3.647 5.080
pclass -1.0653 0.096 -11.122 0.000 -1.253 -0.878
age -0.0320 0.006 -5.294 0.000 -0.044 -0.020
sex -2.4979 0.149 -16.793 0.000 -2.789 -2.206
==============================================================================
Perfomance of the model: 0.786096256684492
# coefficient of estimates
coef_est = np.exp(model1.params)
print(coef_est)
const 78.520886
pclass 0.344632
age 0.968529
sex 0.082261
dtype: float64
# split training and test set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
conf_table = model1.pred_table()
# preformance = TN + FP / (TN + FP + FN + TP)
perf_model = (conf_table[0][0] + conf_table[1][1]) / (conf_table[0][0] + conf_table[1][1] + conf_table[0][1] + conf_table[1][0])
print("Perfomance of the model:", perf_model)
Perfomance of the model: 0.786096256684492