import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
import statsmodels.api as sm
# 3.1
diabetes_data = pd.read_excel('datasets/Diabetes_Data.xlsx')
diabetes_data.head()
Y = diabetes_data['Y']
X = diabetes_data.drop(['Y'], axis=1)
# correlation matrix
corr_matrix_X = X.corr()
# heatmap plot
sns.heatmap(corr_matrix_X, annot=True, cmap='coolwarm')
plt.figure(figsize=(15, 10))
plt.show()
# plt.imshow(corr_matrix_X, cmap='coolwarm', interpolation='none')
# plt.colorbar()
# plt.show()
# 3.3
# multivariate model
lin_reg = LinearRegression()
model1 = lin_reg.fit(X, Y)
# print(model1.coef_)
# mean squared error
Y_pred = model1.predict(X)
mse = mean_squared_error(Y, Y_pred)
print("MSE:", mse)
# adjusted r-squared
adj_r2 = 1 - (1-r2_score(Y, Y_pred)) * (len(Y)-1)/(len(Y)-X.shape[1]-1)
print("Adjusted R-squared:", adj_r2)
# 3.5
# forward selection
def forward_regression(X, y,
threshold_in,
verbose=False):
initial_list = []
included = list(initial_list)
while True:
changed=False
excluded = list(set(X.columns)-set(included))
new_pval = pd.Series(index=excluded, dtype=float)
for new_column in excluded:
model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
new_pval[new_column] = model.pvalues[new_column]
best_pval = new_pval.min()
if best_pval < threshold_in:
best_feature = new_pval.idxmin()
included.append(best_feature)
changed=True
if verbose:
print('Add {:30} with p-value {:.6}'.format(best_feature, best_pval))
if not changed:
break
return included
print("Useful variables using forward selection: ", forward_regression(X, Y, 0.05))
# new independent variables from forward selection
new_X = X[forward_regression(X, Y, 0.05)]
# model2
lin_reg = LinearRegression()
model2 = lin_reg.fit(new_X, Y)
# mean squared error
Y_pred = model2.predict(new_X)
mse = mean_squared_error(Y, Y_pred)
print("New MSE:", mse)
# r-squared
r2 = r2_score(Y, Y_pred)
print("R-squared:", r2)
# 4
# load dataset
titanic_data = pd.read_csv('datasets/titanic3.csv')
# where survived = 1
titanic_data_survived = titanic_data[titanic_data['survived'] == 1]
# probability of survival
survived_prob = np.size(titanic_data_survived, 0) / np.size(titanic_data, 0)
print("Probability of survival for a passenger:", survived_prob)
"""
First class
"""
# first class passengers
titanic_data_first_class = titanic_data[titanic_data['pclass'] == 1]
# first class survived
first_class_survived = titanic_data_first_class[titanic_data_first_class['survived'] == 1]
# probability of class 1 survival
first_class_survived_prob = np.size(first_class_survived, 0) / np.size(titanic_data_first_class, 0)
print("Probability of survival in first class:", first_class_survived_prob)
"""
Second class
"""
# second class passengers
titanic_data_second_class = titanic_data[titanic_data['pclass'] == 2]
# second class survived
second_class_survived = titanic_data_second_class[titanic_data_second_class['survived'] == 1]
# probability of class 2 survival
second_class_survived_prob = np.size(second_class_survived, 0) / np.size(titanic_data_second_class, 0)
print("Probability of survival in second class:", second_class_survived_prob)
"""
Third class
"""
# third class passengers
titanic_data_third_class = titanic_data[titanic_data['pclass'] == 3]
# third class survived
third_class_survived = titanic_data_third_class[titanic_data_third_class['survived'] == 1]
# probability of class 3 survival
third_class_survived_prob = np.size(third_class_survived, 0) / np.size(titanic_data_third_class, 0)
print("Probability of survival in third class:", third_class_survived_prob)
"""
Female passengers
"""
# female passengers
titanic_female = titanic_data[titanic_data['sex'] == 'female']
# female survived
female_survived = titanic_female[titanic_female['survived'] == 1]
# probability of female survived
female_survived_prob = np.size(female_survived, 0) / np.size(titanic_female, 0)
print("Female passengers survival rate: ", female_survived_prob)
"""
Male passengers
"""
# male passengers
titanic_male = titanic_data[titanic_data['sex'] == 'male']
# male survived
male_survived = titanic_male[titanic_male['survived'] == 1]
# probability of male survived
male_survived_prob = np.size(male_survived, 0) / np.size(titanic_male, 0)
print("Male passengers survival rate: ", male_survived_prob)
"""
Age
"""
# filling nan values with mean
titanic_age = titanic_data['age']
mean_age = titanic_age.mean()
titanic_age.fillna(value=mean_age, inplace=True)
# under 5 of age
u_5_titanic = titanic_data[titanic_age < 5]
u_5_survived = u_5_titanic[u_5_titanic['survived'] == 1]
u_5_survived_prob = np.size(u_5_survived, 0) / np.size(u_5_titanic, 0)
print("Probability of survival in under 5:", u_5_survived_prob)
# under 18 of age
u_18_titanic = titanic_data[titanic_age < 18]
u_18_survived = u_18_titanic[u_18_titanic['survived'] == 1]
u_18_survived_prob = np.size(u_18_survived, 0) / np.size(u_18_titanic, 0)
print("Probability of survival in under 18:", u_18_survived_prob)
# under 30 of age
u_30_titanic = titanic_data[titanic_age < 30]
u_30_survived = u_30_titanic[u_30_titanic['survived'] == 1]
u_30_survived_prob = np.size(u_30_survived, 0) / np.size(u_30_titanic, 0)
print("Probability of survival in under 30:", u_30_survived_prob)
# under 50 of age
u_50_titanic = titanic_data[titanic_age < 50]
u_50_survived = u_50_titanic[u_50_titanic['survived'] == 1]
u_50_survived_prob = np.size(u_50_survived, 0) / np.size(u_50_titanic, 0)
print("Probability of survival in under 50:", u_50_survived_prob)
# under 90 of age
u_90_titanic = titanic_data[titanic_age < 90]
u_90_survived = u_90_titanic[u_90_titanic['survived'] == 1]
u_90_survived_prob = np.size(u_90_survived, 0) / np.size(u_90_titanic, 0)
print("Probability of survival in under 90:", u_90_survived_prob)
# logistic regression
# labelling gender
le = LabelEncoder()
titanic_data['sex'] = le.fit_transform(titanic_data['sex'])
# filling missing values
titanic_age = titanic_data['age']
mean_age = titanic_age.mean()
titanic_age.fillna(value=mean_age, inplace=True)
X = titanic_data[['pclass', 'age', 'sex']]
y = titanic_data['survived']
log_reg = sm.Logit(y, sm.add_constant(X))
model1 = log_reg.fit()
print(model1.summary())
# coefficient of estimates
coef_est = np.exp(model1.params)
print(coef_est)
# split training and test set
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
conf_table = model1.pred_table()
# preformance = TN + FP / (TN + FP + FN + TP)
perf_model = (conf_table[0][0] + conf_table[1][1]) / (conf_table[0][0] + conf_table[1][1] + conf_table[0][1] + conf_table[1][0])
print("Perfomance of the model:", perf_model)