!pip install statsmodels==0.12.2
Requirement already satisfied: statsmodels==0.12.2 in /root/venv/lib/python3.7/site-packages (0.12.2)
Requirement already satisfied: patsy>=0.5 in /root/venv/lib/python3.7/site-packages (from statsmodels==0.12.2) (0.5.1)
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.6.3)
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.19.5)
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.2.4)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2021.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2.8.1)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels==0.12.2) (1.16.0)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy import stats
from scipy.stats import probplot
sns.set_style("whitegrid")
titanicData = pd.read_csv('Bijl-deBraal-data.csv')
titanicOriginal = titanicData
#Summarize keys with the associated number of missing values
print("The data set contains:", len(titanicData["pclass"]), "data points")
for key in titanicData.keys():
print(key, ": ", titanicData[key].isna().sum()-1, "missing value(s).")
#Removing "cabin" attribute
titanicData = titanicData.drop("cabin", axis = 1)
#Removing "embarked" attribute
titanicData = titanicData.drop("embarked", axis = 1)
#Removing "boat" attribute
titanicData = titanicData.drop("boat", axis = 1)
#Removing "body" attribute
titanicData = titanicData.drop("body", axis = 1)
#Removing "home.dest" attribute
titanicData = titanicData.drop("home.dest", axis = 1)
#Removing rows with missing values
missingValues = titanicData[titanicData.isna().any(axis = 1)]
titanicData = titanicData.dropna(subset = titanicData.keys())
The data set contains: 1310 data points
pclass : 0 missing value(s).
survived : 0 missing value(s).
name : 0 missing value(s).
sex : 0 missing value(s).
age : 263 missing value(s).
sibsp : 0 missing value(s).
parch : 0 missing value(s).
ticket : 0 missing value(s).
fare : 1 missing value(s).
cabin : 1014 missing value(s).
embarked : 2 missing value(s).
boat : 823 missing value(s).
body : 1188 missing value(s).
home.dest : 564 missing value(s).
#Checking normality of response variable
fig, axs = plt.subplots(1, 2, sharey='row')
sns.distplot(x = titanicOriginal["fare"], ax = axs[0])
axs[0].set_title('Old dataset')
sns.distplot(x = titanicData["fare"], ax = axs[1] )
axs[ 1].set_title('New dataset')
plt.show()
print("Figure 1: A histogram of the titanic dataset, representing the population distribution")
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
/shared-libs/python3.7/py/lib/python3.7/site-packages/seaborn/distributions.py:2557: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
Figure 1: A histogram of the titanic dataset, representing the population distribution
#check for linearity and constant variability
dummy_coding = {'male': 0, 'female': 1}
sex_dummy = titanicData['sex'].copy()
sex_dummy = sex_dummy.replace(dummy_coding)
titanicData['sex_dummy']=sex_dummy
formula_string = "fare ~ sex_dummy"
model = sm.formula.ols(formula = formula_string, data = titanicData)
model_fitted = model.fit()
e = titanicData['sex_dummy']-(titanicData['sex_dummy'] * model_fitted.params[1] + model_fitted.params[0])
sns.residplot(x = titanicData['fare'], y=e)
plt.show()
print("Figure 2: Linearity and constant variability between sex and fare")
Figure 2: Linearity and constant variability between sex and fare
#checking if the variable is normally distributed
residual_values = (titanicData['fare'] - model_fitted.predict())
stats.probplot(residual_values, plot =plt)
plt.show()
print("Figure 3: The normal distribution of the variable sex")
Figure 3: The normal distribution of the variable sex
#check for linearity and constant variability
formula_string = "fare ~ age"
model = sm.formula.ols(formula = formula_string, data = titanicData)
model_fitted_age = model.fit()
b = titanicData['age']-(titanicData['age'] * model_fitted_age.params[1] + model_fitted_age.params[0])
sns.residplot(x = titanicData['fare'], y=b)
plt.show()
print("Figure 4: Linearity and constant variability between age and fare")
Figure 4: Linearity and constant variability between age and fare
#checking if the variable is normally distributed
residual_values_age = (titanicData['fare'] - b)
stats.probplot(residual_values_age, plot =plt)
plt.show()
print("Figure 5: The normal distribution of the variable age")
Figure 5: The normal distribution of the variable age
#check for linearity and constant variability
formula_string = "fare ~ sibsp"
model = sm.formula.ols(formula = formula_string, data = titanicData)
model_fitted_sibsp = model.fit()
y = titanicData['sibsp']-(titanicData['sibsp'] * model_fitted_sibsp.params[1] + model_fitted_sibsp.params[0])
sns.residplot(x = titanicData['fare'], y=y)
plt.show()
print("Figure 6: Linearity and constant variability between siblings/spouces aboard and fare")
Figure 6: Linearity and constant variability between siblings/spouces aboard and fare
#checking if the variable is normally distributed
residual_values_sibsp = (titanicData['fare'] - y)
stats.probplot(residual_values_sibsp, plot =plt)
plt.show()
print("Figure 7: The normal distribution of the variable siblings/spouces aboard")
Figure 7: The normal distribution of the variable siblings/spouces aboard
#check for linearity and constant variability
formula_string = "fare ~ parch"
model = sm.formula.ols(formula = formula_string, data = titanicData)
model_fitted_parch = model.fit()
a = titanicData['parch']-(titanicData['parch'] * model_fitted_parch.params[1] + model_fitted_parch.params[0])
sns.residplot(x = titanicData['fare'], y=a)
plt.show()
print("Figure 8: Linearity and constant variability between parents/children and fare")
Figure 8: Linearity and constant variability between parents/children and fare
#checking if the variable is normally distributed
residual_values_parch = (titanicData['fare'] - a)
stats.probplot(residual_values_parch, plot =plt)
plt.show()
print("Figure 9: The normal distribution of the variable parents/children")
Figure 9: The normal distribution of the variable parents/children
#scatterplot of sex and fare
plt.figure(figsize=(10,8))
sns.scatterplot(x = titanicData['sex_dummy'], y =titanicData['fare'])
fare = 0.6923*titanicData['sex_dummy'] + 16.0198
sns.lineplot(x= titanicData['sex_dummy'], y = titanicData['fare'])
plt.title("sex vs fare", fontsize = "xx-large")
plt.show()
print("Figure 10: scatterplot how the sex affects the fare")
Figure 10: scatterplot how the sex affects the fare
#scatterplot of age and fare
plt.figure(figsize=(10,8))
sns.scatterplot(x = titanicData['age'], y =titanicData['fare'])
fare = 0.6923*titanicData['age'] + 16.0198
sns.lineplot(x= titanicData['age'], y = titanicData['fare'])
plt.title("age vs fare", fontsize = "xx-large")
plt.show()
print("Figure 11: scatterplot how the age affects the fare")
Figure 11: scatterplot how the age affects the fare
#scatterplot of siblings/spouses aboard and fare
plt.figure(figsize=(10,8))
sns.scatterplot(x = titanicData['sibsp'], y =titanicData['fare'])
fare = 7.9597*titanicData['sibsp'] + 29.3217
sns.lineplot(x= titanicData['sibsp'], y = titanicData['fare'])
plt.title("sibsp vs fare", fontsize = "xx-large")
plt.show()
print("Figure 12: scatterplot how the amount of siblings/spouses aboard affects the fare")
Figure 12: scatterplot how the amount of siblings/spouses aboard affects the fare
#scatterplot of parents/children aboard and fare
plt.figure(figsize=(10,8))
sns.scatterplot(x = titanicData['parch'], y =titanicData['fare'])
fare = 13.2435*titanicData['parch'] + 28.1925
sns.lineplot(x= titanicData['parch'], y = titanicData['fare'])
plt.title("parch vs fare", fontsize = "xx-large")
plt.show()
print("Figure 13: scatterplot how the amount of parents/children aboard affects the fare")
Figure 13: scatterplot how the amount of parents/children aboard affects the fare
m_full = sm.formula.ols(formula = 'fare ~ sex_dummy + age + sibsp + parch', data = titanicData)
multi_reg = m_full.fit()
print(multi_reg.summary())
OLS Regression Results
==============================================================================
Dep. Variable: fare R-squared: 0.127
Model: OLS Adj. R-squared: 0.124
Method: Least Squares F-statistic: 37.89
Date: Sat, 05 Jun 2021 Prob (F-statistic): 1.28e-29
Time: 15:56:36 Log-Likelihood: -5612.7
No. Observations: 1045 AIC: 1.124e+04
Df Residuals: 1040 BIC: 1.126e+04
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -6.6397 4.346 -1.528 0.127 -15.168 1.889
sex_dummy 17.6567 3.426 5.153 0.000 10.933 24.380
age 0.9443 0.116 8.142 0.000 0.717 1.172
sibsp 7.3701 1.949 3.781 0.000 3.545 11.195
parch 11.5681 2.119 5.458 0.000 7.409 15.727
==============================================================================
Omnibus: 954.166 Durbin-Watson: 1.106
Prob(Omnibus): 0.000 Jarque-Bera (JB): 34628.428
Skew: 4.172 Prob(JB): 0.00
Kurtosis: 29.938 Cond. No. 94.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
fare_guessed = 17.65 * sex_dummy + 0.94 * titanicData["age"] + 7.37 * titanicData["sibsp"] + 11.57 * titanicData["parch"] - 6.64
plt.scatter(titanicData["fare"], fare_guessed)
plt.title("predicted vs actual fare", fontsize = "xx-large")
plt.show()
print('Figure 14: the actual fare on the x-axis and the fare we predicted on the y-axis')
Figure 14: the actual fare on the x-axis and the fare we predicted on the y-axis