from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
#Importing useful libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from copy import deepcopy
from scipy.stats import shapiro
from statsmodels.graphics.gofplots import qqplot
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from warnings import filterwarnings
filterwarnings(action = 'ignore')
%matplotlib inline
raw = pd.read_csv('surveyofbodyfat.csv')
df = deepcopy(raw)
#Features with above +-0.5 of correlation coefficient affect the most
df.corr()['BodyFat'].to_frame().sort_values(by='BodyFat', ascending=False).iloc[1:].T
sns.displot(df['BodyFat'], bins=20, color='grey')
plt.show()
def normality_visual(data):
for column in data.columns:
fig = qqplot(data[column], line = '45', fit=True)
ax = plt.gca()
fig.set_size_inches(15, 8)
ax.set_xlabel('Theoretical Quantiles', fontsize=13)
ax.set_ylabel(f'Sample Quantiles of the {column} column', fontsize=13)
plt.show()
normality_visual(df)
sns.set(rc={'figure.figsize':(10,7)})
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
sns.heatmap(df.corr(), mask=mask, annot=True)
plt.show()
df.describe().iloc[2,:].reset_index().T #standard deviations
df.duplicated().sum()
class handle_vif():
def __init__(self, data):
self.data = data
def compute_vif(self, considered_features: list):
self.considered_features = considered_features
X = self.data[self.considered_features]
X['intercept'] = 1
self.vif = pd.DataFrame()
self.vif["Variable"] = X.columns
self.vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
self.vif = self.vif[self.vif['Variable']!='intercept']
return self.vif
def drop_high_vif(self):
self.vif_table = self.vif.sort_values(by='VIF', ascending=False).reset_index(drop=True)
while self.vif_table['VIF'].iloc[0]>5:
self.data.drop(self.vif_table['Variable'][0], axis=1, inplace=True)
computed = compute_vif([col for col in df.columns if col!='BodyFat'])
self.vif_table = computed.sort_values(by='VIF', ascending=False).reset_index(drop=True)
return self.data.head()
vif = handle_vif(df)
vif.compute_vif([col for col in df.columns if col!='BodyFat'])
vif.drop_high_vif()
df.corr()['BodyFat'].to_frame().sort_values(by='BodyFat', ascending=False).iloc[1:].T
df = df[[f for f in df.columns if abs(df[f].corr(df['BodyFat']))>0.5]]
df = pd.concat([df.iloc[:,1:], df.iloc[:,0]], axis=1)
X = df.iloc[:,:-1]
y = df['BodyFat']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_train = pd.DataFrame(X_train)
scaler = RobustScaler()
X_test = scaler.fit_transform(X_test)
X_test = pd.DataFrame(X_test)
lin_mod = LinearRegression()
lin_mod.fit(X_train, y_train)
preds = lin_mod.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rmse
r2_score(y_test, preds)
mean_absolute_error(y_test, preds)
normalized_rmse = rmse/(47.5-0) #rmse divided by the range
normalized_rmse