Regression Model

import seaborn as sns # for visualization import numpy as np # for mathematical operations import matplotlib.pyplot as plt # for visualization import pandas as pd # for reading and manipulation of information

df = pd.read_csv('insurance.csv') df.head(2)

sns.histplot(df.charges,bins=40) # plotting of the histogram plt.xlabel('Charges') plt.show()

sns.histplot(df[df['charges']>30000].charges,bins=40) plt.xlabel('Charges') plt.show()

sns.set(style='whitegrid') cols =['age','bmi','children','charges'] sns.pairplot(df[cols],height=(2)) plt.show()

# I calculate the correlation through pandas sns.set(font_scale = 1.1) # Here we make a little bit greater the font # I plot a heatmap using the correlations sns.heatmap(df[cols].corr(), annot=True,cbar = True, yticklabels = cols,xticklabels= cols) """ There are other ways to calculate, for example, through numpy cm = np.corrcoef(df[cols].values.T) sns.heatmap(cm,annot=True,yticklabels=cols,xticklabels=cols) """ plt.show()

# It is necessary to use the option drop_first, to not have two columns, it means one for male # and other for female. df = pd.get_dummies(df,columns=['sex','smoker','region'], drop_first=True) df.head(2)

from sklearn.preprocessing import StandardScaler # for escaling data from sklearn.linear_model import LinearRegression # for the linear regression model from sklearn.model_selection import train_test_split # to divide the train and test dataset

""" Let us define the columns that are of interest. In this case, the charges will be dependent of the rest of variables. """ x_cols = list(set(df.columns)-set(['charges'])) y_cols = ['charges'] X = df[x_cols].values y = df[y_cols].values """ train_test_split: From all the data in X, it is created a test and train set. In this case, will be used with default parameters, so the train set will be a 75% of the total data and the test set will be the remaining 25% """ X_train, X_test, y_train, y_test = train_test_split(X,y) """ The function StandardScaler will do the calculos of the mean and devesta of the data This will be used in the transformation of data. The transformation consists in the centering and escaling of data """ sc_x = StandardScaler().fit(X) # Se realiza una estandarización, std significa standarizado sc_y = StandardScaler().fit(y) X_train = sc_x.transform(X_train) X_test = sc_x.transform(X_test) y_train = sc_y.transform(y_train) y_test = sc_y.transform(y_test) """ Finally, the linear regression model will be gotten with the train data and proved with the test data. """ model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test)

import sklearn.metrics as metrics mse = metrics.mean_squared_error(y_test, y_pred) r2 = metrics.r2_score(y_test, y_pred) print("r2", r2.round(4)) print("mse:", mse.round(4))

from regressors import stats model.intercept_ = model.intercept_[0] model.coef_ = model.coef_.reshape(-1) y_test = y_test.reshape(-1) y_pred = y_pred.reshape(-1) print("--------------Summary---------------------") stats.summary(model, X_test, y_test, x_cols)

residuals = np.subtract(y_test, y_pred) plt.scatter(y_pred,residuals) plt.xlabel('Predictions') plt.ylabel('Residuals') plt.show()

# To create this second model, I created a new dataframe and added three columns more. df_second = df.copy() df_second['age2'] = df_second.age**2 df_second['overweight'] = (df_second.bmi >=30).astype(int) df_second['overweight*smoker'] = df_second.overweight * df_second.smoker_yes df_second.head(2)

#X_cols = list(set(df_second.columns) - set(['charges'])) X_cols = ['overweight*smoker', 'smoker_yes', 'age2', 'children'] y_col = ['charges'] X = df_second[X_cols].values y = df_second[y_col].values X_train, X_test, y_train, y_test = train_test_split(X,y) sc_x = StandardScaler().fit(X) sc_y = StandardScaler().fit(y) X_train = sc_x.transform(X_train) X_test = sc_x.transform(X_test) y_train = sc_y.transform(y_train) y_test = sc_y.transform(y_test) model = LinearRegression(fit_intercept=False) model.fit(X_train,y_train) y_pred = model.predict(X_test).reshape(-1)

mse = metrics.mean_squared_error(y_test, y_pred) r2 = metrics.r2_score(y_test, y_pred) print("r2: ", r2.round(4)) print("mse: ", mse.round(4))

# model.intercept_ = model.intercept_[0] model.coef_ = model.coef_.reshape(-1)

y_test = y_test.reshape(-1) print(X_test.shape,y_test.shape) print("-----------summary-----------") stats.summary(model, X_test, y_test, X_cols)

residuals = np.subtract(y_test,y_pred) sns.scatterplot(y_pred,residuals) plt.ylabel('Residuals') plt.xlabel('Predictions') plt.show()