import seaborn as sns # for visualization
import numpy as np # for mathematical operations
import matplotlib.pyplot as plt # for visualization
import pandas as pd # for reading and manipulation of information
df = pd.read_csv('insurance.csv')
df.head(2)
ageint64
sexobject
0
19
female
1
18
male
sns.histplot(df.charges,bins=40) # plotting of the histogram
plt.xlabel('Charges')
plt.show()
sns.histplot(df[df['charges']>30000].charges,bins=40)
plt.xlabel('Charges')
plt.show()
sns.set(style='whitegrid')
cols =['age','bmi','children','charges']
sns.pairplot(df[cols],height=(2))
plt.show()
# I calculate the correlation through pandas
sns.set(font_scale = 1.1) # Here we make a little bit greater the font
# I plot a heatmap using the correlations
sns.heatmap(df[cols].corr(), annot=True,cbar = True, yticklabels = cols,xticklabels= cols)
"""
There are other ways to calculate, for example, through numpy
cm = np.corrcoef(df[cols].values.T)
sns.heatmap(cm,annot=True,yticklabels=cols,xticklabels=cols)
"""
plt.show()
# It is necessary to use the option drop_first, to not have two columns, it means one for male
# and other for female.
df = pd.get_dummies(df,columns=['sex','smoker','region'], drop_first=True)
df.head(2)
ageint64
bmifloat64
0
19
27.9
1
18
33.77
from sklearn.preprocessing import StandardScaler # for escaling data
from sklearn.linear_model import LinearRegression # for the linear regression model
from sklearn.model_selection import train_test_split # to divide the train and test dataset
"""
Let us define the columns that are of interest. In this case, the charges will be dependent
of the rest of variables.
"""
x_cols = list(set(df.columns)-set(['charges']))
y_cols = ['charges']
X = df[x_cols].values
y = df[y_cols].values
"""
train_test_split:
From all the data in X, it is created a test and train set.
In this case, will be used with default parameters, so the train set will be a 75% of the total
data and the test set will be the remaining 25%
"""
X_train, X_test, y_train, y_test = train_test_split(X,y)
"""
The function StandardScaler will do the calculos of the mean and devesta of the data
This will be used in the transformation of data. The transformation consists in the
centering and escaling of data
"""
sc_x = StandardScaler().fit(X) # Se realiza una estandarización, std significa standarizado
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
"""
Finally, the linear regression model will be gotten with the train data and proved with the
test data.
"""
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
import sklearn.metrics as metrics
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse:", mse.round(4))
r2 0.7595
mse: 0.2296
from regressors import stats
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
y_pred = y_pred.reshape(-1)
print("--------------Summary---------------------")
stats.summary(model, X_test, y_test, x_cols)
--------------Summary---------------------
Residuals:
Min 1Q Median 3Q Max
-2.0791 -0.1168 0.0841 0.2477 0.8794
Coefficients:
Estimate Std. Error t value p value
_intercept 0.003648 0.026237 0.1390 0.889506
age 0.293727 0.027421 10.7119 0.000000
smoker_yes 0.793940 0.026256 30.2384 0.000000
bmi 0.180364 0.027316 6.6029 0.000000
region_northwest -0.005209 0.030057 -0.1733 0.862514
region_southwest -0.036205 0.030853 -1.1735 0.241437
sex_male -0.007929 0.026288 -0.3016 0.763133
region_southeast -0.030317 0.032061 -0.9456 0.345043
children 0.042177 0.026573 1.5872 0.113411
---
R-squared: 0.75945, Adjusted R-squared: 0.75355
F-statistic: 128.66 on 8 features
residuals = np.subtract(y_test, y_pred)
plt.scatter(y_pred,residuals)
plt.xlabel('Predictions')
plt.ylabel('Residuals')
plt.show()
# To create this second model, I created a new dataframe and added three columns more.
df_second = df.copy()
df_second['age2'] = df_second.age**2
df_second['overweight'] = (df_second.bmi >=30).astype(int)
df_second['overweight*smoker'] = df_second.overweight * df_second.smoker_yes
df_second.head(2)
ageint64
bmifloat64
0
19
27.9
1
18
33.77
#X_cols = list(set(df_second.columns) - set(['charges']))
X_cols = ['overweight*smoker', 'smoker_yes', 'age2', 'children']
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression(fit_intercept=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test).reshape(-1)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2: ", r2.round(4))
print("mse: ", mse.round(4))
r2: 0.8184
mse: 0.1748
# model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print(X_test.shape,y_test.shape)
print("-----------summary-----------")
stats.summary(model, X_test, y_test, X_cols)
(335, 4) (335,)
-----------summary-----------
Residuals:
Min 1Q Median 3Q Max
-1.9705 0.0638 0.1051 0.1374 0.3214
Coefficients:
Estimate Std. Error t value p value
_intercept 0.000000 0.022897 0.0000 1.000000
overweight*smoker 0.516240 0.029788 17.3303 0.000000
smoker_yes 0.440324 0.027493 16.0158 0.000000
age2 0.309880 0.022848 13.5628 0.000000
children 0.071842 0.023090 3.1114 0.002023
---
R-squared: 0.81836, Adjusted R-squared: 0.81616
F-statistic: 371.69 on 4 features
residuals = np.subtract(y_test,y_pred)
sns.scatterplot(y_pred,residuals)
plt.ylabel('Residuals')
plt.xlabel('Predictions')
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(