import seaborn as sns # for visualization
import numpy as np # for mathematical operations
import matplotlib.pyplot as plt # for visualization
import pandas as pd # for reading and manipulation of information
df = pd.read_csv('insurance.csv')
df.head(2)
sns.histplot(df.charges,bins=40) # plotting of the histogram
plt.xlabel('Charges')
plt.show()
sns.histplot(df[df['charges']>30000].charges,bins=40)
plt.xlabel('Charges')
plt.show()
sns.set(style='whitegrid')
cols =['age','bmi','children','charges']
sns.pairplot(df[cols],height=(2))
plt.show()
# I calculate the correlation through pandas
sns.set(font_scale = 1.1) # Here we make a little bit greater the font
# I plot a heatmap using the correlations
sns.heatmap(df[cols].corr(), annot=True,cbar = True, yticklabels = cols,xticklabels= cols)
"""
There are other ways to calculate, for example, through numpy
cm = np.corrcoef(df[cols].values.T)
sns.heatmap(cm,annot=True,yticklabels=cols,xticklabels=cols)
"""
plt.show()
# It is necessary to use the option drop_first, to not have two columns, it means one for male
# and other for female.
df = pd.get_dummies(df,columns=['sex','smoker','region'], drop_first=True)
df.head(2)
from sklearn.preprocessing import StandardScaler # for escaling data
from sklearn.linear_model import LinearRegression # for the linear regression model
from sklearn.model_selection import train_test_split # to divide the train and test dataset
"""
Let us define the columns that are of interest. In this case, the charges will be dependent
of the rest of variables.
"""
x_cols = list(set(df.columns)-set(['charges']))
y_cols = ['charges']
X = df[x_cols].values
y = df[y_cols].values
"""
train_test_split:
From all the data in X, it is created a test and train set.
In this case, will be used with default parameters, so the train set will be a 75% of the total
data and the test set will be the remaining 25%
"""
X_train, X_test, y_train, y_test = train_test_split(X,y)
"""
The function StandardScaler will do the calculos of the mean and devesta of the data
This will be used in the transformation of data. The transformation consists in the
centering and escaling of data
"""
sc_x = StandardScaler().fit(X) # Se realiza una estandarización, std significa standarizado
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
"""
Finally, the linear regression model will be gotten with the train data and proved with the
test data.
"""
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
import sklearn.metrics as metrics
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse:", mse.round(4))
from regressors import stats
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
y_pred = y_pred.reshape(-1)
print("--------------Summary---------------------")
stats.summary(model, X_test, y_test, x_cols)
residuals = np.subtract(y_test, y_pred)
plt.scatter(y_pred,residuals)
plt.xlabel('Predictions')
plt.ylabel('Residuals')
plt.show()
# To create this second model, I created a new dataframe and added three columns more.
df_second = df.copy()
df_second['age2'] = df_second.age**2
df_second['overweight'] = (df_second.bmi >=30).astype(int)
df_second['overweight*smoker'] = df_second.overweight * df_second.smoker_yes
df_second.head(2)
#X_cols = list(set(df_second.columns) - set(['charges']))
X_cols = ['overweight*smoker', 'smoker_yes', 'age2', 'children']
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression(fit_intercept=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test).reshape(-1)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2: ", r2.round(4))
print("mse: ", mse.round(4))
# model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print(X_test.shape,y_test.shape)
print("-----------summary-----------")
stats.summary(model, X_test, y_test, X_cols)
residuals = np.subtract(y_test,y_pred)
sns.scatterplot(y_pred,residuals)
plt.ylabel('Residuals')
plt.xlabel('Predictions')
plt.show()