!pip install setuptools==58
!pip install regressors
import pandas as pd
import seaborn as sns
sns.set(style = 'whitegrid', context = 'notebook')
df = pd.read_csv('insurance.csv')
df.head()
print(df.shape)
df.charges.hist(bins = 40)
df[df.charges > 50000]
df = df[df.charges < 50000]
df.charges.hist(bins = 40)
import matplotlib.pyplot as plt
sns.pairplot(df,height = 2.5)
plt.show()
import numpy as np
numeric_cols = ['age', 'bmi', 'children', 'charges']
cm = np.corrcoef(df[numeric_cols].values.T)
sns.set(font_scale = 1.5)
sns.heatmap(cm, annot = True, yticklabels = numeric_cols, xticklabels = numeric_cols)
df = pd.get_dummies(df, columns = ['sex','smoker','region'], drop_first = True)
df.head()
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_cols = list(set(df.columns)-set(['charges']))
y_col = ['charges']
X = df[X_cols].values
y = df[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred.shape
import sklearn.metrics as metrics
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
from regressors import stats
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
df_second = df.copy()
df_second['age2'] = df_second.age**2
df_second['sobrepeso'] = (df_second.bmi >= 30).astype(int)
df_second['sobrepeso*fumador'] = df_second.sobrepeso * df_second.smoker_yes
X_cols = list(set(df_second.columns)-set(['charges']))
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
Analizando los p-values en el summary, me permite eliminar variables no significativas.
X_cols = ['children','sobrepeso*fumador','smoker_yes','age2']
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression(fit_intercept=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
Vemos como al eliminar variables no significativas mejoramos el poder predictivo del modelo.