!pip install setuptools==58
Collecting setuptools==58
Downloading setuptools-58.0.0-py3-none-any.whl (816 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 816.3/816.3 KB 43.1 MB/s eta 0:00:00
Installing collected packages: setuptools
Attempting uninstall: setuptools
Found existing installation: setuptools 58.1.0
Uninstalling setuptools-58.1.0:
Successfully uninstalled setuptools-58.1.0
Successfully installed setuptools-58.0.0
WARNING: You are using pip version 22.0.4; however, version 22.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
!pip install regressors
Collecting regressors
Downloading regressors-0.0.3.tar.gz (24 kB)
Preparing metadata (setup.py) ... done
Requirement already satisfied: numpy>=1.6.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (1.23.4)
Requirement already satisfied: scipy>=0.9 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (1.9.3)
Requirement already satisfied: matplotlib in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (3.6.0)
Requirement already satisfied: scikit-learn>=0.17 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (1.1.2)
Collecting statsmodels>=0.6.1
Downloading statsmodels-0.13.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.9 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 9.9/9.9 MB 58.7 MB/s eta 0:00:00
Requirement already satisfied: seaborn in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (0.12.1)
Requirement already satisfied: pandas in /shared-libs/python3.9/py/lib/python3.9/site-packages (from regressors) (1.2.5)
Requirement already satisfied: joblib>=1.0.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from scikit-learn>=0.17->regressors) (1.2.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from scikit-learn>=0.17->regressors) (3.1.0)
Requirement already satisfied: packaging>=21.3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from statsmodels>=0.6.1->regressors) (21.3)
Collecting patsy>=0.5.2
Downloading patsy-0.5.3-py2.py3-none-any.whl (233 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 233.8/233.8 KB 55.6 MB/s eta 0:00:00
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from pandas->regressors) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from pandas->regressors) (2022.5)
Requirement already satisfied: contourpy>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->regressors) (1.0.5)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->regressors) (9.2.0)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->regressors) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from matplotlib->regressors) (3.0.9)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->regressors) (4.37.4)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.9/py/lib/python3.9/site-packages (from matplotlib->regressors) (1.4.4)
Requirement already satisfied: six in /shared-libs/python3.9/py-core/lib/python3.9/site-packages (from patsy>=0.5.2->statsmodels>=0.6.1->regressors) (1.16.0)
Building wheels for collected packages: regressors
Building wheel for regressors (setup.py) ... done
Created wheel for regressors: filename=regressors-0.0.3-py2.py3-none-any.whl size=12590 sha256=dbf984287bdae83ddecb1f9194eb937fadc582dac48f798c320a4bad0443142a
Stored in directory: /root/.cache/pip/wheels/c9/5a/a9/c70338832ecebeabf81897762da766a8cb64ad31e4a5542725
Successfully built regressors
Installing collected packages: patsy, statsmodels, regressors
Successfully installed patsy-0.5.3 regressors-0.0.3 statsmodels-0.13.4
WARNING: You are using pip version 22.0.4; however, version 22.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import seaborn as sns
sns.set(style = 'whitegrid', context = 'notebook')
df = pd.read_csv('insurance.csv')
df.head()
ageint64
sexobject
0
19
female
1
18
male
2
28
male
3
33
male
4
32
male
print(df.shape)
df.charges.hist(bins = 40)
(1338, 7)
df[df.charges > 50000]
ageint64
sexobject
34
28
male
543
54
female
577
31
female
819
33
female
1146
60
male
1230
52
male
1300
45
male
df = df[df.charges < 50000]
df.charges.hist(bins = 40)
import matplotlib.pyplot as plt
sns.pairplot(df,height = 2.5)
plt.show()
import numpy as np
numeric_cols = ['age', 'bmi', 'children', 'charges']
cm = np.corrcoef(df[numeric_cols].values.T)
sns.set(font_scale = 1.5)
sns.heatmap(cm, annot = True, yticklabels = numeric_cols, xticklabels = numeric_cols)
df = pd.get_dummies(df, columns = ['sex','smoker','region'], drop_first = True)
df.head()
ageint64
bmifloat64
0
19
27.9
1
18
33.77
2
28
33.0
3
33
22.705
4
32
28.88
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X_cols = list(set(df.columns)-set(['charges']))
y_col = ['charges']
X = df[X_cols].values
y = df[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
y_pred.shape
import sklearn.metrics as metrics
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
r2 0.7677
mse: 0.2233
from regressors import stats
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
==========Summary==========
Residuals:
Min 1Q Median 3Q Max
-2.1051 -0.1005 0.0769 0.2077 0.86
Coefficients:
Estimate Std. Error t value p value
_intercept -0.002255 0.025985 -0.0868 0.930913
children 0.059319 0.025023 2.3706 0.018329
sex_male 0.006099 0.026039 0.2342 0.814943
bmi 0.165003 0.028317 5.8270 0.000000
region_northwest -0.003162 0.030406 -0.1040 0.917235
smoker_yes 0.788375 0.026610 29.6265 0.000000
region_southwest -0.036120 0.030033 -1.2027 0.229966
region_southeast -0.024368 0.031686 -0.7691 0.442398
age 0.307269 0.025535 12.0334 0.000000
---
R-squared: 0.76771, Adjusted R-squared: 0.76197
F-statistic: 133.85 on 8 features
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
df_second = df.copy()
df_second['age2'] = df_second.age**2
df_second['sobrepeso'] = (df_second.bmi >= 30).astype(int)
df_second['sobrepeso*fumador'] = df_second.sobrepeso * df_second.smoker_yes
X_cols = list(set(df_second.columns)-set(['charges']))
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
r2 0.8319
mse: 0.1575
model.intercept_ = model.intercept_[0]
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
==========Summary==========
Residuals:
Min 1Q Median 3Q Max
-2.101 0.0464 0.0937 0.131 0.3343
Coefficients:
Estimate Std. Error t value p value
_intercept -0.007079 0.021932 -0.3228 0.747063
children 0.064253 0.022377 2.8713 0.004350
sex_male -0.024803 0.022082 -1.1232 0.262164
bmi 0.066148 0.035615 1.8573 0.064148
region_northwest -0.012973 0.025450 -0.5097 0.610573
smoker_yes 0.464264 0.026525 17.5031 0.000000
region_southwest -0.030845 0.026121 -1.1808 0.238515
region_southeast -0.027773 0.025927 -1.0712 0.284852
age2 0.338483 0.110364 3.0670 0.002340
sobrepeso -0.060433 0.034971 -1.7281 0.084901
age -0.016315 0.112384 -0.1452 0.884663
sobrepeso*fumador 0.499941 0.030581 16.3483 0.000000
---
R-squared: 0.83193, Adjusted R-squared: 0.82617
F-statistic: 144.44 on 11 features
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
Analizando los p-values en el summary, me permite eliminar variables no significativas.
X_cols = ['children','sobrepeso*fumador','smoker_yes','age2']
y_col = ['charges']
X = df_second[X_cols].values
y = df_second[y_col].values
X_train, X_test, y_train, y_test = train_test_split(X,y)
sc_x = StandardScaler().fit(X)
sc_y = StandardScaler().fit(y)
X_train = sc_x.transform(X_train)
X_test = sc_x.transform(X_test)
y_train = sc_y.transform(y_train)
y_test = sc_y.transform(y_test)
model = LinearRegression(fit_intercept=False)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)
mse = metrics.mean_squared_error(y_test, y_pred)
r2 = metrics.r2_score(y_test, y_pred)
print("r2", r2.round(4))
print("mse: ", mse.round(4))
r2 0.8473
mse: 0.1528
model.coef_ = model.coef_.reshape(-1)
y_test = y_test.reshape(-1)
print("==========Summary==========")
stats.summary(model, X_test, y_test, X_cols)
==========Summary==========
Residuals:
Min 1Q Median 3Q Max
-1.9342 0.0656 0.1106 0.1293 0.3868
Coefficients:
Estimate Std. Error t value p value
_intercept 0.000000 0.021484 0.0000 1.000000
children 0.060726 0.020977 2.8949 0.004044
sobrepeso*fumador 0.491888 0.026040 18.8896 0.000000
smoker_yes 0.465124 0.024984 18.6169 0.000000
age2 0.315998 0.020478 15.4313 0.000000
---
R-squared: 0.84734, Adjusted R-squared: 0.84548
F-statistic: 455.14 on 4 features
residuals = np.subtract(y_test, y_pred.reshape(-1))
plt.scatter(y_pred, residuals)
plt.show
Vemos como al eliminar variables no significativas mejoramos el poder predictivo del modelo.