# import libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# read in the test and train csv files
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
# notice, test_df does not have SalePrice columns
test_df.columns
# this is our training data set with SalePrice column
# most of the time, we will only work with train_df since we do not want any data leakage
train_df.columns
# first check the correlation and see which numerical column has greatest correlation with sale price
train_df.corr()['SalePrice'].sort_values()
sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)
# they are two outliers we can see from previous figure
train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)]
Idint64
MSSubClassint64
523
524
60
1298
1299
60
# create a variable to store indices of two houses
drop_index = train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)].index
# drop the index by row
train_df = train_df.drop(drop_index,axis=0)
# now we do not see outliers anymore
sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)
# let's do the same for column with second highest correlation, General Living Area
sns.scatterplot(x='GrLivArea',y='SalePrice',data=train_df)
# we will check for 3rd feature, GarageCars (size of garage in car capacity)
sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)
# they are 5 outliers from above figure
# notice all the houses have sale price lower than 300,000
train_df[(train_df['GarageCars']==4.0)]['SalePrice']
# we delete the outliers as desired
drop_index_garage = train_df[(train_df['GarageCars']==4.0)].index
train_df = train_df.drop(drop_index_garage,axis=0)
sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)
# this shows the percentage of missing values (NaN) for each column
100*train_df.isnull().sum()/len(train_df)
# we define a function to get features with missing values and their percentage
def percent_missing(df):
percent = 100*df.isnull().sum()/len(df)
percent = percent[percent>0].sort_values(ascending=False)
return percent
missing_percent = percent_missing(train_df)
missing_percent
# we can visualize with barplot
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90)
plt.ylim(0,5);
train_df = train_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])
# now we take a look at other features with low missing percentage
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
test_df[test_df['BsmtExposure'].isnull()][['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1','BsmtFinSF1','BsmtFinSF2']]
BsmtExposureobject
Missing100%
BsmtFinType2object
Unf4.5%
Missing95.5%
27
nan
Unf
125
nan
nan
133
nan
nan
269
nan
nan
318
nan
nan
354
nan
nan
387
nan
nan
388
nan
nan
396
nan
nan
397
nan
nan
train_df['BsmtExposure'].unique()
# Basement String Columns --> fillna 'NA'
bsmt_str = ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']
train_df[bsmt_str] = train_df[bsmt_str].fillna('NA')
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
test_df[test_df['GarageType'].isnull()][[ 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual','GarageCond','GarageArea','YearRemodAdd']]
GarageTypeobject
Missing100%
GarageYrBltfloat64
NaN - NaN
53
nan
nan
71
nan
nan
79
nan
nan
92
nan
nan
96
nan
nan
98
nan
nan
100
nan
nan
130
nan
nan
133
nan
nan
134
nan
nan
np.round(train_df['GarageYrBlt'].mean(),0)
# Garage String Columns --> fillna 'NA'
garage_str = ['GarageType', 'GarageFinish', 'GarageQual','GarageCond']
train_df[garage_str] = train_df[garage_str].fillna('NA')
# Garage Year Built --> fillna mean_year
mean_year = np.round(train_df['GarageYrBlt'].mean(),0)
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(mean_year)
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
# we have 0 for pool area when they have missing values for PoolQC
train_df[train_df['PoolQC'].isnull()][['PoolQC','PoolArea']]
PoolQCobject
Missing100%
PoolAreaint64
0 - 0
190
nan
0
191
nan
0
192
nan
0
193
nan
0
194
nan
0
195
nan
0
196
nan
0
198
nan
0
199
nan
0
200
nan
0
# we have 0 for MiscVal when they have missing values for MiscFeature
train_df[train_df['MiscFeature'].isnull()][['MiscFeature','MiscVal']]
MiscFeatureobject
Missing100%
MiscValint64
0 - 0
44
nan
0
45
nan
0
46
nan
0
47
nan
0
48
nan
0
49
nan
0
50
nan
0
52
nan
0
53
nan
0
54
nan
0
# we have 0 for MiscVal when they have missing values for MiscFeature
train_df[train_df['FireplaceQu'].isnull()][['FireplaceQu','Fireplaces']]
FireplaceQuobject
Missing100%
Fireplacesint64
0 - 0
0
nan
0
5
nan
0
10
nan
0
12
nan
0
15
nan
0
17
nan
0
18
nan
0
19
nan
0
26
nan
0
29
nan
0
# String Columns --> fillna 'NA'
missing_str = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
train_df[missing_str] = train_df[missing_str].fillna('NA')
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
train_df['LotFrontage'].describe()
# box plot of LotFrontage
# it is basically a visualization of .describe() method
sns.boxplot(x='LotFrontage',data=train_df)
# LotFrontage --> fillna mean_LotFrontage
mean_LotFrontage = np.round(train_df['LotFrontage'].mean(),0)
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(mean_year)
# now we do not have any more missing values
missing_percent = percent_missing(train_df)
missing_percent
train_df = pd.get_dummies(train_df,drop_first=False)
train_df.head()
Idint64
MSSubClassint64
0
1
60
1
2
20
2
3
60
3
4
70
4
5
60
# now we have 303 features instead of 79 features
train_df.shape
missing_percent.index
from sklearn.model_selection import train_test_split
X = train_df.drop('SalePrice',axis=1)
y = train_df['SalePrice']
# remember we will come back to test_df only when we finish choosing the model
# X_test means test set from the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)
X_test
Idint64
11 - 1460
MSSubClassint64
20 - 190
414
415
60
317
318
60
1043
1044
60
65
66
60
1033
1034
20
666
667
60
175
176
20
830
831
20
654
655
20
231
232
60
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error,mean_squared_error
sgd = SGDRegressor(max_iter=10000,random_state=42,penalty='elasticnet')
sgd_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]}
grid_sgd = GridSearchCV(estimator=sgd,param_grid=sgd_param,scoring='neg_mean_squared_error',cv=5)
grid_sgd.fit(scaled_X_train,y_train)
grid_sgd.best_params_
sgd_preds = grid_sgd.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(sgd_preds,y_test))
# root mean sqaured log error
rmsle = {}
rmsle['SGD']=np.sqrt(mean_squared_log_error(sgd_preds,y_test))
rmsle
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(max_iter=10000,random_state=42)
elastic_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]}
grid_elastic = GridSearchCV(estimator=elastic,param_grid=elastic_param,scoring='neg_mean_squared_error',cv=5)
grid_elastic.fit(scaled_X_train,y_train)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.702e+09, tolerance: 4.876e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.783e+09, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.677e+09, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 4.765e+09, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.646e+09, tolerance: 4.899e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.102e+11, tolerance: 4.876e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.314e+11, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.251e+11, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.174e+11, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.025e+11, tolerance: 4.899e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.176e+10, tolerance: 4.876e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.281e+11, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.224e+11, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.141e+11, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.625e+10, tolerance: 4.899e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.618e+09, tolerance: 4.876e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.239e+11, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.181e+11, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.103e+11, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 6.695e+09, tolerance: 4.899e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.039e+10, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.464e+10, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 8.070e+10, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.689e+09, tolerance: 4.899e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 3.138e+10, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.840e+10, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.834e+10, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.299e+10, tolerance: 5.675e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.128e+10, tolerance: 5.416e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_coordinate_descent.py:648: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.166e+10, tolerance: 5.206e+08
coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive
elastic_preds = grid_elastic.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(elastic_preds,y_test))
rmsle['Elastic']=np.sqrt(mean_squared_log_error(elastic_preds,y_test))
from sklearn.linear_model import Ridge
ridge = Ridge(max_iter=10000,random_state=42)
ridge_param = {'alpha':[.01,0.1,1,5,10,100]}
grid_ridge = GridSearchCV(estimator=ridge,param_grid=ridge_param,scoring='neg_mean_squared_error',cv=5)
grid_ridge.fit(scaled_X_train,y_train)
ridge_preds = grid_ridge.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(ridge_preds,y_test))
rmsle['Ridge']=np.sqrt(mean_squared_log_error(ridge_preds,y_test))
from sklearn.svm import SVR
svr = SVR(max_iter=10000)
svr_param = {'C':[0.001,0.01,0.1,0.5,1],
'kernel':['linear','rbf'],
'gamma':['scale','auto'],# for rbf, checking both gamma
'epsilon':[0,0.01,0.1,0.5,1,2]}
grid_svr = GridSearchCV(estimator=svr,param_grid=svr_param,scoring='neg_mean_squared_error',cv=5)
grid_svr.fit(scaled_X_train,y_train)
svr_preds = grid_svr.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(svr_preds,y_test))
rmsle['SVR']=np.sqrt(mean_squared_log_error(svr_preds,y_test))
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=42)
gb_param = {'n_estimators':[50,100],
'learning_rate':[0.1,0.05,0.2],
'max_depth':[3,4,5]}
grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)
grid_gb.fit(scaled_X_train,y_train)
gb_preds = grid_gb.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(gb_preds,y_test))
rmsle['GradientBoosting']=np.sqrt(mean_squared_log_error(gb_preds,y_test))
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf_param = {'n_estimators':[50,100],
'max_depth':[3,4,5]}
grid_rf = GridSearchCV(estimator=rf,param_grid=rf_param,scoring='neg_mean_squared_error',cv=5)
grid_rf.fit(scaled_X_train,y_train)
rf_preds = grid_rf.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(rf_preds,y_test))
rmsle['RandomForest']=np.sqrt(mean_squared_log_error(rf_preds,y_test))
rmsle
RMSLE_df = pd.DataFrame.from_dict(data=rmsle,orient='index',columns=['RMSLE'])
RMSLE_df
RMSLEfloat64
SGD
0.1334498016
Elastic
0.1415488752
Ridge
0.1420691044
SVR
0.3298503335
GradientBoosting
0.1303618012
RandomForest
0.1705726585
plt.figure(figsize=(12,8))
sns.scatterplot(data=RMSLE_df);
grid_gb.best_params_
gb = GradientBoostingRegressor(random_state=42)
gb_param = {'n_estimators':[50,100,150,1000],
'learning_rate':[0.05,0.1,0.2],
'max_depth':[3,4,5]}
new_grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)
new_grid_gb.fit(scaled_X_train,y_train)
new_gb_preds = new_grid_gb.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(new_gb_preds,y_test))
new_grid_gb.best_params_
grid_gb.best_params_
final_model = GradientBoostingRegressor(learning_rate=0.05,max_depth=3,n_estimators=1000,random_state=42)
# remember train_df has dummy features
# so we first separate the features and label
# then we need to scale X before fit to the final model
X_df = train_df.drop('SalePrice',axis=1)
scaled_X_df = scaler.fit_transform(X_df)
y_df = train_df['SalePrice']
final_model.fit(scaled_X_df,y_df)
test_df = test_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])
missing_percent = percent_missing(test_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
# Garage Cars and Areas --> 0
test_df[['GarageCars', 'GarageArea']] = test_df[['GarageCars', 'GarageArea']].fillna(0)
# Garage Year Built --> Meanyear
mean_year = np.round(test_df['GarageYrBlt'].mean(),0)
test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(mean_year)
# Bsmt numerical columns --> 0
bsmt_num = ['BsmtFullBath','BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
test_df[bsmt_num] = test_df[bsmt_num].fillna(0)
# Lot Frontage --> mean_LotFrontage
mean_LotFrontage = np.round(test_df['LotFrontage'].mean(),0)
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(mean_year)
missing_percent = percent_missing(test_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
missing_percent.index
# setting all the missing values equal to NA
missing_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond',
'GarageQual', 'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType',
'MSZoning', 'Functional', 'Utilities', 'KitchenQual', 'Exterior2nd',
'Exterior1st', 'SaleType']
test_df[missing_features] = test_df[missing_features].fillna('NA')
scaled_test_df.shape
# Get missing columns in the training test
missing_cols = set(train_df.columns ) - set( test_df.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
test_df[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_df = test_df[train_df.columns]
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""
test_df = pd.get_dummies(test_df,drop_first=False)
scaled_test_df = scaler.fit_transform(test_df)
scaled_test_df = scaled_test_df.reset_index()
Execution error
AttributeError: 'numpy.ndarray' object has no attribute 'reset_index'
predictions = final_model.predict(scaled_test_df)
Execution error
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').