# import libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# read in the test and train csv files
test_df = pd.read_csv('test.csv')
train_df = pd.read_csv('train.csv')
# notice, test_df does not have SalePrice columns
test_df.columns
# this is our training data set with SalePrice column
# most of the time, we will only work with train_df since we do not want any data leakage
train_df.columns
# first check the correlation and see which numerical column has greatest correlation with sale price
train_df.corr()['SalePrice'].sort_values()
sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)
# they are two outliers we can see from previous figure
train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)]
# create a variable to store indices of two houses
drop_index = train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)].index
# drop the index by row
train_df = train_df.drop(drop_index,axis=0)
# now we do not see outliers anymore
sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)
# let's do the same for column with second highest correlation, General Living Area
sns.scatterplot(x='GrLivArea',y='SalePrice',data=train_df)
# we will check for 3rd feature, GarageCars (size of garage in car capacity)
sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)
# they are 5 outliers from above figure
# notice all the houses have sale price lower than 300,000
train_df[(train_df['GarageCars']==4.0)]['SalePrice']
# we delete the outliers as desired
drop_index_garage = train_df[(train_df['GarageCars']==4.0)].index
train_df = train_df.drop(drop_index_garage,axis=0)
sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)
# this shows the percentage of missing values (NaN) for each column
100*train_df.isnull().sum()/len(train_df)
# we define a function to get features with missing values and their percentage
def percent_missing(df):
percent = 100*df.isnull().sum()/len(df)
percent = percent[percent>0].sort_values(ascending=False)
return percent
missing_percent = percent_missing(train_df)
missing_percent
# we can visualize with barplot
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90)
plt.ylim(0,5);
train_df = train_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])
# now we take a look at other features with low missing percentage
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
test_df[test_df['BsmtExposure'].isnull()][['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1','BsmtFinSF1','BsmtFinSF2']]
train_df['BsmtExposure'].unique()
# Basement String Columns --> fillna 'NA'
bsmt_str = ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']
train_df[bsmt_str] = train_df[bsmt_str].fillna('NA')
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
test_df[test_df['GarageType'].isnull()][[ 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual','GarageCond','GarageArea','YearRemodAdd']]
np.round(train_df['GarageYrBlt'].mean(),0)
# Garage String Columns --> fillna 'NA'
garage_str = ['GarageType', 'GarageFinish', 'GarageQual','GarageCond']
train_df[garage_str] = train_df[garage_str].fillna('NA')
# Garage Year Built --> fillna mean_year
mean_year = np.round(train_df['GarageYrBlt'].mean(),0)
train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(mean_year)
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
# we have 0 for pool area when they have missing values for PoolQC
train_df[train_df['PoolQC'].isnull()][['PoolQC','PoolArea']]
# we have 0 for MiscVal when they have missing values for MiscFeature
train_df[train_df['MiscFeature'].isnull()][['MiscFeature','MiscVal']]
# we have 0 for MiscVal when they have missing values for MiscFeature
train_df[train_df['FireplaceQu'].isnull()][['FireplaceQu','Fireplaces']]
# String Columns --> fillna 'NA'
missing_str = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']
train_df[missing_str] = train_df[missing_str].fillna('NA')
missing_percent = percent_missing(train_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
train_df['LotFrontage'].describe()
# box plot of LotFrontage
# it is basically a visualization of .describe() method
sns.boxplot(x='LotFrontage',data=train_df)
# LotFrontage --> fillna mean_LotFrontage
mean_LotFrontage = np.round(train_df['LotFrontage'].mean(),0)
train_df['LotFrontage'] = train_df['LotFrontage'].fillna(mean_year)
# now we do not have any more missing values
missing_percent = percent_missing(train_df)
missing_percent
train_df = pd.get_dummies(train_df,drop_first=False)
train_df.head()
# now we have 303 features instead of 79 features
train_df.shape
missing_percent.index
from sklearn.model_selection import train_test_split
X = train_df.drop('SalePrice',axis=1)
y = train_df['SalePrice']
# remember we will come back to test_df only when we finish choosing the model
# X_test means test set from the training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaled_X_train = scaler.transform(X_train)
scaled_X_test = scaler.transform(X_test)
X_test
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_log_error,mean_squared_error
sgd = SGDRegressor(max_iter=10000,random_state=42,penalty='elasticnet')
sgd_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]}
grid_sgd = GridSearchCV(estimator=sgd,param_grid=sgd_param,scoring='neg_mean_squared_error',cv=5)
grid_sgd.fit(scaled_X_train,y_train)
grid_sgd.best_params_
sgd_preds = grid_sgd.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(sgd_preds,y_test))
# root mean sqaured log error
rmsle = {}
rmsle['SGD']=np.sqrt(mean_squared_log_error(sgd_preds,y_test))
rmsle
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(max_iter=10000,random_state=42)
elastic_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]}
grid_elastic = GridSearchCV(estimator=elastic,param_grid=elastic_param,scoring='neg_mean_squared_error',cv=5)
grid_elastic.fit(scaled_X_train,y_train)
elastic_preds = grid_elastic.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(elastic_preds,y_test))
rmsle['Elastic']=np.sqrt(mean_squared_log_error(elastic_preds,y_test))
from sklearn.linear_model import Ridge
ridge = Ridge(max_iter=10000,random_state=42)
ridge_param = {'alpha':[.01,0.1,1,5,10,100]}
grid_ridge = GridSearchCV(estimator=ridge,param_grid=ridge_param,scoring='neg_mean_squared_error',cv=5)
grid_ridge.fit(scaled_X_train,y_train)
ridge_preds = grid_ridge.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(ridge_preds,y_test))
rmsle['Ridge']=np.sqrt(mean_squared_log_error(ridge_preds,y_test))
from sklearn.svm import SVR
svr = SVR(max_iter=10000)
svr_param = {'C':[0.001,0.01,0.1,0.5,1],
'kernel':['linear','rbf'],
'gamma':['scale','auto'],# for rbf, checking both gamma
'epsilon':[0,0.01,0.1,0.5,1,2]}
grid_svr = GridSearchCV(estimator=svr,param_grid=svr_param,scoring='neg_mean_squared_error',cv=5)
grid_svr.fit(scaled_X_train,y_train)
svr_preds = grid_svr.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(svr_preds,y_test))
rmsle['SVR']=np.sqrt(mean_squared_log_error(svr_preds,y_test))
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(random_state=42)
gb_param = {'n_estimators':[50,100],
'learning_rate':[0.1,0.05,0.2],
'max_depth':[3,4,5]}
grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)
grid_gb.fit(scaled_X_train,y_train)
gb_preds = grid_gb.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(gb_preds,y_test))
rmsle['GradientBoosting']=np.sqrt(mean_squared_log_error(gb_preds,y_test))
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf_param = {'n_estimators':[50,100],
'max_depth':[3,4,5]}
grid_rf = GridSearchCV(estimator=rf,param_grid=rf_param,scoring='neg_mean_squared_error',cv=5)
grid_rf.fit(scaled_X_train,y_train)
rf_preds = grid_rf.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(rf_preds,y_test))
rmsle['RandomForest']=np.sqrt(mean_squared_log_error(rf_preds,y_test))
rmsle
RMSLE_df = pd.DataFrame.from_dict(data=rmsle,orient='index',columns=['RMSLE'])
RMSLE_df
plt.figure(figsize=(12,8))
sns.scatterplot(data=RMSLE_df);
grid_gb.best_params_
gb = GradientBoostingRegressor(random_state=42)
gb_param = {'n_estimators':[50,100,150,1000],
'learning_rate':[0.05,0.1,0.2],
'max_depth':[3,4,5]}
new_grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)
new_grid_gb.fit(scaled_X_train,y_train)
new_gb_preds = new_grid_gb.predict(scaled_X_test)
np.sqrt(mean_squared_log_error(new_gb_preds,y_test))
new_grid_gb.best_params_
grid_gb.best_params_
final_model = GradientBoostingRegressor(learning_rate=0.05,max_depth=3,n_estimators=1000,random_state=42)
# remember train_df has dummy features
# so we first separate the features and label
# then we need to scale X before fit to the final model
X_df = train_df.drop('SalePrice',axis=1)
scaled_X_df = scaler.fit_transform(X_df)
y_df = train_df['SalePrice']
final_model.fit(scaled_X_df,y_df)
test_df = test_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])
missing_percent = percent_missing(test_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
# Garage Cars and Areas --> 0
test_df[['GarageCars', 'GarageArea']] = test_df[['GarageCars', 'GarageArea']].fillna(0)
# Garage Year Built --> Meanyear
mean_year = np.round(test_df['GarageYrBlt'].mean(),0)
test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(mean_year)
# Bsmt numerical columns --> 0
bsmt_num = ['BsmtFullBath','BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF']
test_df[bsmt_num] = test_df[bsmt_num].fillna(0)
# Lot Frontage --> mean_LotFrontage
mean_LotFrontage = np.round(test_df['LotFrontage'].mean(),0)
test_df['LotFrontage'] = test_df['LotFrontage'].fillna(mean_year)
missing_percent = percent_missing(test_df)
plt.figure(figsize=(12,8))
sns.barplot(x=missing_percent.index,y=missing_percent)
plt.xticks(rotation=90);
missing_percent.index
# setting all the missing values equal to NA
missing_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond',
'GarageQual', 'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual',
'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType',
'MSZoning', 'Functional', 'Utilities', 'KitchenQual', 'Exterior2nd',
'Exterior1st', 'SaleType']
test_df[missing_features] = test_df[missing_features].fillna('NA')
scaled_test_df.shape
# Get missing columns in the training test
missing_cols = set(train_df.columns ) - set( test_df.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
test_df[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
test_df = test_df[train_df.columns]
test_df = pd.get_dummies(test_df,drop_first=False)
scaled_test_df = scaler.fit_transform(test_df)
scaled_test_df = scaled_test_df.reset_index()
predictions = final_model.predict(scaled_test_df)