Ames Housing Prediction

# import libraries for data manipulation and visualization import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

# read in the test and train csv files test_df = pd.read_csv('test.csv') train_df = pd.read_csv('train.csv')

# notice, test_df does not have SalePrice columns test_df.columns

# this is our training data set with SalePrice column # most of the time, we will only work with train_df since we do not want any data leakage train_df.columns

# first check the correlation and see which numerical column has greatest correlation with sale price train_df.corr()['SalePrice'].sort_values()

sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)

# they are two outliers we can see from previous figure train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)]

# create a variable to store indices of two houses drop_index = train_df[(train_df['OverallQual']==10)&(train_df['SalePrice']<300000)].index

# drop the index by row train_df = train_df.drop(drop_index,axis=0)

# now we do not see outliers anymore sns.scatterplot(x='OverallQual',y='SalePrice',data=train_df)

# let's do the same for column with second highest correlation, General Living Area sns.scatterplot(x='GrLivArea',y='SalePrice',data=train_df)

# we will check for 3rd feature, GarageCars (size of garage in car capacity) sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)

# they are 5 outliers from above figure # notice all the houses have sale price lower than 300,000 train_df[(train_df['GarageCars']==4.0)]['SalePrice']

# we delete the outliers as desired drop_index_garage = train_df[(train_df['GarageCars']==4.0)].index train_df = train_df.drop(drop_index_garage,axis=0) sns.scatterplot(x='GarageCars',y='SalePrice',data=train_df)

# this shows the percentage of missing values (NaN) for each column 100*train_df.isnull().sum()/len(train_df)

# we define a function to get features with missing values and their percentage def percent_missing(df): percent = 100*df.isnull().sum()/len(df) percent = percent[percent>0].sort_values(ascending=False) return percent

missing_percent = percent_missing(train_df)

missing_percent

# we can visualize with barplot plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90) plt.ylim(0,5);

train_df = train_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])

# now we take a look at other features with low missing percentage missing_percent = percent_missing(train_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

test_df[test_df['BsmtExposure'].isnull()][['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1','BsmtFinSF1','BsmtFinSF2']]

train_df['BsmtExposure'].unique()

# Basement String Columns --> fillna 'NA' bsmt_str = ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1'] train_df[bsmt_str] = train_df[bsmt_str].fillna('NA')

missing_percent = percent_missing(train_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

test_df[test_df['GarageType'].isnull()][[ 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual','GarageCond','GarageArea','YearRemodAdd']]

np.round(train_df['GarageYrBlt'].mean(),0)

# Garage String Columns --> fillna 'NA' garage_str = ['GarageType', 'GarageFinish', 'GarageQual','GarageCond'] train_df[garage_str] = train_df[garage_str].fillna('NA') # Garage Year Built --> fillna mean_year mean_year = np.round(train_df['GarageYrBlt'].mean(),0) train_df['GarageYrBlt'] = train_df['GarageYrBlt'].fillna(mean_year)

missing_percent = percent_missing(train_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

# we have 0 for pool area when they have missing values for PoolQC train_df[train_df['PoolQC'].isnull()][['PoolQC','PoolArea']]

# we have 0 for MiscVal when they have missing values for MiscFeature train_df[train_df['MiscFeature'].isnull()][['MiscFeature','MiscVal']]

# we have 0 for MiscVal when they have missing values for MiscFeature train_df[train_df['FireplaceQu'].isnull()][['FireplaceQu','Fireplaces']]

# String Columns --> fillna 'NA' missing_str = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'] train_df[missing_str] = train_df[missing_str].fillna('NA')

missing_percent = percent_missing(train_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

train_df['LotFrontage'].describe()

# box plot of LotFrontage # it is basically a visualization of .describe() method sns.boxplot(x='LotFrontage',data=train_df)

# LotFrontage --> fillna mean_LotFrontage mean_LotFrontage = np.round(train_df['LotFrontage'].mean(),0) train_df['LotFrontage'] = train_df['LotFrontage'].fillna(mean_year)

# now we do not have any more missing values missing_percent = percent_missing(train_df) missing_percent

train_df = pd.get_dummies(train_df,drop_first=False)

train_df.head()

# now we have 303 features instead of 79 features train_df.shape

missing_percent.index

from sklearn.model_selection import train_test_split

X = train_df.drop('SalePrice',axis=1) y = train_df['SalePrice']

# remember we will come back to test_df only when we finish choosing the model # X_test means test set from the training data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() scaler.fit(X_train) scaled_X_train = scaler.transform(X_train) scaled_X_test = scaler.transform(X_test)

X_test

from sklearn.linear_model import SGDRegressor

from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_log_error,mean_squared_error

sgd = SGDRegressor(max_iter=10000,random_state=42,penalty='elasticnet') sgd_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]} grid_sgd = GridSearchCV(estimator=sgd,param_grid=sgd_param,scoring='neg_mean_squared_error',cv=5)

grid_sgd.fit(scaled_X_train,y_train)

grid_sgd.best_params_

sgd_preds = grid_sgd.predict(scaled_X_test)

np.sqrt(mean_squared_log_error(sgd_preds,y_test))

# root mean sqaured log error rmsle = {} rmsle['SGD']=np.sqrt(mean_squared_log_error(sgd_preds,y_test))

rmsle

from sklearn.linear_model import ElasticNet

elastic = ElasticNet(max_iter=10000,random_state=42) elastic_param = {'alpha':[.01,0.1,1,5,10,100],'l1_ratio':[.1,.5,.9,.99,1]} grid_elastic = GridSearchCV(estimator=elastic,param_grid=elastic_param,scoring='neg_mean_squared_error',cv=5)

grid_elastic.fit(scaled_X_train,y_train)

elastic_preds = grid_elastic.predict(scaled_X_test)

np.sqrt(mean_squared_log_error(elastic_preds,y_test))

rmsle['Elastic']=np.sqrt(mean_squared_log_error(elastic_preds,y_test))

from sklearn.linear_model import Ridge

ridge = Ridge(max_iter=10000,random_state=42) ridge_param = {'alpha':[.01,0.1,1,5,10,100]} grid_ridge = GridSearchCV(estimator=ridge,param_grid=ridge_param,scoring='neg_mean_squared_error',cv=5)

grid_ridge.fit(scaled_X_train,y_train)

ridge_preds = grid_ridge.predict(scaled_X_test)

np.sqrt(mean_squared_log_error(ridge_preds,y_test))

rmsle['Ridge']=np.sqrt(mean_squared_log_error(ridge_preds,y_test))

from sklearn.svm import SVR

svr = SVR(max_iter=10000) svr_param = {'C':[0.001,0.01,0.1,0.5,1], 'kernel':['linear','rbf'], 'gamma':['scale','auto'],# for rbf, checking both gamma 'epsilon':[0,0.01,0.1,0.5,1,2]} grid_svr = GridSearchCV(estimator=svr,param_grid=svr_param,scoring='neg_mean_squared_error',cv=5)

grid_svr.fit(scaled_X_train,y_train)

svr_preds = grid_svr.predict(scaled_X_test) np.sqrt(mean_squared_log_error(svr_preds,y_test))

rmsle['SVR']=np.sqrt(mean_squared_log_error(svr_preds,y_test))

from sklearn.ensemble import GradientBoostingRegressor

gb = GradientBoostingRegressor(random_state=42) gb_param = {'n_estimators':[50,100], 'learning_rate':[0.1,0.05,0.2], 'max_depth':[3,4,5]} grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)

grid_gb.fit(scaled_X_train,y_train)

gb_preds = grid_gb.predict(scaled_X_test) np.sqrt(mean_squared_log_error(gb_preds,y_test))

rmsle['GradientBoosting']=np.sqrt(mean_squared_log_error(gb_preds,y_test))

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=42) rf_param = {'n_estimators':[50,100], 'max_depth':[3,4,5]} grid_rf = GridSearchCV(estimator=rf,param_grid=rf_param,scoring='neg_mean_squared_error',cv=5)

grid_rf.fit(scaled_X_train,y_train)

rf_preds = grid_rf.predict(scaled_X_test) np.sqrt(mean_squared_log_error(rf_preds,y_test))

rmsle['RandomForest']=np.sqrt(mean_squared_log_error(rf_preds,y_test))

rmsle

RMSLE_df = pd.DataFrame.from_dict(data=rmsle,orient='index',columns=['RMSLE'])

RMSLE_df

plt.figure(figsize=(12,8)) sns.scatterplot(data=RMSLE_df);

grid_gb.best_params_

gb = GradientBoostingRegressor(random_state=42) gb_param = {'n_estimators':[50,100,150,1000], 'learning_rate':[0.05,0.1,0.2], 'max_depth':[3,4,5]} new_grid_gb = GridSearchCV(estimator=gb,param_grid=gb_param,scoring='neg_mean_squared_error',cv=5)

new_grid_gb.fit(scaled_X_train,y_train)

new_gb_preds = new_grid_gb.predict(scaled_X_test) np.sqrt(mean_squared_log_error(new_gb_preds,y_test))

new_grid_gb.best_params_

grid_gb.best_params_

final_model = GradientBoostingRegressor(learning_rate=0.05,max_depth=3,n_estimators=1000,random_state=42)

# remember train_df has dummy features # so we first separate the features and label # then we need to scale X before fit to the final model X_df = train_df.drop('SalePrice',axis=1) scaled_X_df = scaler.fit_transform(X_df)

y_df = train_df['SalePrice']

final_model.fit(scaled_X_df,y_df)

test_df = test_df.dropna(axis=0,subset=['MasVnrArea','MasVnrType','Electrical'])

missing_percent = percent_missing(test_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

# Garage Cars and Areas --> 0 test_df[['GarageCars', 'GarageArea']] = test_df[['GarageCars', 'GarageArea']].fillna(0) # Garage Year Built --> Meanyear mean_year = np.round(test_df['GarageYrBlt'].mean(),0) test_df['GarageYrBlt'] = test_df['GarageYrBlt'].fillna(mean_year) # Bsmt numerical columns --> 0 bsmt_num = ['BsmtFullBath','BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF'] test_df[bsmt_num] = test_df[bsmt_num].fillna(0) # Lot Frontage --> mean_LotFrontage mean_LotFrontage = np.round(test_df['LotFrontage'].mean(),0) test_df['LotFrontage'] = test_df['LotFrontage'].fillna(mean_year)

missing_percent = percent_missing(test_df) plt.figure(figsize=(12,8)) sns.barplot(x=missing_percent.index,y=missing_percent) plt.xticks(rotation=90);

missing_percent.index

# setting all the missing values equal to NA missing_features = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType', 'BsmtCond', 'BsmtQual', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSZoning', 'Functional', 'Utilities', 'KitchenQual', 'Exterior2nd', 'Exterior1st', 'SaleType'] test_df[missing_features] = test_df[missing_features].fillna('NA')

scaled_test_df.shape

# Get missing columns in the training test missing_cols = set(train_df.columns ) - set( test_df.columns ) # Add a missing column in test set with default value equal to 0 for c in missing_cols: test_df[c] = 0 # Ensure the order of column in the test set is in the same order than in train set test_df = test_df[train_df.columns]

test_df = pd.get_dummies(test_df,drop_first=False) scaled_test_df = scaler.fit_transform(test_df)

scaled_test_df = scaled_test_df.reset_index()

predictions = final_model.predict(scaled_test_df)