House sales price - Reduction and prediction
Notebook under renovation! Sorry for the inconvenience. (All the code it's alright but I'm working on storytelling)
Libraries
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from scipy.stats import norm, skew, kurtosis
from scipy import stats
Data
#data
data = pd.read_csv('train.csv', index_col='Id')
df_train = data.copy()
df_train.shape
df_train.info()
df_train.describe()
SalePrice
df_train['SalePrice'].describe()
sns.boxplot(df_train['SalePrice'])
sns.distplot(df_train['SalePrice'])
df_train['SalePrice'].skew()
df_train['SalePrice'].kurt()
df_train['SalePrice'] = np.log1p(df_train['SalePrice'])
sns.distplot(df_train['SalePrice'],fit=norm)
Numeric
df_num = df_train._get_numeric_data()
df_num
dropping un-correlated
#correlation matrix
corrmat = df_num.corr()
f, ax = plt.subplots(figsize=(15, 12))
sns.heatmap(corrmat, vmax=1, square=True);
corrmat.columns
corrmat.shape
corrmat.nlargest(37, 'SalePrice')['SalePrice']
cols = corrmat.nlargest(20, 'SalePrice')['SalePrice'].index
# cols = corrmat.index[abs(corrmat['SalePrice']) > 0.3]
cols
dropping redundant
df_num = df_num[cols]
corrmat = df_num.corr()
fig, ax = plt.subplots(figsize=(12,10))
sns.heatmap(corrmat, annot=True, ax=ax)
for column in cols:
print(corrmat.nlargest(3,column)[column])
print()
df_num = df_num.drop(['TotRmsAbvGrd','GarageCars','1stFlrSF','GarageYrBlt','YearRemodAdd','2ndFlrSF'], axis=1)
df_num.columns
for column in df_num.columns[1:]:
print(column, df_num['SalePrice'].corr(df_num[column]))
other skewed features
numeric_vars = df_num.columns
def diagnostic(df, var):
fig = plt.figure(figsize = (17, 6))
plt.subplot(1,2,1)
df[var].hist(bins = 40)
plt.title("Distribution of {}".format(var))
plt.subplot(1,2,2)
stats.probplot(df[var], dist = "norm", plot = plt)
plt.ylabel("Quantiles")
plt.show()
for var in numeric_vars[1:]:
diagnostic(df_num, var)
vars_skewed = df_train[df_num.columns].apply(lambda x: skew(x)).sort_values()
vars_skewed
vars_kurtosis = df_train[df_num.columns].apply(lambda x: kurtosis(x)).sort_values()
vars_kurtosis
for var in vars_skewed.index[1:]:
df_num[var] = np.log1p(df_num[var])
numeric_vars = df_num.columns
def diagnostic(df, var):
fig = plt.figure(figsize = (17, 6))
plt.subplot(1,2,1)
df[var].hist(bins = 40)
plt.title("Distribution of {}".format(var))
plt.subplot(1,2,2)
stats.probplot(df[var], dist = "norm", plot = plt)
plt.ylabel("Quantiles")
plt.show()
for var in numeric_vars[1:]:
diagnostic(df_num, var)
Numeric data is ready...
df_num
Categorical
cat_cols = [c for c in df_train.columns if c not in df_train._get_numeric_data().columns ]
df_cate = df_train[cat_cols]
df_cate.shape
droping unuseful
df_cate = df_cate.drop(['Alley','Neighborhood','Condition2','ExterCond','MiscFeature',
'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior2nd',
'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
'GarageCond','PavedDrive','Functional','SaleType','SaleCondition'], axis=1)
df_cate.shape
df_cate.columns
Encoding
# 1.-recategorizing: PoolQC, Fence
# 2.-ordinal: 'FirePlaceQu','GarageQual','BsmtQual','ExterQual','HeatingQC','KitchenQual'
# 3.-one-hot encoder: 'MSZoning','LotShape','LandContour','LotConfig','LandSlope',
# 'Condition1','BldgType','HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
# 'MasVnrType','Foundation','Heating','Electrical','GarageType','GarageFinish',
# binary (if_binary): Street, Utilities, CentralAir
# 1.-Recategorizing: PoolQC, Fence
print(df_cate['PoolQC'].unique())
print(df_cate['Fence'].unique())
df_cate['PoolQC']=df_cate['PoolQC'].replace(['Ex','Fa','Gd'],1)
df_cate['Fence']=df_cate['Fence'].replace(['MnPrv', 'GdWo', 'GdPrv', 'MnWw'],1)
df_cate['PoolQC'] = df_cate['PoolQC'].fillna(0)
df_cate['Fence'] = df_cate['Fence'].fillna(0)
print(df_cate['PoolQC'].unique())
print(df_cate['Fence'].unique())
# 2.- Ordinal
ord_features = ['FireplaceQu','GarageQual','BsmtQual','ExterQual','HeatingQC','KitchenQual']
df_cate[ord_features] = df_cate[ord_features].fillna(0)
categories = [0,'Po','Fa','TA','Gd','Ex']
categories = [categories for i in range(len(ord_features))]
ord_enc = OrdinalEncoder(categories=categories)
categories
df_cate[ord_features] = ord_enc.fit_transform(df_cate[ord_features])
df_cate[ord_features]
#3.-one-hot encoder:
oh_features = ['MSZoning','LotShape','LandContour','LotConfig','LandSlope',
'Condition1','BldgType','Exterior1st','MasVnrType','Foundation',
'Heating','Electrical','GarageType','GarageFinish','Street',
'Utilities','CentralAir']
oh_enc = OneHotEncoder(drop='if_binary').fit(df_cate[oh_features])
oh_result = oh_enc.transform(df_cate[oh_features]).toarray()
oh_result = pd.DataFrame(oh_result)
oh_result.index = np.arange(1, len(oh_result)+1)
df_cate = df_cate.drop(oh_features, axis=1)
df_cate = pd.concat([df_cate, oh_result], axis=1)
df_cate.shape
df_cate
back to train...
df_train = pd.concat([df_num,df_cate], axis=1)
df_train
outliers
rows = []
for col in df_num.columns:
Q1 = np.percentile(df_num[col],25)
Q3 = np.percentile(df_num[col],75)
IQR = Q3 - Q1
up_limit = Q3 + 1.5 * IQR
lo_limit = Q1 - 1.5 * IQR
rows.extend(df_num[(df_num[col] < lo_limit)|(df_num[col] > up_limit)].index)
# print(up_limit,lo_limit)
rows = list(set(rows))
df_train = df_train.drop(rows, axis=0)
df_train
df_train.shape
dealing with na
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data
df_train['LotFrontage'] = df_train['LotFrontage'].fillna(0)
df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)
df_train.isnull().sum().max()
its done
df_train
df_train.shape
PCA
pca = PCA(n_components=25)
pca.fit(df_train.iloc[:,1:])
pca.explained_variance_ratio_.sum()
reduced_scaled = pca.transform(df_train.iloc[:,1:])
df_model = pd.DataFrame(reduced_scaled)
df_model.index = np.arange(1, len(df_model)+1)
df_train.index = np.arange(1, len(df_train)+1)
df_model
df_train
df_model = pd.concat([df_train['SalePrice'],df_model], axis=1)
df_model
model
X_train,X_test,y_train,y_test = train_test_split(df_model.iloc[:,1:], df_model['SalePrice'], random_state=0, test_size=0.3)
pipeline = make_pipeline(StandardScaler(),
RandomForestRegressor(random_state=0))
pipeline.fit(X_train, y_train)
#
# Calculate the predicted value for training and test dataset
#
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)
#
# Mean Squared Error
#
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
#
# R-Squared
#
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))
pipeline2 = make_pipeline(StandardScaler(),
LinearRegression())
pipeline2.fit(X_train, y_train)
#
# Calculate the predicted value for training and test dataset
#
y_train_pred = pipeline2.predict(X_train)
y_test_pred = pipeline2.predict(X_test)
#
# Mean Squared Error
#
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred),
mean_squared_error(y_test, y_test_pred)))
#
# R-Squared
#
print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred),
r2_score(y_test, y_test_pred)))