House sales price - Reduction and prediction

Notebook under renovation! Sorry for the inconvenience. (All the code it's alright but I'm working on storytelling)

Libraries

#import libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler, OrdinalEncoder, RobustScaler, OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.metrics import mean_squared_error, mean_squared_log_error, r2_score from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.pipeline import make_pipeline from scipy.stats import norm, skew, kurtosis from scipy import stats

Data

#data data = pd.read_csv('train.csv', index_col='Id') df_train = data.copy() df_train.shape

df_train.info()

df_train.describe()

SalePrice

df_train['SalePrice'].describe()

sns.boxplot(df_train['SalePrice'])

sns.distplot(df_train['SalePrice'])

df_train['SalePrice'].skew()

df_train['SalePrice'].kurt()

df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

sns.distplot(df_train['SalePrice'],fit=norm)

Numeric

df_num = df_train._get_numeric_data() df_num

dropping un-correlated

#correlation matrix corrmat = df_num.corr() f, ax = plt.subplots(figsize=(15, 12)) sns.heatmap(corrmat, vmax=1, square=True);

corrmat.columns

corrmat.shape

corrmat.nlargest(37, 'SalePrice')['SalePrice']

cols = corrmat.nlargest(20, 'SalePrice')['SalePrice'].index # cols = corrmat.index[abs(corrmat['SalePrice']) > 0.3] cols

dropping redundant

df_num = df_num[cols] corrmat = df_num.corr()

fig, ax = plt.subplots(figsize=(12,10)) sns.heatmap(corrmat, annot=True, ax=ax)

for column in cols: print(corrmat.nlargest(3,column)[column]) print()

df_num = df_num.drop(['TotRmsAbvGrd','GarageCars','1stFlrSF','GarageYrBlt','YearRemodAdd','2ndFlrSF'], axis=1)

df_num.columns

for column in df_num.columns[1:]: print(column, df_num['SalePrice'].corr(df_num[column]))

other skewed features

numeric_vars = df_num.columns def diagnostic(df, var): fig = plt.figure(figsize = (17, 6)) plt.subplot(1,2,1) df[var].hist(bins = 40) plt.title("Distribution of {}".format(var)) plt.subplot(1,2,2) stats.probplot(df[var], dist = "norm", plot = plt) plt.ylabel("Quantiles") plt.show() for var in numeric_vars[1:]: diagnostic(df_num, var)

vars_skewed = df_train[df_num.columns].apply(lambda x: skew(x)).sort_values() vars_skewed

vars_kurtosis = df_train[df_num.columns].apply(lambda x: kurtosis(x)).sort_values() vars_kurtosis

for var in vars_skewed.index[1:]: df_num[var] = np.log1p(df_num[var])

numeric_vars = df_num.columns def diagnostic(df, var): fig = plt.figure(figsize = (17, 6)) plt.subplot(1,2,1) df[var].hist(bins = 40) plt.title("Distribution of {}".format(var)) plt.subplot(1,2,2) stats.probplot(df[var], dist = "norm", plot = plt) plt.ylabel("Quantiles") plt.show() for var in numeric_vars[1:]: diagnostic(df_num, var)

Numeric data is ready...

df_num

Categorical

cat_cols = [c for c in df_train.columns if c not in df_train._get_numeric_data().columns ]

df_cate = df_train[cat_cols] df_cate.shape

droping unuseful

df_cate = df_cate.drop(['Alley','Neighborhood','Condition2','ExterCond','MiscFeature', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior2nd', 'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2', 'GarageCond','PavedDrive','Functional','SaleType','SaleCondition'], axis=1) df_cate.shape

df_cate.columns

Encoding

# 1.-recategorizing: PoolQC, Fence # 2.-ordinal: 'FirePlaceQu','GarageQual','BsmtQual','ExterQual','HeatingQC','KitchenQual' # 3.-one-hot encoder: 'MSZoning','LotShape','LandContour','LotConfig','LandSlope', # 'Condition1','BldgType','HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', # 'MasVnrType','Foundation','Heating','Electrical','GarageType','GarageFinish', # binary (if_binary): Street, Utilities, CentralAir

# 1.-Recategorizing: PoolQC, Fence print(df_cate['PoolQC'].unique()) print(df_cate['Fence'].unique())

df_cate['PoolQC']=df_cate['PoolQC'].replace(['Ex','Fa','Gd'],1) df_cate['Fence']=df_cate['Fence'].replace(['MnPrv', 'GdWo', 'GdPrv', 'MnWw'],1) df_cate['PoolQC'] = df_cate['PoolQC'].fillna(0) df_cate['Fence'] = df_cate['Fence'].fillna(0)

print(df_cate['PoolQC'].unique()) print(df_cate['Fence'].unique())

# 2.- Ordinal ord_features = ['FireplaceQu','GarageQual','BsmtQual','ExterQual','HeatingQC','KitchenQual']

df_cate[ord_features] = df_cate[ord_features].fillna(0)

categories = [0,'Po','Fa','TA','Gd','Ex'] categories = [categories for i in range(len(ord_features))] ord_enc = OrdinalEncoder(categories=categories) categories

df_cate[ord_features] = ord_enc.fit_transform(df_cate[ord_features])

df_cate[ord_features]

#3.-one-hot encoder: oh_features = ['MSZoning','LotShape','LandContour','LotConfig','LandSlope', 'Condition1','BldgType','Exterior1st','MasVnrType','Foundation', 'Heating','Electrical','GarageType','GarageFinish','Street', 'Utilities','CentralAir']

oh_enc = OneHotEncoder(drop='if_binary').fit(df_cate[oh_features])

oh_result = oh_enc.transform(df_cate[oh_features]).toarray() oh_result = pd.DataFrame(oh_result) oh_result.index = np.arange(1, len(oh_result)+1)

df_cate = df_cate.drop(oh_features, axis=1)

df_cate = pd.concat([df_cate, oh_result], axis=1) df_cate.shape

df_cate

back to train...

df_train = pd.concat([df_num,df_cate], axis=1) df_train

outliers

rows = [] for col in df_num.columns: Q1 = np.percentile(df_num[col],25) Q3 = np.percentile(df_num[col],75) IQR = Q3 - Q1 up_limit = Q3 + 1.5 * IQR lo_limit = Q1 - 1.5 * IQR rows.extend(df_num[(df_num[col] < lo_limit)|(df_num[col] > up_limit)].index) # print(up_limit,lo_limit) rows = list(set(rows))

df_train = df_train.drop(rows, axis=0)

df_train

df_train.shape

dealing with na

total = df_train.isnull().sum().sort_values(ascending=False) percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False) missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent']) missing_data

df_train['LotFrontage'] = df_train['LotFrontage'].fillna(0) df_train['MasVnrArea'] = df_train['MasVnrArea'].fillna(0)

df_train.isnull().sum().max()

its done

df_train

df_train.shape

PCA

pca = PCA(n_components=25)

pca.fit(df_train.iloc[:,1:])

pca.explained_variance_ratio_.sum()

reduced_scaled = pca.transform(df_train.iloc[:,1:])

df_model = pd.DataFrame(reduced_scaled) df_model.index = np.arange(1, len(df_model)+1) df_train.index = np.arange(1, len(df_train)+1)

df_model

df_train

df_model = pd.concat([df_train['SalePrice'],df_model], axis=1) df_model

model

X_train,X_test,y_train,y_test = train_test_split(df_model.iloc[:,1:], df_model['SalePrice'], random_state=0, test_size=0.3)

pipeline = make_pipeline(StandardScaler(), RandomForestRegressor(random_state=0)) pipeline.fit(X_train, y_train) # # Calculate the predicted value for training and test dataset # y_train_pred = pipeline.predict(X_train) y_test_pred = pipeline.predict(X_test) # # Mean Squared Error # print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) # # R-Squared # print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

pipeline2 = make_pipeline(StandardScaler(), LinearRegression()) pipeline2.fit(X_train, y_train) # # Calculate the predicted value for training and test dataset # y_train_pred = pipeline2.predict(X_train) y_test_pred = pipeline2.predict(X_test) # # Mean Squared Error # print('MSE train: %.3f, test: %.3f' % (mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) # # R-Squared # print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}House sales price - Reduction and prediction

Libraries

Data

SalePrice

Numeric

dropping un-correlated

dropping redundant

other skewed features

Numeric data is ready...

Categorical

droping unuseful

Encoding

back to train...

outliers

dealing with na

its done

PCA

model

House sales price - Reduction and prediction