Introduction

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn import metrics %matplotlib inline

sample_submission = pd.read_csv('sample_submission.csv') test = pd.read_csv("test.csv") train = pd.read_csv("train.csv") sample = pd.read_csv("sample_submission.csv")

train.head()

test.head()

train.corr()['SalePrice'].sort_values()

sns.scatterplot(data=train, x='OverallQual', y='SalePrice') plt.axhline(y=200000,color='r')

train[(train['OverallQual']>8) &(train['SalePrice']<200000)][['SalePrice', 'OverallQual']]

sns.scatterplot(x='GrLivArea', y='SalePrice', data=train) plt.axhline(y=200000, color='r') plt.axvline(x=4000, color='r')

train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)][['SalePrice', 'GrLivArea']]

index_drop=train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)].index train=train.drop(index_drop, axis=0)

sns.scatterplot(x='GrLivArea', y='SalePrice', data=train) plt.axhline(y=200000, color='r') plt.axvline(x=4000, color='r')

sns.scatterplot(x='OverallQual', y='SalePrice', data=train) plt.axhline(y=200000,color='r')

sns.boxplot(x='OverallQual', y='SalePrice', data=train)

train= train.drop('Id', axis=1)

train.isnull()

#How many missing data is there in each features? train.isnull().sum()

#The percent of missing data in any feature: 100*(train.isnull().sum()/len(train))

#Make a Function to calculate the percent of missing data in each columns (feature) and then sort it def missing_percent(train): nan_percent= 100*(train.isnull().sum()/len(train)) nan_percent= nan_percent[nan_percent>0].sort_values() return nan_percent

nan_percent= missing_percent(train)

nan_percent

#plot the feature with missing indicating the percent of missing data plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90)

#every Feature with missing data must be checked! #We choose a threshold of 1%. It means, if there is less than 1% of a feature are missing, #then we will consider just dropping that rows plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90) #Set 1% threshold: plt.ylim(0,1)

nan_percent[nan_percent<1]

nan_percent[nan_percent<1].index

100/len(train) #It shows that, Feature with just one missing rows has this percent value of missing data

train[train['Electrical'].isnull()]

train[train['GarageArea'].isnull()]

train= train.dropna(axis=0, subset=['Electrical', 'GarageArea','MasVnrArea'])

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90) plt.ylim(0,1)

train[train['TotalBsmtSF'].isnull()]

train[train['BsmtHalfBath'].isnull()]

train[train['BsmtFullBath'].isnull()]

#After checking the data documentation, #it shows that missing value (two rows) in Basement Features are because of there is no basement in these rows #Decision: Filling in data based on column: numerical basement & string descriptive: #Numerical Columns fill with 0: bsmt_num_cols= ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF' ,'BsmtFullBath', 'BsmtHalfBath'] train[bsmt_num_cols]=train[bsmt_num_cols].fillna(0) #String Columns fill with None: bsmt_str_cols= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'] train[bsmt_str_cols]= train[bsmt_str_cols].fillna('None')

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90) plt.ylim(0,1)

train["MasVnrType"]= train["MasVnrType"].fillna("None") train["MasVnrArea"]= train["MasVnrArea"].fillna(0)

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90)

train[['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']]

#Filling the missing Value: Gar_str_cols= ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'] train[Gar_str_cols]=train[Gar_str_cols].fillna('None') train['GarageYrBlt']=train['GarageYrBlt'].fillna(0)

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90)

nan_percent.index

train[['LotFrontage', 'FireplaceQu', 'Fence', 'Alley', 'MiscFeature', 'PoolQC']]

train= train.drop(['Fence', 'Alley', 'MiscFeature','PoolQC'], axis=1)

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90)

#Filling in Fireplace Quality based on dataset documentation: train['FireplaceQu']= train['FireplaceQu'].fillna('None')

nan_percent= missing_percent(train) plt.figure(figsize=(12,6)) sns.barplot(x=nan_percent.index, y=nan_percent) plt.xticks(rotation=90)

train['Neighborhood'].unique()

plt.figure(figsize=(8,12)) sns.boxplot(data=train, x='LotFrontage', y='Neighborhood')

#Impute missing data based on other columns: train.groupby('Neighborhood')['LotFrontage']

train.groupby('Neighborhood')['LotFrontage'].mean()

train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

train['LotFrontage']=train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))

train['LotFrontage']= train['LotFrontage'].fillna(0)

nan_percent= missing_percent(train)

nan_percent

train['MSSubClass']

train.info()

train['MSSubClass'].unique()

#Convert to String: train['MSSubClass']= train['MSSubClass'].apply(str)

train.info() #or: df['MS SubClass'].dtype

train.select_dtypes(include='object')

df_num= train.select_dtypes(exclude='object') df_obj= train.select_dtypes(include='object')

df_num.info()

df_obj.info()

# Converting: df_obj= pd.get_dummies(df_obj, drop_first=True)

df_obj.shape

Final_df= pd.concat([df_num, df_obj], axis=1)

Final_df.head()

Final_df.isnull()

X=Final_df.drop(['SalePrice'],axis=1) y=Final_df['SalePrice']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

!pip install xgboost

import xgboost as xgb

xg_reg = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=6, n_estimators=1000, verbosity=3)

xg_reg.fit(X_train, y_train)

y_pred = xg_reg.predict(X_test)

metrics.mean_absolute_error(y_test, y_pred)

metrics.r2_score(y_test, y_pred)

train['SalePrice'].mean()

train['SalePrice'].std()

xgb.plot_importance(xg_reg)