Introduction
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
sample_submission = pd.read_csv('sample_submission.csv')
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")
sample = pd.read_csv("sample_submission.csv")
train.head()
test.head()
train.corr()['SalePrice'].sort_values()
sns.scatterplot(data=train, x='OverallQual', y='SalePrice')
plt.axhline(y=200000,color='r')
train[(train['OverallQual']>8) &(train['SalePrice']<200000)][['SalePrice', 'OverallQual']]
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train)
plt.axhline(y=200000, color='r')
plt.axvline(x=4000, color='r')
train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)][['SalePrice', 'GrLivArea']]
index_drop=train[(train['GrLivArea']>4000) & (train['SalePrice']<400000)].index
train=train.drop(index_drop, axis=0)
sns.scatterplot(x='GrLivArea', y='SalePrice', data=train)
plt.axhline(y=200000, color='r')
plt.axvline(x=4000, color='r')
sns.scatterplot(x='OverallQual', y='SalePrice', data=train)
plt.axhline(y=200000,color='r')
sns.boxplot(x='OverallQual', y='SalePrice', data=train)
train= train.drop('Id', axis=1)
train.isnull()
#How many missing data is there in each features?
train.isnull().sum()
#The percent of missing data in any feature:
100*(train.isnull().sum()/len(train))
#Make a Function to calculate the percent of missing data in each columns (feature) and then sort it
def missing_percent(train):
nan_percent= 100*(train.isnull().sum()/len(train))
nan_percent= nan_percent[nan_percent>0].sort_values()
return nan_percent
nan_percent= missing_percent(train)
nan_percent
#plot the feature with missing indicating the percent of missing data
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
#every Feature with missing data must be checked!
#We choose a threshold of 1%. It means, if there is less than 1% of a feature are missing,
#then we will consider just dropping that rows
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
#Set 1% threshold:
plt.ylim(0,1)
nan_percent[nan_percent<1]
nan_percent[nan_percent<1].index
100/len(train)
#It shows that, Feature with just one missing rows has this percent value of missing data
train[train['Electrical'].isnull()]
train[train['GarageArea'].isnull()]
train= train.dropna(axis=0, subset=['Electrical', 'GarageArea','MasVnrArea'])
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
plt.ylim(0,1)
train[train['TotalBsmtSF'].isnull()]
train[train['BsmtHalfBath'].isnull()]
train[train['BsmtFullBath'].isnull()]
#After checking the data documentation,
#it shows that missing value (two rows) in Basement Features are because of there is no basement in these rows
#Decision: Filling in data based on column: numerical basement & string descriptive:
#Numerical Columns fill with 0:
bsmt_num_cols= ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF' ,'BsmtFullBath', 'BsmtHalfBath']
train[bsmt_num_cols]=train[bsmt_num_cols].fillna(0)
#String Columns fill with None:
bsmt_str_cols= ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
train[bsmt_str_cols]= train[bsmt_str_cols].fillna('None')
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
plt.ylim(0,1)
train["MasVnrType"]= train["MasVnrType"].fillna("None")
train["MasVnrArea"]= train["MasVnrArea"].fillna(0)
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
train[['GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond']]
#Filling the missing Value:
Gar_str_cols= ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
train[Gar_str_cols]=train[Gar_str_cols].fillna('None')
train['GarageYrBlt']=train['GarageYrBlt'].fillna(0)
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
nan_percent.index
train[['LotFrontage', 'FireplaceQu', 'Fence', 'Alley', 'MiscFeature',
'PoolQC']]
train= train.drop(['Fence', 'Alley', 'MiscFeature','PoolQC'], axis=1)
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
#Filling in Fireplace Quality based on dataset documentation:
train['FireplaceQu']= train['FireplaceQu'].fillna('None')
nan_percent= missing_percent(train)
plt.figure(figsize=(12,6))
sns.barplot(x=nan_percent.index, y=nan_percent)
plt.xticks(rotation=90)
train['Neighborhood'].unique()
plt.figure(figsize=(8,12))
sns.boxplot(data=train, x='LotFrontage', y='Neighborhood')
#Impute missing data based on other columns:
train.groupby('Neighborhood')['LotFrontage']
train.groupby('Neighborhood')['LotFrontage'].mean()
train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))
train['LotFrontage']=train.groupby('Neighborhood')['LotFrontage'].transform(lambda val: val.fillna(val.mean()))
train['LotFrontage']= train['LotFrontage'].fillna(0)
nan_percent= missing_percent(train)
nan_percent
train['MSSubClass']
train.info()
train['MSSubClass'].unique()
#Convert to String:
train['MSSubClass']= train['MSSubClass'].apply(str)
train.info()
#or: df['MS SubClass'].dtype
train.select_dtypes(include='object')
df_num= train.select_dtypes(exclude='object')
df_obj= train.select_dtypes(include='object')
df_num.info()
df_obj.info()
# Converting:
df_obj= pd.get_dummies(df_obj, drop_first=True)
df_obj.shape
Final_df= pd.concat([df_num, df_obj], axis=1)
Final_df.head()
Final_df.isnull()
X=Final_df.drop(['SalePrice'],axis=1)
y=Final_df['SalePrice']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
!pip install xgboost
import xgboost as xgb
xg_reg = xgb.XGBRegressor(colsample_bytree=0.8, learning_rate=0.1, max_depth=6, n_estimators=1000, verbosity=3)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
metrics.mean_absolute_error(y_test, y_pred)
metrics.r2_score(y_test, y_pred)
train['SalePrice'].mean()
train['SalePrice'].std()
xgb.plot_importance(xg_reg)