train = pd.read_csv("ames_train.csv")
test = pd.read_csv("ames_test.csv")
train.head()
train.shape
train['SalePrice'].describe()
plt.figure(figsize=(8,8))
ax = sns.displot(train['SalePrice'])
ax.set_xticklabels(rotation=90)
sns.jointplot(x='Gr_Liv_Area', y='SalePrice', data=train, kind='reg', scatter_kws={'s':3, 'alpha':0.5})
def remove_outlier(df, area):
df = df[(df['Gr_Liv_Area'] <= area)]
return df
train = remove_outlier(train, 5000)
plt.figure(figsize=(12,12), tight_layout=True)
sns.heatmap(train.drop(['Order', 'PID'], axis=1).corr(), cmap='viridis')
We begin feature selection by choosing the features that are highly correlated with sales price.
#building a pipeline for processing data
def select_column(data, col):
return data.reindex(columns=col)
def process_data_gm(data):
data = remove_outlier(data, 5000)
cols = ['SalePrice', 'Overall_Qual', 'Total_Bsmt_SF', '1st_Flr_SF', 'Gr_Liv_Area', 'Garage_Area', 'is_rich']
data = select_column(data, cols)
data = data.fillna(0)
x = data.drop('SalePrice', axis=1)
y = data['SalePrice']
return x, y
def rmse(y, y_hat):
return np.sqrt(np.mean((y - y_hat) ** 2))
def modeling(train):
train_df, val = train_test_split(train, test_size = 0.2, random_state=42)
train_x, train_y = process_data_gm(train_df)
test_x, test_y = process_data_gm(val)
model = lm.LinearRegression()
model.fit(train_x, train_y)
y_hat = model.predict(test_x)
error = rmse(test_y, y_hat)
# making a resiudal plot
residual = y_hat - test_y
sns.jointplot(x = test_y, y = residual)
plt.xlabel("actual sales price")
plt.ylabel('residual')
return error
modeling(train)
The residual plot indicates that the model systematically under estimate the prices of higher-value homes. So let's fix that by incorporating some of the categorical variables.
train['Neighborhood'].unique()
ax = sns.boxplot(x = 'Neighborhood', y='SalePrice', data=train)
ax.set_xticklabels(ax.get_xticklabels(), rotation=90);
def rich_neighborhood(df, n = 3):
price = df.groupby('Neighborhood')['SalePrice'].agg(np.median)
price = price.sort_values(ascending=False)
lst = price.head(3)
return lst.index.to_list()
rich_neighborhood(train)
train['is_rich'] = train['Neighborhood'].isin(rich_neighborhood(train)).astype(int)
train.head()
modeling(train)