housing price prediction

train = pd.read_csv("ames_train.csv") test = pd.read_csv("ames_test.csv")

train.head()

train.shape

train['SalePrice'].describe()

plt.figure(figsize=(8,8)) ax = sns.displot(train['SalePrice']) ax.set_xticklabels(rotation=90)

sns.jointplot(x='Gr_Liv_Area', y='SalePrice', data=train, kind='reg', scatter_kws={'s':3, 'alpha':0.5})

def remove_outlier(df, area): df = df[(df['Gr_Liv_Area'] <= area)] return df

train = remove_outlier(train, 5000)

plt.figure(figsize=(12,12), tight_layout=True) sns.heatmap(train.drop(['Order', 'PID'], axis=1).corr(), cmap='viridis')

We begin feature selection by choosing the features that are highly correlated with sales price.

#building a pipeline for processing data def select_column(data, col): return data.reindex(columns=col) def process_data_gm(data): data = remove_outlier(data, 5000) cols = ['SalePrice', 'Overall_Qual', 'Total_Bsmt_SF', '1st_Flr_SF', 'Gr_Liv_Area', 'Garage_Area', 'is_rich'] data = select_column(data, cols) data = data.fillna(0) x = data.drop('SalePrice', axis=1) y = data['SalePrice'] return x, y

def rmse(y, y_hat): return np.sqrt(np.mean((y - y_hat) ** 2))

def modeling(train): train_df, val = train_test_split(train, test_size = 0.2, random_state=42) train_x, train_y = process_data_gm(train_df) test_x, test_y = process_data_gm(val) model = lm.LinearRegression() model.fit(train_x, train_y) y_hat = model.predict(test_x) error = rmse(test_y, y_hat) # making a resiudal plot residual = y_hat - test_y sns.jointplot(x = test_y, y = residual) plt.xlabel("actual sales price") plt.ylabel('residual') return error

modeling(train)

The residual plot indicates that the model systematically under estimate the prices of higher-value homes. So let's fix that by incorporating some of the categorical variables.

train['Neighborhood'].unique()

ax = sns.boxplot(x = 'Neighborhood', y='SalePrice', data=train) ax.set_xticklabels(ax.get_xticklabels(), rotation=90);

def rich_neighborhood(df, n = 3): price = df.groupby('Neighborhood')['SalePrice'].agg(np.median) price = price.sort_values(ascending=False) lst = price.head(3) return lst.index.to_list() rich_neighborhood(train)

train['is_rich'] = train['Neighborhood'].isin(rich_neighborhood(train)).astype(int) train.head()

modeling(train)