%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'
if not os.path.exists("AB_NYC_2019.csv"):
!wget $data
features = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
df = pd.read_csv('AB_NYC_2019.csv')
df = df[features]
df.head()
df.dtypes
print(df.shape, len(df))
sns.histplot(df.price, bins=50)
df.isnull().sum()
print("[ANSWER-1] The feature 'reviews_per_month' has 10052 missing values")
df.minimum_nights.describe()
print(f"[ANSWER-2] The median of th 'minimum_nights' feature is: {df.minimum_nights.median()}")
n = len(df)
idx = np.arange(n)
np.random.seed(42)
np.random.shuffle(idx)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
print(f"n_train:{n_train}, n_val:{n_val}, n_test:{n_test}")
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)
del df_train['price']
del df_val['price']
del df_test['price']
sns.histplot(y_train, bins=50)
df_train
# Computing the trining mean
reviews_train_mean_value = df_train.reviews_per_month.mean()
reviews_train_mean_value
def prepare_data(df, replace_value=None):
df_temp = df.copy()
if replace_value is None:
X = df_temp.values
#print(df_temp.reviews_per_month.isnull().sum())
#print(df_temp.reviews_per_month.mean())
else:
df_temp['reviews_per_month'] = df_temp['reviews_per_month'].fillna(replace_value)
X = df_temp.values
#print(replace_value)
#print(df_temp.reviews_per_month.fillna(replace_value).isnull().sum())
#print(df_temp.reviews_per_month.fillna(replace_value).mean())
return X
prepare_data(df_train, reviews_train_mean_value)
def train_linear_regression(X, y):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
def rmse(y, y_pred):
se = (y - y_pred) ** 2
mse = se.mean()
return np.sqrt(mse)
X_train = prepare_data(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_data(df_val, 0)
y_pred = w0 + X_val.dot(w)
rmse_zero_fill = round(rmse(y_val, y_pred), 2)
rmse_zero_fill
X_train = prepare_data(df_train, reviews_train_mean_value)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_data(df_val, reviews_train_mean_value)
y_pred = w0 + X_val.dot(w)
rmse_mean_fill = round(rmse(y_val, y_pred), 2)
rmse_mean_fill
print(f"[ANSWER-3] Both cases are giving the same RMSE score: {rmse_zero_fill}")
def train_linear_regression_reg(X, y, r=0.001):
ones = np.ones(X.shape[0])
X = np.column_stack([ones, X])
XTX = X.T.dot(X)
XTX = XTX + r * np.eye(XTX.shape[0])
XTX_inv = np.linalg.inv(XTX)
w_full = XTX_inv.dot(X.T).dot(y)
return w_full[0], w_full[1:]
results = dict()
X_train = prepare_data(df_train, 0)
X_val = prepare_data(df_val, 0)
best_rmse = None
best_rmse_arg = None
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
w0, w = train_linear_regression_reg(X_train, y_train, r=r)
y_pred = w0 + X_val.dot(w)
raw_rmse = rmse(y_val, y_pred)
if best_rmse is None:
best_rmse = raw_rmse
best_rmse_arg = r
elif raw_rmse < best_rmse:
best_rmse = raw_rmse
best_rmse_arg = r
results[r] = round(raw_rmse, 2)
pprint.pprint(results)
print(f"[ANSWER-4] Best RMSE is {best_rmse} for r value: {best_rmse_arg}")
scores = list()
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
idx = np.arange(n)
np.random.seed(seed)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = np.log1p(df_train.price.values)
y_val = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)
del df_train['price']
del df_val['price']
del df_test['price']
X_train = prepare_data(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_data(df_val, 0)
y_pred = w0 + X_val.dot(w)
scores.append(rmse(y_val, y_pred))
print(f"[ANSWER-5] The standard deviation of all the scores is: {round(np.std(scores), 3)}")
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - n_val - n_test
idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)
df_train = df.iloc[idx[:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = np.log1p(df_train.price.values)
y_test = np.log1p(df_test.price.values)
del df_train['price']
del df_test['price']
X_train = prepare_data(df_train, 0)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.001)
X_test = prepare_data(df_test, 0)
y_pred = w0 + X_test.dot(w)
print(f"[ANSWER-6] The RMSE score in test dataset is: {round(rmse(y_test, y_pred), 2)}")