%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
import pprint
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'
if not os.path.exists("AB_NYC_2019.csv"):
!wget $data
features = ['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews',
'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
df = pd.read_csv('AB_NYC_2019.csv')
df = df[features]
df.head()
df.dtypes
print(df.shape, len(df))
df['neighbourhood_group'].unique()
df['room_type'].unique()
strings = list(df.dtypes[df.dtypes == 'object'].index)
for col in strings:
df[col] = df[col].str.lower().str.replace(" ", "_")
df['neighbourhood_group'].unique()
df['room_type'].unique()
df['availability_365'].unique()
df.isnull().sum()
df.fillna(0, inplace=True)
df.isnull().sum()
df['neighbourhood_group'].describe()
print(f"[ANSWER-1] The mode of the column 'neighbourhood_group' is {df['neighbourhood_group'].mode().values[0]} with {(df['neighbourhood_group'] == 'manhattan').astype('int').sum()} aparitions")
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values
del df_train['price']
del df_val['price']
del df_test['price']
numerical = ['latitude', 'longitude',
'minimum_nights', 'number_of_reviews', 'reviews_per_month',
'calculated_host_listings_count', 'availability_365']
dfCorr = df_full_train[numerical+ ['price']].corr().abs()
plt.figure(figsize=(30,10))
sns.heatmap(dfCorr, annot=True, cmap="Reds")
plt.show()
print("[ANSWER-2] The two features with biggest correlation are: number_of_reviews and reviews_per_month")
df.price.mean()
above_average = (y_train >= 152).astype('int')
above_average_val = (y_val >= 152).astype('int')
above_average_test = (y_test >= 152).astype('int')
categorical = ['neighbourhood_group', 'room_type']
def mutual_info_above_average_score(series):
return round(mutual_info_score(series, above_average), 2)
mi = df_train[categorical].apply(mutual_info_above_average_score)
mi.sort_values(ascending=False)
print("[ANSWER-3] The variable 'room_type' has bigger mutual information score (0.14)")
df_train
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
dv.get_feature_names()
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, above_average)
model.intercept_[0]
model.coef_[0].round(3)
y_pred = model.predict_proba(X_val)[:, 1]
y_pred
#above_average_decision = (y_pred >= 0.5)
#all_features_accuracy = (above_average_val == above_average_decision).mean()
all_features_accuracy = model.score(X_val, above_average_val)
all_features_accuracy
print(f"[ANSWER-4] The accuracy of the model on the validation dataset is: {round(all_features_accuracy, 2)}")
differences = dict()
total_features = numerical + strings
for feature in total_features:
#print(feature)
df_train_copy = df_train.copy()
del df_train_copy[feature]
df_val_copy = df_val.copy()
del df_val_copy[feature]
dv = DictVectorizer(sparse=False)
train_dict = df_train_copy.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val_copy.to_dict(orient='records')
X_val = dv.transform(val_dict)
#print(dv.get_feature_names())
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train, above_average)
#y_pred = model.predict_proba(X_val)[:, 1]
#above_average_decision = (y_pred >= 0.5)
#no_feature_accuracy = (above_average_val == above_average_decision).mean()
no_feature_accuracy = model.score(X_val, above_average_val)
differences[feature] = round(all_features_accuracy - no_feature_accuracy, 4)
for key, value in differences.items():
print(f"{key}:{value}")
print("[ANSWER-5] The feature with smallest difference (of the list) is: reviews_per_month")
def rmse(y, y_pred):
se = (y - y_pred) ** 2
mse = se.mean()
return np.sqrt(mse)
y_train = np.log1p(y_train)
y_val = np.log1p(y_val)
y_test = np.log1p(y_test)
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)
results = dict()
best_rmse = None
best_rmse_arg = None
for a in [0, 0.01, 0.1, 1, 10]:
model = Ridge(alpha=a)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)
raw_rmse = rmse(y_val, y_pred)
if best_rmse is None:
best_rmse = raw_rmse
best_rmse_arg = a
elif raw_rmse < best_rmse:
best_rmse = raw_rmse
best_rmse_arg = a
results[a] = round(raw_rmse, 3)
print(f"[ANSWER-6] Best RMSE is {round(best_rmse, 3)} for r value: {best_rmse_arg}")