Airbnb in San Francisco

!pip install beautifulsoup4 !pip install nltk

num_total = 200

import os import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split reviews_fname = 'reviews.csv' reviews_df = pd.read_csv(reviews_fname) print(reviews_df.shape) # reviews_df = reviews_df.drop(range(num_total, 286427)) print(reviews_df.shape)

(286427, 6)
(200, 6)

listing_fname = 'listings.csv' listing_df = pd.read_csv(listing_fname)

merged_df = pd.merge(reviews_df, listing_df, left_on='listing_id', right_on='id')

merged_df = merged_df.drop(['listing_id'], axis=1)

merged_df = merged_df.rename(columns={'id_x': 'reviews_id', 'id_y': 'listing_id'})

y = merged_df['price'] X = merged_df.copy() X.drop('price', axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

from bs4 import BeautifulSoup import nltk nltk.download('punkt') nltk.download('stopwords') from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from nltk.stem import PorterStemmer stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

def lower_case(text): return text.lower() from string import punctuation def remove_punctuation(document): no_punct = ''.join([character for character in document if character not in punctuation]) return no_punct def remove_digit(document): no_digit = ''.join([character for character in document if not character.isdigit()]) return no_digit def remove_stopwords(document): return [word for word in document if not word in stop_words] porter = PorterStemmer() def stemmer(document): return [porter.stem(word) for word in document] def combine_changes(dataframe, field): return dataframe[field].dropna().apply(word_tokenize).apply(remove_stopwords).apply(stemmer)

X_train_reviews = combine_changes(X_train, 'comments')

from nltk.tokenize.treebank import TreebankWordDetokenizer from sklearn.feature_extraction.text import CountVectorizer X_train_reviews_detok = X_train_reviews.apply(TreebankWordDetokenizer().detokenize) countvec = CountVectorizer() sparse_dtm = countvec.fit_transform(X_train_reviews_detok)

dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=X_train.index)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

frequencies = dtm.sum().sort_values(ascending=False) print(frequencies[frequencies > 100])

holli    137
stay     105
br       101
dtype: int64

plt.figure(figsize=(8,6)) import seaborn as sns freq = pd.DataFrame(frequencies).transpose() ax = plt.hist(frequencies[frequencies > 50]) plt.xlabel('terms') plt.ylabel(' ') plt.show()

X_test_reviews = combine_changes(X_test, 'comments')

X_test_reviews_detok = X_test_reviews.apply(TreebankWordDetokenizer().detokenize) sparse_dtm_test = countvec.fit_transform(X_test_reviews_detok) dtm_test = pd.DataFrame(sparse_dtm_test.toarray(), columns=countvec.get_feature_names(), index=X_test.index)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
  warnings.warn(msg, category=FutureWarning)

intersected_columns = dtm_test.columns.intersection(dtm.columns) missing_columns = dtm.columns.difference(dtm_test.columns) dtm_test = dtm_test[intersected_columns].join(dtm[missing_columns]) dtm_test = dtm_test.fillna(0)

from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeRegressor grid_values = {'ccp_alpha': np.linspace(0, 0.1, 50)} dtr = DecisionTreeRegressor(random_state=88) dtr_cv = GridSearchCV(dtr, param_grid=grid_values, cv=10).fit(dtm, y_train)

ccp_alpha = dtr_cv.cv_results_['param_ccp_alpha'].data ACC_scores = dtr_cv.cv_results_['mean_test_score'] plt.figure(figsize=(8, 6)) plt.xlabel('ccp_alpha', fontsize=16) plt.ylabel('CV Accuracy', fontsize=16) plt.scatter(ccp_alpha, ACC_scores, s=3) plt.plot(ccp_alpha, ACC_scores, linewidth=3) plt.grid(True, which='both') plt.tight_layout() plt.show() print('Best ccp_alpha', dtr_cv.best_params_)

Best ccp_alpha {'ccp_alpha': 0.0}

from sklearn.tree import plot_tree print('Node count =', dtr_cv.best_estimator_.tree_.node_count) plt.figure(figsize=(20,10)) plot_tree(dtr_cv.best_estimator_, feature_names=X_train.columns, class_names=['0','1'], filled=True, impurity=False, fontsize=12) plt.show()

Node count = 1

from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score y_pred = dtr_cv.predict(dtm_test) mae_dtr_cv = mean_absolute_error(y_test, y_pred) mse_dtr_cv = mean_squared_error(y_test, y_pred) r2_dtr_cv = r2_score(y_test, y_pred) pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)

from sklearn.tree import plot_tree print('Node count =', dtr_cv.best_estimator_.tree_.node_count) plt.figure(figsize=(20,10)) plot_tree(dtr_cv.best_estimator_, feature_names=X_train.columns, class_names=['0','1'], filled=True, impurity=False, fontsize=12) plt.show()

Node count = 1

from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score y_pred = dtr_cv.predict(dtm_test) mae_dtr_cv = mean_absolute_error(y_test, y_pred) mse_dtr_cv = mean_squared_error(y_test, y_pred) r2_dtr_cv = r2_score(y_test, y_pred) pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)

from sklearn.ensemble import RandomForestRegressor import time grid_values = {'max_features': np.linspace(1,100,50, dtype='int32'), 'min_samples_leaf': [5], 'n_estimators': [50], 'random_state': [88]} tic = time.time() rf_2 = RandomForestRegressor() rf_cv = GridSearchCV(rf_2, param_grid=grid_values, cv=5) rf_cv.fit(dtm, y_train) toc = time.time() print('time:', round(toc-tic, 2),'s')

time: 32.06 s

max_features = rf_cv.cv_results_['param_max_features'].data ACC_scores = rf_cv.cv_results_['mean_test_score'] plt.figure(figsize=(8, 6)) plt.xlabel('max_features', fontsize=16) plt.ylabel('CV Accuracy', fontsize=16) plt.scatter(max_features, ACC_scores, s=3) plt.plot(max_features, ACC_scores, linewidth=3) plt.grid(True, which='both') plt.tight_layout() plt.show() print('Best parameters', rf_cv.best_params_)

Best parameters {'max_features': 1, 'min_samples_leaf': 5, 'n_estimators': 50, 'random_state': 88}

y_pred = rf_cv.predict(dtm_test) mae_rf_cv = mean_absolute_error(y_test, y_pred) mse_rf_cv = mean_squared_error(y_test, y_pred) r2_rf_cv = r2_score(y_test, y_pred) pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_rf_cv, mse_rf_cv, r2_rf_cv]}).head(200)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)

from sklearn.ensemble import GradientBoostingRegressor grid_values = {'n_estimators': np.linspace(1, 100, 50, dtype='int32'), # np.logspace(6, 12, num=7, base=2, dtype='int32'), 'learning_rate': [0.01], 'max_leaf_nodes': np.linspace(2, 10, 8, dtype='int32'), 'max_depth': [100], 'min_samples_leaf': [10], 'random_state': [88]} tic = time.time() gbr = GradientBoostingRegressor() gbr_cv = GridSearchCV(gbr, param_grid=grid_values, cv=5) gbr_cv.fit(dtm, y_train) toc = time.time() print('time:', round(toc-tic, 2),'s')

time: 98.13 s

n_estimators = gbr_cv.cv_results_['param_n_estimators'].data cv_acc_scores = gbr_cv.cv_results_['mean_test_score'] plt.figure(figsize=(12, 8)) plt.xlabel('n estimators', fontsize=16) plt.ylabel('CV Accuracy', fontsize=16) plt.grid(True, which='both') N = len(grid_values['max_leaf_nodes']) M = len(grid_values['n_estimators']) for i in range(N): plt.scatter(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], s=30) plt.plot(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], linewidth=2, label='max leaf nodes = '+str(grid_values['max_leaf_nodes'][i])) plt.legend(loc='lower right') plt.show()

y_pred = gbr_cv.predict(dtm_test) mae_gbr_cv = mean_absolute_error(y_test, y_pred) mse_gbr_cv = mean_squared_error(y_test, y_pred) r2_gbr_cv = r2_score(y_test, y_pred) pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_gbr_cv, mse_gbr_cv, r2_gbr_cv]}).head(200)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.

  warnings.warn(message, FutureWarning)

models = (dtr_cv, rf_cv, gbr_cv) model_name = ["Decision Tree Regressor CV", "Random Forest CV", "Gradient Boosting Classifier CV",] mae = [mae_dtr_cv, mae_rf_cv, mae_gbr_cv] mse = [mse_dtr_cv, mse_rf_cv, mse_gbr_cv] r2 = [r2_dtr_cv, r2_rf_cv, r2_gbr_cv] pd.DataFrame({'Multiclass Model' : model_name, 'MAE': mae, 'MSE': mse, 'R2':r2}).head(200)