!pip install beautifulsoup4
!pip install nltk
num_total = 200
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
reviews_fname = 'reviews.csv'
reviews_df = pd.read_csv(reviews_fname)
print(reviews_df.shape)
# reviews_df = reviews_df.drop(range(num_total, 286427))
print(reviews_df.shape)
(286427, 6)
(200, 6)
listing_fname = 'listings.csv'
listing_df = pd.read_csv(listing_fname)
merged_df = pd.merge(reviews_df, listing_df, left_on='listing_id', right_on='id')
merged_df = merged_df.drop(['listing_id'], axis=1)
merged_df = merged_df.rename(columns={'id_x': 'reviews_id', 'id_y': 'listing_id'})
y = merged_df['price']
X = merged_df.copy()
X.drop('price', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
def lower_case(text):
return text.lower()
from string import punctuation
def remove_punctuation(document):
no_punct = ''.join([character
for character in document
if character not in punctuation])
return no_punct
def remove_digit(document):
no_digit = ''.join([character
for character in document
if not character.isdigit()])
return no_digit
def remove_stopwords(document):
return [word for word in document
if not word in stop_words]
porter = PorterStemmer()
def stemmer(document):
return [porter.stem(word) for word in document]
def combine_changes(dataframe, field):
return dataframe[field].dropna().apply(word_tokenize).apply(remove_stopwords).apply(stemmer)
X_train_reviews = combine_changes(X_train, 'comments')
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
X_train_reviews_detok = X_train_reviews.apply(TreebankWordDetokenizer().detokenize)
countvec = CountVectorizer()
sparse_dtm = countvec.fit_transform(X_train_reviews_detok)
dtm = pd.DataFrame(sparse_dtm.toarray(),
columns=countvec.get_feature_names(),
index=X_train.index)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
frequencies = dtm.sum().sort_values(ascending=False)
print(frequencies[frequencies > 100])
holli 137
stay 105
br 101
dtype: int64
plt.figure(figsize=(8,6))
import seaborn as sns
freq = pd.DataFrame(frequencies).transpose()
ax = plt.hist(frequencies[frequencies > 50])
plt.xlabel('terms')
plt.ylabel(' ')
plt.show()
X_test_reviews = combine_changes(X_test, 'comments')
X_test_reviews_detok = X_test_reviews.apply(TreebankWordDetokenizer().detokenize)
sparse_dtm_test = countvec.fit_transform(X_test_reviews_detok)
dtm_test = pd.DataFrame(sparse_dtm_test.toarray(),
columns=countvec.get_feature_names(),
index=X_test.index)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
intersected_columns = dtm_test.columns.intersection(dtm.columns)
missing_columns = dtm.columns.difference(dtm_test.columns)
dtm_test = dtm_test[intersected_columns].join(dtm[missing_columns])
dtm_test = dtm_test.fillna(0)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
grid_values = {'ccp_alpha': np.linspace(0, 0.1, 50)}
dtr = DecisionTreeRegressor(random_state=88)
dtr_cv = GridSearchCV(dtr, param_grid=grid_values, cv=10).fit(dtm, y_train)
ccp_alpha = dtr_cv.cv_results_['param_ccp_alpha'].data
ACC_scores = dtr_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('ccp_alpha', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(ccp_alpha, ACC_scores, s=3)
plt.plot(ccp_alpha, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best ccp_alpha', dtr_cv.best_params_)
Best ccp_alpha {'ccp_alpha': 0.0}
from sklearn.tree import plot_tree
print('Node count =', dtr_cv.best_estimator_.tree_.node_count)
plt.figure(figsize=(20,10))
plot_tree(dtr_cv.best_estimator_,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
fontsize=12)
plt.show()
Node count = 1
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtr_cv.predict(dtm_test)
mae_dtr_cv = mean_absolute_error(y_test, y_pred)
mse_dtr_cv = mean_squared_error(y_test, y_pred)
r2_dtr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.
warnings.warn(message, FutureWarning)
from sklearn.tree import plot_tree
print('Node count =', dtr_cv.best_estimator_.tree_.node_count)
plt.figure(figsize=(20,10))
plot_tree(dtr_cv.best_estimator_,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
fontsize=12)
plt.show()
Node count = 1
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtr_cv.predict(dtm_test)
mae_dtr_cv = mean_absolute_error(y_test, y_pred)
mse_dtr_cv = mean_squared_error(y_test, y_pred)
r2_dtr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.
warnings.warn(message, FutureWarning)
from sklearn.ensemble import RandomForestRegressor
import time
grid_values = {'max_features': np.linspace(1,100,50, dtype='int32'),
'min_samples_leaf': [5],
'n_estimators': [50],
'random_state': [88]}
tic = time.time()
rf_2 = RandomForestRegressor()
rf_cv = GridSearchCV(rf_2, param_grid=grid_values, cv=5)
rf_cv.fit(dtm, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
time: 32.06 s
max_features = rf_cv.cv_results_['param_max_features'].data
ACC_scores = rf_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('max_features', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(max_features, ACC_scores, s=3)
plt.plot(max_features, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best parameters', rf_cv.best_params_)
Best parameters {'max_features': 1, 'min_samples_leaf': 5, 'n_estimators': 50, 'random_state': 88}
y_pred = rf_cv.predict(dtm_test)
mae_rf_cv = mean_absolute_error(y_test, y_pred)
mse_rf_cv = mean_squared_error(y_test, y_pred)
r2_rf_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_rf_cv, mse_rf_cv, r2_rf_cv]}).head(200)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.
warnings.warn(message, FutureWarning)
from sklearn.ensemble import GradientBoostingRegressor
grid_values = {'n_estimators': np.linspace(1, 100, 50, dtype='int32'), # np.logspace(6, 12, num=7, base=2, dtype='int32'),
'learning_rate': [0.01],
'max_leaf_nodes': np.linspace(2, 10, 8, dtype='int32'),
'max_depth': [100],
'min_samples_leaf': [10],
'random_state': [88]}
tic = time.time()
gbr = GradientBoostingRegressor()
gbr_cv = GridSearchCV(gbr, param_grid=grid_values, cv=5)
gbr_cv.fit(dtm, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
time: 98.13 s
n_estimators = gbr_cv.cv_results_['param_n_estimators'].data
cv_acc_scores = gbr_cv.cv_results_['mean_test_score']
plt.figure(figsize=(12, 8))
plt.xlabel('n estimators', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.grid(True, which='both')
N = len(grid_values['max_leaf_nodes'])
M = len(grid_values['n_estimators'])
for i in range(N):
plt.scatter(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], s=30)
plt.plot(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], linewidth=2,
label='max leaf nodes = '+str(grid_values['max_leaf_nodes'][i]))
plt.legend(loc='lower right')
plt.show()
y_pred = gbr_cv.predict(dtm_test)
mae_gbr_cv = mean_absolute_error(y_test, y_pred)
mse_gbr_cv = mean_squared_error(y_test, y_pred)
r2_gbr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_gbr_cv, mse_gbr_cv, r2_gbr_cv]}).head(200)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:488: FutureWarning: The feature names should match those that were passed during fit. Starting version 1.2, an error will be raised.
Feature names must be in the same order as they were in fit.
warnings.warn(message, FutureWarning)
models = (dtr_cv, rf_cv, gbr_cv)
model_name = ["Decision Tree Regressor CV",
"Random Forest CV",
"Gradient Boosting Classifier CV",]
mae = [mae_dtr_cv, mae_rf_cv, mae_gbr_cv]
mse = [mse_dtr_cv, mse_rf_cv, mse_gbr_cv]
r2 = [r2_dtr_cv, r2_rf_cv, r2_gbr_cv]
pd.DataFrame({'Multiclass Model' : model_name, 'MAE': mae, 'MSE': mse, 'R2':r2}).head(200)