!pip install beautifulsoup4
!pip install nltk
num_total = 200
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
reviews_fname = 'reviews.csv'
reviews_df = pd.read_csv(reviews_fname)
print(reviews_df.shape)
# reviews_df = reviews_df.drop(range(num_total, 286427))
print(reviews_df.shape)
listing_fname = 'listings.csv'
listing_df = pd.read_csv(listing_fname)
merged_df = pd.merge(reviews_df, listing_df, left_on='listing_id', right_on='id')
merged_df = merged_df.drop(['listing_id'], axis=1)
merged_df = merged_df.rename(columns={'id_x': 'reviews_id', 'id_y': 'listing_id'})
y = merged_df['price']
X = merged_df.copy()
X.drop('price', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
from bs4 import BeautifulSoup
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
stop_words = set(stopwords.words('english'))
def lower_case(text):
return text.lower()
from string import punctuation
def remove_punctuation(document):
no_punct = ''.join([character
for character in document
if character not in punctuation])
return no_punct
def remove_digit(document):
no_digit = ''.join([character
for character in document
if not character.isdigit()])
return no_digit
def remove_stopwords(document):
return [word for word in document
if not word in stop_words]
porter = PorterStemmer()
def stemmer(document):
return [porter.stem(word) for word in document]
def combine_changes(dataframe, field):
return dataframe[field].dropna().apply(word_tokenize).apply(remove_stopwords).apply(stemmer)
X_train_reviews = combine_changes(X_train, 'comments')
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.feature_extraction.text import CountVectorizer
X_train_reviews_detok = X_train_reviews.apply(TreebankWordDetokenizer().detokenize)
countvec = CountVectorizer()
sparse_dtm = countvec.fit_transform(X_train_reviews_detok)
dtm = pd.DataFrame(sparse_dtm.toarray(),
columns=countvec.get_feature_names(),
index=X_train.index)
frequencies = dtm.sum().sort_values(ascending=False)
print(frequencies[frequencies > 100])
plt.figure(figsize=(8,6))
import seaborn as sns
freq = pd.DataFrame(frequencies).transpose()
ax = plt.hist(frequencies[frequencies > 50])
plt.xlabel('terms')
plt.ylabel(' ')
plt.show()
X_test_reviews = combine_changes(X_test, 'comments')
X_test_reviews_detok = X_test_reviews.apply(TreebankWordDetokenizer().detokenize)
sparse_dtm_test = countvec.fit_transform(X_test_reviews_detok)
dtm_test = pd.DataFrame(sparse_dtm_test.toarray(),
columns=countvec.get_feature_names(),
index=X_test.index)
intersected_columns = dtm_test.columns.intersection(dtm.columns)
missing_columns = dtm.columns.difference(dtm_test.columns)
dtm_test = dtm_test[intersected_columns].join(dtm[missing_columns])
dtm_test = dtm_test.fillna(0)
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
grid_values = {'ccp_alpha': np.linspace(0, 0.1, 50)}
dtr = DecisionTreeRegressor(random_state=88)
dtr_cv = GridSearchCV(dtr, param_grid=grid_values, cv=10).fit(dtm, y_train)
ccp_alpha = dtr_cv.cv_results_['param_ccp_alpha'].data
ACC_scores = dtr_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('ccp_alpha', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(ccp_alpha, ACC_scores, s=3)
plt.plot(ccp_alpha, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best ccp_alpha', dtr_cv.best_params_)
from sklearn.tree import plot_tree
print('Node count =', dtr_cv.best_estimator_.tree_.node_count)
plt.figure(figsize=(20,10))
plot_tree(dtr_cv.best_estimator_,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
fontsize=12)
plt.show()
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtr_cv.predict(dtm_test)
mae_dtr_cv = mean_absolute_error(y_test, y_pred)
mse_dtr_cv = mean_squared_error(y_test, y_pred)
r2_dtr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)
from sklearn.tree import plot_tree
print('Node count =', dtr_cv.best_estimator_.tree_.node_count)
plt.figure(figsize=(20,10))
plot_tree(dtr_cv.best_estimator_,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
fontsize=12)
plt.show()
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
y_pred = dtr_cv.predict(dtm_test)
mae_dtr_cv = mean_absolute_error(y_test, y_pred)
mse_dtr_cv = mean_squared_error(y_test, y_pred)
r2_dtr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_dtr_cv, mse_dtr_cv, r2_dtr_cv]}).head(200)
from sklearn.ensemble import RandomForestRegressor
import time
grid_values = {'max_features': np.linspace(1,100,50, dtype='int32'),
'min_samples_leaf': [5],
'n_estimators': [50],
'random_state': [88]}
tic = time.time()
rf_2 = RandomForestRegressor()
rf_cv = GridSearchCV(rf_2, param_grid=grid_values, cv=5)
rf_cv.fit(dtm, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
max_features = rf_cv.cv_results_['param_max_features'].data
ACC_scores = rf_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('max_features', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(max_features, ACC_scores, s=3)
plt.plot(max_features, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best parameters', rf_cv.best_params_)
y_pred = rf_cv.predict(dtm_test)
mae_rf_cv = mean_absolute_error(y_test, y_pred)
mse_rf_cv = mean_squared_error(y_test, y_pred)
r2_rf_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_rf_cv, mse_rf_cv, r2_rf_cv]}).head(200)
from sklearn.ensemble import GradientBoostingRegressor
grid_values = {'n_estimators': np.linspace(1, 100, 50, dtype='int32'), # np.logspace(6, 12, num=7, base=2, dtype='int32'),
'learning_rate': [0.01],
'max_leaf_nodes': np.linspace(2, 10, 8, dtype='int32'),
'max_depth': [100],
'min_samples_leaf': [10],
'random_state': [88]}
tic = time.time()
gbr = GradientBoostingRegressor()
gbr_cv = GridSearchCV(gbr, param_grid=grid_values, cv=5)
gbr_cv.fit(dtm, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
n_estimators = gbr_cv.cv_results_['param_n_estimators'].data
cv_acc_scores = gbr_cv.cv_results_['mean_test_score']
plt.figure(figsize=(12, 8))
plt.xlabel('n estimators', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.grid(True, which='both')
N = len(grid_values['max_leaf_nodes'])
M = len(grid_values['n_estimators'])
for i in range(N):
plt.scatter(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], s=30)
plt.plot(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], linewidth=2,
label='max leaf nodes = '+str(grid_values['max_leaf_nodes'][i]))
plt.legend(loc='lower right')
plt.show()
y_pred = gbr_cv.predict(dtm_test)
mae_gbr_cv = mean_absolute_error(y_test, y_pred)
mse_gbr_cv = mean_squared_error(y_test, y_pred)
r2_gbr_cv = r2_score(y_test, y_pred)
pd.DataFrame({'Performance Metric' : ['MAE','MSE','R2'], 'Value': [mae_gbr_cv, mse_gbr_cv, r2_gbr_cv]}).head(200)
models = (dtr_cv, rf_cv, gbr_cv)
model_name = ["Decision Tree Regressor CV",
"Random Forest CV",
"Gradient Boosting Classifier CV",]
mae = [mae_dtr_cv, mae_rf_cv, mae_gbr_cv]
mse = [mse_dtr_cv, mse_rf_cv, mse_gbr_cv]
r2 = [r2_dtr_cv, r2_rf_cv, r2_gbr_cv]
pd.DataFrame({'Multiclass Model' : model_name, 'MAE': mae, 'MSE': mse, 'R2':r2}).head(200)