!pip install beautifulsoup4
!pip install nltk
from bs4 import BeautifulSoup
total_num = 14000
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
fname = 'stack_stats_2020_train.csv'
fname_2 = 'stack_stats_2020_test.csv'
stack = pd.read_csv(fname)
stack_2 = pd.read_csv(fname_2)
# print(stack.shape)
# stack = stack.append(stack_2)
stack = stack.drop(range(total_num, 19247))
# print(stack.shape)
# print(stack_2.shape)
stack['Score_p'] = (stack['Score'] >= 1).astype(int)
stack.head(10)
def lower_case(text):
return text.lower()
from string import punctuation
def remove_punctuation(document):
no_punct = ''.join([character
for character in document
if character not in punctuation])
return no_punct
def remove_digit(document):
no_digit = ''.join([character
for character in document
if not character.isdigit()])
return no_digit
def process_field(frame, field):
frame["{}_text".format(field)] = pd.DataFrame([BeautifulSoup(i, 'html.parser')
.get_text()
.strip()
.replace("\n", " ")
for i in stack[field]])
return frame
process_field(stack, 'Body')
process_field(stack, 'Title')
stack["Tags_text"] = pd.DataFrame([" "
.join(i
.lstrip('<')
.rstrip('>')
.split("><"))]
for i in stack['Tags'])
stack.head(10)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(document):
return [word for word in document
if not word in stop_words]
from nltk.stem import PorterStemmer
porter = PorterStemmer()
def stemmer(document):
return [porter.stem(word) for word in document]
def combine_changes(dataframe, field):
return dataframe[field].dropna().apply(word_tokenize).apply(remove_stopwords).apply(stemmer)
combine_body = combine_changes(stack, 'Body_text')
# combine_title = combine_changes(stack, 'Title_text')
combine_title = stack['Title_text'].apply(word_tokenize).apply(remove_stopwords)
combine_tags = combine_changes(stack, 'Tags_text')
print(combine_body, combine_title, combine_tags)
from nltk.tokenize.treebank import TreebankWordDetokenizer
body_detokenised = combine_body.apply(TreebankWordDetokenizer().detokenize)
title_detokenised = combine_title.apply(TreebankWordDetokenizer().detokenize)
tags_detokenised = combine_tags.apply(TreebankWordDetokenizer().detokenize)
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer()
sparse_dtm = countvec.fit_transform(body_detokenised)
sparse_dtm
dtm = pd.DataFrame(sparse_dtm.toarray(),
columns=countvec.get_feature_names(),
index=stack.index)
frequencies = dtm.sum().sort_values(ascending=False)
print(frequencies[frequencies > 100])
plt.figure(figsize=(8,6))
import seaborn as sns
# ax = sns.countplot(frequencies)
# plt.xticks(np.arange(1, 500, step=5), np.arange(1, 50, step=5))
freq = pd.DataFrame(frequencies).transpose()
#ax = sns.countplot(freq)
#ax = sns.distplot(frequencies, bins=len(frequencies))
ax = plt.hist(frequencies[frequencies > 50])
plt.xlabel('terms')
plt.ylabel(' ')
plt.show()
countvec_2 = CountVectorizer(min_df=0.005)
sparse_dtm_2 = countvec_2.fit_transform(body_detokenised)
dtm_2 = pd.DataFrame(sparse_dtm_2.toarray(),
columns=countvec_2.get_feature_names(),
index=stack.index)
dtm_2.sum().sort_values(ascending=False)
countvec_3 = CountVectorizer(min_df=0.2)
sparse_dtm_3 = countvec_3.fit_transform(body_detokenised)
dtm_3 = pd.DataFrame(sparse_dtm_3.toarray(),
columns=countvec_3.get_feature_names(),
index=stack.index)
# dtm_3.sum().sort_values(ascending=False)
# dtm_3_freq = dtm_3.sum().sort_values(ascending=False)
# dtm_3_freq = dtm_3_freq[dtm_3_freq > 100]
dtm_3_ = dtm_3.add_suffix('_body')
sparse_dtm_title = countvec_2.fit_transform(title_detokenised)
dtm_title = pd.DataFrame(sparse_dtm_title.toarray(),
columns=countvec_2.get_feature_names(),
index=stack.index)
dtm_title_ = dtm_title.add_suffix('_title')
sparse_dtm_tags = countvec_2.fit_transform(tags_detokenised)
dtm_tags = pd.DataFrame(sparse_dtm_tags.toarray(),
columns=countvec_2.get_feature_names(),
index=stack.index)
dtm_tags_ = dtm_tags.add_suffix('_tags')
final_dtm = dtm_tags_.join(dtm_title_).join(dtm_3_)
final_dtm
y_train = stack['Score_p'].astype('int32')
X_train = final_dtm
fname = 'stack_stats_2020_test.csv'
stack_test = pd.read_csv(fname)
# stack_test = stack_test.drop(range(total_num, 8249))
stack_test['Score_p'] = (stack_test['Score'] >= 1).astype(int)
stack_test.shape
process_field(stack_test, 'Body')
process_field(stack_test, 'Title')
stack_test["Tags_text"] = pd.DataFrame([" "
.join(i
.lstrip('<')
.rstrip('>')
.split("><"))]
for i in stack['Tags'])
combine_body_test = combine_changes(stack_test, 'Body')
combine_title_test = combine_changes(stack_test, 'Title')
combine_tags_test = combine_changes(stack_test, 'Tags')
body_detokenised_test = combine_body_test.apply(TreebankWordDetokenizer().detokenize)
title_detokenised_test = combine_title_test.apply(TreebankWordDetokenizer().detokenize)
tags_detokenised_test = combine_tags_test.apply(TreebankWordDetokenizer().detokenize)
countvec_test = CountVectorizer(min_df=0.2)
sparse_dtm_test = countvec_3.fit_transform(body_detokenised_test)
dtm_test = pd.DataFrame(sparse_dtm_test.toarray(),
columns=countvec_3.get_feature_names())
dtm_3.sum().sort_values(ascending=False)
dtm_test_freq = dtm_test.sum().sort_values(ascending=False)
dtm_test_freq = dtm_test_freq[dtm_test_freq > 100]
dtm_test = dtm_test.add_suffix('_body')
sparse_dtm_title_test = countvec_2.fit_transform(title_detokenised_test)
dtm_title_test = pd.DataFrame(sparse_dtm_title_test.toarray(),
columns=countvec_2.get_feature_names())
dtm_title_test_ = dtm_title_test.add_suffix('_title')
sparse_dtm_tags_test = countvec_2.fit_transform(tags_detokenised_test)
dtm_tags_test = pd.DataFrame(sparse_dtm_tags_test.toarray(),
columns=countvec_2.get_feature_names())
dtm_tags_test_ = dtm_tags_test.add_suffix('_tags')
final_dtm_test = dtm_tags_test_.join(dtm_title_test_).join(dtm_test)
final_dtm_test
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
grid_values = {'ccp_alpha': np.linspace(0, 0.1, 101)}
dtc = DecisionTreeClassifier(random_state=88)
dtc_cv = GridSearchCV(dtc, param_grid=grid_values, cv=10).fit(X_train, y_train)
ccp_alpha = dtc_cv.cv_results_['param_ccp_alpha'].data
ACC_scores = dtc_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('ccp_alpha', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(ccp_alpha, ACC_scores, s=3)
plt.plot(ccp_alpha, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best ccp_alpha', dtc_cv.best_params_)
from sklearn.tree import plot_tree
print('Node count =', dtc_cv.best_estimator_.tree_.node_count)
plt.figure(figsize=(20,10))
plot_tree(dtc_cv.best_estimator_,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
fontsize=12)
plt.show()
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = dtc_cv.predict(final_dtm_test)
cm = confusion_matrix(y_test, y_pred)
dtc_cv_acc = accuracy_score(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", dtc_cv_acc)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features=5, min_samples_leaf=5, n_estimators=500, random_state=88)
rf.fit(X_train, y_train)
y_pred = rf.predict(final_dtm_test)
print ("Confusion Matrix: \n", cm)
rf_acc = accuracy_score(y_test, y_pred)
print ("\nAccuracy:", rf_acc)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
import time
grid_values = {'max_features': np.linspace(1,100,50, dtype='int32'),
'min_samples_leaf': [5],
'n_estimators': [50],
'random_state': [88]}
tic = time.time()
rf_2 = RandomForestClassifier()
rf_cv = GridSearchCV(rf_2, param_grid=grid_values, cv=5)
rf_cv.fit(X_train, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
max_features = rf_cv.cv_results_['param_max_features'].data
ACC_scores = rf_cv.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('max_features', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(max_features, ACC_scores, s=3)
plt.plot(max_features, ACC_scores, linewidth=3)
plt.grid(True, which='both')
plt.tight_layout()
plt.show()
print('Best parameters', rf_cv.best_params_)
y_pred = rf_cv.predict(final_dtm_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
rf_cv_acc = accuracy_score(y_test, y_pred)
print ("\nAccuracy:", rf_cv_acc)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
sorted_idx = rf_cv.best_estimator_.feature_importances_.argsort()
feature_importances = rf_cv.best_estimator_.feature_importances_[sorted_idx[::-1]]
feature_names = X_train.columns[sorted_idx[::-1]]
plt.figure(figsize=(8,7))
plt.barh(feature_names[:10], 100*feature_importances[:10])
plt.show()
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)
y_prob = logreg.predict_proba(final_dtm_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)
cm = confusion_matrix(y_test, y_pred)
logreg_acc = accuracy_score(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", logreg_acc)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
from sklearn.ensemble import GradientBoostingClassifier
grid_values = {'n_estimators': np.linspace(1, 100, 50, dtype='int32'), # np.logspace(6, 12, num=7, base=2, dtype='int32'),
'learning_rate': [0.01],
'max_leaf_nodes': np.linspace(2, 10, 8, dtype='int32'),
'max_depth': [100],
'min_samples_leaf': [10],
'random_state': [88]}
tic = time.time()
gbc = GradientBoostingClassifier()
gbc_cv = GridSearchCV(gbc, param_grid=grid_values, cv=5)
gbc_cv.fit(X_train, y_train)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
n_estimators = gbc_cv.cv_results_['param_n_estimators'].data
cv_acc_scores = gbc_cv.cv_results_['mean_test_score']
plt.figure(figsize=(12, 8))
plt.xlabel('n estimators', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.grid(True, which='both')
N = len(grid_values['max_leaf_nodes'])
M = len(grid_values['n_estimators'])
for i in range(N):
plt.scatter(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], s=30)
plt.plot(n_estimators[(M*i):(M*i)+M], cv_acc_scores[(M*i):(M*i)+M], linewidth=2,
label='max leaf nodes = '+str(grid_values['max_leaf_nodes'][i]))
plt.legend(loc='lower right')
plt.show()
y_pred = gbc_cv.predict(final_dtm_test)
cm = confusion_matrix(y_test, y_pred)
gbc_acc = accuracy_score(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", gbc_acc)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
y_pred = lda.predict(final_dtm_test)
cm = confusion_matrix(y_test, y_pred)
lda_acc = accuracy_score(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", lda_acc)
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=3, linestyle='--')
plt.legend(loc='lower right', fontsize=14)
plt.show()
models = (dtc_cv, rf, rf_cv, gbc_cv, logreg, lda)
model_name = ["Decision Tree Classifier CV",
"Random Forest",
"Random Forest CV",
"Gradient Boosting Classifier CV",
"Logistic Regression",
"Linear Discriminant Analysis"]
accuracy = [dtc_cv_acc, rf_acc, rf_cv_acc, gbc_acc, logreg_acc, lda_acc]
accuracy_rounded = ["{}%".format(round(i*100, 3)) for i in accuracy]
pd.DataFrame({'Multiclass Model' : model_name, 'Validation Accuracy': accuracy_rounded}).head(200)
# Set the number of tests to run
num_samples = 100
import time
def bootstrap_validation(test_data, test_label, model, metrics_list, sample=500, random_state=66):
tic = time.time()
n_sample = sample
n_metrics = len(metrics_list)
output_array=np.zeros([n_sample, n_metrics])
output_array[:]=np.nan
print(output_array.shape)
for bs_iter in range(n_sample):
bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
bs_data = test_data.loc[bs_index]
bs_label = test_label.loc[bs_index]
bs_predicted = model.predict(bs_data)
for metrics_iter in range(n_metrics):
metrics = metrics_list[metrics_iter]
output_array[bs_iter, metrics_iter]=metrics(bs_predicted,bs_label)
output_df = pd.DataFrame(output_array)
return output_df
index = 0
for model in models:
bs = bootstrap_validation(final_dtm_test,
y_test,
model,
metrics_list=[accuracy_score],
sample=num_samples)
CI = np.quantile(bs.iloc[:,0],np.array([0.025,0.975]))
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[0].set_xlabel("Bootstrap Accuracy", fontsize=16)
axs[1].set_xlabel("Bootstrap Accuracy - Test Set Accuracy", fontsize=16)
axs[0].set_ylabel("Count", fontsize=16)
axs[0].set_title("{}".format(model_name[index]), fontsize=14)
axs[1].set_title("{}".format(model_name[index]), fontsize=14)
axs[0].hist(bs.iloc[:,0],bins=20,edgecolor='black',linewidth=2,color='green')
axs[1].hist(bs.iloc[:,0]-accuracy_score(y_test, model.predict(final_dtm_test)),bins=20,edgecolor='black',linewidth=2,color='green')
axs[0].vlines(x=CI[0],ymin=0,ymax=5,color="black",linestyle="dotted")
axs[0].vlines(x=CI[1],ymin=0,ymax=5,color="black",linestyle="dotted")
index += 1
import time
def bootstrap_comparison(test_data, test_label, modelA, modelB, metrics_list, sample=500, random_state=66):
tic = time.time()
n_sample = sample
n_metrics = len(metrics_list)
output_array=np.zeros([n_sample, n_metrics])
output_array[:]=np.nan
print(output_array.shape)
for bs_iter in range(n_sample):
bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
bs_data = test_data.loc[bs_index]
bs_label = test_label.loc[bs_index]
bs_predictedA = modelA.predict(bs_data)
bs_predictedB = modelB.predict(bs_data)
for metrics_iter in range(n_metrics):
metrics = metrics_list[metrics_iter]
output_array[bs_iter, metrics_iter]=metrics(bs_predictedA,bs_label)-metrics(bs_predictedB,bs_label)
output_df = pd.DataFrame(output_array)
return output_df
for model_number in range(0, len(models)):
for model_number_2 in range(0, len(models)):
if not model_number == model_number_2:
bs_comp = bootstrap_comparison(final_dtm_test, y_test,
models[model_number],
models[model_number_2],
metrics_list=[accuracy_score],
sample=num_samples)
plt.figure(figsize=(12,5))
plt.xlabel("Bootstrap Accuracy", fontsize=16)
plt.ylabel("Count", fontsize=16)
plt.title("{} vs {} Accuracy Difference".format(model_name[model_number], model_name[model_number_2]), fontsize=14)
plt.hist(bs_comp.iloc[:,0],bins=20,edgecolor='black',linewidth=2,color='green')
def baseline(train, var):
tot = train.shape[0]
return [1] * tot
baseline_pred = baseline(final_dtm_test, 'Score_p')
cm = confusion_matrix(y_test, baseline_pred)
baseline_acc = accuracy_score(y_test, baseline_pred)
print("Confusion matrix: \n", cm)
print("Accuracy: ", baseline_acc)