import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
lakers = pd.read_csv('lakers.csv')
lakers.info()
lakers.describe()
lakers = lakers[lakers['body'] != '[removed]']
lakers = lakers[lakers['body'] != '[deleted]']
text = lakers['body']
lakers.head()
text_lowercase = text.apply(str.lower)
text_lowercase
Remove everything else other than alphanumerics
##removing punctuation
from string import punctuation
def remove_punctuation(document):
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
myStr = document
answer = ''.join(filter(whitelist.__contains__, myStr))
return answer
##removing punct
text_no_punct = text_lowercase.apply(remove_punctuation)
text_no_punct
tokenization
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
text_tokenized = text_no_punct.apply(word_tokenize)
text_tokenized.head()
Removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words
def remove_stopwords(document):
words = [word for word in document if not word in stop_words]
return words
text_no_stop = text_tokenized.apply(remove_stopwords)
text_no_stop
Stemming
# from nltk.stem import PorterStemmer
# porter = PorterStemmer()
# def stemmer(document):
# stemmed_document = [porter.stem(word) for word in document]
# return stemmed_document
# text_stemmed = text_no_stop.apply(stemmer)
# text_stemmed
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()
total_compound = 0
for word in text_no_stop[0]:
total_compound = total_compound + sia.polarity_scores(word)['compound']
total_compound
compound_scores = []
for comment in text_no_stop:
total_compound = 0
for word in comment:
total_compound = total_compound + sia.polarity_scores(word)['compound']
compound_scores.append(total_compound)
lakers['compound_score'] = compound_scores
lakers
lakers['Negative'] = (lakers['compound_score'] <= -0.1).astype('int32').astype('object')
lakers.head()
lakers.to_csv('updated_lakers.csv')
from nltk.tokenize.treebank import TreebankWordDetokenizer
text_detokenized = text_no_stop.apply(TreebankWordDetokenizer().detokenize)
text_detokenized
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df=0.01)
sparse_dtm = countvec.fit_transform(text_detokenized)
sparse_dtm
dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=lakers.index)
dtm.head(5)
dtm.to_csv('dtm1.csv')
from sklearn.model_selection import train_test_split
y = lakers['Negative'].astype('int32')
X = dtm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=88)
X_train.shape, X_test.shape
y.head()
dtm.head()
X.to_csv('X1.csv')
y.to_csv('y1.csv')
Linear Discriminate Analysis
##LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = lda.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)
y_prob = logreg.predict_proba(X_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dtc = DecisionTreeClassifier(max_depth = 3, min_samples_leaf=5,random_state = 88)
dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
print('Node count =', dtc.tree_.node_count)
plt.figure(figsize=(15,10))
plot_tree(dtc,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
rounded=True,
fontsize=12)
plt.show()
# #DTC
# #First GridsearchCV for best ccp
# from sklearn.model_selection import GridSearchCV
# grid_values = {'ccp_alpha': np.linspace(0, 0.1, 101)}
# dtc = DecisionTreeClassifier(random_state=88)
# dtc_cv = GridSearchCV(dtc, param_grid=grid_values, cv=10).fit(X_train, y_train)
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features=5, min_samples_leaf=5, n_estimators=500, random_state=88)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 3300, max_leaf_nodes = 10)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm2)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
pip install bootstrapped
#Bootstrapping
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
def bootstrap_validation(test_data, test_label, train_label, model,sample=500, random_state=66):
tic = time.time()
n_sample = sample
output_array=np.zeros([n_sample, 1])
output_array[:]=np.nan
print(output_array.shape)
for bs_iter in range(n_sample):
bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
bs_data = test_data.loc[bs_index]
bs_label = test_label.loc[bs_index]
bs_predicted = model.predict(bs_data)
output_array[bs_iter]=accuracy_score(bs_label,bs_predicted)
output_df = pd.DataFrame(output_array)
return output_df