indeng142final

import os import pandas as pd import numpy as np import matplotlib.pyplot as plt

lakers = pd.read_csv('lakers.csv')

lakers.info()

lakers.describe()

lakers = lakers[lakers['body'] != '[removed]'] lakers = lakers[lakers['body'] != '[deleted]'] text = lakers['body']

lakers.head()

text_lowercase = text.apply(str.lower) text_lowercase

Remove everything else other than alphanumerics

##removing punctuation from string import punctuation def remove_punctuation(document): whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ') myStr = document answer = ''.join(filter(whitelist.__contains__, myStr)) return answer

##removing punct text_no_punct = text_lowercase.apply(remove_punctuation) text_no_punct

tokenization

import nltk nltk.download('punkt')

from nltk.tokenize import word_tokenize text_tokenized = text_no_punct.apply(word_tokenize) text_tokenized.head()

Removing stop words

import nltk nltk.download('stopwords') from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) stop_words

def remove_stopwords(document): words = [word for word in document if not word in stop_words] return words

text_no_stop = text_tokenized.apply(remove_stopwords) text_no_stop

Stemming

# from nltk.stem import PorterStemmer # porter = PorterStemmer() # def stemmer(document): # stemmed_document = [porter.stem(word) for word in document] # return stemmed_document

# text_stemmed = text_no_stop.apply(stemmer) # text_stemmed

import nltk nltk.download('vader_lexicon') from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA sia = SIA()

total_compound = 0 for word in text_no_stop[0]: total_compound = total_compound + sia.polarity_scores(word)['compound'] total_compound

compound_scores = [] for comment in text_no_stop: total_compound = 0 for word in comment: total_compound = total_compound + sia.polarity_scores(word)['compound'] compound_scores.append(total_compound) lakers['compound_score'] = compound_scores lakers

lakers['Negative'] = (lakers['compound_score'] <= -0.1).astype('int32').astype('object') lakers.head() lakers.to_csv('updated_lakers.csv')

from nltk.tokenize.treebank import TreebankWordDetokenizer text_detokenized = text_no_stop.apply(TreebankWordDetokenizer().detokenize) text_detokenized

from sklearn.feature_extraction.text import CountVectorizer countvec = CountVectorizer(min_df=0.01) sparse_dtm = countvec.fit_transform(text_detokenized) sparse_dtm

dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=lakers.index) dtm.head(5) dtm.to_csv('dtm1.csv')

from sklearn.model_selection import train_test_split y = lakers['Negative'].astype('int32') X = dtm X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=88) X_train.shape, X_test.shape

y.head()

dtm.head()

X.to_csv('X1.csv')

y.to_csv('y1.csv')

Linear Discriminate Analysis

##LDA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis lda = LinearDiscriminantAnalysis() lda.fit(X_train, y_train)

from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score y_pred = lda.predict(X_test) cm = confusion_matrix(y_test, y_pred) print ("Confusion Matrix: \n", cm) print ("\nAccuracy:", accuracy_score(y_test, y_pred))

#Logistic Regression from sklearn.linear_model import LogisticRegression logreg = LogisticRegression(random_state=88) logreg.fit(X_train, y_train)

y_prob = logreg.predict_proba(X_test) y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index) cm = confusion_matrix(y_test, y_pred) print ("Confusion Matrix: \n", cm) print ("\nAccuracy:", accuracy_score(y_test, y_pred))

#Decision Tree Classifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score dtc = DecisionTreeClassifier(max_depth = 3, min_samples_leaf=5,random_state = 88) dtc = dtc.fit(X_train, y_train) y_pred = dtc.predict(X_test) cm = confusion_matrix(y_test, y_pred) print ("Confusion Matrix: \n", cm) print ("\nAccuracy:", accuracy_score(y_test, y_pred))

import matplotlib.pyplot as plt from sklearn.tree import plot_tree print('Node count =', dtc.tree_.node_count) plt.figure(figsize=(15,10)) plot_tree(dtc, feature_names=X_train.columns, class_names=['0','1'], filled=True, impurity=False, rounded=True, fontsize=12) plt.show()

# #DTC # #First GridsearchCV for best ccp # from sklearn.model_selection import GridSearchCV # grid_values = {'ccp_alpha': np.linspace(0, 0.1, 101)} # dtc = DecisionTreeClassifier(random_state=88) # dtc_cv = GridSearchCV(dtc, param_grid=grid_values, cv=10).fit(X_train, y_train)

#Random Forest Classifier from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(max_features=5, min_samples_leaf=5, n_estimators=500, random_state=88) rf.fit(X_train, y_train) y_pred = rf.predict(X_test) cm = confusion_matrix(y_test, y_pred) print ("Confusion Matrix: \n", cm) print ("\nAccuracy:", accuracy_score(y_test, y_pred))

#Gradient Boosting Classifier from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(n_estimators = 3300, max_leaf_nodes = 10) gbc.fit(X_train, y_train) y_pred = gbc.predict(X_test) cm2 = confusion_matrix(y_test, y_pred) print ("Confusion Matrix: \n", cm2) print ("\nAccuracy:", accuracy_score(y_test, y_pred))

pip install bootstrapped

#Bootstrapping import numpy as np import bootstrapped.bootstrap as bs import bootstrapped.stats_functions as bs_stats

def bootstrap_validation(test_data, test_label, train_label, model,sample=500, random_state=66): tic = time.time() n_sample = sample output_array=np.zeros([n_sample, 1]) output_array[:]=np.nan print(output_array.shape) for bs_iter in range(n_sample): bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True) bs_data = test_data.loc[bs_index] bs_label = test_label.loc[bs_index] bs_predicted = model.predict(bs_data) output_array[bs_iter]=accuracy_score(bs_label,bs_predicted) output_df = pd.DataFrame(output_array) return output_df

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Remove everything else other than alphanumerics

tokenization

Removing stop words

Stemming

Linear Discriminate Analysis

Remove everything else other than alphanumerics