import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
lakers = pd.read_csv('lakers.csv')
lakers.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 53 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 all_awardings 10000 non-null object
1 associated_award 0 non-null float64
2 author 10000 non-null object
3 author_flair_background_color 0 non-null float64
4 author_flair_css_class 2756 non-null object
5 author_flair_richtext 9659 non-null object
6 author_flair_template_id 2223 non-null object
7 author_flair_text 2007 non-null object
8 author_flair_text_color 3128 non-null object
9 author_flair_type 9659 non-null object
10 author_fullname 9659 non-null object
11 author_patreon_flair 9659 non-null object
12 author_premium 9659 non-null object
13 awarders 9545 non-null object
14 body 10000 non-null object
15 collapsed_because_crowd_control 0 non-null float64
16 comment_type 1 non-null object
17 created_utc 10000 non-null int64
18 gildings 10000 non-null object
19 id 10000 non-null object
20 is_submitter 10000 non-null bool
21 link_id 10000 non-null object
22 locked 10000 non-null bool
23 no_follow 10000 non-null bool
24 parent_id 10000 non-null object
25 permalink 10000 non-null object
26 retrieved_on 9545 non-null float64
27 score 10000 non-null int64
28 send_replies 10000 non-null bool
29 stickied 10000 non-null bool
30 subreddit 10000 non-null object
31 subreddit_id 10000 non-null object
32 top_awarded_type 0 non-null float64
33 total_awards_received 10000 non-null int64
34 treatment_tags 10000 non-null object
35 author_cakeday 33 non-null object
36 distinguished 85 non-null object
37 collapsed_reason_code 68 non-null object
38 archived 455 non-null object
39 body_sha1 455 non-null object
40 can_gild 455 non-null object
41 collapsed 455 non-null object
42 collapsed_reason 8 non-null object
43 controversiality 455 non-null float64
44 gilded 455 non-null float64
45 retrieved_utc 455 non-null float64
46 score_hidden 455 non-null object
47 subreddit_name_prefixed 455 non-null object
48 subreddit_type 455 non-null object
49 media_metadata 33 non-null object
50 edited 12 non-null float64
51 unrepliable_reason 0 non-null float64
52 editable 1 non-null object
dtypes: bool(5), float64(10), int64(3), object(35)
memory usage: 3.7+ MB
lakers.describe()
associated_awardfloat64
author_flair_background_colorfloat64
count
0
0
mean
nan
nan
std
nan
nan
min
nan
nan
25%
nan
nan
50%
nan
nan
75%
nan
nan
max
nan
nan
lakers = lakers[lakers['body'] != '[removed]']
lakers = lakers[lakers['body'] != '[deleted]']
text = lakers['body']
lakers.head()
all_awardingsobject
associated_awardfloat64
0
[]
nan
1
[]
nan
2
[]
nan
3
[]
nan
4
[]
nan
text_lowercase = text.apply(str.lower)
text_lowercase
Remove everything else other than alphanumerics
##removing punctuation
from string import punctuation
def remove_punctuation(document):
whitelist = set('abcdefghijklmnopqrstuvwxyz ABCDEFGHIJKLMNOPQRSTUVWXYZ')
myStr = document
answer = ''.join(filter(whitelist.__contains__, myStr))
return answer
##removing punct
text_no_punct = text_lowercase.apply(remove_punctuation)
text_no_punct
tokenization
import nltk
nltk.download('punkt')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
from nltk.tokenize import word_tokenize
text_tokenized = text_no_punct.apply(word_tokenize)
text_tokenized.head()
Removing stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
stop_words
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
def remove_stopwords(document):
words = [word for word in document if not word in stop_words]
return words
text_no_stop = text_tokenized.apply(remove_stopwords)
text_no_stop
Stemming
# from nltk.stem import PorterStemmer
# porter = PorterStemmer()
# def stemmer(document):
# stemmed_document = [porter.stem(word) for word in document]
# return stemmed_document
# text_stemmed = text_no_stop.apply(stemmer)
# text_stemmed
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
sia = SIA()
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
total_compound = 0
for word in text_no_stop[0]:
total_compound = total_compound + sia.polarity_scores(word)['compound']
total_compound
compound_scores = []
for comment in text_no_stop:
total_compound = 0
for word in comment:
total_compound = total_compound + sia.polarity_scores(word)['compound']
compound_scores.append(total_compound)
lakers['compound_score'] = compound_scores
lakers
all_awardingsobject
[]99.9%
[{'award_sub_type': 'GLOBAL', 'award_type': 'global', 'awardings_required_to_grant_benefits': None, 'coin_price': 80, 'coin_reward': 0, 'count': 1, 'days_of_drip_extension': 0, 'days_of_premium': 0, 'description': 'Everything is better with a good hug', 'end_date': None, 'giver_coin_reward': 0, 'icon_format': 'PNG', 'icon_height': 2048, 'icon_url': 'https://i.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png', 'icon_width': 2048, 'id': 'award_8352bdff-3e03-4189-8a08-82501dd8f835', 'is_enabled': True, 'is_new': False, 'name': 'Hugz', 'penny_donate': 0, 'penny_price': 0, 'resized_icons': [{'height': 16, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png?width=16&height=16&auto=webp&s=73a23bf7f08b633508dedf457f2704c522b94a04', 'width': 16}, {'height': 32, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png?width=32&height=32&auto=webp&s=50f2f16e71d2929e3d7275060af3ad6b851dbfb1', 'width': 32}, {'height': 48, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png?width=48&height=48&auto=webp&s=ca487311563425e195699a4d7e4c57a98cbfde8b', 'width': 48}, {'height': 64, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png?width=64&height=64&auto=webp&s=7b4eedcffb1c09a826e7837532c52979760f1d2b', 'width': 64}, {'height': 128, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/ks45ij6w05f61_oldHugz.png?width=128&height=128&auto=webp&s=e4d5ab237eb71a9f02bb3bf9ad5ee43741918d6c', 'width': 128}], 'resized_static_icons': [{'height': 16, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png?width=16&height=16&auto=webp&s=69997ace3ef4ffc099b81d774c2c8f1530602875', 'width': 16}, {'height': 32, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png?width=32&height=32&auto=webp&s=e9519d1999ef9dce5c8a9f59369cb92f52d95319', 'width': 32}, {'height': 48, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png?width=48&height=48&auto=webp&s=f076c6434fb2d2f9075991810fd845c40fa73fc6', 'width': 48}, {'height': 64, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png?width=64&height=64&auto=webp&s=85527145e0c4b754306a30df29e584fd16187636', 'width': 64}, {'height': 128, 'url': 'https://preview.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png?width=128&height=128&auto=webp&s=b8843cdf82c3b741d7af057c14076dcd2621e811', 'width': 128}], 'start_date': None, 'static_icon_height': 2048, 'static_icon_url': 'https://i.redd.it/award_images/t5_q0gj4/fpm0r5ryq1361_PolarHugs.png', 'static_icon_width': 2048, 'subreddit_coin_reward': 0, 'subreddit_id': None, 'tiers_by_required_awardings': None}]0%
7 others0.1%
associated_awardfloat64
NaN - NaN
0
[]
nan
1
[]
nan
2
[]
nan
3
[]
nan
4
[]
nan
5
[]
nan
6
[]
nan
7
[]
nan
8
[]
nan
9
[]
nan
lakers['Negative'] = (lakers['compound_score'] <= -0.1).astype('int32').astype('object')
lakers.head()
lakers.to_csv('updated_lakers.csv')
from nltk.tokenize.treebank import TreebankWordDetokenizer
text_detokenized = text_no_stop.apply(TreebankWordDetokenizer().detokenize)
text_detokenized
from sklearn.feature_extraction.text import CountVectorizer
countvec = CountVectorizer(min_df=0.01)
sparse_dtm = countvec.fit_transform(text_detokenized)
sparse_dtm
dtm = pd.DataFrame(sparse_dtm.toarray(), columns=countvec.get_feature_names(), index=lakers.index)
dtm.head(5)
dtm.to_csv('dtm1.csv')
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.
warnings.warn(msg, category=FutureWarning)
from sklearn.model_selection import train_test_split
y = lakers['Negative'].astype('int32')
X = dtm
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=88)
X_train.shape, X_test.shape
y.head()
dtm.head()
actuallyint64
adint64
0
0
0
1
0
0
2
0
0
3
0
0
4
0
0
X.to_csv('X1.csv')
y.to_csv('y1.csv')
Linear Discriminate Analysis
##LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
y_pred = lda.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
Confusion Matrix:
[[2216 73]
[ 411 199]]
Accuracy: 0.8330458778889273
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state=88)
logreg.fit(X_train, y_train)
y_prob = logreg.predict_proba(X_test)
y_pred = pd.Series([1 if x > 0.5 else 0 for x in y_prob[:,1]], index=y_test.index)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
Confusion Matrix:
[[2224 65]
[ 415 195]]
Accuracy: 0.8344256640220766
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dtc = DecisionTreeClassifier(max_depth = 3, min_samples_leaf=5,random_state = 88)
dtc = dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
Confusion Matrix:
[[2253 36]
[ 498 112]]
Accuracy: 0.8157985512245602
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
print('Node count =', dtc.tree_.node_count)
plt.figure(figsize=(15,10))
plot_tree(dtc,
feature_names=X_train.columns,
class_names=['0','1'],
filled=True,
impurity=False,
rounded=True,
fontsize=12)
plt.show()
Node count = 15
# #DTC
# #First GridsearchCV for best ccp
# from sklearn.model_selection import GridSearchCV
# grid_values = {'ccp_alpha': np.linspace(0, 0.1, 101)}
# dtc = DecisionTreeClassifier(random_state=88)
# dtc_cv = GridSearchCV(dtc, param_grid=grid_values, cv=10).fit(X_train, y_train)
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_features=5, min_samples_leaf=5, n_estimators=500, random_state=88)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
Confusion Matrix:
[[2275 14]
[ 486 124]]
Accuracy: 0.8275267333563298
#Gradient Boosting Classifier
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier(n_estimators = 3300, max_leaf_nodes = 10)
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)
cm2 = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix: \n", cm2)
print ("\nAccuracy:", accuracy_score(y_test, y_pred))
Confusion Matrix:
[[2175 114]
[ 392 218]]
Accuracy: 0.8254570541566058
pip install bootstrapped
Collecting bootstrapped
Downloading bootstrapped-0.0.2.tar.gz (11 kB)
Requirement already satisfied: matplotlib>=1.5.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from bootstrapped) (3.5.1)
Requirement already satisfied: numpy>=1.11.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from bootstrapped) (1.19.5)
Requirement already satisfied: pandas>=0.18.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from bootstrapped) (1.2.5)
Requirement already satisfied: fonttools>=4.22.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (4.28.3)
Requirement already satisfied: packaging>=20.0 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (21.3)
Requirement already satisfied: python-dateutil>=2.7 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (2.8.2)
Requirement already satisfied: cycler>=0.10 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (0.11.0)
Requirement already satisfied: pyparsing>=2.2.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (3.0.6)
Requirement already satisfied: kiwisolver>=1.0.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (1.3.2)
Requirement already satisfied: pillow>=6.2.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from matplotlib>=1.5.3->bootstrapped) (8.4.0)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.18.1->bootstrapped) (2021.3)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7->matplotlib>=1.5.3->bootstrapped) (1.16.0)
Building wheels for collected packages: bootstrapped
Building wheel for bootstrapped (setup.py) ... done
Created wheel for bootstrapped: filename=bootstrapped-0.0.2-py2.py3-none-any.whl size=13954 sha256=851f6ec1841bec0c254731563d4e7fb0b1c7b48ccb5ea7b421aac5cfa8c29102
Stored in directory: /root/.cache/pip/wheels/15/55/6a/9a722f067ac4c3dfab359ed2ec7906b9cc6649156d9886bd59
Successfully built bootstrapped
Installing collected packages: bootstrapped
Successfully installed bootstrapped-0.0.2
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
Note: you may need to restart the kernel to use updated packages.
#Bootstrapping
import numpy as np
import bootstrapped.bootstrap as bs
import bootstrapped.stats_functions as bs_stats
def bootstrap_validation(test_data, test_label, train_label, model,sample=500, random_state=66):
tic = time.time()
n_sample = sample
output_array=np.zeros([n_sample, 1])
output_array[:]=np.nan
print(output_array.shape)
for bs_iter in range(n_sample):
bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
bs_data = test_data.loc[bs_index]
bs_label = test_label.loc[bs_index]
bs_predicted = model.predict(bs_data)
output_array[bs_iter]=accuracy_score(bs_label,bs_predicted)
output_df = pd.DataFrame(output_array)
return output_df