# general imports
import numpy as np
import pandas as pd
import re, string
import datetime
from random import randint
import random
from collections import Counter
#spicy imports
from scipy.stats import rankdata, hmean, norm
from scipy.sparse import hstack
import time
#ploting iports
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
#Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
#NLTK imports
import nltk
from nltk.tag import pos_tag
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import classify
from nltk import NaiveBayesClassifier
import nltk.classify
# read dataset
columns = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(
"/work/training.1600000.processed.noemoticon.csv",
names=columns
)
"""
pre-processing including adding a column to denote the index
of the occurence of the # symbol(if not present will be -1)
and adds a timestamp column that represents date and time
Note: ignore time zone because it is local time zone
"""
df["hashtag"] = df["text"].apply(str.find, args=("#",))
has_hashtag_mask = df["hashtag"] != -1
hashtag_df = df[has_hashtag_mask]
#defining global function
def preproccessing(data):
"""
Creates and returns a bag of words tfidf sparse matrix using the given
data.
"""
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data)
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
return X_train_tfidf
#make new df just for machine learning
ml_df = df
#normalize text
ml_df['text'] = ml_df['text'].apply(str.lower)
"""
Preprocessing including using tf-idf and "bag of words" to transform
text of tweet into something that can be interpreted by ml model
"""
text = preproccessing(ml_df['text'])
date = preproccessing(ml_df['date'])
labels = ml_df['target']
#combine preproccessed dates with preproccessed text
features = hstack((text, date))
#defining global function
def train_and_add(model, data_array, test_size,
features_func=features, labels_func=labels):
"""
Takes a model a data array and a test size and appends the model's
accuracies on the preproccessed data to the array in the form
[model type, test size, training accuracy, testing accuracy]
"""
features_train, features_test, labels_train, labels_test =\
train_test_split(features_func, labels_func, test_size=(test_size/100))
print("training model")
clf = model.fit(features_train, labels_train)
print("finished")
train_acc = accuracy_score(labels_train, clf.predict(features_train))
test_acc = accuracy_score(labels_test, clf.predict(features_test))
data_array.append([type(model), test_size, train_acc, test_acc])
return data_array
"""
Get accuracy data of the LinearSVC model at different test sizes and
save it to a df with columns model, test_size, train, test with train
and test being the training accuracy and test being the testing
accuracy. Then preprocess the dataframe so that you can graph train
and test on one plot.
"""
accuracy_data_svc = []
for i in range(10, 91, 10):
print("In loop for " + str(i) + "% test size")
train_and_add(LinearSVC(), accuracy_data_svc, i)
accuracy_df_svc = pd.DataFrame(
accuracy_data_svc,
columns=[
'model',
'test_size',
'train',
'test'
]
)
accuracy_df_svc = accuracy_df_svc.melt(
['test_size', 'model'],
var_name='cols',
value_name='vals'
)
In loop for 10% test size
training model
finished
In loop for 20% test size
training model
finished
In loop for 30% test size
training model
finished
In loop for 40% test size
training model
finished
In loop for 50% test size
training model
finished
In loop for 60% test size
training model
finished
In loop for 70% test size
training model
finished
In loop for 80% test size
training model
finished
In loop for 90% test size
training model
finished
"""
Get accuracy data of the MultinomialNB model at different test sizes and
save it to a df with columns model, test_size, train, test with train
and test being the training accuracy and test being the testing
accuracy. Then preprocess the dataframe so that you can graph train
and test on one plot.
"""
accuracy_data_nb = []
for i in range(10, 91, 10):
print("In loop for " + str(i) + "% test size")
train_and_add(MultinomialNB(), accuracy_data_nb, i)
accuracy_df_nb = pd.DataFrame(
accuracy_data_nb,
columns=[
'model',
'test_size',
'train',
'test'
]
)
accuracy_df_nb = accuracy_df_nb.melt(
['test_size', 'model'],
var_name='cols',
value_name='vals'
)
In loop for 10% test size
training model
finished
In loop for 20% test size
training model
finished
In loop for 30% test size
training model
finished
In loop for 40% test size
training model
finished
In loop for 50% test size
training model
finished
In loop for 60% test size
training model
finished
In loop for 70% test size
training model
finished
In loop for 80% test size
training model
finished
In loop for 90% test size
training model
finished
"""
Graphs the previous data
"""
sns.set()
fig, ax = plt.subplots(1, figsize=(7, 7))
sns.lineplot(
ax=ax,
data=accuracy_df_nb,
x='test_size', y='vals',
hue='cols',
palette=["red", "peru"],
linewidth=2
)
sns.lineplot(
ax=ax,
data=accuracy_df_svc,
x='test_size',
y='vals',
hue='cols',
palette=["royalblue", "darkslateblue"],
linewidth=3
)
plt.title("Accuracy V.S. Test size", fontsize=16)
plt.xlabel("Test Size (%)", fontsize=14)
plt.ylabel("Accuracy", fontsize=14)
svc_train = mlines.Line2D([], [], color='royalblue', label='SVC Train')
svc_test = mlines.Line2D([], [], color='darkslateblue', label='SVC Test')
nb_train = mlines.Line2D([], [], color='red', label='NB Train')
nb_test = mlines.Line2D([], [], color='peru', label='NB Test')
plt.legend(
handles=[svc_train, svc_test, nb_train, nb_test],
shadow=True,
fontsize=12
)
# testing code for question 1
features_train, features_test, labels_train, labels_test =\
train_test_split(features, labels, test_size=0.05)
clf = LinearSVC().fit(features_train, labels_train)
train_acc = accuracy_score(labels_train, clf.predict(features_train))
test_acc = accuracy_score(labels_test, clf.predict(features_test))
print("The training Accuracy is:", train_acc)
print("The testing accuracy is:", test_acc)
print("------")
for i in range(3):
rand_int = randint(0, labels_test.shape[0])
print("Real Value of " + ml_df.iloc[rand_int]['text']\
+ " is: " + str(labels_test.iloc[rand_int]))
print("and the predicted value is: " +\
str(clf.predict(features_test[rand_int, :])[0]))
The training Accuracy is: 0.8949625
The testing accuracy is: 0.8150625
------
Real Value of @kelliekk hmmm...let me check it out. i'm sorry is: 4
and the predicted value is: 4
Real Value of znowu jestem housewife... is: 0
and the predicted value is: 4
Real Value of @therealjordin there's no beach here..but i am pretty sure its nice outside..im going to the movies today though.. you're lucky!! is: 0
and the predicted value is: 0
# downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data] Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data] /root/nltk_data...
[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Unzipping corpora/wordnet.zip.
"""
cutting the size down for the df
and separate data into positive and negative tweets
"""
small_df = df.sample(frac = 0.3)
negative_mask = small_df['target'] == 0
positive_mask = small_df['target'] == 4
negative_tweets = small_df[negative_mask]
positive_tweets = small_df[positive_mask]
"""
converts tweets into tokens
"""
tt = TweetTokenizer()
positive_tweets_tokens = positive_tweets['text'].apply(tt.tokenize)
positive_tweets = positive_tweets.assign(tokens=positive_tweets_tokens)
negative_tweets_tokens = negative_tweets['text'].apply(tt.tokenize)
negative_tweets = negative_tweets.assign(tokens=negative_tweets_tokens)
"""
tags each token with parts of speech
"""
positive_tweets_tagged = positive_tweets['tokens'].apply(pos_tag)
positive_tweets_pos_tagged = positive_tweets\
.assign(pos_tag=positive_tweets_tagged)
negative_tweets_tagged = negative_tweets['tokens'].apply(pos_tag)
negative_tweets_pos_tagged = negative_tweets\
.assign(pos_tag=negative_tweets_tagged)
"""
Removes all hashtags, hyperlinks, twitter handles,
and stop words from the given tweet tokens. Normalizes
and lemmatizes all the remaining tweet tokens.
Returns a list of the normalized and lemmatized tweet tokens.
https://www.digitalocean.com/community/tutorials/
how-to-perform-sentiment-analysis-in-python-3-using
-the-natural-language-toolkit-nltk
"""
def remove_noise(tweet_tokens, stop_words=[]):
cleaned_tokens = []
for token, tag in tweet_tokens:
token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
'(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
token = re.sub("(@[A-Za-z0-9_]+)","", token)
if tag.startswith("NN"):
pos = 'n'
elif tag.startswith('VB'):
pos = 'v'
else:
pos = 'a'
lemmatizer = WordNetLemmatizer()
token = lemmatizer.lemmatize(token, pos)
if len(token) > 0 and token not in string.punctuation\
and token.lower() not in stop_words:
cleaned_tokens.append(token.lower())
return cleaned_tokens
"""
converts the df to a list of lists
"""
positive_twts = positive_tweets_pos_tagged.loc[:, 'pos_tag']\
.values.flatten().tolist()
negative_twts = negative_tweets_pos_tagged.loc[:, 'pos_tag']\
.values.flatten().tolist()
total_positive_twts = []
total_negative_twts = []
for twt in positive_twts:
total_positive_twts.append(twt)
for twt in negative_twts:
total_negative_twts.append(twt)
"""
normalizes the tokens, removes unnecessary info,
and removes all previously given tags
"""
stop_words = stopwords.words('english')
positive_tokens_clean = []
negative_tokens_clean = []
for tokens in total_positive_twts:
positive_tokens_clean.append(remove_noise(tokens, stop_words))
for tokens in total_negative_twts:
negative_tokens_clean.append(remove_noise(tokens, stop_words))
"""
removes all tags for empty spaces case
calculates most commonly used words
"""
all_pos_words = []
for data in positive_tokens_clean:
if len(data) == 2:
all_pos_words.append(data[0])
for data in negative_tokens_clean:
if len(data) == 2:
all_pos_words.append(data[0])
freq_dist_pos = FreqDist(all_pos_words)
"""
Converts list of tokens to dictionary of tokens.
Returns the dictionary of tokens where
the key is the token and True is the value.
"""
def get_tweets_for_model(cleaned_tokens_list):
for tweet_tokens in cleaned_tokens_list:
yield dict([token, True] for token in tweet_tokens)
positive_tokens_for_model = get_tweets_for_model(positive_tokens_clean)
negative_tokens_for_model = get_tweets_for_model(negative_tokens_clean)
"""
converts list to dictionary
shuffles the dataset and
separates the training and test data
"""
positive_dataset = [(tweet_dict, "Positive")
for tweet_dict in positive_tokens_for_model]
negative_dataset = [(tweet_dict, "Negative")
for tweet_dict in negative_tokens_for_model]
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)
train_data = dataset[:200000]
test_data = dataset[200000:]
"""
trains the classifier and prints the top 20
most common words and their sentiment ratio
"""
classifier = nltk.NaiveBayesClassifier.train(train_data)
print(classifier.show_most_informative_features(20))
Most Informative Features
farrah = True Negati : Positi = 31.0 : 1.0
depressing = True Negati : Positi = 21.3 : 1.0
sadness = True Negati : Positi = 20.1 : 1.0
infection = True Negati : Positi = 19.6 : 1.0
sad = True Negati : Positi = 19.4 : 1.0
toothache = True Negati : Positi = 19.0 : 1.0
disappointed = True Negati : Positi = 18.8 : 1.0
migraine = True Negati : Positi = 17.2 : 1.0
gahh = True Negati : Positi = 17.0 : 1.0
heartbreaking = True Negati : Positi = 17.0 : 1.0
died = True Negati : Positi = 17.0 : 1.0
congratulation = True Positi : Negati = 17.0 : 1.0
sinus = True Negati : Positi = 16.6 : 1.0
boooo = True Negati : Positi = 16.4 : 1.0
hates = True Negati : Positi = 16.2 : 1.0
#followfriday = True Positi : Negati = 15.9 : 1.0
sadly = True Negati : Positi = 15.7 : 1.0
poisoning = True Negati : Positi = 15.7 : 1.0
bleed = True Negati : Positi = 15.0 : 1.0
strep = True Negati : Positi = 15.0 : 1.0
None
# Preprocess the data into format where we can graph.
THRESHOLD = 90
positive = []
negative = []
for data in positive_tokens_clean:
if len(data) == 2:
# print(data[0])
positive.append(data[0])
for data in negative_tokens_clean:
if len(data) == 2:
negative.append(data[0])
positve_word_counts = dict(Counter(positive))
negative_word_counts = dict(Counter(negative))
combined = {'Postive':positve_word_counts,'Negative':negative_word_counts}
word_count_df = pd.DataFrame(combined)
word_count_df = word_count_df.fillna(0)
greater_negative = word_count_df['Negative'] > THRESHOLD
greater_positive = word_count_df['Postive'] > THRESHOLD
word_count_df = word_count_df[greater_negative | greater_positive]
word_count_df['word'] = word_count_df.index
# plot the positivity of the word vs negativity using plotly
fig = px.scatter(word_count_df, x="Postive", y="Negative", text="word")
fig.update_traces(textposition='top center')
fig.update_layout(
height=500,
title_text='Positivity vs Negativity of words in tweets'
)
fig.show()
# prints the accuracy of the training and test data by the classifier
# prints the top 20 most common words and their sentiment ratio
print("Accuracy:", classify.accuracy(classifier, train_data))
print("Accuracy:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(20))
Accuracy: 0.821105
Accuracy: 0.7478785714285714
Most Informative Features
freezing = True Negati : Positi = 27.1 : 1.0
sadden = True Negati : Positi = 27.1 : 1.0
farrah = True Negati : Positi = 24.7 : 1.0
depressed = True Negati : Positi = 24.3 : 1.0
carradine = True Negati : Positi = 23.1 : 1.0
#fail = True Negati : Positi = 22.4 : 1.0
#followfriday = True Positi : Negati = 20.9 : 1.0
migraine = True Negati : Positi = 19.9 : 1.0
sad = True Negati : Positi = 18.8 : 1.0
heartbreaking = True Negati : Positi = 18.4 : 1.0
depressing = True Negati : Positi = 18.2 : 1.0
depress = True Negati : Positi = 17.7 : 1.0
boohoo = True Negati : Positi = 17.0 : 1.0
mourn = True Negati : Positi = 16.4 : 1.0
strep = True Negati : Positi = 16.4 : 1.0
laughter = True Positi : Negati = 16.3 : 1.0
disappointing = True Negati : Positi = 15.9 : 1.0
condolence = True Negati : Positi = 15.7 : 1.0
arghhh = True Negati : Positi = 15.0 : 1.0
cmt = True Negati : Positi = 15.0 : 1.0
None
"""
Takes the preproccessed hashtag_df and transforms it into a new dataframe
that connects the hashtag to a label.
"""
hashtag_to_label = []
def convert_data(row):
"""
Takes 1 row of a dataframe and appends the hashtag and the
corresponding label to hashtag_to_label
"""
for word in row.text.split():
if '#' in word:
hashtag_to_label.append([word, row.target])
hashtag_df.apply(lambda row: convert_data(row), axis=1)
label_hashtag_df = pd.DataFrame(
hashtag_to_label,
columns=[
'hashtag',
'target',
]
)
#Grab the first 3 rows out of the dataframe and save them for later testing
first_3_hashtags = label_hashtag_df.drop(
label_hashtag_df.head(3).index,inplace=False)
label_hashtag_df.drop(
label_hashtag_df.head(3).index,inplace=True)
"""
Splits the preprocessed data into features and labels then trains a
LinearSVC model on the given data at different test sizes ploting the result.
"""
hashtag_features = preproccessing(label_hashtag_df['hashtag'])
hashtag_labels = label_hashtag_df['target']
hashtag_accuracy_data = []
for i in range(10, 91, 10):
train_and_add(
LinearSVC(),
hashtag_accuracy_data,
i,
features_func=hashtag_features,
labels_func=hashtag_labels
)
hashtag_accuracy_df = pd.DataFrame(
hashtag_accuracy_data,
columns=[
'model',
'test_size',
'train',
'test'
]
)
hashtag_accuracy_df = hashtag_accuracy_df.melt(
['test_size', 'model'],
var_name='cols',
value_name='vals'
)
sns.lineplot(
data=hashtag_accuracy_df,
x='test_size', y='vals',
hue='cols',
palette=["red", "peru"],
linewidth=2
)
plt.title("Accuracy VS. Test Size (#'s only)")
plt.xlabel("Test Size(%)")
plt.ylabel("Accuracy (out of 1)")
training model
finished
training model
finished
training model
finished
training model
finished
training model
finished
training model
finished
training model
finished
training model
finished
training model
finished
"""
Prints 6 different hashtags, their predicted value and real value(of the
tweet that they wehere in) in order to show that taking sentiment of #
is a good way to predict tweet sentiment
"""
clf = LinearSVC().fit(hashtag_features, hashtag_labels)
def print_predict_real(data, model, features, i):
"""
Takes data a model the features you trained the model on and then i
the index of the desired # then prints the real and predicted value
"""
print("Real Value of " + data.iloc[i]['hashtag']\
+ " is: " + str(data.iloc[i]['target']))
print("and the predicted value is: " +\
str(model.predict(features[i, :])[0]))
for i in range(3):
print_predict_real(first_3_hashtags, clf, hashtag_features, i)
print("--------")
print_predict_real(label_hashtag_df, clf, hashtag_features, 30000)
Real Value of #TTSC? is: 0
and the predicted value is: 0
--------
Real Value of #24 is: 0
and the predicted value is: 4
--------
Real Value of #gayforpeavy is: 0
and the predicted value is: 0
--------
Real Value of #mmwanted is: 4
and the predicted value is: 4