import pandas as pd
import re as regex
trump = pd.read_csv('raw_data/trump_tweets.csv')
old_biden = pd.read_csv('raw_data/biden_tweets_old.csv')
new_biden = pd.read_csv('raw_data/biden_tweets_api.csv')
def tweet_link_only(tweet):
'''
True if the tweet contains a link only.
'''
pattern = r'^https?://t.co/(\S)+$'
if regex.match(pattern, tweet):
return True
return False
def transform_tweet(tweet):
'''
Removes the https from links and corrects the & sign.
'''
tweet = regex.sub(r'https?://t.co/(\S)+|\Ws\W', ' ', tweet)
tweet = regex.sub(r'\Wre\W', ' are ', tweet, count=5)
tweet = regex.sub(r'\Wll\W', ' will ', tweet, count=5)
tweet = regex.sub(r'&[;,]', '&', tweet, count=5)
return tweet.replace('https://', '').lower()
old_biden = old_biden[['id', 'timestamp', 'tweet']].rename(columns={'id':'ID', 'timestamp':'Datetime', 'tweet':'Tweet'})
old_biden.Datetime = old_biden.Datetime.astype('datetime64')
new_biden = new_biden.rename(columns={'Time': 'Datetime'})
new_biden.Datetime = new_biden.Datetime.astype('datetime64')
biden = pd.concat([old_biden, new_biden])
biden['Author'] = 'Joe Biden'
biden.drop_duplicates('ID', inplace=True)
biden = biden[~biden.Tweet.apply(tweet_link_only)].sort_values(by=['Datetime'], ascending=False).reset_index(drop=True)
biden.Tweet = biden.Tweet.apply(transform_tweet)
biden.head(3)
trump = trump.loc[trump.isRetweet=='f', ['id', 'date', 'text']]
trump['Author'] = 'Donald J. Trump'
trump.rename(columns={'id': 'ID', 'text': 'Tweet', 'date': 'Datetime'}, inplace=True)
trump.Datetime = trump.Datetime.astype('datetime64')
trump = trump[~trump.Tweet.apply(tweet_link_only)].sort_values(by=['Datetime'], ascending=False).reset_index(drop=True)
trump.drop_duplicates('ID', inplace=True)
trump.Tweet = trump.Tweet.apply(transform_tweet)
trump.head(3)
pd.concat([biden, trump]).to_csv('tweets.csv', index=False)
from datetime import datetime
start_date = datetime(2018,1,1,1,1,1)
end_date = trump.Datetime.max()
biden_trim = biden[(biden.Datetime > start_date) & (biden.Datetime < end_date)]
trump_trim = trump[trump.Datetime > start_date]
pd.concat([biden_trim, trump_trim.sample(n=len(biden_trim))]).to_csv('tweets_trim.csv', index=False)
pd.concat([biden_trim, trump_trim]).to_csv('tweets_trim_2.csv', index=False)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
from nb_plotter import plot_confusion_matrix, plot_top_features
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
from gensim.parsing.preprocessing import remove_stopwords
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
tweets = pd.read_csv('tweets_trim.csv')
y = tweets.Author
X_train, X_test, y_train, y_test = train_test_split(tweets.Tweet, y, random_state=1, test_size=.33)
tfidf_vectorizer = TfidfVectorizer(max_df=.9, min_df=.05)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)
tfidf_nsw_vectorizer = TfidfVectorizer(stop_words='english', max_df=.9, min_df=.05)
tfidf_nsw_train = tfidf_nsw_vectorizer.fit_transform(X_train)
tfidf_nsw_test = tfidf_nsw_vectorizer.transform(X_test)
count_vectorizer = CountVectorizer(max_df=.9, min_df=.05)
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train, y_train)
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)
tfidf_nb_score = metrics.accuracy_score(y_test, tfidf_nb_pred)
tfidf_nsw_nb = MultinomialNB()
tfidf_nsw_nb.fit(tfidf_nsw_train, y_train)
tfidf_nsw_nb_pred = tfidf_nsw_nb.predict(tfidf_nsw_test)
tfidf_nsw_nb_score = metrics.accuracy_score(y_test, tfidf_nsw_nb_pred)
count_nb = MultinomialNB()
count_nb.fit(count_train, y_train)
count_nb_pred = count_nb.predict(count_test)
count_nb_score = metrics.accuracy_score(y_test, count_nb_pred)
print('NaiveBayes Tfidf Score: ', tfidf_nb_score)
print('NaiveBayes Tfidf Score (w/o stop words): ', tfidf_nsw_nb_score)
print('NaiveBayes Count Score: ', count_nb_score)
tfidf_nb_cm = metrics.confusion_matrix(y_test, tfidf_nb_pred, labels=['Donald J. Trump', 'Joe Biden'])
plot_confusion_matrix(tfidf_nb_cm, classes=['Trump', 'Biden'], title="TF-IDF NB Confusion Matrix")
tfidf_nsw_nb_cm = metrics.confusion_matrix(y_test, tfidf_nsw_nb_pred, labels=['Donald J. Trump', 'Joe Biden'])
plot_confusion_matrix(tfidf_nsw_nb_cm, classes=['Trump', 'Biden'], title="TF-IDF NB Confusion Matrix, no stop words", figure=1)
count_nb_cm = metrics.confusion_matrix(y_test, count_nb_pred, labels=['Donald J. Trump', 'Joe Biden'])
plot_confusion_matrix(count_nb_cm, classes=['Trump', 'Biden'], title='Count NB Confusion Matrix', figure=2)
plot_top_features(tfidf_svc, tfidf_vectorizer, top_features=10, title='Most significant (stop) words')
plot_top_features(tfidf_svc_nsw, tfidf_nsw_vectorizer, top_features=10, title='Most significant key words')