NLP - Hillary Clinton and Donald Trump Tweets

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

!pip install transformers

!pip install simpletransformers==0.32.3

import pandas as pd from nltk.corpus import stopwords import re from wordcloud import WordCloud, STOPWORDS import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from transformers import (GPT2Config,GPT2LMHeadModel,GPT2Tokenizer) import torch from string import punctuation as pnc from collections import Counter import gc pd.set_option('display.max_colwidth', -1)

tweets = pd.read_csv('tweets.csv') display(tweets.head(1))

tweets = tweets[['handle','text','is_retweet']]

print("Number of Tweets : ",len(tweets)) print("Null Count in the 3 columns : ") print(tweets.isna().sum())

print("Number of Tweets from Doland and Hillary : ") tweets['handle'].value_counts()

tweets['tweetLen'] = tweets['text'].apply(lambda x : len(x.split(" ")))

doland_tweets = tweets[tweets['handle']=='realDonaldTrump'] print("Doland Tweets : ") display(doland_tweets['text'].head(5)) hillary_tweets = tweets[tweets['handle']=='HillaryClinton'] print("Hillary Tweets : ") display(hillary_tweets['text'].head(5))

doland_tweets['tweetLen'].hist(bins=32)

hillary_tweets['tweetLen'].hist(bins=32)

def getWordCloud(df,col): comment_words = '' stopwords = set(STOPWORDS) for val in df[col]: val = str(val) tokens = val.split() for i in range(len(tokens)): tokens[i] = tokens[i].lower() comment_words += " ".join(tokens)+" " wordcloud = WordCloud(width = 800, height = 800, background_color ='white', stopwords = stopwords, min_font_size = 10).generate(comment_words) plt.figure(figsize = (5, 5), facecolor = None) plt.imshow(wordcloud) plt.axis("off") plt.tight_layout(pad = 0) plt.show()

getWordCloud(doland_tweets,'text')

getWordCloud(hillary_tweets,'text')

# Prints only the top 20 frequently occured Twitter handles def getTwitterHandlesTagged(df, col): taggedHandlesList = [] for tweet in df[col].tolist(): taggedHandles = [x for x in tweet.split(" ") if x.startswith('@')] taggedHandlesList = taggedHandlesList + taggedHandles print(list({k: v for k, v in sorted(dict(Counter(taggedHandlesList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])

getTwitterHandlesTagged(doland_tweets, 'text')

getTwitterHandlesTagged(hillary_tweets, 'text')

# Prints only the top 20 frequently occured tags def getTags(df, col): tagsList = [] for tweet in df[col].tolist(): tags = [x for x in tweet.split(" ") if x.startswith('#')] tagsList = tagsList + tags print(list({k: v for k, v in sorted(dict(Counter(tagsList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])

getTags(doland_tweets, 'text')

getTags(hillary_tweets, 'text')

hillary_tweets['textLwr'] = hillary_tweets['text'].str.lower() hillary_tweets['hasHillaySubString'] = hillary_tweets['textLwr'].str.contains('hillary') display(hillary_tweets[hillary_tweets['hasHillaySubString'] == True]['text'].head(10))

def getQuoteAuthor(df, col): quoteAuthorList = [] for tweet in df[col].tolist(): quoteAuthor = [x for x in tweet.split(" ") if x.startswith('—')] quoteAuthorList = quoteAuthorList + quoteAuthor print(list({k: v for k, v in sorted(dict(Counter(quoteAuthorList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])

getQuoteAuthor(hillary_tweets, 'text')

getQuoteAuthor(doland_tweets, 'text')

def removeTagTaggedHandlesQuoteAuthor(text): text = " ".join([x for x in text.split(" ") if not x.startswith("@")]) text = " ".join([x for x in text.split(" ") if not x.startswith("#")]) text = " ".join([x for x in text.split(" ") if not x.startswith("—")]) return text hillary_tweets['preProcessedText'] = hillary_tweets['text'].str.replace('http\S+|www.\S+', '', case=False) hillary_tweets['preProcessedText'] = hillary_tweets['preProcessedText'].str.replace('\n','') hillary_tweets['preProcessedText'] = hillary_tweets['preProcessedText'].apply(removeTagTaggedHandlesQuoteAuthor)

hillary_preprocessedtweets = hillary_tweets['preProcessedText']

hillary_preprocessedtweets_train, hillary_preprocessedtweets_eval = train_test_split(hillary_preprocessedtweets,test_size = 0.05) print("Number of tweets in training data : ",len(hillary_preprocessedtweets_train)) print("Number of tweets in validation data : ",len(hillary_preprocessedtweets_eval))

hillary_preprocessedtweets_train.to_csv('hillary_preprocessedtweets_train.txt', header=None, index=None, sep=' ') hillary_preprocessedtweets_eval.to_csv('hillary_preprocessedtweets_eval.txt', header=None, index=None, sep=' ')

from simpletransformers.language_modeling import LanguageModelingModel import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger() logger.warning("Is this working?") transformers_logger = logging.getLogger("transformers") transformers_logger.setLevel(logging.WARNING)

args = { "reprocess_input_data": True, "overwrite_output_dir": True, "num_train_epochs": 10, "train_batch_size": 32, "mlm": False, "dataset_type" : "simple", "block_size" : 24, "max_seq_length" : 24, "evaluate_during_training": True, "evaluate_during_training_steps": 50, "evaluate_during_training_verbose": True, "use_cached_eval_features": True, "save_eval_checkpoints" : False, "save_model_every_epoch" : False, "early_stopping_patience" : 2, "use_early_stopping" : True, "save_optimizer_and_scheduler " : False, "fp16" : False } hillary_model = LanguageModelingModel( 'gpt2', 'gpt2', args=args, use_cuda=True, )

print("Get Value of all the hyperparameters : ") for key in hillary_model.args: print(key, '->', hillary_model.args[key])

hillary_model.train_model("hillary_preprocessedtweets_train.txt", eval_file="hillary_preprocessedtweets_eval.txt")

!ls ./outputs/best_model

config_class, model_class, tokenizer_class = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer BestModel = model_class.from_pretrained('gpt2') BestModel.load_state_dict(torch.load("./outputs/best_model/pytorch_model.bin"))

prompt_texts = ["I will reduce Gun violence.","Donald will build a wall","I will make our health care system better","Come rally with us","America is in financial stress","We have to preserve secularism","We will win the election"] tokenizer = tokenizer_class.from_pretrained('gpt2') for prompt_text in prompt_texts: encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt") generated = BestModel.generate(encoded_prompt,max_length = 128, num_beams = 2, repetition_penalty = 5.0,verbose=False) generated = generated.tolist()[0] text = tokenizer.decode(generated, clean_up_tokenization_spaces=True) print(".".join(text.split(".")[:3]))