import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
!pip install transformers
!pip install simpletransformers==0.32.3
import pandas as pd
from nltk.corpus import stopwords
import re
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import (GPT2Config,GPT2LMHeadModel,GPT2Tokenizer)
import torch
from string import punctuation as pnc
from collections import Counter
import gc
pd.set_option('display.max_colwidth', -1)
tweets = pd.read_csv('tweets.csv')
display(tweets.head(1))
tweets = tweets[['handle','text','is_retweet']]
print("Number of Tweets : ",len(tweets))
print("Null Count in the 3 columns : ")
print(tweets.isna().sum())
print("Number of Tweets from Doland and Hillary : ")
tweets['handle'].value_counts()
tweets['tweetLen'] = tweets['text'].apply(lambda x : len(x.split(" ")))
doland_tweets = tweets[tweets['handle']=='realDonaldTrump']
print("Doland Tweets : ")
display(doland_tweets['text'].head(5))
hillary_tweets = tweets[tweets['handle']=='HillaryClinton']
print("Hillary Tweets : ")
display(hillary_tweets['text'].head(5))
doland_tweets['tweetLen'].hist(bins=32)
hillary_tweets['tweetLen'].hist(bins=32)
def getWordCloud(df,col):
comment_words = ''
stopwords = set(STOPWORDS)
for val in df[col]:
val = str(val)
tokens = val.split()
for i in range(len(tokens)):
tokens[i] = tokens[i].lower()
comment_words += " ".join(tokens)+" "
wordcloud = WordCloud(width = 800, height = 800,
background_color ='white',
stopwords = stopwords,
min_font_size = 10).generate(comment_words)
plt.figure(figsize = (5, 5), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()
getWordCloud(doland_tweets,'text')
getWordCloud(hillary_tweets,'text')
# Prints only the top 20 frequently occured Twitter handles
def getTwitterHandlesTagged(df, col):
taggedHandlesList = []
for tweet in df[col].tolist():
taggedHandles = [x for x in tweet.split(" ") if x.startswith('@')]
taggedHandlesList = taggedHandlesList + taggedHandles
print(list({k: v for k, v in sorted(dict(Counter(taggedHandlesList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])
getTwitterHandlesTagged(doland_tweets, 'text')
getTwitterHandlesTagged(hillary_tweets, 'text')
# Prints only the top 20 frequently occured tags
def getTags(df, col):
tagsList = []
for tweet in df[col].tolist():
tags = [x for x in tweet.split(" ") if x.startswith('#')]
tagsList = tagsList + tags
print(list({k: v for k, v in sorted(dict(Counter(tagsList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])
getTags(doland_tweets, 'text')
getTags(hillary_tweets, 'text')
hillary_tweets['textLwr'] = hillary_tweets['text'].str.lower()
hillary_tweets['hasHillaySubString'] = hillary_tweets['textLwr'].str.contains('hillary')
display(hillary_tweets[hillary_tweets['hasHillaySubString'] == True]['text'].head(10))
def getQuoteAuthor(df, col):
quoteAuthorList = []
for tweet in df[col].tolist():
quoteAuthor = [x for x in tweet.split(" ") if x.startswith('—')]
quoteAuthorList = quoteAuthorList + quoteAuthor
print(list({k: v for k, v in sorted(dict(Counter(quoteAuthorList)).items(), key=lambda item: item[1], reverse = True)}.items())[:20])
getQuoteAuthor(hillary_tweets, 'text')
getQuoteAuthor(doland_tweets, 'text')
def removeTagTaggedHandlesQuoteAuthor(text):
text = " ".join([x for x in text.split(" ") if not x.startswith("@")])
text = " ".join([x for x in text.split(" ") if not x.startswith("#")])
text = " ".join([x for x in text.split(" ") if not x.startswith("—")])
return text
hillary_tweets['preProcessedText'] = hillary_tweets['text'].str.replace('http\S+|www.\S+', '', case=False)
hillary_tweets['preProcessedText'] = hillary_tweets['preProcessedText'].str.replace('\n','')
hillary_tweets['preProcessedText'] = hillary_tweets['preProcessedText'].apply(removeTagTaggedHandlesQuoteAuthor)
hillary_preprocessedtweets = hillary_tweets['preProcessedText']
hillary_preprocessedtweets_train, hillary_preprocessedtweets_eval = train_test_split(hillary_preprocessedtweets,test_size = 0.05)
print("Number of tweets in training data : ",len(hillary_preprocessedtweets_train))
print("Number of tweets in validation data : ",len(hillary_preprocessedtweets_eval))
hillary_preprocessedtweets_train.to_csv('hillary_preprocessedtweets_train.txt', header=None, index=None, sep=' ')
hillary_preprocessedtweets_eval.to_csv('hillary_preprocessedtweets_eval.txt', header=None, index=None, sep=' ')
from simpletransformers.language_modeling import LanguageModelingModel
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger()
logger.warning("Is this working?")
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
args = {
"reprocess_input_data": True,
"overwrite_output_dir": True,
"num_train_epochs": 10,
"train_batch_size": 32,
"mlm": False,
"dataset_type" : "simple",
"block_size" : 24,
"max_seq_length" : 24,
"evaluate_during_training": True,
"evaluate_during_training_steps": 50,
"evaluate_during_training_verbose": True,
"use_cached_eval_features": True,
"save_eval_checkpoints" : False,
"save_model_every_epoch" : False,
"early_stopping_patience" : 2,
"use_early_stopping" : True,
"save_optimizer_and_scheduler " : False,
"fp16" : False
}
hillary_model = LanguageModelingModel(
'gpt2',
'gpt2',
args=args,
use_cuda=True,
)
print("Get Value of all the hyperparameters : ")
for key in hillary_model.args:
print(key, '->', hillary_model.args[key])
hillary_model.train_model("hillary_preprocessedtweets_train.txt", eval_file="hillary_preprocessedtweets_eval.txt")
!ls ./outputs/best_model
config_class, model_class, tokenizer_class = GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
BestModel = model_class.from_pretrained('gpt2')
BestModel.load_state_dict(torch.load("./outputs/best_model/pytorch_model.bin"))
prompt_texts = ["I will reduce Gun violence.","Donald will build a wall","I will make our health care system better","Come rally with us","America is in financial stress","We have to preserve secularism","We will win the election"]
tokenizer = tokenizer_class.from_pretrained('gpt2')
for prompt_text in prompt_texts:
encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors="pt")
generated = BestModel.generate(encoded_prompt,max_length = 128, num_beams = 2, repetition_penalty = 5.0,verbose=False)
generated = generated.tolist()[0]
text = tokenizer.decode(generated, clean_up_tokenization_spaces=True)
print(".".join(text.split(".")[:3]))