import pandas as pd
import numpy as np
import nltk
tweets = pd.read_csv('/work/tweets_labelled_09042020_16072020.csv', sep=';')
tweets.head()
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sentence = "It was the best of times."
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
print('{0}: {1}, '.format(k, ss[k]), end='')
print()
df = pd.DataFrame({'text':
["This is great!",
"This is bad.",
"This is neither.",
]})
df
df['sent'] = df.text.apply(sid.polarity_scores)
for i in ['compound','neg','neu','pos']:
df[i] = df['sent'].apply(lambda x: x[i])
df.drop('sent',axis=1)
tweets = pd.read_csv('data/stocktweets/tweets_labelled_09042020_16072020.csv', sep=';')
tweets.head()
tweets['sent'] = tweets.text.apply(sid.polarity_scores)
for i in ['compound','neg','neu','pos']:
tweets[i] = tweets['sent'].apply(lambda x: x[i])
tweets = tweets.drop('sent',axis=1)
tweets['vader'] = np.where(tweets.compound < -0.1,
'negative',
np.where(tweets.compound > 0.1, 'positive', 'neutral'))
tweets
t = tweets.dropna()
len(t[t['sentiment'] == t['vader']])/len(t)
pos_or_neg = tweets.query("sentiment == 'positive' | sentiment == 'negative'").copy()
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
df['text'].apply(tt.tokenize)
pos_or_neg['words'] = pos_or_neg['text'].str.lower().apply(tt.tokenize)
pos_or_neg.head()
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
pos_or_neg['lemmas'] = pos_or_neg['words'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])
# nltk.download('omw-1.4')
positive = pos_or_neg.query('sentiment == "positive"')
negative = pos_or_neg.query('sentiment == "negative"')
pos_words = set(positive.explode('lemmas')['lemmas'])
neg_words = set(negative.explode('lemmas')['lemmas'])
len(pos_words)
len(neg_words)
pos_only = pos_words.difference(neg_words)
neg_only = neg_words.difference(pos_words)