Day 14

import pandas as pd import numpy as np import nltk

tweets = pd.read_csv('/work/tweets_labelled_09042020_16072020.csv', sep=';')

tweets.head() nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

sentence = "It was the best of times."

ss = sid.polarity_scores(sentence)

for k in sorted(ss): print('{0}: {1}, '.format(k, ss[k]), end='') print()

df = pd.DataFrame({'text': ["This is great!", "This is bad.", "This is neither.", ]})

df['sent'] = df.text.apply(sid.polarity_scores) for i in ['compound','neg','neu','pos']: df[i] = df['sent'].apply(lambda x: x[i]) df.drop('sent',axis=1)

tweets = pd.read_csv('data/stocktweets/tweets_labelled_09042020_16072020.csv', sep=';')

tweets.head()

tweets['sent'] = tweets.text.apply(sid.polarity_scores) for i in ['compound','neg','neu','pos']: tweets[i] = tweets['sent'].apply(lambda x: x[i]) tweets = tweets.drop('sent',axis=1)

tweets['vader'] = np.where(tweets.compound < -0.1, 'negative', np.where(tweets.compound > 0.1, 'positive', 'neutral')) tweets

t = tweets.dropna() len(t[t['sentiment'] == t['vader']])/len(t)

pos_or_neg = tweets.query("sentiment == 'positive' | sentiment == 'negative'").copy()

from nltk.tokenize import TweetTokenizer tt = TweetTokenizer() df['text'].apply(tt.tokenize)

pos_or_neg['words'] = pos_or_neg['text'].str.lower().apply(tt.tokenize)

pos_or_neg.head()

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

pos_or_neg['lemmas'] = pos_or_neg['words'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])

# nltk.download('omw-1.4')

positive = pos_or_neg.query('sentiment == "positive"') negative = pos_or_neg.query('sentiment == "negative"')

pos_words = set(positive.explode('lemmas')['lemmas']) neg_words = set(negative.explode('lemmas')['lemmas'])

len(pos_words)

len(neg_words)

pos_only = pos_words.difference(neg_words) neg_only = neg_words.difference(pos_words)