import pandas as pd
import numpy as np
import nltk
tweets = pd.read_csv('/work/tweets_labelled_09042020_16072020.csv', sep=';')
tweets.head()
nltk.download('vader_lexicon')
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
sentence = "It was the best of times."
ss = sid.polarity_scores(sentence)
for k in sorted(ss):
print('{0}: {1}, '.format(k, ss[k]), end='')
print()
compound: 0.6369,
neg: 0.0,
neu: 0.543,
pos: 0.457,
df = pd.DataFrame({'text':
["This is great!",
"This is bad.",
"This is neither.",
]})
df
textobject
0
This is great!
1
This is bad.
2
This is neither.
df['sent'] = df.text.apply(sid.polarity_scores)
for i in ['compound','neg','neu','pos']:
df[i] = df['sent'].apply(lambda x: x[i])
df.drop('sent',axis=1)
textobject
compoundfloat64
0
This is great!
0.6588
1
This is bad.
-0.5423
2
This is neither.
0
tweets = pd.read_csv('data/stocktweets/tweets_labelled_09042020_16072020.csv', sep=';')
FileNotFoundError: [Errno 2] No such file or directory: 'data/stocktweets/tweets_labelled_09042020_16072020.csv'
tweets.head()
idint64
created_atobject
0
77522
2020-04-15 01:03:46+00:00
1
661634
2020-06-25 06:20:06+00:00
2
413231
2020-06-04 15:41:45+00:00
3
760262
2020-07-03 19:39:35+00:00
4
830153
2020-07-09 14:39:14+00:00
tweets['sent'] = tweets.text.apply(sid.polarity_scores)
for i in ['compound','neg','neu','pos']:
tweets[i] = tweets['sent'].apply(lambda x: x[i])
tweets = tweets.drop('sent',axis=1)
tweets['vader'] = np.where(tweets.compound < -0.1,
'negative',
np.where(tweets.compound > 0.1, 'positive', 'neutral'))
tweets
idint64
11 - 938084
created_atobject
2020-04-15 01:03:46+00:000%
2020-06-25 06:20:06+00:000%
4998 others100%
0
77522
2020-04-15 01:03:46+00:00
1
661634
2020-06-25 06:20:06+00:00
2
413231
2020-06-04 15:41:45+00:00
3
760262
2020-07-03 19:39:35+00:00
4
830153
2020-07-09 14:39:14+00:00
5
27027
2020-04-12 21:52:56+00:00
6
472959
2020-06-09 05:23:06+00:00
7
392845
2020-06-02 01:12:29+00:00
8
313771
2020-05-07 04:58:41+00:00
9
267894
2020-05-04 15:16:29+00:00
t = tweets.dropna()
len(t[t['sentiment'] == t['vader']])/len(t)
pos_or_neg = tweets.query("sentiment == 'positive' | sentiment == 'negative'").copy()
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer()
df['text'].apply(tt.tokenize)
pos_or_neg['words'] = pos_or_neg['text'].str.lower().apply(tt.tokenize)
pos_or_neg.head()
idint64
created_atobject
0
77522
2020-04-15 01:03:46+00:00
1
661634
2020-06-25 06:20:06+00:00
2
413231
2020-06-04 15:41:45+00:00
3
760262
2020-07-03 19:39:35+00:00
4
830153
2020-07-09 14:39:14+00:00
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
pos_or_neg['lemmas'] = pos_or_neg['words'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])
LookupError:
**********************************************************************
Resource [93mwordnet[0m not found.
Please use the NLTK Downloader to obtain the resource:
[31m>>> import nltk
>>> nltk.download('wordnet')
[0m
For more information see: https://www.nltk.org/data.html
Attempted to load [93mcorpora/wordnet[0m
Searched in:
- '/root/nltk_data'
- '/usr/local/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/local/lib/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
# nltk.download('omw-1.4')
positive = pos_or_neg.query('sentiment == "positive"')
negative = pos_or_neg.query('sentiment == "negative"')
pos_words = set(positive.explode('lemmas')['lemmas'])
neg_words = set(negative.explode('lemmas')['lemmas'])
KeyError: 'lemmas'
len(pos_words)
len(neg_words)
pos_only = pos_words.difference(neg_words)
neg_only = neg_words.difference(pos_words)