import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk # you might need to do pip install nltk once only
nltk.download('book') # you only need to do this once
tweets = pd.read_csv('/work/stocktweets/tweets_labelled_09042020_16072020.csv', sep=';')
tweets
tweets.head()
tweets['text_normalized'] = tweets.text.str.lower()
from nltk.tokenize import word_tokenize
tweets['words'] = tweets['text_normalized'].apply(word_tokenize)
tweets
tweets['types'] = tweets['words'].apply(set)
tweets['ttr'] = tweets['types'].apply(len)/tweets['words'].apply(len)
tweets
'/root/venv/bin/python -m pip install --upgrade pip'
from ntlk.corpus import stopwords
STOPWORDS = stopwords.words('english')
def remove_stopwords(words):
return [word for word in words if word not in STOPWORDS]
tweets['non_stopwords'] = tweets['words'].apply(remove_stopwords)
tweets['non_stopwords_count'] = tweets['non_stopwords'].apply(len)
tweets['pos'] = tweets['non_stopwords'].apply(nltk.pos_tag)