Day 13

import pandas as pd import numpy as np import matplotlib.pyplot as plt

import nltk # you might need to do pip install nltk once only

nltk.download('book') # you only need to do this once

tweets = pd.read_csv('/work/stocktweets/tweets_labelled_09042020_16072020.csv', sep=';') tweets

tweets.head()

tweets['text_normalized'] = tweets.text.str.lower()

from nltk.tokenize import word_tokenize tweets['words'] = tweets['text_normalized'].apply(word_tokenize) tweets

tweets['types'] = tweets['words'].apply(set) tweets['ttr'] = tweets['types'].apply(len)/tweets['words'].apply(len) tweets

'/root/venv/bin/python -m pip install --upgrade pip'

from ntlk.corpus import stopwords STOPWORDS = stopwords.words('english') def remove_stopwords(words): return [word for word in words if word not in STOPWORDS] tweets['non_stopwords'] = tweets['words'].apply(remove_stopwords) tweets['non_stopwords_count'] = tweets['non_stopwords'].apply(len)

tweets['pos'] = tweets['non_stopwords'].apply(nltk.pos_tag)