Day 12

import pandas as pd import numpy as np import matplotlib.pyplot as plt

tweets0 = pd.read_csv('/work/SI_330_Day_12_Text_Processing_II (1)/tweets_labelled_09042020_16072020.csv', sep = ';') tweets1 = pd.read_csv('/work/SI_330_Day_12_Text_Processing_II (1)/tweets_remaining_09042020_16072020.csv', sep=';') tweets1

tweets1.columns = ['id','created_at', 'text'] tweets = pd.concat([tweets0, tweets1]) tweets # There are 928,673 rows and 4 columns

tweets.text.str.startswith('RT').sum()

graph = tweets.text.apply(len).plot(kind='hist') plt.xlabel('Length of tweets') plt.title('Tweet length')

tweets['when'] = pd.to_datetime(tweets['created_at']) tweets.set_index('when').resample('1W').count()

tweets['text'] = tweets.text.str.findall('(@\w+)').apply(len) tweets.text.plot.hist()

tweets['hashtag_count'] = tweets['text'].str.count("#\w+") tweets.sort_values('hashtag_count', ascending=False)

tweets['hash_count'] = tweets['text'].str.findall("#\w+") tweets.explode('hash_count')['hash_count'].str.lower().value_counts().head(10)

tweets['stock_symbols'] = tweets['text'].str.findall('^\$[a-zA-Z]+\.*[a-zA-Z]+') tweets.explode('stock_symbols')['stock_symbols'].str.lower().value_counts().sort_values(ascending=False).head(10)

# insert your code here