import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
tweets0 = pd.read_csv('/work/SI_330_Day_12_Text_Processing_II (1)/tweets_labelled_09042020_16072020.csv', sep = ';')
tweets1 = pd.read_csv('/work/SI_330_Day_12_Text_Processing_II (1)/tweets_remaining_09042020_16072020.csv', sep=';')
tweets1
tweets1.columns = ['id','created_at', 'text']
tweets = pd.concat([tweets0, tweets1])
tweets
# There are 928,673 rows and 4 columns
tweets.text.str.startswith('RT').sum()
graph = tweets.text.apply(len).plot(kind='hist')
plt.xlabel('Length of tweets')
plt.title('Tweet length')
tweets['when'] = pd.to_datetime(tweets['created_at'])
tweets.set_index('when').resample('1W').count()
tweets['text'] = tweets.text.str.findall('(@\w+)').apply(len)
tweets.text.plot.hist()
tweets['hashtag_count'] = tweets['text'].str.count("#\w+")
tweets.sort_values('hashtag_count', ascending=False)
tweets['hash_count'] = tweets['text'].str.findall("#\w+")
tweets.explode('hash_count')['hash_count'].str.lower().value_counts().head(10)
tweets['stock_symbols'] = tweets['text'].str.findall('^\$[a-zA-Z]+\.*[a-zA-Z]+')
tweets.explode('stock_symbols')['stock_symbols'].str.lower().value_counts().sort_values(ascending=False).head(10)
# insert your code here