import emoji
import pandas as pd
#Text processing libraries
import re
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#sklearn
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
#Libraries for plotting
import seaborn as sns
#Modules for plotting
from matplotlib import pyplot as plt
import geopandas as gpd
from shapely.geometry import Point , Polygon
import descartes
from wordcloud import WordCloud, STOPWORDS
#Import Nominatim for transform city names in coords
from geopy.geocoders import Nominatim
#Load dataset
df = pd.read_csv('tweets.csv').set_index('id')
df
# Applying a first round of text cleaning techniques to the text column
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
# Applying the cleaning function
df['text'] = df['text'].apply(lambda x: clean_text(x))
def deEmojify(text):
regrex_pattern = re.compile(pattern = "["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags = re.UNICODE)
return regrex_pattern.sub(r'',text)
df['text'] = df['text'].apply(lambda x: deEmojify(x))
# I'm going to comment on this line that I think is the one who return an error
#df.convert_dtypes['location'] = df['location'].apply(lambda x: deEmojify(x))
# Let's take a look at the updated text
df
#First we're going to check this column
df.keyword.describe()
# get all the unique values in the 'keyword' column
keywords = df['keyword'].unique()
# sort them alphabetically and then take a closer look
keywords.sort()
keywords
#Remove the "%20" from the registers in the column 'keyword'
df['keyword'] = df['keyword'].apply(lambda x: x.replace("%20"," "))
#Change the dtype of the column from object to str
df.keyword = df.keyword.astype(str)
# get all the unique values in the 'keyword' column
keywords = df['keyword'].unique()
# sort them alphabetically and then take a closer look
keywords.sort()
keywords
#Apply quotation marks to all registers in the "location" column (this will allowed us to treat the emoji flags later with the
# emoji library).
df.location = df.location.astype(str)
df.update(df[['location']].applymap('\'{}\''.format))
# Let's take a look at the updated text
df
#Transform the emoji flags in the location column
def emojiflag_to_text(flag):
'''Convert the emoji flag to a string with their name. the emoji flags are two unicode letters that composes
for example 🇲🇽 -> MX, so we know that the lenght of an emoji flag is 2, as we put it in a simple quotation marks,
we can assume that an emoji flag has lenght equal to 4'''
if len(flag) == 4:
return emoji.demojize(flag)
else:
return flag
# Applying the transform function
df['location'] = df['location'].apply(lambda x: emojiflag_to_text(x))
# Let's take a look at the updated text
df
#Remove the simple quotes from all the registers in the "location" column.
df['location'] = df['location'].apply(lambda x: x.replace("'",""))
#Remove the ":" from the registers that used to be an emoji flag
df['location'] = df['location'].apply(lambda x: x.replace(":",""))
#Remove the strings nan with a true NaN
df['location'] = df['location'].apply(lambda x: x.replace("nan",""))
#Remove all the emojis (no emoji flags) that remains in the 'location' column
df['location'] = df['location'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE)
# Let's take a look at the updated text
df
df.info()
print('\n')
print('Data Type')
print('__'*12)
df.dtypes
nul_locations = df['location'].isna().sum()
print(f'We have {nul_locations} missing locations')
print('Data Type Converted')
print('__'*12)
df.convert_dtypes().dtypes
# We use unique to know the different variables that conforms each column
df.nunique()
# Let's see if there is any duplicated text
dup_text = df['text'].duplicated().sum()
print(f'We have {dup_text} duplicated texts')
# We can have a description of the lenght of the text, the largest and shortest text and maybe a relation with the target number.
df['length'] = [len(i) for i in df['text']]
df['length'].describe()
# Histogram with the distribution of the Tweets
plt.hist(df.length)
plt.xlabel('Length of Tweet')
plt.ylabel('# of Tweets')
plt.show()
# Table with the highest and lowest lengths
display(df[df['length']>130])
print('\n')
display(df[df['length']<20])
location = df['location'].value_counts()
location[location>=10][:20]
# We are going to group some places, so we create a mapping dictionary to replace the locations
mapping = {'United States':'USA',
'New York':'USA',
"London":'UK',
"Los Angeles, CA":'USA',
"Washington, D.C.":'USA',
"California":'USA',
"Chicago, IL":'USA',
"Chicago":'USA',
"New York, NY":'USA',
"California, USA":'USA',
"FLorida":'USA',
"Nigeria":'Africa',
"Kenya":'Africa',
"Everywhere":'Worldwide',
"San Francisco":'USA',
"Florida":'USA',
"United Kingdom":'UK',
"Los Angeles":'USA',
"Toronto":'Canada',
"San Francisco, CA":'USA',
"NYC":'USA',
"Seattle":'USA',
"Earth":'Worldwide',
"Ireland":'UK',
"London, England":'UK',
"New York City":'USA',
"Texas":'USA',
"London, UK":'UK',
"Atlanta, GA":'USA',
"England, United Kingdom":'UK',
"Mumbai, India":'India',
"Melbourne,Victoria":'Australia'}
# We apply a function and change the name of the location to group if it exists on the mapping key
df['location'] = df['location'].apply(lambda i: mapping[i] if i in mapping.keys() else i)
# save df
df.to_csv('df_transform.csv', index=False)
# Now the values changed
ndf = df
ndf.dropna()
location = ndf['location'].value_counts()
location[location>=10]
localizator = Nominatim(user_agent='tweets-analysis') # Creation of the agent
location = list(location.index) #We keep just the name of the cities
# geolocated = list(map(lambda x: [x,localizator.geocode(x,timeout=None)[1] if localizator.geocode(x,timeout=None) else None],location2))
# geolocated = pd.DataFrame(geolocated)
# geolocated.columns = ['locat','latlong']
# try:
# geolocated['lat'] = geolocated.latlong.apply(lambda x: x[0])
# geolocated['lon'] = geolocated.latlong.apply(lambda x: x[1])
# geolocated.drop('latlong',axis=1, inplace=True)
# except:
# pass
geolocated = pd.read_csv('coords.csv')
geolocated.drop(['Unnamed: 0','Unnamed: 0.1'],1,inplace=True)
geolocated['latlong']=geolocated.latlong.apply(lambda x: x[1:-1].split(','))
geolocated['lat'] = geolocated.latlong.apply(lambda x: x[0])
geolocated['lon'] = geolocated.latlong.apply(lambda x: x[1])
geolocated.drop('latlong',axis=1, inplace=True)
geolocated['lat']=pd.to_numeric(geolocated['lat'])
geolocated['lon']=pd.to_numeric(geolocated['lon'])
world_map = gpd.read_file('World_Countries.shp')
fig,ax = plt.subplots(figsize=(15,15))
world_map.plot(ax=ax)
crs = {'init':'epsg:4326'}
geometry = [Point(xy) for xy in zip(geolocated['lon'],geolocated['lat'])]
geo_df = gpd.GeoDataFrame(geolocated,crs=crs,geometry=geometry)
geo_df.head()
fig,ax = plt.subplots(figsize=(25,25))
ax.set(facecolor='Powderblue')
ax.set_alpha(0.2)
world_map.plot(ax=ax, alpha=1, color='white')
geo_df.plot(ax=ax,markersize=30,color='darkviolet',marker='o',alpha=0.3)
countries_mask = df['location'].value_counts()
countries_mask
df_usa = df[df['location'] == countries_mask.index[1]]
df_usa
plt.rcParams['font.size'] = 15
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['figure.subplot.bottom'] = .1
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_usa['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_usa_positive = df_usa[df_usa['target'] == 1]
df_usa_positive
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_usa_positive['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_usa_negative = df_usa[df_usa['target'] == 0]
df_usa_negative
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_usa_negative['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_uk = df[df['location'] == countries_mask.index[2]]
df_uk
plt.rcParams['font.size'] = 15
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['figure.subplot.bottom'] = .1
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_uk['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_uk_positive = df_uk[df_uk['target'] == 1]
df_uk_positive
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_uk_positive['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_uk_negative = df_uk[df_uk['target'] == 0]
df_uk_negative
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_uk_negative['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_india = df[df['location'] == countries_mask.index[3]]
df_india
plt.rcParams['font.size'] = 15
plt.rcParams['savefig.dpi'] = 100
plt.rcParams['figure.subplot.bottom'] = .1
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='white',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_india['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_india_positive = df_india[df_india['target'] == 1]
df_india_positive
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_india_positive['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
df_india_negative = df_india[df_india['target'] == 0]
df_india_negative
stopwords = set(STOPWORDS)
wordcloud = WordCloud(
background_color='gray',
stopwords=stopwords,
max_words = 20,
max_font_size = 40,
random_state = 42,
).generate(str(df_india_negative['text']))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
wordcloud.words_
x = df['target'].value_counts().index
y = df['target'].value_counts()
sns.barplot(x=x ,y=y)
# Now we see how a Disaster tweet looks like (Target 1)
disaster_tweets=df[df['target']==1]['text']
disaster_tweets.values[5]
# Now we see how a non Disaster tweet looks like (Target 0)
nondisaster_tweets=df[df['target']==0]['text']
nondisaster_tweets.values[10]
# Applying a first round of text cleaning techniques
def clean_text(text):
'''Make text lowercase, remove text in square brackets,remove links,remove punctuation
and remove words containing numbers.'''
text = text.lower()
text = re.sub('\[.*?\]', '', text)
text = re.sub('https?://\S+|www\.\S+', '', text)
text = re.sub('<.*?>+', '', text)
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
text = re.sub('\n', '', text)
text = re.sub('\w*\d\w*', '', text)
return text
# Applying the cleaning function to both test and training datasets
df['text'] = df['text'].apply(lambda x: clean_text(x))
# Let's take a look at the updated text
df['text'].head()
#Tokenize the dataset
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
df['text']=df['text'].apply(lambda x:tokenizer.tokenize(x))
df['text'].head()
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
def remove_stopwords(text):
"""
Removing stopwords belonging to english language
"""
words = [w for w in text if w not in stopwords.words('english')]
return words
df['text'] = df['text'].apply(lambda x : remove_stopwords(x))
df.head()
# After preprocessing, the text format
def combine_text(list_of_text):
'''Takes a list of text and combines them into one large chunk of text.'''
combined_text = ' '.join(list_of_text)
return combined_text
df['text'] = df['text'].apply(lambda x : combine_text(x))
df.head()
# Splitting the data into independent and dependent features
X=df['text']
y=df['target']
X.head()
# We need data to train the model and data to test it. In this case we have 20% data to test
X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size=0.2,random_state=1)
#Convert a collection of text documents to a matrix of token counts
vectorizer=CountVectorizer()
x_train_vectors=vectorizer.fit_transform(X_train)
x_test_vectors=vectorizer.transform(X_test)
X_train.head()
# We have a matrix with all the words converted to numbers
x_train_vectors.todense()
clf_naive=MultinomialNB(alpha=0.2,fit_prior=False)
clf_naive.fit(x_train_vectors,y_train)
pred=clf_naive.predict(x_test_vectors)
accuracy_score_train=metrics.accuracy_score(y_train,clf_naive.predict(x_train_vectors))
print(f'Accuracy score train: {accuracy_score_train}')
accuracy_score_test=metrics.accuracy_score(y_test,pred)
print(f'Accuracy score test {accuracy_score_test}')
print('_____'*10+'\n')
classification_report_train=metrics.classification_report(y_train,clf_naive.predict(x_train_vectors))
print(f'-Classification report train:\n {classification_report_train}')
classification_report_test=metrics.classification_report(y_test,pred)
print(f'-Classification report test:\n {classification_report_test}')
roc_auc_score_train=metrics.roc_auc_score(y_train,clf_naive.predict(x_train_vectors))
print(f'Compute Area Under the Receiver Operating Characteristic Curve train: {roc_auc_score_train}')
roc_auc_score_test=metrics.roc_auc_score(y_test,pred)
print(f'Compute Area Under the Receiver Operating Characteristic Curve test: {roc_auc_score_test}')
print('_____'*10+'\n')
confusion_matrix_train=metrics.confusion_matrix(y_train,clf_naive.predict(x_train_vectors))
print(f'Confusion Matrix Train:\n {confusion_matrix_train}')
confusion_matrix_test=metrics.confusion_matrix(y_test,pred)
print(f'Confusion Matrix Test:\n {confusion_matrix_test}')