Twitter Disasters

import emoji import pandas as pd #Text processing libraries import re import string import nltk nltk.download('stopwords') from nltk.corpus import stopwords from nltk.tokenize import word_tokenize #sklearn from sklearn import model_selection from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB from sklearn import metrics #Libraries for plotting import seaborn as sns #Modules for plotting from matplotlib import pyplot as plt import geopandas as gpd from shapely.geometry import Point , Polygon import descartes from wordcloud import WordCloud, STOPWORDS #Import Nominatim for transform city names in coords from geopy.geocoders import Nominatim

#Load dataset df = pd.read_csv('tweets.csv').set_index('id') df

# Applying a first round of text cleaning techniques to the text column def clean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) return text # Applying the cleaning function df['text'] = df['text'].apply(lambda x: clean_text(x)) def deEmojify(text): regrex_pattern = re.compile(pattern = "[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) "]+", flags = re.UNICODE) return regrex_pattern.sub(r'',text) df['text'] = df['text'].apply(lambda x: deEmojify(x)) # I'm going to comment on this line that I think is the one who return an error #df.convert_dtypes['location'] = df['location'].apply(lambda x: deEmojify(x)) # Let's take a look at the updated text df

#First we're going to check this column df.keyword.describe()

# get all the unique values in the 'keyword' column keywords = df['keyword'].unique() # sort them alphabetically and then take a closer look keywords.sort() keywords

#Remove the "%20" from the registers in the column 'keyword' df['keyword'] = df['keyword'].apply(lambda x: x.replace("%20"," ")) #Change the dtype of the column from object to str df.keyword = df.keyword.astype(str) # get all the unique values in the 'keyword' column keywords = df['keyword'].unique() # sort them alphabetically and then take a closer look keywords.sort() keywords

#Apply quotation marks to all registers in the "location" column (this will allowed us to treat the emoji flags later with the # emoji library). df.location = df.location.astype(str) df.update(df[['location']].applymap('\'{}\''.format)) # Let's take a look at the updated text df

#Transform the emoji flags in the location column def emojiflag_to_text(flag): '''Convert the emoji flag to a string with their name. the emoji flags are two unicode letters that composes for example 🇲🇽 -> MX, so we know that the lenght of an emoji flag is 2, as we put it in a simple quotation marks, we can assume that an emoji flag has lenght equal to 4''' if len(flag) == 4: return emoji.demojize(flag) else: return flag # Applying the transform function df['location'] = df['location'].apply(lambda x: emojiflag_to_text(x)) # Let's take a look at the updated text df

#Remove the simple quotes from all the registers in the "location" column. df['location'] = df['location'].apply(lambda x: x.replace("'","")) #Remove the ":" from the registers that used to be an emoji flag df['location'] = df['location'].apply(lambda x: x.replace(":","")) #Remove the strings nan with a true NaN df['location'] = df['location'].apply(lambda x: x.replace("nan","")) #Remove all the emojis (no emoji flags) that remains in the 'location' column df['location'] = df['location'].str.replace('[^\w\s#@/:%.,_-]', '', flags=re.UNICODE) # Let's take a look at the updated text df

df.info() print('\n') print('Data Type') print('__'*12) df.dtypes

nul_locations = df['location'].isna().sum() print(f'We have {nul_locations} missing locations')

print('Data Type Converted') print('__'*12) df.convert_dtypes().dtypes

# We use unique to know the different variables that conforms each column df.nunique()

# Let's see if there is any duplicated text dup_text = df['text'].duplicated().sum() print(f'We have {dup_text} duplicated texts')

# We can have a description of the lenght of the text, the largest and shortest text and maybe a relation with the target number. df['length'] = [len(i) for i in df['text']] df['length'].describe()

# Histogram with the distribution of the Tweets plt.hist(df.length) plt.xlabel('Length of Tweet') plt.ylabel('# of Tweets') plt.show()

# Table with the highest and lowest lengths display(df[df['length']>130]) print('\n') display(df[df['length']<20])

location = df['location'].value_counts() location[location>=10][:20]

# We are going to group some places, so we create a mapping dictionary to replace the locations mapping = {'United States':'USA', 'New York':'USA', "London":'UK', "Los Angeles, CA":'USA', "Washington, D.C.":'USA', "California":'USA', "Chicago, IL":'USA', "Chicago":'USA', "New York, NY":'USA', "California, USA":'USA', "FLorida":'USA', "Nigeria":'Africa', "Kenya":'Africa', "Everywhere":'Worldwide', "San Francisco":'USA', "Florida":'USA', "United Kingdom":'UK', "Los Angeles":'USA', "Toronto":'Canada', "San Francisco, CA":'USA', "NYC":'USA', "Seattle":'USA', "Earth":'Worldwide', "Ireland":'UK', "London, England":'UK', "New York City":'USA', "Texas":'USA', "London, UK":'UK', "Atlanta, GA":'USA', "England, United Kingdom":'UK', "Mumbai, India":'India', "Melbourne,Victoria":'Australia'}

# We apply a function and change the name of the location to group if it exists on the mapping key df['location'] = df['location'].apply(lambda i: mapping[i] if i in mapping.keys() else i)

# save df df.to_csv('df_transform.csv', index=False)

# Now the values changed ndf = df ndf.dropna() location = ndf['location'].value_counts() location[location>=10]

localizator = Nominatim(user_agent='tweets-analysis') # Creation of the agent

location = list(location.index) #We keep just the name of the cities

# geolocated = list(map(lambda x: [x,localizator.geocode(x,timeout=None)[1] if localizator.geocode(x,timeout=None) else None],location2)) # geolocated = pd.DataFrame(geolocated) # geolocated.columns = ['locat','latlong'] # try: # geolocated['lat'] = geolocated.latlong.apply(lambda x: x[0]) # geolocated['lon'] = geolocated.latlong.apply(lambda x: x[1]) # geolocated.drop('latlong',axis=1, inplace=True) # except: # pass

geolocated = pd.read_csv('coords.csv')

geolocated.drop(['Unnamed: 0','Unnamed: 0.1'],1,inplace=True)

geolocated['latlong']=geolocated.latlong.apply(lambda x: x[1:-1].split(','))

geolocated['lat'] = geolocated.latlong.apply(lambda x: x[0]) geolocated['lon'] = geolocated.latlong.apply(lambda x: x[1]) geolocated.drop('latlong',axis=1, inplace=True)

geolocated['lat']=pd.to_numeric(geolocated['lat']) geolocated['lon']=pd.to_numeric(geolocated['lon'])

world_map = gpd.read_file('World_Countries.shp') fig,ax = plt.subplots(figsize=(15,15)) world_map.plot(ax=ax)

crs = {'init':'epsg:4326'} geometry = [Point(xy) for xy in zip(geolocated['lon'],geolocated['lat'])] geo_df = gpd.GeoDataFrame(geolocated,crs=crs,geometry=geometry) geo_df.head()

fig,ax = plt.subplots(figsize=(25,25)) ax.set(facecolor='Powderblue') ax.set_alpha(0.2) world_map.plot(ax=ax, alpha=1, color='white') geo_df.plot(ax=ax,markersize=30,color='darkviolet',marker='o',alpha=0.3)

countries_mask = df['location'].value_counts() countries_mask

df_usa = df[df['location'] == countries_mask.index[1]] df_usa

plt.rcParams['font.size'] = 15 plt.rcParams['savefig.dpi'] = 100 plt.rcParams['figure.subplot.bottom'] = .1

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='white', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_usa['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_usa_positive = df_usa[df_usa['target'] == 1] df_usa_positive

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_usa_positive['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_usa_negative = df_usa[df_usa['target'] == 0] df_usa_negative

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='white', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_usa_negative['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_uk = df[df['location'] == countries_mask.index[2]] df_uk

plt.rcParams['font.size'] = 15 plt.rcParams['savefig.dpi'] = 100 plt.rcParams['figure.subplot.bottom'] = .1

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_uk['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_uk_positive = df_uk[df_uk['target'] == 1] df_uk_positive

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_uk_positive['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_uk_negative = df_uk[df_uk['target'] == 0] df_uk_negative

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_uk_negative['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_india = df[df['location'] == countries_mask.index[3]] df_india

plt.rcParams['font.size'] = 15 plt.rcParams['savefig.dpi'] = 100 plt.rcParams['figure.subplot.bottom'] = .1

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='white', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_india['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_india_positive = df_india[df_india['target'] == 1] df_india_positive

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_india_positive['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

df_india_negative = df_india[df_india['target'] == 0] df_india_negative

stopwords = set(STOPWORDS) wordcloud = WordCloud( background_color='gray', stopwords=stopwords, max_words = 20, max_font_size = 40, random_state = 42, ).generate(str(df_india_negative['text'])) print(wordcloud) fig = plt.figure(1) plt.imshow(wordcloud) plt.axis('off') plt.show()

wordcloud.words_

x = df['target'].value_counts().index y = df['target'].value_counts() sns.barplot(x=x ,y=y)

# Now we see how a Disaster tweet looks like (Target 1) disaster_tweets=df[df['target']==1]['text'] disaster_tweets.values[5]

# Now we see how a non Disaster tweet looks like (Target 0) nondisaster_tweets=df[df['target']==0]['text'] nondisaster_tweets.values[10]

# Applying a first round of text cleaning techniques def clean_text(text): '''Make text lowercase, remove text in square brackets,remove links,remove punctuation and remove words containing numbers.''' text = text.lower() text = re.sub('\[.*?\]', '', text) text = re.sub('https?://\S+|www\.\S+', '', text) text = re.sub('<.*?>+', '', text) text = re.sub('[%s]' % re.escape(string.punctuation), '', text) text = re.sub('\n', '', text) text = re.sub('\w*\d\w*', '', text) return text # Applying the cleaning function to both test and training datasets df['text'] = df['text'].apply(lambda x: clean_text(x)) # Let's take a look at the updated text df['text'].head()

#Tokenize the dataset tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+') df['text']=df['text'].apply(lambda x:tokenizer.tokenize(x)) df['text'].head()

from nltk.corpus import stopwords from nltk.tokenize import word_tokenize

def remove_stopwords(text): """ Removing stopwords belonging to english language """ words = [w for w in text if w not in stopwords.words('english')] return words df['text'] = df['text'].apply(lambda x : remove_stopwords(x)) df.head()

# After preprocessing, the text format def combine_text(list_of_text): '''Takes a list of text and combines them into one large chunk of text.''' combined_text = ' '.join(list_of_text) return combined_text df['text'] = df['text'].apply(lambda x : combine_text(x)) df.head()

# Splitting the data into independent and dependent features X=df['text'] y=df['target']

X.head()

# We need data to train the model and data to test it. In this case we have 20% data to test X_train,X_test,y_train,y_test = model_selection.train_test_split(X,y,test_size=0.2,random_state=1) #Convert a collection of text documents to a matrix of token counts vectorizer=CountVectorizer() x_train_vectors=vectorizer.fit_transform(X_train) x_test_vectors=vectorizer.transform(X_test)

X_train.head()

# We have a matrix with all the words converted to numbers x_train_vectors.todense()

clf_naive=MultinomialNB(alpha=0.2,fit_prior=False) clf_naive.fit(x_train_vectors,y_train) pred=clf_naive.predict(x_test_vectors) accuracy_score_train=metrics.accuracy_score(y_train,clf_naive.predict(x_train_vectors)) print(f'Accuracy score train: {accuracy_score_train}') accuracy_score_test=metrics.accuracy_score(y_test,pred) print(f'Accuracy score test {accuracy_score_test}') print('_____'*10+'\n') classification_report_train=metrics.classification_report(y_train,clf_naive.predict(x_train_vectors)) print(f'-Classification report train:\n {classification_report_train}') classification_report_test=metrics.classification_report(y_test,pred) print(f'-Classification report test:\n {classification_report_test}') roc_auc_score_train=metrics.roc_auc_score(y_train,clf_naive.predict(x_train_vectors)) print(f'Compute Area Under the Receiver Operating Characteristic Curve train: {roc_auc_score_train}') roc_auc_score_test=metrics.roc_auc_score(y_test,pred) print(f'Compute Area Under the Receiver Operating Characteristic Curve test: {roc_auc_score_test}') print('_____'*10+'\n') confusion_matrix_train=metrics.confusion_matrix(y_train,clf_naive.predict(x_train_vectors)) print(f'Confusion Matrix Train:\n {confusion_matrix_train}') confusion_matrix_test=metrics.confusion_matrix(y_test,pred) print(f'Confusion Matrix Test:\n {confusion_matrix_test}')