import pandas as pd
import numpy as np
!pip install plotly
pd.options.plotting.backend = "plotly"
import plotly.io as pio
import plotly.express as px
pio.templates.default = "plotly_white"
TestDF=pd.DataFrame(np.random.randn(100,5),index=pd.date_range('1/1/15',periods=100),
columns=['IBM','MSFT','GOOG','VERZ','APPL'])
TestDF.cumsum().plot()
sample_submission = pd.read_csv("/datasets/real-or-not-dataset/sample_submission.csv")
test = pd.read_csv("/datasets/real-or-not-dataset/test.csv")
train = pd.read_csv("/datasets/real-or-not-dataset/train.csv")
print(train.target.describe())
train.head(len(train))
count 7613.00000
mean 0.42966
std 0.49506
min 0.00000
25% 0.00000
50% 0.00000
75% 1.00000
max 1.00000
Name: target, dtype: float64
false_true_counts = train.target.value_counts()
false_true_counts = false_true_counts.to_frame()
false_true_counts['indx'] = false_true_counts.index
false_true_counts.columns=['counts','target']
false_true_counts.plot(x ='target', y='counts', kind = 'bar', title='Distibution of Real and Fake tweets')
Cleaning the Data
nan_rows = train[train['location'].isnull()]
nan_rows
is_notreal = nan_rows[nan_rows['target']==0].shape[0] #1 is real
notReal_noloc = is_notreal/nan_rows.shape[0]
print("Real tweets without keyword: {0:0.2f}%".format((notReal_noloc*100)))
Real tweets without keyword: 57.56%
nan_rows.count()
nan_keywords = train[train['keyword'].isnull()]
print("Total tweets without keyword: {}".format(nan_keywords.shape[0]))
Total tweets without keyword: 61
real_nokeyword = (nan_keywords[nan_keywords['target']==1].shape[0]/nan_keywords.shape[0])
print("Real tweets without keyword: {0:0.2f}%".format((real_nokeyword*100)))
Real tweets without keyword: 68.85%
locations = train
locations[locations.location.notnull()]
!pip install flashgeotext
# for finding the city names properly
from flashgeotext.geotext import GeoText
geotext = GeoText(use_demo_data=True)
from collections import Counter
true_locations = train[train['target']==1]
false_locations = train[train['target']==0]
all_true_locations = true_locations.location.str.cat(sep=' ')#convert to string
shittygeo_dict_1 = geotext.extract(input_text=all_true_locations, span_info=False)
temp_dict_1 = {}
for key in shittygeo_dict_1['countries']:
temp_dict_1.update({str(key) : int(shittygeo_dict_1['countries'][key]['count'])})
count_true_country_df = pd.DataFrame(temp_dict_1.items(), columns=['Country', 'Occurrences'])
all_false_locations = false_locations.location.str.cat(sep=' ')#convert to string
shittygeo_dict_2 = geotext.extract(input_text=all_false_locations, span_info=False)
temp_dict_2 = {}
for key in shittygeo_dict_2['countries']:
temp_dict_2.update({str(key) : int(shittygeo_dict_2['countries'][key]['count'])})
count_false_country_df = pd.DataFrame(temp_dict_2.items(), columns=['Country', 'Occurrences'])
count_true_country_df.sort_values(['Occurrences'],ascending=False, inplace = True)
count_true_country_df[:8].plot(x='Country', y='Occurrences', kind='bar', title ="Top Counties with real tweets")
count_false_country_df.sort_values(['Occurrences'], ascending=False, inplace = True)
count_false_country_df[:8].plot(x='Country', y='Occurrences', kind='bar', title="Top Counties with non-real tweets")
!pip install nltk
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
train.keyword[train['keyword'].isnull()] = ''
train.location[train['location'].isnull()] = ''
keyword_string = train.keyword.str.cat(sep=' ')
tokenized_word = word_tokenize(keyword_string)
stemmer = SnowballStemmer("english", ignore_stopwords=True)
clean_tokenized_word = [str(stemmer.stem(word)) for word in tokenized_word]
clean_tokenized_word = [w for w in tokenized_word if not w in stopwords]
# filtered_keyword=[]
# for w in tokenized_word:
# if w not in stopwords:
# filtered_keyword.append(w)
fdist = nltk.FreqDist(clean_tokenized_word)
fd = pd.DataFrame(fdist.most_common(30),columns = ["Word","Frequency"]).drop([0]).reindex()
px.bar(fd, x='Word', y='Frequency', title='Frequency of top Keywords')
train.head()