import pandas as pd
import numpy as np
import time
import seaborn as sns
import matplotlib.pylab as plt
%matplotlib inline
dataPath = 'nlp-getting-started/train.csv'
data = pd.read_csv(dataPath, index_col=0)
data.info()
data.count()
data.isna().sum()
dataNonNull = data.dropna()
print(f'{"Columns":20}: {"All":10} {"NonNull":10} {"%NonNull":10} {"Difference"}')
for idx, col in enumerate(data.columns):
allValue = data.count()[idx]
nonNullValue = dataNonNull.count()[idx]
per = nonNullValue*100/allValue
diff = allValue - nonNullValue
print(f'{col:20}: {allValue} {nonNullValue:10} {np.round(per):10} {diff:10}')
data['keyword'].fillna("", inplace=True)
data['location'].fillna("", inplace=True)
columnName = 'target'
#----------------------
def getCategoricalColumn(value):
if value == 1: return "Disaster"
else: return "Not disaster"
CategoricalColumn = data[columnName].apply(getCategoricalColumn)
CategoricalColumn.name = 'catTarget'
df = pd.concat([data, CategoricalColumn], axis=1)
#----------------------
groups = []
for group, subset in df.groupby(by=CategoricalColumn.name):
groups.append({
CategoricalColumn.name: group,
'Count': len(subset)
})
lenData = data[columnName].count()
dataCategoricalQuality = pd.DataFrame(groups)
fig, ax = plt.subplots(figsize=(4, 4))
dataCategoricalQuality.plot.bar(x=CategoricalColumn.name, ax=ax)
for i in range(len(groups)):
value = str(groups[i]['Count'])+': '+str(np.round(groups[i]['Count']*100/lenData))+'%'
ax.text(i, groups[i]['Count'], value , horizontalalignment='center',
verticalalignment='bottom')
ax.set_ylim(0, lenData - lenData/5)
ax.set_xlabel('target')
ax.set_ylabel('Count')
ax.set_title('Sum: '+ str(lenData) )
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()
data['keyword'] = data['keyword'].str.replace('%20', ' ')
columnNameA = 'target'
columnNameB = 'keyword'
crossTable = pd.crosstab(index=data[columnNameB],
columns=data[columnNameA],
margins=True)
crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)
print('Most frequent Keywords for Disaster')
crossTable.sort_values(by='Disaster', ascending=False).head(10)
print('Most frequent Keywords for Not disaster')
crossTable.sort_values(by='Not disaster', ascending=False).head(10)
data['keywordLengthChar'] = data['keyword'].apply(len)
columnNameA = 'target'
columnNameB = 'keywordLengthChar'
sns.boxplot(data=data, x=columnNameA, y=columnNameB)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'keywordLengthChar'
g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'location'
crossTable = pd.crosstab(index=data[columnNameB],
columns=data[columnNameA],
margins=True)
crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)
print('Most frequent Locations for Disaster')
crossTable.sort_values(by='Disaster', ascending=False).head(10)
print('Most frequent Locations for Not disaster')
crossTable.sort_values(by='Not disaster', ascending=False).head(10)
data['textLengthChar'] = data['text'].apply(len)
columnName = 'textLengthChar'
ax = (data[columnName]).plot.box(figsize=(3, 4))
ax.set_ylabel(columnName)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'textLengthChar'
g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'textLengthChar'
sns.boxplot(data=data, x=columnNameA, y=columnNameB)
plt.tight_layout()
plt.show()
def getWordTextLength(text):
return len(text.split())
data['textLengthWord'] = data['text'].apply(getWordTextLength)
columnName = 'textLengthWord'
ax = (data[columnName]).plot.box(figsize=(3, 4))
ax.set_ylabel(columnName)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'textLengthWord'
g = sns.FacetGrid(data, col=columnNameA)
g.map(sns.histplot, columnNameB, bins=30)
plt.tight_layout()
plt.show()
columnNameA = 'target'
columnNameB = 'textLengthWord'
sns.boxplot(data=data, x=columnNameA, y=columnNameB)
plt.tight_layout()
plt.show()
pattern = r'(https?://\S+)'
data['link']= data["text"].str.extract(pattern)
data['containLink'] = data['link'].notna()
columnNameA = 'target'
columnNameB = 'containLink'
crossTable = pd.crosstab(index=data[columnNameB],
columns=data[columnNameA],
margins=True)
crossTable.rename(columns={0 : 'Not disaster',1 : 'Disaster',}, inplace=True)
crossTable.rename(index={False : 'No link',True : 'Link',}, inplace=True)
crossTable['Not disaster %'] = crossTable['Not disaster'] * 100 / crossTable['All']
crossTable['Disaster %'] = crossTable['Disaster'] * 100 / crossTable['All']
crossTable