# import important modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["axes.labelsize"] = 18
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Import data
data = pd.read_csv('/work/data/data.csv')
# print shape
print('train data shape :', data.shape)
train data shape : (23524, 13)
# Inspect Data by shing the first five rows
data.head()
countryobject
yearint64
0
Kenya
2018
1
Kenya
2018
2
Kenya
2018
3
Kenya
2018
4
Kenya
2018
#show list of columns
list(data.columns)
## show Some information about the dataset
print(data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23524 entries, 0 to 23523
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 country 23524 non-null object
1 year 23524 non-null int64
2 uniqueid 23524 non-null object
3 bank_account 23524 non-null object
4 location_type 23524 non-null object
5 cellphone_access 23524 non-null object
6 household_size 23524 non-null int64
7 age_of_respondent 23524 non-null int64
8 gender_of_respondent 23524 non-null object
9 relationship_with_head 23524 non-null object
10 marital_status 23524 non-null object
11 education_level 23524 non-null object
12 job_type 23524 non-null object
dtypes: int64(3), object(10)
memory usage: 2.3+ MB
None
# Check for missing values
print('missing values:', data.isnull().sum())
missing values: country 0
year 0
uniqueid 0
bank_account 0
location_type 0
cellphone_access 0
household_size 0
age_of_respondent 0
gender_of_respondent 0
relationship_with_head 0
marital_status 0
education_level 0
job_type 0
dtype: int64
# Frequency table of a variable will give us the count of each category in that Target variable.
data['bank_account'].value_counts()
# Explore Target distribution
sns.catplot(x="bank_account", kind="count", data= data)
# Explore Country distribution
sns.catplot(x="country", kind="count", data=data)
# Explore Location distribution
sns.catplot(x="location_type", kind="count", data=data)
# Explore Years distribution
sns.catplot(x="year", kind="count", data=data)
# Explore cellphone_access distribution
sns.catplot(x="cellphone_access", kind="count", data=data)
# Explore gender_of_respondents distribution
sns.catplot(x="gender_of_respondent", kind="count", data=data)
# Explore relationship_with_head distribution
sns.catplot(x="relationship_with_head", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore marital_status distribution
sns.catplot(x="marital_status", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore education_level distribution
sns.catplot(x="education_level", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore job_type distribution
sns.catplot(x="job_type", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore household_size distribution
plt.figure(figsize=(16, 6))
data.household_size.hist()
plt.xlabel('Household size')
# Explore age_of_respondent distribution
plt.figure(figsize=(16, 6))
data.age_of_respondent.hist()
plt.xlabel('Age of Respondent')
#Explore location type vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='location_type', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore gender_of_respondent vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='gender_of_respondent', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore cellphone_accesst vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='cellphone_access', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore relationship_with_head vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='relationship_with_head', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore 'marital_status vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='marital_status', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore 'education_level vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='education_level', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore job_type vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='job_type', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)