# import important modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["axes.labelsize"] = 18
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# Import data
data = pd.read_csv('/work/data/data.csv')
# print shape
print('train data shape :', data.shape)
# Inspect Data by shing the first five rows
data.head()
#show list of columns
list(data.columns)
## show Some information about the dataset
print(data.info())
# Check for missing values
print('missing values:', data.isnull().sum())
# Frequency table of a variable will give us the count of each category in that Target variable.
data['bank_account'].value_counts()
# Explore Target distribution
sns.catplot(x="bank_account", kind="count", data= data)
# Explore Country distribution
sns.catplot(x="country", kind="count", data=data)
# Explore Location distribution
sns.catplot(x="location_type", kind="count", data=data)
# Explore Years distribution
sns.catplot(x="year", kind="count", data=data)
# Explore cellphone_access distribution
sns.catplot(x="cellphone_access", kind="count", data=data)
# Explore gender_of_respondents distribution
sns.catplot(x="gender_of_respondent", kind="count", data=data)
# Explore relationship_with_head distribution
sns.catplot(x="relationship_with_head", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore marital_status distribution
sns.catplot(x="marital_status", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore education_level distribution
sns.catplot(x="education_level", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore job_type distribution
sns.catplot(x="job_type", kind="count", data=data);
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Explore household_size distribution
plt.figure(figsize=(16, 6))
data.household_size.hist()
plt.xlabel('Household size')
# Explore age_of_respondent distribution
plt.figure(figsize=(16, 6))
data.age_of_respondent.hist()
plt.xlabel('Age of Respondent')
#Explore location type vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='location_type', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore gender_of_respondent vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='gender_of_respondent', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore cellphone_accesst vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='cellphone_access', hue= 'bank_account', data=data)
plt.xticks(
fontweight='light',
fontsize='x-large'
)
#Explore relationship_with_head vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='relationship_with_head', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore 'marital_status vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='marital_status', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore 'education_level vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='education_level', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
#Explore job_type vs bank account
plt.figure(figsize=(16, 6))
sns.countplot(x='job_type', hue= 'bank_account', data=data)
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)