import pandas as pd #for data manipulations
import numpy as np # for computations
import seaborn as sns # for visualization
import matplotlib.pyplot as plt # for visualizations
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option("display.max_colwidth", 50)
Loading data
data = pd.read_csv("data/df.csv")
# Let's preview the datasets
data.head()
As we can observe that our data frame has proper standard names and we can understand what they do mean no need of renaming our columns
# let's see the shape of datasets
data.shape
The dataset contain 23524 records and 13 columns
# To understand what each column , lets use variable definition file
var_def = pd.read_csv("data/VariableDefinitions.csv")
var_def.T
`var_def.T` gives records in a transpose manner, which is the best to see each record in small space rather than occupying large space
# let's see information of our dataset
data.info()
# Let's check for missing data
data.isnull().sum().any()
Univariate Analysis
# Let's start explore age distribution using histogram
plt.figure(figsize=(14,7))
data.age_of_respondent.hist()
plt.title("Age of Respondent Distribution")
plt.xlabel('age_of_respondent')
plt.show()
# Let's start explore household_size distribution using histogram
plt.figure(figsize=(14,7))
data.household_size.hist()
plt.title("Household Size Distribution")
plt.xlabel('household_size')
plt.show()
# Let's explore the distribution of our target column using count
sns.catplot(x="bank_account", kind="count", data=data)
plt.title("Banck Account Distribution")
# Let's explore the distribution of Gender of respondents using count
sns.catplot(x="gender_of_respondent", kind="count", data=data)
plt.title("Gender of Respondent Distribution")
# Let's explore the distribution of country of respondents using count
sns.catplot(x="country", kind="count", data=data)
plt.title("Country of Respondent Distribution")
# Let's explore the distribution of Year of respondents using count
sns.catplot(x="year", kind="count", data=data)
plt.title("Year Distribution")
# Let's explore the distribution of location_type of respondents using count
sns.catplot(x="location_type", kind="count", data=data)
plt.title("Location Distribution")
# Let's explore the distribution of cellphone_access using count
sns.catplot(x="cellphone_access", kind="count", data=data)
plt.title("Cellphone Acess Distribution")
# Let's explore the distribution of Education level using count
sns.catplot(x="education_level", kind="count", data=data)
plt.title("Education Level Distribution")
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Let's explore the distribution of Job Type using count
sns.catplot(x="job_type", kind="count", data=data)
plt.title("Job Type Distribution")
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)
# Let's explore the distribution of marital_status using count
sns.catplot(x="marital_status", kind="count", data=data)
plt.title("Marital Status Distribution")
plt.xticks(
rotation=45,
horizontalalignment='right',
fontweight='light',
fontsize='x-large'
)