#0 EDA - My machine learning pipeline
Imports
import os
import numpy as np
import pandas as pd
from easydict import EasyDict as edict
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
Config
config = edict()
config.main = edict()
config.main.TRAIN_FILE = "/work/train.csv"
config.main.TEST_FILE = "/work/test.csv"
config.main.TARGET_VAR = "target"
Loading data
df_train = pd.read_csv(config.main.TRAIN_FILE)
df_test = pd.read_csv(config.main.TEST_FILE)
EDA
df_train.head()
df_test.head()
df_train.info()
df_test.info()
df_train.describe()
df_test.describe()
TARGET VARIABLE
plt.figure(figsize = (10,5))
sns.barplot(data = df_train, x = 'target')
plt.show()
df_train.target.value_counts()
MISSING VALUES
def missing_zero_values_table(df): #Function to access missing and zero values in your dataset
zero_val = (df == 0.00).astype(int).sum(axis=0) #Number of zeros in the dataset
mis_val = df.isnull().sum() #Number of missing values in the dataset
mis_val_percent = 100 * df.isnull().sum() / len(df) #percent missing values
mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
mz_table = mz_table.rename(
columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'}) #Create missing value table
mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
mz_table['Data Type'] = df.dtypes
mz_table = mz_table[
mz_table.iloc[:,1] != 0].sort_values(
'% of Total Values', ascending=False).round(1)
print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"
"There are " + str(mz_table.shape[0]) +
" columns that have missing values.")
return mz_table
missing_zero_values_table(df_train)
missing_zero_values_table(df_test)
PANDAS PROFILING
profile_train = ProfileReport(df_train, title='Pandas Train Profiling Report', html={'style':{'full_width':True}})
profile_train
profile_test = ProfileReport(df_test, title='Pandas Train Profiling Report', html={'style':{'full_width':True}})
profile_test