EDA for Classification problem

import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

#load train.csv data dataset=pd.read_csv("/work/train.csv") #shift the 'OUTCOME' (class variable) as last column. outcome=dataset.pop("OUTCOME") dataset.insert(17,"OUTCOME",outcome)

dataset.head(10)

print ('Shape of the given dataset is {}.Contains {} rows and {} columns '.format(dataset.shape,dataset.shape[0],dataset.shape[1]))

dataset.info()

##Selecting numerical features numerical_data = dataset.select_dtypes(include='number') #append the features of numerical_data to list numerical_features=numerical_data.columns.tolist() print(f'There are {len(numerical_features)} numerical features:', '\n') print(numerical_features)

#Selecting categoricalfeatures categorical_data=dataset.select_dtypes(include= 'object') #append the features of categorical_data to list categorical_features=categorical_data.columns.tolist() print(f'There are {len(categorical_features)} numerical features:', '\n') print(categorical_features)

numerical_data.describe().T

categorical_data.describe(include='object').T

numerical_data.var()

numerical_data.skew()

numerical_data.hist(figsize=(12,12),bins=20) plt.show()

numerical_data.nunique()

numerical_data.isnull().sum()

# unique values counts unique_counts=categorical_data.nunique() for index,i in enumerate(range(7),start=1): print(index,"{a} has {b} unique values".format(a=categorical_features[i],b=unique_counts[i]))

#through Looping generate bar plots of unqiue value counts in each variable for col in categorical_features: print() print(f"\033[1m{col}\033[0m\n") #print column name above the plot categorical_data[col].value_counts().sort_index().plot(kind='bar', rot=0, xlabel=col,ylabel='count') plt.show()

# Generating correlation coefficient between each features from pandas import set_option set_option("precision",3) correlation=dataset.corr(method='pearson') correlation

# Plot correlation matrix in heatmap fig, ax = plt.subplots(figsize=(10, 6)) sns.heatmap(dataset.corr(), ax=ax, annot=True) plt.show()

fig, ax = plt.subplots(3,1, figsize=(12, 12)) ## Correlation coefficient using different methods corr1 = numerical_data.corr('pearson')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False) corr2 = numerical_data.corr('spearman')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False) corr3 = numerical_data.corr('kendall')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False) #setting titles for each plot ax[0].set_title('Pearson method') ax[1].set_title('spearman method') ax[2].set_title('Kendall method') ## Generating heatmaps of each methods sns.heatmap(corr1, ax=ax[0], annot=True) sns.heatmap(corr2, ax=ax[1], annot=True) sns.heatmap(corr3, ax=ax[2], annot=True) plt.show()

#check Imbalance in data #group instances based on the classes in OUTCOME variable class_counts=dataset.groupby("OUTCOME").size() columns=['outcome','count','percentage'] outcome=[0,1] count=list() percentage=list() #Calculate the percentage of each value of the OUTCOME variable from total for val in range(2): count.append(class_counts[val]) percent=(class_counts[val]/105000)*100 percentage.append(percent) # Convert the calulated values into a dataframe imbalance_df=pd.DataFrame(list(zip(outcome,count,percentage)),columns=columns) imbalance_df

sns.barplot(data=imbalance_df,x=imbalance_df['outcome'],y=imbalance_df['percentage']) plt.show()

numerical_data.plot(kind='density',figsize=(14,14),subplots=True,layout=(6,2),title="Density plot of Numerical features",sharex=False) plt.show()