import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#load train.csv data
dataset=pd.read_csv("/work/train.csv")
#shift the 'OUTCOME' (class variable) as last column.
outcome=dataset.pop("OUTCOME")
dataset.insert(17,"OUTCOME",outcome)
dataset.head(10)
print ('Shape of the given dataset is {}.Contains {} rows and {} columns '.format(dataset.shape,dataset.shape[0],dataset.shape[1]))
dataset.info()
##Selecting numerical features
numerical_data = dataset.select_dtypes(include='number')
#append the features of numerical_data to list
numerical_features=numerical_data.columns.tolist()
print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)
#Selecting categoricalfeatures
categorical_data=dataset.select_dtypes(include= 'object')
#append the features of categorical_data to list
categorical_features=categorical_data.columns.tolist()
print(f'There are {len(categorical_features)} numerical features:', '\n')
print(categorical_features)
numerical_data.describe().T
categorical_data.describe(include='object').T
numerical_data.var()
numerical_data.skew()
numerical_data.hist(figsize=(12,12),bins=20)
plt.show()
numerical_data.nunique()
numerical_data.isnull().sum()
# unique values counts
unique_counts=categorical_data.nunique()
for index,i in enumerate(range(7),start=1):
print(index,"{a} has {b} unique values".format(a=categorical_features[i],b=unique_counts[i]))
#through Looping generate bar plots of unqiue value counts in each variable
for col in categorical_features:
print()
print(f"\033[1m{col}\033[0m\n") #print column name above the plot
categorical_data[col].value_counts().sort_index().plot(kind='bar', rot=0, xlabel=col,ylabel='count')
plt.show()
# Generating correlation coefficient between each features
from pandas import set_option
set_option("precision",3)
correlation=dataset.corr(method='pearson')
correlation
# Plot correlation matrix in heatmap
fig, ax = plt.subplots(figsize=(10, 6))
sns.heatmap(dataset.corr(), ax=ax, annot=True)
plt.show()
fig, ax = plt.subplots(3,1, figsize=(12, 12))
## Correlation coefficient using different methods
corr1 = numerical_data.corr('pearson')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False)
corr2 = numerical_data.corr('spearman')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False)
corr3 = numerical_data.corr('kendall')[['OUTCOME']].sort_values(by='OUTCOME', ascending=False)
#setting titles for each plot
ax[0].set_title('Pearson method')
ax[1].set_title('spearman method')
ax[2].set_title('Kendall method')
## Generating heatmaps of each methods
sns.heatmap(corr1, ax=ax[0], annot=True)
sns.heatmap(corr2, ax=ax[1], annot=True)
sns.heatmap(corr3, ax=ax[2], annot=True)
plt.show()
#check Imbalance in data
#group instances based on the classes in OUTCOME variable
class_counts=dataset.groupby("OUTCOME").size()
columns=['outcome','count','percentage']
outcome=[0,1]
count=list()
percentage=list()
#Calculate the percentage of each value of the OUTCOME variable from total
for val in range(2):
count.append(class_counts[val])
percent=(class_counts[val]/105000)*100
percentage.append(percent)
# Convert the calulated values into a dataframe
imbalance_df=pd.DataFrame(list(zip(outcome,count,percentage)),columns=columns)
imbalance_df
sns.barplot(data=imbalance_df,x=imbalance_df['outcome'],y=imbalance_df['percentage'])
plt.show()
numerical_data.plot(kind='density',figsize=(14,14),subplots=True,layout=(6,2),title="Density plot of Numerical features",sharex=False)
plt.show()