!pip install --upgrade pip
!pip install numpy pandas matplotlib seaborn empiricaldist statsmodels sklearn pyjanitor
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import janitor
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11,9.4)
# Seaborn
penguin_color = {
'Adelie':'#ff6602ff',
'Gentoo':'#0f7175ff',
'Chinstrap':'#c65dc9ff',
'Torgersen': '#955FC8',
'Biscoe': '#94e2c3',
'Dream': '#345469',
'Female': 'pink',
'Male': 'skyblue',
}
# Matplotlib
pcolors =['#ff6602ff','#0f7175ff','#c65dc9ff']
icolors =['#955FC8', '#94e2c3', '#345469']
scolors =['pink','skyblue',]
plt_colors=[pcolors,icolors,scolors]
df = sns.load_dataset('penguins')
df
df.info()
category_columns = ['species','island','sex']
numeric_columns = df.select_dtypes(include=np.number).columns
penguin_columns = ['Adelie', 'Chinstrap', 'Gentoo' ]
df.notnull().sum()
df.isnull().sum()
df.isnull().sum()/df.notnull().sum()
missing_data = df.isnull().any(True)
missing_data
df[missing_data]
df2 = df.dropna()
print(f'''
{df2.isna().any()}
''')
df = df.astype({'species': 'category','island': 'category','sex': 'category',})
df2 = df2.astype({'species': 'category','island': 'category','sex': 'category',})
df2.info()
numerical_statistics = pd.concat([
df2.describe(include=np.number).iloc[0:1],
df2.mode(numeric_only=True).rename(index={0:'mode'}),
pd.DataFrame(df2.median(numeric_only=True),columns=['median']).T,
df2.describe(include=np.number).iloc[1:8],
])
numerical_statistics
fig,ax = plt.subplots(1,4,figsize=(20,5))
for i,col in enumerate(numerical_statistics):
sns.histplot(
ax=ax[i],
data=df2,
x=col,
palette=penguin_color,
bins=50,
alpha=.55,
color='#0f7175ff',
kde=True,
)
ax[i].lines[0].set_color('#4c36f5')
ax[i].axvline(
x=numerical_statistics.iloc[:,i:i+1].loc['25%'][0],
color='#f26a02',
linestyle='dashed',
linewidth=2.5,
label='Q1'
)
ax[i].axvline(
x=numerical_statistics.iloc[:,i:i+1].loc['75%'][0],
color='#bd00b0',
linestyle='dashed',
linewidth=2.5,
label='Q3'
)
ax[i].axvline(
x=numerical_statistics.iloc[:,i:i+1].loc['mean'][0],
color='#f75c6b',
linestyle='dashed',
linewidth=2.5,
label='mean',
)
ax[i].legend()
df2.describe(include='category')
fig, ax = plt.subplots(1,3,figsize=(20,5))
for i,category in enumerate(category_columns):
sns.histplot(
ax=ax[i],
data=df2,
y=category,
hue=category,
palette=penguin_color,
alpha=0.6
)
fig, ax = plt.subplots(1,3,figsize=(20,5))
for i,category in enumerate(category_columns):
sns.histplot(
ax=ax[i],
data=df2.add_column(category[i],',').reset_index(),
y=category[i],
palette=penguin_color,
multiple='fill',
stat='count',
hue=category,
alpha=0.6
)
ax[i].set(ylabel=category, xlabel='proportion')
male = df2.sex == 'Male'
female = ~male
adelie = df2.species == 'Adelie'
chinstrap = df2.species == 'Chinstrap'
gentoo = df2.species == 'Gentoo'
torgersen = df2.island == 'Torgersen'
dream = df2.island == 'Dream'
biscoe = df2.island == 'Biscoe'
species = [adelie,chinstrap,gentoo]
islands = [torgersen,dream,biscoe]
sex = [male, female]
df2.groupby(['species','island']).agg(['min','mean','max',]).dropna()
fig,ax = plt.subplots(3,len(numeric_columns), figsize=(20,15))
for i, i_col in enumerate(category_columns):
for j, j_col in enumerate(numeric_columns):
sns.violinplot(
split=True if i == 2 else False,
ax=ax[i][j],
data=df2,
x='species',
y=j_col,
hue=i_col,
palette=penguin_color,
)
ax[i][j].set_title(j_col) if i ==0 else None
ax[i][j].set_xlabel(None)
ax[i][j].set_ylabel(None)
fig,ax = plt.subplots(3,len(numeric_columns), figsize=(20,15))
for i, i_col in enumerate(category_columns):
for j, j_col in enumerate(numeric_columns):
sns.histplot(
ax=ax[i][j],
data=df2,
x=j_col,
hue=i_col,
bins=40,
kde=True,
palette=penguin_color,
)
fig,ax = plt.subplots(len(numeric_columns),len(species), figsize=(15,13))
bins = 20
for i, i_col in enumerate(numeric_columns):
for j, j_col in enumerate(category_columns):
sns.histplot(
ax=ax[i][j],
data=df2[adelie],
x=i_col,
hue=j_col,
multiple='layer',
bins=30,
kde=True,
palette=penguin_color,
)
ax[i][j].set_ylabel(numeric_columns[i], labelpad=60,rotation=0) if j==0 else ax[i][j].set_ylabel(None)
ax[i][j].set_xlabel(None)
fig.suptitle('Adelie species');
plt.subplots_adjust(top=0.95);
fig,ax = plt.subplots(len(numeric_columns),len(species), figsize=(15,13))
bins = 20
for i, i_col in enumerate(numeric_columns):
for j, j_col in enumerate(category_columns):
sns.histplot(
ax=ax[i][j],
data=df2[chinstrap],
x=i_col,
hue=j_col,
multiple='layer',
bins=30,
kde=True,
palette=penguin_color,
)
ax[i][j].set_ylabel(numeric_columns[i], labelpad=60,rotation=0) if j==0 else ax[i][j].set_ylabel(None)
ax[i][j].set_xlabel(None)
plt.suptitle('Chinstrap species');
plt.subplots_adjust(top=0.95);
fig,ax = plt.subplots(len(numeric_columns),len(species), figsize=(15,13))
bins = 20
for i, i_col in enumerate(numeric_columns):
for j, j_col in enumerate(category_columns):
sns.histplot(
ax=ax[i][j],
data=df2[gentoo],
x=i_col,
hue=j_col,
multiple='layer',
bins=30,
kde=True,
palette=penguin_color,
)
ax[i][j].set_ylabel(numeric_columns[i], labelpad=60,rotation=0) if j==0 else ax[i][j].set_ylabel(None)
ax[i][j].set_xlabel(None)
plt.suptitle('Gentoo species');
plt.subplots_adjust(top=0.95);
dummies = pd.get_dummies(df2).iloc[:,4:12]
dummie_df = pd.concat([df2,dummies],axis=1)
dummie_df
sns.heatmap(
data=dummie_df.corr(),
annot=True,
cmap=sns.diverging_palette(20,238, as_cmap=True),
fmt='.2f'
)
df[missing_data]
from statsmodels.formula.api import logit
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
print(f'''
Proportion Penguins
{((dummie_df['species'].value_counts()/len(dummie_df)).sort_values(ascending=False))}
''')
# Stratified Sampling Function
def data_strat(dummie_df, strat_columns_name, strat_values, prop_strat, random_state=None):
df_estrat = pd.DataFrame(columns=dummie_df.columns) # Create an empty data frame with the names of the columns of dummie_df
pos = -1
for i in range(len(strat_values)): # Iterate over the input of stratified values
pos += 1
if pos == len(strat_values) -1:
ratio_len = len(dummie_df) - len(df_estrat) # Calculates the number of output values having the same number of rows as input
else:
ratio_len = int(len(dummie_df) * prop_strat[i]) # calculates the number of rows according to the desired ratio
df_filter = dummie_df[dummie_df[strat_columns_name] == strat_values[i]] # filters the source data based on the values selected in the data layer
df_temp = df_filter.sample(replace=True, n=ratio_len, random_state=random_state) # sample the filtered data using the ratio we have calculated
df_estrat = pd.concat([df_estrat, df_temp]) # concat the sample tables with the stratified one to produce the final result
return df_estrat # Return the stratified, re-sampled data
strat_values = ['Adelie','Chinstrap','Gentoo']
strat_prop = [0.333, 0.333, 0.333]
df_strat = data_strat(dummie_df, 'species',strat_values, strat_prop, random_state = 42)
print(f'''
Proportion sample penguins
{((df_strat['species'].value_counts()/len(df_strat)).sort_values(ascending=False))}
''')
train,test = train_test_split(dummie_df, train_size=0.5, random_state=1)
for i,i_col in enumerate(test.iloc[:,7:15]):
test[i_col] = pd.to_numeric(test[i_col])
train[i_col] = pd.to_numeric(train[i_col])
logit_model = logit(
formula='sex_Male ~ flipper_length_mm + body_mass_g + bill_length_mm + bill_depth_mm + species_Adelie + species_Chinstrap + species_Gentoo + island_Biscoe + island_Dream + island_Torgersen',
data=train
).fit()
test_predict = logit_model.predict(test).round()
accuracy_score(
test.sex_Male,
test_predict
)
plt.figure(figsize=(5,5))
plt.title('Confusion Matrix')
sns.heatmap(
confusion_matrix(test['sex_Male'], test_predict),
annot=True
)
df[missing_data]
logit_model = logit(
formula='sex_Male ~ flipper_length_mm + body_mass_g + bill_length_mm + bill_depth_mm + species_Adelie + species_Chinstrap + species_Gentoo + island_Biscoe + island_Dream + island_Torgersen',
data=dummie_df
).fit()
prediction_sex = logit_model.predict(pd.get_dummies(df[missing_data])).round()
prediction_sex = pd.DataFrame(prediction_sex, columns=['sex'])
prediction_sex = prediction_sex.replace([0,1],['Female','Male'])
prediction_sex = prediction_sex.astype({'sex': 'category',})
df.loc[prediction_sex.index,'sex'] = prediction_sex.sex
df[missing_data]