EDA Definition
Razones para hacer un EDA
Pasos de EDA
Tipos de Analitica de datos
Project
Environment Setup
Library installation
!pip install --upgrade pip
!pip install palmerpenguins==0.1.4 numpy==1.23.4 pandas==1.5.1 seaborn==0.12.1 matplotlib==3.6.0 empiricaldist==0.6.7 statsmodels==0.13.5 scikit-learn==1.1.2 pyjanitor==0.23.1 session-info
Library import
import empiricaldist
import janitor
import matplotlib.pyplot as plt
import numpy as np
import palmerpenguins
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
import session_info
Stablish Chart Settings
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11, 9.4)
penguin_color = {
'Adelie': '#ff6602ff',
'Gentoo': '#0f7175ff',
'Chinstrap': '#c65dc9ff'
}
Importing Dataset
Raw Data
raw_df = palmerpenguins.load_penguins_raw()
raw_df
processed data
df = palmerpenguins.load_penguins()
df
Using the seaborn dataset
sns.load_dataset("penguins")
Performing EDA
What are the variables types for this dataset?
df.dtypes
How many variables of each type is present in the dataset?
(
df
.dtypes
.value_counts()
)
How many variables and records are in the dataset?
df.shape
Are there null values in the dataset?
df.isnull().any()
How many null values are there in each column?
df.isnull().sum()
What is the total amount of null values in the dataset?
df.isnull().sum().sum()
What is the null values proportion for each variable?
( df.isnull().sum()/(df.shape[0]) )*100
melted = df.isnull().melt()
melted
melted.pipe(
lambda x : sns.displot(data=x, y="variable", hue="value", multiple="fill", aspect=2)
)
What are the records that contain the null values in the dataset? (Visualize)
nulls = df.isnull().transpose()
nulls
plt.figure(figsize=(7, 7))
nulls.pipe(
lambda x : sns.heatmap(data=x)
)
# Alternatively, use:
#df.isnull().transpose().pipe(lambda x : sns.heatmap(data=x))
How many records will be lost if we delete all the missing values?
df.dropna()
df.shape
df_clean = df.dropna()
df_clean
Counts and Proportions
What measures describe our dataset?
df.describe(include="all")
df.describe(include=[np.number])
df.describe(include=object)
BONUS: Transform an object into a category
df.astype({
"species":"category", #just insert a dict where key is column name and value is the type
"island":"category",
"sex":"category"
})
How to visualize counts?
plt.figure(figsize=(6, 6))
df.species.value_counts().plot(kind="bar");
sns.catplot(data=df, x="species", kind="count", palette=penguin_color);
How to visualize proportions
df.add_column("x", "").pipe(
lambda df: (
sns.displot(data=df, x="x", hue="species", multiple="fill", palette=penguin_color)
)
);
Measures of central tendency
Mean
df.mean()
df.bill_depth_mm.mean()
Median
df.median()
Mode
df.mode()
Measures of dispersion
Maximum value for each variable?
df.max(numeric_only=True) #numeric_only=True excludes the categorical variables(makes sense)
Minimum value for each variable?
df.min(numeric_only=True)
Range of each variable
df.max(numeric_only=True) - df.min(numeric_only=True)
Standart deviation
df.std()
Interquartile range?
df.quantile(0.25)
df.quantile(0.75)
(
df
.quantile(q=[0.75, 0.50, 0.25])
.transpose()
.rename_axis("variable")
.reset_index()
.assign(
iqr=lambda df: df[0.75] - df[0.25]
)
)
Probabilities
Probability Mass function
empiricaldist.Pmf.from_seq(df.flipper_length_mm)
empiricaldist.Pmf.from_seq(df.flipper_length_mm)(190)
Empirical Cumulative Distribution Function
plt.figure(figsize=(6, 6))
sns.ecdfplot(data=df, x="flipper_length_mm");
empiricaldist.Cdf.from_seq(df.flipper_length_mm)(200)
Correlations
Pairplot
sns.pairplot(data=df.drop(["year"], axis=1));
Pairplot per specie
sns.pairplot(data=df.drop(["year"], axis=1), hue="species", palette=penguin_color);
Swarmplot
plt.figure(figsize=(6, 6))
sns.swarmplot(
data=df,
x="island",
y="body_mass_g",
hue="species",
palette=penguin_color
);
Is there a linear correlation among any of our variables?
df.drop(["year"], axis=1).corr() #first remove the year column and then run
plt.figure(figsize=(6, 6))
sns.heatmap(
data=df.drop(["year"], axis=1).corr(),
cmap=sns.diverging_palette(20, 230, as_cmap=True), #nice color palette
center=0, #centering the color legend
vmin=-1, #set min corr value to -1
vmax=1, #set max corr value to 1
linewidths=0.5,
annot=True
)
sns.clustermap(
data=df.drop(["year"], axis=1).corr(),
cmap=sns.diverging_palette(20, 230, as_cmap=True), #nice color palette
center=0, #centering the color legend
vmin=-1, #set min corr value to -1
vmax=1, #set max corr value to 1
linewidths=0.5,
annot=True,
figsize=(6,6)
);
How can i represent a categorical variable as a discrete numerical?
df = df.assign(
numeric_sex=lambda df : df.sex.replace(["female", "male"], [0, 1])
)
df_clean = df.dropna()
sns.clustermap(
data=df.drop(["year"], axis=1).corr(),
cmap=sns.diverging_palette(20, 230, as_cmap=True), #nice color palette
center=0, #centering the color legend
vmin=-1, #set min corr value to -1
vmax=1, #set max corr value to 1
linewidths=0.5,
annot=True,
figsize=(6,6)
);
Multiple regression
Model 1
#Creating the model
model_1 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm",
data=df_clean
)
.fit()
)
#Describing the model
model_1.summary()
Model 2
#Creating the model
model_2 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm",
data=df_clean
)
.fit()
)
#Describing the model
model_2.summary()
Model 3
#Creating the model
model_3 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm",
data=df_clean
)
.fit()
)
#Describing the model
model_3.summary()
Model 4
#Creating the model
model_4 = (
smf.ols(
formula="body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)",
data=df_clean
)
.fit()
)
#Describing the model
model_4.summary()
Model 5
#Creating the model
model_5 = (
smf.ols(
formula="body_mass_g ~ flipper_length_mm + C(sex)",
data=df_clean
)
.fit()
)
#Describing the model
model_5.summary()
Multiple Regression Model Comparisons
Creation of a results table
model_results = pd.DataFrame(
dict(
actual_value = df_clean.body_mass_g,
prediction_model_1 = model_1.predict(),
prediction_model_2 = model_2.predict(),
prediction_model_3 = model_3.predict(),
prediction_model_4 = model_4.predict(),
prediction_model_5 = model_5.predict(),
species = df_clean.species,
sex = df_clean.sex
)
)
model_results
Comparing predicted values vs Actual values (ECDFs)
plt.figure(figsize=(6, 6))
sns.ecdfplot(
data=model_results
);
Comparing predicted values distributions vs Original values distributions(Kdeplot)
plt.figure(figsize=(6, 6))
sns.kdeplot(
data=model_results
);
Bonus: Pandas Profiling Report
import palmerpenguins
import pandas as pd
import pandas_profiling as pp
df = palmerpenguins.load_penguins()
pp.ProfileReport(df)