!pip install --upgrade pip
!pip install palmerpenguins numpy pandas seaborn matplotlib empiricaldist statsmodels sklearn pyjanitor
import empiricaldist
import janitor
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import palmerpenguins
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11,9.4)
penguin_color = {
'Adelie':'#ff6602ff',
'Gentoo':'#0f7175ff',
'Chinstrap':'#c65dc9ff',
'Torgersen': '#955FC8',
'Biscoe': '#94e2c3',
'Dream': '#345469',
}
raw_penguins_df = palmerpenguins.load_penguins_raw()
raw_penguins_df.head()
preprocess_penguins_df = palmerpenguins.load_penguins()
preprocess_penguins_df.head()
raw_penguins_df = pd.read_csv('https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins_raw.csv') # Raw Data
preprocess_penguins_df = pd.read_csv('https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/inst/extdata/penguins.csv') # preprocessed Data
raw_penguins_df.head(3)
preprocess_penguins_df.head(3)
# preprocess_penguins_df = sns.load_dataset('penguins')
preprocess_penguins_df.dtypes
(
preprocess_penguins_df
.dtypes
.value_counts()
)
preprocess_penguins_df.shape
preprocess_penguins_df.isnull().any()
(
preprocess_penguins_df
.isnull()
.sum()
)
(
preprocess_penguins_df
.isnull()
.sum()
.sum()
)
(
preprocess_penguins_df
.isnull()
.melt()
.pipe(
lambda df: (
sns.displot(
data=df,
y='variable',
hue='value',
multiple='fill',
aspect=3
)
)
)
)
(
preprocess_penguins_df
.isnull()
.transpose()
.pipe(
lambda df: (
sns.heatmap(
data=df,
)
)
)
)
(
preprocess_penguins_df.shape,
preprocess_penguins_df.dropna().shape
)
penguins_df = (
preprocess_penguins_df
.dropna()
)
penguins_df
penguins_df.describe(include='all')
penguins_df.describe(include=np.number)
penguins_df.describe(include=object)
(
penguins_df
.astype(
{
'species': 'category',
'island': 'category',
'sex': 'category'
}
)
.dtypes
)
# No guardamos los datos!, En caso de que quieras guardarlos deberas sobreescribir los datos
penguins_df.dtypes
(
penguins_df
.species
.value_counts()
.plot(
kind='bar'
)
)
sns.catplot(
data=penguins_df,
x='species',
kind='count',
palette=penguin_color,
)
(
penguins_df
.value_counts('species',sort=True)
.reset_index(name='count')
.pipe(
lambda df: (
sns.barplot(
data=df,
x='species',
y='count',
palette=penguin_color,
alpha=0.5
)
)
)
)
(
penguins_df
.add_column('x','')
.pipe(
lambda df: (
sns.displot(
data=df,
x='x',
hue='species',
multiple='fill',
palette=penguin_color,
stat='percent',
alpha=0.5
)
)
)
)
penguins_df.bill_depth_mm.mean()
penguins_df.median(numeric_only=True)
penguins_df.mode()
penguins_df.describe(include=object)
max=penguins_df.max(numeric_only=True)
min=penguins_df.min(numeric_only=True)
'RANGE',np.subtract(max,min)
penguins_df.max(numeric_only=True)
penguins_df.min(numeric_only=True)
np.subtract(penguins_df.max(numeric_only=True), penguins_df.min(numeric_only=True))
penguins_df.std(numeric_only=True)
print(f'''
mean + std
{penguins_df.mean(numeric_only=True) + penguins_df.std(numeric_only=True)}
mean - std
{penguins_df.mean(numeric_only=True) - penguins_df.std(numeric_only=True)}
''')
(
penguins_df
.quantile([.25,.50,.75])
.T
.rename_axis('variable')
.assign(
iqr= lambda df: df[0.75] - df[0.25]
)
)
plt.figure(figsize=(8,4))
sns.histplot(
data=penguins_df,
x='flipper_length_mm',
)
plt.axvline(
x=penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
species = penguins_df.species.unique()
adelie_df = penguins_df.query("species == 'Adelie'")
gentoo_df = penguins_df.query("species == 'Gentoo'")
chinstrap_df = penguins_df.query("species == 'Chinstrap'")
specie = [adelie_df,gentoo_df,chinstrap_df]
plt.figure(figsize=(8,4))
sns.histplot(
data=penguins_df,
x='flipper_length_mm',
hue='species',
bins=20
)
for i in specie:
plt.axvline(
x=i.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
plt.figure(figsize=(8,4))
sns.boxplot (
data=penguins_df,
x='flipper_length_mm',
y='species'
)
def freedman_diaconis_bindwith(x: pd.Series) -> float:
"""Find optimal bindwith using Freedman-Diaconis rule."""
IQR = x.quantile(.075) - x.quantile(0.25)
N = x.size
return np.abs(2 * IQR / N * 1/3)
freedman_diaconis_bindwith(penguins_df.bill_length_mm)
sns.histplot(
data=penguins_df,
x='flipper_length_mm',
hue='species',
multiple='layer',
# binwidth=freedman_diaconis_bindwith(penguins_df.bill_length_mm)
)
for i in specie:
plt.axvline(
x=i.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
fig,ax = plt.subplots(ncols=4,nrows=1,figsize=(20,5))
for i in range(len(specie) + 1):
sns.histplot(
ax = ax[i],
data=penguins_df if i == 0 else specie[i-1],
x='flipper_length_mm',
binwidth=1,
stat='probability',
hue='species',
palette=penguin_color
);
(
empiricaldist.Pmf.from_seq(
penguins_df.flipper_length_mm,
normalize=True # cambia a su probabilidad o a frecuencia
)
.sort_values(ascending=False) # remueve para que no se ordenen
.head() # remueve para ver los resultados completos
)
# Guardamos el objeto empiricaldist en una variable
pmf_flipper_length_mm = empiricaldist.Pmf.from_seq(
penguins_df.flipper_length_mm,
normalize=True
)
# usamos el metodo .bar para graficar el PMF como lo hicimos en seaborn
pmf_flipper_length_mm.bar()
# Usamos nuestro objeto para preguntar las probabilidades de encontrar un pinguino con una longitud de ala X
print(f'''
180 = {(pmf_flipper_length_mm(180)*100).round(4)} %
190 = {(pmf_flipper_length_mm(190)*100).round(4)} %
200 = {(pmf_flipper_length_mm(200)*100).round(4)} %
210 = {(pmf_flipper_length_mm(210)*100).round(4)} %
230 = {(pmf_flipper_length_mm(230)*100).round(4)} %
''')
fig,ax = plt.subplots(ncols=4,nrows=1,figsize=(20,5))
for i in range(len(specie) + 1):
sns.ecdfplot(
ax = ax[i],
data=penguins_df if i == 0 else specie[i-1],
x='flipper_length_mm',
# stat='probability',
hue= None if i == 0 else 'species',
palette=penguin_color
);
# Guardamos nuestro objeto CDF en una variable con los datos
cdf_flipper_length_mm = empiricaldist.Cdf.from_seq(
penguins_df.flipper_length_mm,
normalize=True
)
# Graficamos nuestro CDF con la funcion plot
cdf_flipper_length_mm.plot()
q = 200 # quantity
p = cdf_flipper_length_mm.forward(q) # probability
print('la probabilidad de tener una ala de:' ,q, 'mm es de:',p*100,'%')
# Graficamos con empiricaldist
cdf_flipper_length_mm.plot() # Graficamos
q = 200 # quantity
p = cdf_flipper_length_mm.forward(q) # probability
plt.vlines( # Linea eje Y
x=q,
ymin=0,
ymax=p,
linestyles='dashed',color='black',linewidth=2
)
plt.hlines( # Linea eje X
y=p,
xmin=pmf_flipper_length_mm.qs[0], # cuantiles
xmax=q,
linestyles='dashed',color='black',linewidth=2
)
plt.plot(q,p,'ro') # Punto de interseccion rojo
# Grafica CDF con sus respectivos cuantiles
cdf_flipper_length_mm.step()
p_1 = 0.25 # specifi probability
p_2 = 0.75
ps = (0.25, 0.75) # IQR
qs = cdf_flipper_length_mm.inverse(ps) # {0.25 = 190} y {0.75 = 213}
plt.vlines(
x=qs,
ymin=0,
ymax=ps,
linestyles='--',color='black',
)
plt.hlines(
y=ps,
xmin=cdf_flipper_length_mm.qs[0],
xmax=qs,
linestyles='--',color='black',
)
plt.plot(qs,ps,'ro') # puntos de interseccion
fig,ax = plt.subplots(1,4,figsize=(20,8))
p_1 = 0.25 # specifi probability
p_2 = 0.75
ps = (p_1, p_2) # IQR
for i in range(len(species)+1):
sns.ecdfplot(
ax = ax[i],
data=penguins_df if i == 0 else specie[i-1],
x='flipper_length_mm',
hue='species',
palette=penguin_color
)
qs = empiricaldist.Cdf.from_seq(penguins_df.flipper_length_mm if i ==0 else specie[i-1].flipper_length_mm,normalize=True).inverse(ps)
if i == 0:
pass
else:
ax[i].vlines(
x=qs,
ymin=0,
ymax=ps,
linestyles='--',color='black',
)
ax[i].hlines(
y=ps,
xmin=empiricaldist.Cdf.from_seq(specie[i-1].flipper_length_mm,normalize=True).qs[0],
xmax=qs,
linestyles='--',color='black',
)
ax[i].plot(qs,ps,'ro') # puntos de interseccion
# Grafica de la Densidad de probabilidad.
sns.kdeplot(
data=penguins_df,
x='flipper_length_mm',
bw_method=0.1,
)
# Obtenemos los estadisticos de la variable
stats = penguins_df.body_mass_g.describe()
stats
# generamos los valores aleatorios a partir de los estadisticos de los datos para tener la distribucion teorica utilizando el ECDFs
xs = np.linspace(stats['min'],stats['max']) # array de valores aleatorios
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs) #valores de probabilidad para cada valor
# grafica de la distribucion de datos aleatorios
plt.plot(xs,ys, color='black',linestyle='--')
xs = np.linspace(stats['min'],stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs)
plt.plot(xs,ys, color='black',linestyle='--')
# grafica de la distribucion de los datos reales
empiricaldist.Cdf.from_seq(
penguins_df.body_mass_g,
normalize=True
).plot()
# generar valores aleatorios a partir de los estadisticos de los datos usando PDF
xs = np.linspace(stats['min'],stats['max']) # array de valores aleatorios
ys = scipy.stats.norm(stats['mean'], stats['std']).pdf(xs) # valores de probabilidad para cada valor
plt.plot(xs,ys, color='black',linestyle='--') # grafica de la distribucion de datos aleatorios
# Grafico PDF con valores reales
sns.kdeplot(
data = penguins_df,
x='body_mass_g',
)
# Creamos un dado de 6 caras con la libreria empiricaldist
dice = empiricaldist.Pmf.from_seq([1,2,3,4,5,6])
dice.bar()
# Arrojamos el dado 100, 1000 y 10000 para el experimento
for sample_size in (100, 1000, 10000):
sample_size = int(sample_size)
values = dice.sample(sample_size)
sample_pmf = empiricaldist.Pmf.from_seq(values)
plt.figure(figsize=(5,5))
sample_pmf.bar()
plt.axhline(y=1/6, color='r',linestyle='--')
plt.title(f'sample size: {sample_size}')
# Distribucion binomial del sexo de los pinguinos
plt.figure(figsize=(5,5))
(penguins_df.sex.value_counts(normalize=True),
penguins_df.sex.value_counts(normalize=True).plot(kind='bar'))
sex_numeric = penguins_df.sex.replace(['male','female'], [1,0])
number_sample = 1000 #cantidad de muestras a tomar
sample_size = 35 #tamaño de la muestra
sample_df = pd.DataFrame() #dataframe donde se almacena las media de cada muestra
for i in range(1, number_sample + 1):
sex_numeric_sample = sex_numeric.sample(sample_size, replace=True).to_numpy()
sample_name = f'sample {i}'
sample_df[sample_name] = sex_numeric_sample.copy()
male_population_mean = sample_df.mean().mean()
print(f'Estimated percentage of male penguins in populatio is: {male_population_mean*100:.2f}%')
sample_means_binomial = pd.DataFrame(sample_df.mean(), columns=['sample_mean'])
# Distribucion Teorica
stats = sample_means_binomial.describe()
xs = np.linspace(stats.loc['min'],stats.loc['max'])
ys = scipy.stats.norm(stats.loc['mean'], stats.loc['std']).pdf(xs)
plt.plot(xs,ys, color='black',linestyle='--')
# Distribucion empirica
sns.kdeplot(data=sample_means_binomial)
# promedio
plt.axvline(x=sex_numeric.mean(),color='red',linestyle='--')
plt.figure(figsize=(5,5))
sample_size_experiment = pd.DataFrame(
[[i, sample_df.iloc[:, 0:i].mean().mean()] for i in range(1, number_sample + 1)],
columns=['sample_size', 'estimated_mean']
)
# grafica del efecto
sns.scatterplot(
data=sample_size_experiment,
x='sample_size',
y='estimated_mean'
)
plt.axhline(
y=sex_numeric.mean(),
color='red',
linestyle='dashed'
)
plt.ylim([sex_numeric.mean() - 0.20, sex_numeric.mean() + 0.20])
sns.scatterplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
alpha=1/2,
s=100
);
sns.displot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True, # Analizar cada wea como asdaslkd
);
sns.displot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True, # Analizar cada wea como asdaslkd
kind='kde'
);
sns.jointplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
kind='kde',
);
sns.jointplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
kind='kde',
hue='species'
);
sns.scatterplot(
data=penguins_df,
x='species',
y='flipper_length_mm'
)
sns.scatterplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.stripplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color,
);
sns.boxplot(
data=penguins_df,
x='flipper_length_mm',
y='species',
palette=penguin_color
);
# Guardamos el boxplot en una variable
ax = sns.boxplot(
data=penguins_df,
x='flipper_length_mm',
y='species',
palette=penguin_color,
);
ax = sns.stripplot(
data=penguins_df,
x='flipper_length_mm',
y='species',
color='.3'
);
ax = sns.violinplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color,
);
# ax = sns.stripplot(
# data=penguins_df,
# x='flipper_length_mm',
# y='species',
# color='.3'
# );
sns.violinplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
color='.8'
)
sns.stripplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
sns.swarmplot(
data=penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
plt.figure(figsize=(10,5))
sns.scatterplot(
data=penguins_df,
x='flipper_length_mm',
y='body_mass_g',
color='.0',
s=20
);
penguins_df.corr()
sns.heatmap(
data=penguins_df.corr(),
cmap=sns.diverging_palette(20,238, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=.5,
annot=True,
)
sns.clustermap(
data=penguins_df.corr(),
cmap=sns.diverging_palette(20,238, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=.5,
annot=True,
)
df2 = penguins_df.copy()
# Asignamos un valor a la variable categorica sex
df2 = (
df2.assign(
numeric_sex= lambda df:sex_numeric.replace({'male','female'},{1,0})
)
)
# Asignamos un valor a la variable categorica island
df2 = (
df2.assign(
numeric_island= lambda df:df.island.replace({'Torgersen','Biscoe','Dream'},{0,1,2})
)
)
# Asignamos un valor a la variable categorica species
df2 = (
df2.assign(
numeric_species= lambda df:df.species.replace({'Adelie','Gentoo','Chinstrap'},{0,1,2})
)
)
# Graficamos
sns.clustermap(
data=df2.corr(),
cmap=sns.diverging_palette(20,238, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidth=.5,
annot=True,
)
from sklearn.metrics import r2_score
plt.figure(figsize=(5,5))
x = np.linspace(-100,120,100)
y = x**2
y += np.random.normal(0,1000,x.size)
plt.scatter(x=x,y=y)
plt.show()
np.corrcoef(x,y)
plt.figure(figsize=(5,5))
x = np.linspace(-100,100,100)
y = x**3
y += np.random.normal(0,1000,x.size)
plt.scatter(x,y)
plt.show()
np.corrcoef(x,y)
plt.figure(figsize=(6,6))
sns.scatterplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
)
plt.show()
penguins_df.bill_depth_mm.corr(penguins_df.bill_length_mm)
sns.scatterplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
hue='species',
palette=penguin_color,
)
plt.show()
penguins_df.groupby('species').bill_depth_mm.corr(penguins_df.bill_length_mm)
np.random.seed(42)
x_1= np.linspace(0,100,100)
y_1= 0.1 * x_1 + 3 + np.random.uniform(-2,2, size=x_1.size)
sns.scatterplot(
x=x_1,
y=y_1
)
x_2= np.linspace(0,100,100)
y_2= 0.5 * x_2 + 1 + np.random.uniform(0,60, size=x_2.size)
sns.scatterplot(
x=x_2,
y=y_2
)
plt.legend(labels=['1','2'])
print(np.corrcoef(x_1,y_1))
print(np.corrcoef(x_2,y_2))
sns.lmplot(
data=penguins_df,
x='bill_depth_mm',
y='bill_length_mm',
hue='species',
palette=penguin_color,
aspect=1.5
)
# Definimos nuestras variables de interes
x = penguins_df.bill_length_mm
y = penguins_df.bill_depth_mm
# Ajustemos un modelo de regresion lineal para cada variable
# X contra Y
res_x_y = scipy.stats.linregress(x=x,y=y)
# Y contra X
res_y_x = scipy.stats.linregress(x=y, y=x)
print(f'''
{res_x_y}
{res_y_x}''')
fig, ax = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(
ax=ax[0],
x=y,
y=x
)
fx_2 = np.array([y.min(),y.max()])
fy_2 = res_y_x.intercept + res_y_x.slope * fx_2
ax[0].plot(fx_2,fy_2)
sns.scatterplot(
ax=ax[1],
x=x,
y=y,
)
fx_1 = np.array([x.min(), x.max()])
fy_1 = res_x_y.intercept + res_x_y.slope * fx_1
ax[1].plot(fx_1, fy_1)
# Grafico de ambos slopes
sns.scatterplot(
x=x,
y=y
)
plt.plot(fx_1, fy_1)
plt.plot(fy_2, fx_2)
plt.legend({f'x:{x.name}',f'x:{y.name}'})
(
smf.ols(
formula='bill_length_mm ~ bill_depth_mm',
data=penguins_df
)
.fit()
.params
)
(
smf.ols(
formula='bill_depth_mm ~ bill_length_mm',
data=penguins_df
)
.fit()
.params
)
penguins_df.corr()
# Definimos el modelo y lo ajustamos
model_1 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm',
data=penguins_df
)
.fit()
)
model_1.summary()
# Definimos el modelo y lo ajustamos
model_2 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm ',
data=penguins_df
)
.fit()
)
model_2.summary()
# Definimos el modelo y lo ajustamos
model_3 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm',
data=penguins_df
)
.fit()
)
model_3.summary()
# Definimos el modelo y lo ajustamos
model_4 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + sex',
data=penguins_df
)
.fit()
)
model_4.summary()
model_5 = (
smf.ols(
formula='body_mass_g ~ flipper_length_mm + C(sex)',
data=penguins_df
)
.fit()
)
model_5.summary()
penguins_df.corr().style.highlight_between(left=.8,right=.99)
model_result = pd.DataFrame(
dict(
actual_value=penguins_df.body_mass_g,
prediction_model_1 = model_1.predict(), # body_mass ~ bill_length_mm
prediction_model_2 = model_2.predict(), # body_mass_g ~ bill_length_mm + bill_depth_mm'
prediction_model_3 = model_3.predict(), # body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm
prediction_model_4 = model_4.predict(), # body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)'
prediction_model_5 = model_5.predict(), # body_mass_g ~ flipper_length_mm + C(sex)
species = penguins_df.species,
sex = penguins_df.sex,
)
)
fig,ax = plt.subplots(1,6,figsize=(16.9,1))
for i, col in enumerate(model_result.iloc[:,:6].columns):
sns.histplot(
ax=ax[i],
x=model_result[col],
)
plt.show()
model_result
fig,ax = plt.subplots(1,5,figsize=(30,5))
for i, col in enumerate(model_result.iloc[:,1:6].columns):
sns.ecdfplot(
ax=ax[i],
data=model_result.select_columns('actual_value',col),
)
fig,ax = plt.subplots(1,2,figsize=(15,5))
for i, col in enumerate(model_result.iloc[:,4:6].columns):
sns.kdeplot(
ax=ax[i],
data=model_result.select_columns('actual_value',col),
)
fig,ax = plt.subplots(1,2,figsize=(15,5))
for i, col in enumerate(model_result.iloc[:,4:6].columns):
sns.kdeplot(
ax=ax[i],
data=model_result.select_columns('actual_value',col),
cumulative=True
)
sns.lmplot(
data=penguins_df,
x='flipper_length_mm',
y='body_mass_g',
hue='sex',
height=10
)
# Usaremos la base de datos df2 donde habiamos guardados los valores de numeric_sex
logit_model = smf.logit(
formula='numeric_sex ~ flipper_length_mm + bill_length_mm + bill_depth_mm + C(island)',
data=df2
).fit()
logit_model.summary()
# Restamos los valores de probabilidad entre la isla Dream y la Torgersen
-1.55 - (-1.03)
(
penguins_df
.value_counts(['island', 'sex'])
.reset_index(name='count')
)
penguins_df_2 = (
penguins_df
.assign(is_adelie = lambda df: df.species.replace(['Adelie','Gentoo','Chinstrap'],[1,0,0]))
)
model_is_adelie = smf.logit(
formula= 'is_adelie ~ flipper_length_mm + sex',
data=penguins_df_2
).fit()
model_is_adelie.params
is_adelie_model_prediction = pd.DataFrame(
dict(
actual_adelie = penguins_df_2.is_adelie,
predicted_values = model_is_adelie.predict().round()
)
)
is_adelie_model_prediction
(
is_adelie_model_prediction
.value_counts(['actual_adelie', 'predicted_values'])
.reset_index(name='count')
)
print(
sklearn.metrics.confusion_matrix(
is_adelie_model_prediction.actual_adelie,
is_adelie_model_prediction.predicted_values
)
)
sklearn.metrics.accuracy_score(
is_adelie_model_prediction.actual_adelie,
is_adelie_model_prediction.predicted_values
)
plt.figure(figsize=(6,6))
sns.scatterplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
);
plt.figure(figsize=(6,6))
sns.regplot(
data=penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
);
sns.lmplot(
data=penguins_df,
x='bill_length_mm',
y="bill_depth_mm",
hue='species',
palette=penguin_color
)
sns.lmplot(
data=penguins_df,
x='bill_length_mm',
y="bill_depth_mm",
hue='species',
col='sex',
palette=penguin_color,
)
sns.pairplot(
data=penguins_df,
hue='species',
diag_kind='kde',
palette=penguin_color
)