Sebastián Avello García
import empiricaldist
#import janitor
!pip install pyjanitor
import janitor
import matplotlib.pyplot as plt
import numpy as np
import palmerpenguins
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11, 9.4)
penguin_color = {
'Adelie': '#ff6602ff',
'Gentoo': '#0f7175ff',
'Chinstrap': '#c65dc9ff'
}
palmerpenguins.load_penguins_raw()
preprocess_penguins_df = palmerpenguins.load_penguins()
preprocess_penguins_df
sns.load_dataset('penguins')
preprocess_penguins_df = pd.read_csv('penguins.csv')
preprocess_penguins_df
preprocess_penguins_df.info()
(
preprocess_penguins_df
.dtypes.value_counts()
)
preprocess_penguins_df.shape
(
preprocess_penguins_df
.isnull()
.any()
)
(
preprocess_penguins_df
.isnull()
.sum()
)
(
preprocess_penguins_df
.isnull()
.sum()
.sum()
)
(
preprocess_penguins_df
.isnull()
.melt()
)
(
preprocess_penguins_df
.isnull()
.transpose()
)
(
preprocess_penguins_df
.dropna()
)
processed_penguins_df.describe()
processed_penguins_df.describe(include=[np.number])
processed_penguins_df.describe(include=object)
(
processed_penguins_df
.astype({
'species':'category',
'island' : 'category',
'sex' : 'category'
})
)
(
processed_penguins_df
.species
.value_counts()
)
sns.catplot(
data=processed_penguins_df,
x='species',
kind='count'
)
(
processed_penguins_df
.value_counts('species', sort=True)
.reset_index(name='count')
.pipe(
lambda df: (
sns.barplot(
data=df,
x='species',
y='count',
palette=penguin_color
)
)
)
)
(
processed_penguins_df
.add_column('x','')
.pipe(
lambda df: (
sns.displot(
data=df,
x='x',
hue='species',
multiple='fill',
palette=penguin_color)
)
)
)
processed_penguins_df.bill_depth_mm.mean()
np.mean(processed_penguins_df.bill_depth_mm)
processed_penguins_df.mean()
processed_penguins_df.median()
processed_penguins_df.mode()
processed_penguins_df.describe(include=object)
processed_penguins_df.max(numeric_only=True)
processed_penguins_df.min(numeric_only=True)
processed_penguins_df.max(numeric_only=True) - processed_penguins_df.min(numeric_only=True)
processed_penguins_df.std() + processed_penguins_df.mean()
processed_penguins_df.mean()-processed_penguins_df.std()
processed_penguins_df.quantile(0.75)
processed_penguins_df.quantile(0.25)
processed_penguins_df.quantile(0.75) - processed_penguins_df.quantile(0.25)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=1
)
plt.axvline(
x=processed_penguins_df.flipper_length_mm.mean(),
color='red',
linestyle='dashed',
linewidth=2
)
sns.boxplot(
data=processed_penguins_df,
x='flipper_length_mm'
)
def freedman_diaconis_bindwidth(x: pd.Series) -> float:
"""Find optimal bindwidth using Freedman-Diaconis rule."""
IQR = x.quantile(0.75) - x.quantile(0.25)
N = x.size
return 2 * IQR / N ** (1 / 3)
sns.histplot(
data=processed_penguins_df,
x='flipper_length_mm',
binwidth=1,
stat='probability'
)
pmf_flipper_lenght_mm = empiricaldist.Pmf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
pmf_flipper_lenght_mm(190)
processed_penguins_df.flipper_length_mm.max()
pmf_flipper_lenght_mm.bar()
sns.ecdfplot(
data=processed_penguins_df,
x='flipper_length_mm',
)
cdf_flipper_length_mm = empiricaldist.Cdf.from_seq(
processed_penguins_df.flipper_length_mm,
normalize=True
)
#calculo de un valor en funcion de la probabilidad dada
cdf_flipper_length_mm.step()
#especificar probabilidades
p_1 = 0.25
p_2 = 0.75
#IQR
ps = (0.25, 0.75)
#obtener el valor dada la probabilidad
qs = cdf_flipper_length_mm.inverse(ps)
plt.vlines(
x=qs,
ymin=0,
ymax=ps,
color='black',
linestyles='dashed'
)
plt.hlines(
y=ps,
xmin=pmf_flipper_lenght_mm.qs[0],
xmax=qs,
color='black',
linestyles='dashed'
)
plt.scatter(
x=qs,
y=ps,
color='red',
zorder=2
)
sns.ecdfplot(
data=processed_penguins_df,
x='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.kdeplot(
data=processed_penguins_df,
x='flipper_length_mm',
bw_method=0.1
)
stats= processed_penguins_df.body_mass_g.describe()
stats
xs = np.linspace(stats['min'], stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs)
ys
xs = np.linspace(stats['min'], stats['max'])
ys = scipy.stats.norm(stats['mean'], stats['std']).cdf(xs)
plt.plot(xs,ys,color='black',linestyle='dashed')
dice = empiricaldist.Pmf.from_seq([1,2,3,4,5,6])
dice.bar()
for sample_size in (1e2, 1e3, 1e4):
sample_size = int (sample_size)
values = dice.sample(sample_size)
sample_pmf = empiricaldist.Pmf.from_seq(values)
plt.figure(figsize=(5,5))
sample_pmf.bar()
plt.axhline(y=1/6,color='red',linestyle='dashed')
plt.ylim([0, 0.50])
processed_penguins_df.sex.value_counts(normalize=True)
processed_penguins_df.sex.value_counts(normalize=True).plot(kind='bar')
sex_numeric = processed_penguins_df.sex.replace(['male', 'female'], [1,0])
#reemplaza las palabras male female por 1 y 0
number_samples = 1000
sample_size = 35
samples_df = pd.DataFrame()
np.random.seed(42)
for i in range(1, number_samples + 1):
sex_numeric_sample = sex_numeric.sample(sample_size, replace=True).to_numpy()
sample_name = f"sample_{i}"
samples_df[sample_name] = sex_numeric_sample
male_population_mean = samples_df.mean().mean()
print(f"Estimated percentage of male penguins in population is: {male_population_mean * 100:.4f}%")
sample_means_binomial = pd.DataFrame(samples_df.mean(), columns=['sample_mean'])
sns.kdeplot(data=sample_means_binomial)
plt.axvline(x=sex_numeric.mean(), color='red', linestyle='dashed')
sample_size_experiment = pd.DataFrame(
[[i, samples_df.iloc[:, 0:i].mean().mean().mean()] for i in range(1, number_samples + 1)],
columns=['sample_size', 'estimated_mean']
)
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
alpha = 1/2, #transparencia
s=100, #tamaño de los puntos
hue='species'
)
sns.displot(
data= processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True, #muestra una linea de distribucion para cada variable
hue='species'
)
sns.displot(
data= processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
rug=True, #muestra una linea de distribucion para cada variable,
kind='kde',
hue='species'
)
#multiples graficos
#scatterplot + histograma
sns.jointplot(
data= processed_penguins_df,
x= 'bill_length_mm',
y= 'bill_depth_mm',
hue='species'
)
sns.scatterplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
hue='species', #agrega un calor distinto por categoria
palette = penguin_color
)
sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
ax = sns.boxplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
#grafico de puntos
ax = sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
color ='.3'
)
ax = sns.violinplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color,
color='.8'
)
#Grafico de puntos
ax = sns.stripplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
sns.swarmplot(
data=processed_penguins_df,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
processed_penguins_df.corr()
#sns.heatmap(
# preprocess_penguins_df.corr(),
# annot= True,
# cmap='coolwarm'
#);
sns.heatmap(
data=preprocess_penguins_df.corr(),
cmap=sns.diverging_palette(20,230, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
annot=True
)
sns.clustermap(
data=preprocess_penguins_df.corr(),
cmap=sns.diverging_palette(20,230, as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
annot=True
)
(
processed_penguins_df
.assign(
numeric_sex=lambda df:df.sex.replace(['female', 'male'],[0,1])
)
)
processed_penguins_df= (
processed_penguins_df
.assign(
numeric_sex=lambda df:df.sex.replace(['female', 'male'],[0,1])
)
)
x = np.linspace(-110,100,100)
y = x**2
y += np.random.normal(0,100, x.size)
sns.scatterplot(
x=x, y=y
)
np.corrcoef(x,y)
x = np.linspace(-110,100,100)
y = x**3
y += np.random.normal(0,100, x.size)
sns.scatterplot(
x=x, y=y
)
np.corrcoef(x,y)
sns.scatterplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
)
np.random.seed(42)
x1 = np.linspace(0,100,100)
y1 = 0.1* x1 +3 + np.random.uniform(-2,2,size=x1.size)
sns.scatterplot(x=x1, y=y1)
x2 = np.linspace(0,100,100)
y2 = 0.5* x1 +1 + np.random.uniform(0,60,size=x2.size)
sns.scatterplot(x=x2, y=y2)
plt.legend(["1","2"])
print(np.corrcoef(x1,y1))
print(np.corrcoef(x2,y2))
sns.scatterplot(x=x1, y=y1)
fx1 = np.array([x1.min(), x1.max()])
fy1 = res_1.intercept + res_1.slope * fx1
plt.plot(fx1, fy1)
sns.scatterplot(x=x2, y=y2)
fx2 = np.array([x2.min(), x2.max()])
fy2 = res_2.intercept + res_2.slope * fx2
plt.plot(fx2, fy2)
plt.legend(["1", "1","2","2"])
sns.scatterplot(processed_penguins_df, x="bill_length_mm", y="bill_depth_mm")
res_penguins = scipy.stats.linregress(x=processed_penguins_df.bill_length_mm, y=processed_penguins_df.bill_depth_mm)
print(res_penguins)
fx1 = np.array([processed_penguins_df.bill_length_mm.min(), processed_penguins_df.bill_length_mm.max()])
fy1 = res_penguins.intercept + res_penguins.slope * fx1
plt.plot(fx1,fy1)
sns.lmplot(
data=processed_penguins_df,
x='bill_length_mm',
y='bill_depth_mm',
height=10,
hue='species'
);
x = processed_penguins_df.bill_depth_mm
y = processed_penguins_df.bill_length_mm
res_x_y = scipy.stats.linregress(x=x, y=y)
res_y_x = scipy.stats.linregress(x=y, y=x)
#Fig. 1
sns.scatterplot(
x=x,
y=y
)
fx_1= np.array([x.min(), x.max()])
fy_1= res_x_y.intercept + res_x_y.slope * fx_1
plt.plot(fx_1, fy_1);
#Fig. 2
sns.scatterplot(
x=y,
y=x
)
fx_2= np.array([y.min(), y.max()])
fy_2= res_y_x.intercept + res_y_x.slope * fx_2
plt.plot(fx_2, fy_2)
# Fig. 3
sns.scatterplot(
x=x,
y=y
)
plt.plot(fx_1, fy_1)
plt.plot(fy_2, fx_2)
(
smf.ols(
formula='bill_length_mm ~ bill_depth_mm ',
data=processed_penguins_df
)
.fit()
.params
)
(
smf.ols(
formula='bill_depth_mm ~ bill_length_mm',
data=processed_penguins_df
)
.fit()
.params
)
model_1 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm',
data=processed_penguins_df
)
.fit()
)
model_1.summary() #descripcion del modelo
#De la variable independiente bill_length vemos que la pendiente es de 86.79 lo que indica
#que por cada cambio en milimetro de longitud del pico se genera un cambio de 86 gramos en el peso.
#Peso del pinguino vs longitud del pico, ancho del pico
model_2 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm',
data=processed_penguins_df
)
.fit()
)
model_2.summary()
#El valor que relaciona el peso con la longitud del pico cambia a 74.81.
#Por otro lado, la variable bill_depth indica que cada cambio en
#milimetros del ancho del pico genera un cambio de perdida de peso en 145 gr
model_3 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm',
data=processed_penguins_df
)
.fit()
)
model_3.summary()
model_4 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)',
data=processed_penguins_df
)
.fit()
)
model_4.summary()
model_5 = (
smf.ols(
formula='body_mass_g ~ flipper_length_mm + C(sex)',
data=processed_penguins_df
)
.fit()
)
model_5.summary()
sns.ecdfplot(
data=models_result
)
sns.kdeplot(
data=models_result,
#cumulative=True #genera curvas suavizadas de tipo acumulativas
)
sns.lmplot(
data=processed_penguins_df,
x='flipper_length_mm',
y='body_mass_g',
hue='sex',
height=10
)
#Modelo logistico de la variable sexo VS longitud de las aletas, ancho del pico, largo del pico, y la isla
smf.logit(
formula='numeric_sex ~ flipper_length_mm + bill_length_mm + bill_depth_mm + C(island)',
data=processed_penguins_df
).fit().summary()
#Exploracion visual de los datos con seaborn
sns.pairplot(
data=processed_penguins_df,
hue='species',
palette=penguin_color,
)