import empiricaldist
import janitor
import matplotlib.pyplot as plt
import numpy as np
import palmerpenguins
import pandas as pd
import scipy.stats
import seaborn as sns
import sklearn.metrics
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats as ss
import session_info
%matplotlib inline
sns.set_style(style='whitegrid')
sns.set_context(context='notebook')
plt.rcParams['figure.figsize'] = (11, 9.4)
penguin_color = {
'Adelie': '#ff6602ff',
'Gentoo': '#0f7175ff',
'Chinstrap': '#c65dc9ff'
}
penguins_DF=palmerpenguins.load_penguins_raw()
penguins_DF.head(5)
preprocesados_PGDF=palmerpenguins.load_penguins()
preprocesados_PGDF
sns.load_dataset('penguins')
preprocesados_PGDF=pd.read_csv('penguins.csv')
preprocesados_PGDF
preprocesados_PGDF.dtypes
(
preprocesados_PGDF
.dtypes
.value_counts()
)
preprocesados_PGDF.shape
(
preprocesados_PGDF
.isnull()
.any()
)
(
preprocesados_PGDF
.isnull()
.sum()
)
(
preprocesados_PGDF
.isnull()
.sum()
.sum()
)
(
preprocesados_PGDF
.isnull()
.melt()
.pipe(
lambda df:(
sns.displot(
data=df,
y='variable',
hue='value',
multiple='fill',
aspect=3
)
)
)
)
(
preprocesados_PGDF
.isnull()
.transpose()
.pipe(
lambda df:(
sns.heatmap(
data=df
)
)
)
)
procesados_PGDF=(
preprocesados_PGDF
.dropna()
)
procesados_PGDF.describe(include='all')
procesados_PGDF.describe(include=[np.number])
procesados_PGDF.describe(include=object)
(
procesados_PGDF
.astype({
'species':'category',
'island':'category',
'sex':'category'
})
.describe(include='category')
)
(
procesados_PGDF
.species
.value_counts()
.plot(
kind='bar'
)
)
sns.catplot(
data=procesados_PGDF,
x='species',
kind='count',
palette=penguin_color
)
plt.show()
graf_dispecies=(
procesados_PGDF
.add_column('x','')
.pipe(
lambda df:(
sns.displot(
data=df,
x='x',
hue='species',
multiple='fill',
palette=penguin_color,
)
)
)
)
graf_dispecies.set(title='Distribucion Especie Pinguinos')
plt.show()
procesados_PGDF.bill_depth_mm.mean()
np.mean(procesados_PGDF.bill_depth_mm)
procesados_PGDF.mean()
procesados_PGDF.median()
procesados_PGDF.mode()
procesados_PGDF.describe(include=object)
procesados_PGDF.max(numeric_only=True)
procesados_PGDF.min(numeric_only=True)
procesados_PGDF.max(numeric_only=True) - procesados_PGDF.min(numeric_only=True)
procesados_PGDF.std()
procesados_PGDF.mean()+procesados_PGDF.std()
procesados_PGDF.mean()-procesados_PGDF.std()
procesados_PGDF.quantile(0.75)
procesados_PGDF.quantile(0.75)-procesados_PGDF.quantile(0.25)
(
procesados_PGDF.quantile(q=[0.75,0.5,0.25])
.transpose()
.rename_axis('variable')
.reset_index()
.assign(
iqr=lambda df:df[0.75]-df[0.25]
)
)
sns.histplot(
data=procesados_PGDF,
x='flipper_length_mm',
hue='species',
binwidth=1
)
plt.axvline(
x=procesados_PGDF.flipper_length_mm.mean(),
color='red',
linestyle='dotted',
linewidth=2,
label='Promedio de longitud de aleta'
)
plt.axvline(
x=procesados_PGDF.flipper_length_mm.mode()[0],
color='red',
linestyle='dashed',
linewidth=2,
label='Moda de longitud de aleta'
)
plt.axvline(
x=procesados_PGDF.flipper_length_mm.median(),
color='red',
linestyle='solid',
linewidth=2,
label='Mediana de longitud de aleta'
)
plt.xlabel('Longitud de aleta (mm)')
plt.ylabel('Frecuencia')
plt.title('Histograma de la longitud de aleta de los pingüinos')
plt.legend()
plt.show()
sns.boxplot(
data=procesados_PGDF,
x='flipper_length_mm'
)
def freedman_diaconis_bindwidth(x: pd.Series) -> float:
"""Find optimal bindwidth using Freedman-Diaconis rule."""
IQR = x.quantile(0.75) - x.quantile(0.25)
N = x.size
return 2 * IQR / N ** (1 / 3)
sns.histplot(
data=procesados_PGDF,
x='flipper_length_mm',
binwidth=1,
stat='probability'
)
plt.show()
pmf_flipper_length_mm=empiricaldist.Pmf.from_seq(
procesados_PGDF.flipper_length_mm,
normalize=True
)
pmf_flipper_length_mm.bar()
pmf_flipper_length_mm(190)
procesados_PGDF.flipper_length_mm.max()
sns.ecdfplot(
data=procesados_PGDF,
x='flipper_length_mm'
)
plt.show()
cdf_flipper_length_mm=empiricaldist.Cdf.from_seq(
procesados_PGDF.flipper_length_mm,
normalize=True
)
cdf_flipper_length_mm.plot()
q=200
p=cdf_flipper_length_mm.forward(q)
plt.vlines(
x=q,
ymin=0,
ymax=p,
color='black',
linestyle='dashed'
)
plt.hlines(
y=p,
xmin=pmf_flipper_length_mm.qs[0],
xmax=q,
color='black',
linestyle='dashed'
)
plt.plot(q,p, 'ro')
cdf_flipper_length_mm.step ( )
p_1 = 8.25 # Specify probability
P_2 = 8.75
ps = ( 0.25 , 0.75 ) # IQR
qs = cdf_flipper_length_mm.inverse(ps)
plt.vlines (
x = qs,
ymin = 0,
ymax=ps,
color='black',
linestyle='dashed'
)
plt.hlines(
y=ps,
xmin=pmf_flipper_length_mm.qs[0],
xmax=qs,
color='black',
linestyle='dashed'
)
plt.scatter(
x=qs,
y=ps,
color='red',
zorder=2
)
sns.ecdfplot(
data=procesados_PGDF,
x='flipper_length_mm',
hue='species',
palette=penguin_color
)
plt.show()
sns.kdeplot(
data=procesados_PGDF,
x='flipper_length_mm',
bw_method=0.1
)
stats=procesados_PGDF.body_mass_g.describe()
stats
xs=np.linspace(stats['min'],stats['max'])
ys=scipy.stats.norm(stats['mean'],stats['std']).cdf(xs)
plt.plot(xs,ys,color='black',linestyle='dashed')
empiricaldist.Cdf.from_seq(
procesados_PGDF.body_mass_g,
normalize=True
).plot()
xs=np.linspace(stats['min'],stats['max'])
ys=scipy.stats.norm(stats['mean'],stats['std']).pdf(xs)
plt.plot(xs,ys,color='black',linestyle='dashed')
sns.kdeplot(
data=procesados_PGDF,
x='body_mass_g'
)
dado=empiricaldist.Pmf.from_seq([1,2,3,4,5,6])
dado.bar()
for sample_size in (1e2,1e3,1e4):
sample_size=int(sample_size)
value=dado.sample(sample_size)
sample_pmf=empiricaldist.Pmf.from_seq(value)
plt.figure(figsize=(5,5))
sample_pmf.bar()
plt.axhline(y=1/6,color='red',linestyle='dashed')
plt.ylim([0,0.50])
plt.title(f'Sample size: {sample_size}')
procesados_PGDF.sex.value_counts(normalize=True)
sex_numeric=procesados_PGDF.sex.replace(['male','female'],[1,0])
number_samples = 1000
sample_size = 35
np.random.seed(42)
samples_list = []
for i in range(1, number_samples + 1):
sex_numeric_sample = sex_numeric.sample(sample_size, replace=True).to_numpy()
samples_list.append(sex_numeric_sample)
samples_df = pd.DataFrame(np.column_stack(samples_list), columns=[f"sample_{i}" for i in range(1, number_samples + 1)])
male_population_mean = samples_df.mean().mean()
print(f"Estimated percentage of male penguins in population is: {male_population_mean * 100:.4f}%")
sns.scatterplot(
data=procesados_PGDF,
x='bill_length_mm',
hue='species',
y='bill_depth_mm',
alpha=1/2,
s=100
)
sns.displot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
rug=True
)
sns.displot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
rug=True,
kind='kde'
)
sns.jointplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
cmap="Reds",
kind='kde',
fill=True
)
plt.show()
sns.scatterplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
hue='species',
palette=penguin_color
)
sns.stripplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
ax=sns.boxplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
ax=sns.stripplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
color='.4'
)
ax=sns.violinplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
palette=penguin_color
)
ax=sns.stripplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
color='.1'
)
ax = sns.violinplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
widths=0.5
)
# Modificar la transparencia de los violines
for patch in ax.collections:
patch.set_alpha(0.2) # Ajustar el valor para cambiar la transparencia de los violines
ax = sns.swarmplot(
data=procesados_PGDF,
x='species',
y='flipper_length_mm',
color='.1'
)
# Mostrar el gráfico
plt.show()
procesados_PGDF.corr()
sns.heatmap(
data=procesados_PGDF.corr(),
cmap=sns.diverging_palette(20,230,as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
annot=True
)
plt.show()
sns.clustermap(
data=procesados_PGDF.corr(),
cmap=sns.diverging_palette(20,230,as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
annot=True
)
plt.show()
procesados_PGDF=(
procesados_PGDF
.assign(
num_sex=lambda df: df.sex.replace(['female','male'],[0,1])
)
)
sns.clustermap(
data=procesados_PGDF.corr(),
cmap=sns.diverging_palette(20,230,as_cmap=True),
center=0,
vmin=-1,
vmax=1,
linewidths=0.5,
annot=True
)
plt.show()
x=np.linspace(-100,100,100)
y=x**2
y+=np.random.normal(0,1000,x.size)
sns.scatterplot(
x=x,
y=y
)
np.corrcoef(x,y)
x=np.linspace(-100,100,100)
y=x**3
y+=np.random.normal(0,1000,x.size)
sns.scatterplot(
x=x,
y=y
)
np.corrcoef(x,y)
sns.scatterplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm'
)
np.random.seed(42)
x1 = np.linspace(0,100,100)
y1 = 0.1* x1 +3 + np.random.uniform(-2,2,size=x1.size)
sns.scatterplot(x=x1, y=y1)
x2 = np.linspace(0,100,100)
y2 = 0.5* x1 +1 + np.random.uniform(0,60,size=x2.size)
sns.scatterplot(x=x2, y=y2)
plt.legend(["1","2"])
print(np.corrcoef(x1,y1))
print(np.corrcoef(x2,y2))
sns.lmplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
height=10,
hue='species'
)
plt.show()
res_1=scipy.stats.linregress(x=x1,y=y1)
res_2=scipy.stats.linregress(x=x2,y=y2)
print(res_1,res_2,sep='\n')
sns.scatterplot(x=x1, y=y1)
fx1 = np.array([x1.min(), x1.max()])
fy1 = res_1.intercept + res_1.slope * fx1
plt.plot(fx1, fy1)
sns.scatterplot(x=x2, y=y2)
fx2 = np.array([x2.min(), x2.max()])
fy2 = res_2.intercept + res_2.slope * fx2
plt.plot(fx2,fy2)
plt.legend(["1", "1","2","2"])
x=procesados_PGDF.bill_length_mm
y=procesados_PGDF.bill_depth_mm
res_x_y = scipy.stats.linregress(x=x, y=y)
res_y_x = scipy.stats.linregress(x=y, y=x)
print(res_x_y,res_y_x, sep='\n')
sns.scatterplot(
x=x,
y=y
)
fx_1= np.array([x.min(), x.max()])
fy_1= res_x_y.intercept + res_x_y.slope * fx_1
plt.plot(fx_1, fy_1)
sns.scatterplot(
x=y,
y=x
)
fx_2= np.array([y.min(), y.max()])
fy_2= res_y_x.intercept + res_y_x.slope * fx_2
plt.plot(fx_2, fy_2)
(
smf.ols(
formula='bill_length_mm ~ bill_depth_mm ',
data=procesados_PGDF
)
.fit()
.params
)
(
smf.ols(
formula='bill_depth_mm ~ bill_length_mm',
data=procesados_PGDF
)
.fit()
.params
)
model_1 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm',
data=procesados_PGDF
)
.fit()
)
model_1.summary() #descripcion del modelo
model_2 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm',
data=procesados_PGDF
)
.fit()
)
model_2.summary() #descripcion del modelo
model_3 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm',
data=procesados_PGDF
)
.fit()
)
model_3.summary()
model_4 = (
smf.ols(
formula='body_mass_g ~ bill_length_mm + bill_depth_mm + flipper_length_mm + C(sex)',
data=procesados_PGDF
)
.fit()
)
model_4.summary()
model_5 = (
smf.ols(
formula='body_mass_g ~ flipper_length_mm + C(sex)',
data=procesados_PGDF
)
.fit()
)
model_5.summary()
models_result = pd.DataFrame(
dict(
actual_value = procesados_PGDF.body_mass_g,
prediction_model_1 = model_1.predict(),
prediction_model_2 = model_2.predict(),
prediction_model_3 = model_3.predict(),
prediction_model_4 = model_4.predict(),
prediction_model_5 = model_5.predict(),
species=procesados_PGDF.species,
sex=procesados_PGDF.sex
)
)
models_result
sns.ecdfplot(
data=models_result.select_columns(['actual_value','prediction_model_5'])
)
sns.kdeplot(
data=models_result,
cumulative=True #genera curvas suavizadas de tipo acumulativas
)
sns.lmplot(
data=procesados_PGDF,
x='flipper_length_mm',
y='body_mass_g',
hue='sex',
height=10
)
#Modelo logistico de la variable sexo VS longitud de las aletas, ancho del pico, largo del pico, y la isla
smf.logit(
formula='num_sex ~ flipper_length_mm + bill_length_mm + bill_depth_mm + C(island)',
data=procesados_PGDF
).fit().summary()
#Tabla de conteo de las variables categoricas isla y sexo
(
procesados_PGDF
.value_counts(['island', 'sex'])
.reset_index(name='count')
)
procesados_PGDF = (
procesados_PGDF
.assign(is_adelie=lambda df: df.species.replace(['Adelie', 'Gentoo', 'Chinstrap'], [1,0,0]))
)
#Modelo para determinar si un pinguino es adelie segun su sexo y el largo de las aletas
model_is_adele = smf.logit(
formula='is_adelie ~ flipper_length_mm + C(sex)',
data=procesados_PGDF
).fit()
model_is_adele.params #muestra solo los parametros del modelo
is_adelie_df_prediction = pd.DataFrame(
dict(
actual_adelie = procesados_PGDF.is_adelie,
predicted_values = model_is_adele.predict().round() #round indica solo si es cero o uno
)
)
is_adelie_df_prediction
(
is_adelie_df_prediction
.value_counts(['actual_adelie', 'predicted_values'])
.reset_index(name='count')
)
print(
sklearn.metrics.confusion_matrix(
is_adelie_df_prediction.actual_adelie,
is_adelie_df_prediction.predicted_values
)
)
sklearn.metrics.accuracy_score(
is_adelie_df_prediction.actual_adelie,
is_adelie_df_prediction.predicted_values
)
sns.scatterplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm'
)
plt.show()
sns.lmplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
height=10
)
plt.show()
sns.lmplot(
data=procesados_PGDF,
x='bill_length_mm',
y='bill_depth_mm',
hue='species',
height=10,
palette=penguin_color
)
plt.show()
sns.pairplot(
data=procesados_PGDF,
hue='species',
palette=penguin_color
)
plt.show()
Información de sesión
session_info.show()