Estadística inferencial con Python: ejemplos de uso paso a paso
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#Funciones
def bootstrap_replicate_1d(data, func):
"""Generate bootstrap replicate of 1D data."""
bs_sample = np.random.choice(data, len(data))
return func(bs_sample)
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
plt.style.use('ggplot')
iris = sns.load_dataset('iris')
print(iris.sample(5))
sepal_length sepal_width petal_length petal_width species
92 5.8 2.6 4.0 1.2 versicolor
64 5.6 2.9 3.6 1.3 versicolor
32 5.2 4.1 1.5 0.1 setosa
8 4.4 2.9 1.4 0.2 setosa
69 5.6 2.5 3.9 1.1 versicolor
print(iris.describe())
sepal_length sepal_width petal_length petal_width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
Inferencia estadística: intervalos de confianza
sns.boxplot(x='species', y='sepal_length', data=iris)
plt.show()
versicolor_sepal_length = iris.query('species == "versicolor"')['sepal_length']
versicolor_sepal_length.values
sns.boxplot(data=versicolor_sepal_length, x=versicolor_sepal_length.values)
plt.xlabel('Longitud de sépalo, mm')
plt.title('Longitud de sépalo de Iris Versicolor')
plt.show()
versicolor_sepal_length.describe()
vers_replicates = draw_bs_reps(versicolor_sepal_length.values, np.mean, size=10000)
ci = np.percentile(vers_replicates, [2.5, 97.5])
print(ci)
[5.794 6.078]
¿Cómo interpretamos los intervalos de confianza?
plt.hist(vers_replicates, bins=20)
plt.xlabel('Longitud de sépalo, mm')
plt.ylabel('Frecuencias')
plt.title('Réplicas bootstrap de las medias')
plt.axvline(x=ci[0], color='#395d90')
plt.axvline(x=ci[1], color='#395d90')
plt.fill_between([ci[0], ci[1]], 1700, color='#395d90', alpha=0.6)
plt.show()
Prueba de hipótesis
iris
sepal_lengthfloat64
4.3 - 7.9
sepal_widthfloat64
2.0 - 4.4
0
5.1
3.5
1
4.9
3
2
4.7
3.2
3
4.6
3.1
4
5
3.6
5
5.4
3.9
6
4.6
3.4
7
5
3.4
8
4.4
2.9
9
4.9
3.1
sns.boxplot(x='species', y='sepal_length', data=iris)
plt.show()
versicolor_sepal_length = iris.query('species == "versicolor"')['sepal_length']
virginica_sepal_length = iris.query('species == "virginica"')['sepal_length']
observed_diffs_means = np.mean(virginica_sepal_length.values - versicolor_sepal_length.values)
print(observed_diffs_means)
0.6520000000000001
observed_diffs_means = np.mean(virginica_sepal_length.values) - np.mean(versicolor_sepal_length.values)
print(observed_diffs_means)
0.6519999999999984
versicolor_virginica_concatenated = np.concatenate((versicolor_sepal_length.values, virginica_sepal_length.values))
mean_length = np.mean(versicolor_virginica_concatenated)
print(mean_length)
6.2620000000000005
versicolor_shifted = versicolor_sepal_length.values - np.mean(versicolor_sepal_length.values) + mean_length
virginica_shifted = virginica_sepal_length.values - np.mean(virginica_sepal_length.values) + mean_length
bs_replicates_versicolor = draw_bs_reps(versicolor_shifted, np.mean, size=100000)
bs_replicates_virginica = draw_bs_reps(virginica_shifted, np.mean, size=100000)
bs_replicates = bs_replicates_virginica - bs_replicates_versicolor
plt.hist(bs_replicates)
plt.axvline(x=observed_diffs_means, linestyle='--')
plt.fill_between([observed_diffs_means, 0.9], 35000, color='#395d90', alpha=0.6)
plt.text(0.7, 25000, 'P-value')
plt.title('Distribución de diferencia de medias', size=14)
plt.ylabel('Frecuencia')
plt.xlabel('Diferencia de medias')
plt.show()
p = np.sum(bs_replicates >= observed_diffs_means) / 100000
print(p)
0.0