Estadística inferencial con Python: ejemplos de uso paso a paso
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
#Funciones
def bootstrap_replicate_1d(data, func):
"""Generate bootstrap replicate of 1D data."""
bs_sample = np.random.choice(data, len(data))
return func(bs_sample)
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
plt.style.use('ggplot')
iris = sns.load_dataset('iris')
print(iris.sample(5))
print(iris.describe())
Inferencia estadística: intervalos de confianza
sns.boxplot(x='species', y='sepal_length', data=iris)
plt.show()
versicolor_sepal_length = iris.query('species == "versicolor"')['sepal_length']
versicolor_sepal_length.values
sns.boxplot(data=versicolor_sepal_length, x=versicolor_sepal_length.values)
plt.xlabel('Longitud de sépalo, mm')
plt.title('Longitud de sépalo de Iris Versicolor')
plt.show()
versicolor_sepal_length.describe()
vers_replicates = draw_bs_reps(versicolor_sepal_length.values, np.mean, size=10000)
ci = np.percentile(vers_replicates, [2.5, 97.5])
print(ci)
¿Cómo interpretamos los intervalos de confianza?
plt.hist(vers_replicates, bins=20)
plt.xlabel('Longitud de sépalo, mm')
plt.ylabel('Frecuencias')
plt.title('Réplicas bootstrap de las medias')
plt.axvline(x=ci[0], color='#395d90')
plt.axvline(x=ci[1], color='#395d90')
plt.fill_between([ci[0], ci[1]], 1700, color='#395d90', alpha=0.6)
plt.show()
Prueba de hipótesis
iris
sns.boxplot(x='species', y='sepal_length', data=iris)
plt.show()
versicolor_sepal_length = iris.query('species == "versicolor"')['sepal_length']
virginica_sepal_length = iris.query('species == "virginica"')['sepal_length']
observed_diffs_means = np.mean(virginica_sepal_length.values - versicolor_sepal_length.values)
print(observed_diffs_means)
observed_diffs_means = np.mean(virginica_sepal_length.values) - np.mean(versicolor_sepal_length.values)
print(observed_diffs_means)
versicolor_virginica_concatenated = np.concatenate((versicolor_sepal_length.values, virginica_sepal_length.values))
mean_length = np.mean(versicolor_virginica_concatenated)
print(mean_length)
versicolor_shifted = versicolor_sepal_length.values - np.mean(versicolor_sepal_length.values) + mean_length
virginica_shifted = virginica_sepal_length.values - np.mean(virginica_sepal_length.values) + mean_length
bs_replicates_versicolor = draw_bs_reps(versicolor_shifted, np.mean, size=100000)
bs_replicates_virginica = draw_bs_reps(virginica_shifted, np.mean, size=100000)
bs_replicates = bs_replicates_virginica - bs_replicates_versicolor
plt.hist(bs_replicates)
plt.axvline(x=observed_diffs_means, linestyle='--')
plt.fill_between([observed_diffs_means, 0.9], 35000, color='#395d90', alpha=0.6)
plt.text(0.7, 25000, 'P-value')
plt.title('Distribución de diferencia de medias', size=14)
plt.ylabel('Frecuencia')
plt.xlabel('Diferencia de medias')
plt.show()
p = np.sum(bs_replicates >= observed_diffs_means) / 100000
print(p)