Sampling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import io
import math
from scipy.stats import norm
from scipy.stats import t
import scipy.stats as stats
import pingouin
econdata = pd.read_csv("econdata.csv")
econdata.head()
Types of sampling
Simple random sampling
rand1 = econdata.sample(n=8)
rand1
rand2 = econdata.sample(n=8, random_state=18900217)
rand2
# Extract a random 25% of your population
prop_25 = econdata.sample(frac = .25)
prop_25
Systematic sampling
# let's take samples each 3 rows
def systematic_sampling(econdata, step):
indexes = np.arange(0, len(econdata), step=step)
systematic_sample = econdata.iloc[indexes]
return systematic_sample
systematic_sample = systematic_sampling(econdata,3)
systematic_sample.head(20)
sample_size = 10
interval = int( len(econdata) / sample_size )
econdata.iloc[::interval]#.reset_index()
Stratified sampling
penguins = sns.load_dataset("penguins")
penguins["species"].value_counts(normalize=True)
prop_sample = penguins.groupby("species").sample(frac=0.1)
prop_sample["species"].value_counts(normalize=True)
equal_sample = penguins.groupby("species").sample(n=15)
equal_sample["species"].value_counts(normalize=True)
condition = penguins["species"] == "Adelie"
penguins["weight"] = np.where(condition, 2, 1)
penguins_weight = penguins.sample(frac=0.1, weights="weight")
penguins_weight["species"].value_counts(normalize=True)
# 1st step: Create your stratification variable
econdata["estratificado"] = econdata["delegacion"] + ", " + econdata["tipo"]
(econdata["estratificado"].value_counts()/len(econdata)).sort_values(ascending=False)
def stratified_data(econdata, strat_cols, strat_values, strat_prop, random_state=None):
df_strat = pd.DataFrame(columns=econdata.columns)
pos = -1
for i in range(len(strat_values)):
pos += 1
if pos == len(strat_values) - 1:
ratio_len = len(econdata) - len(df_strat)
else:
ratio_len = int(len(econdata) * strat_prop[i])
df_filtered = econdata[econdata[strat_cols] == strat_values[i]]
df_temp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state)
df_strat = pd.concat([df_strat, df_temp])
return df_strat
strat_values = list(econdata["estratificado"].unique())
strat_prop = [0.5, 0.2, 0.1, 0.1, 0.1]
# Using the function with the given parameters
df_strat = stratified_data(econdata, "estratificado", strat_values, strat_prop, random_state=42)
df_strat
(
(df_strat["estratificado"].value_counts()/len(econdata))
.sort_values(ascending=False)
)
Cluster sampling
Bonus: Calculate mean per categories
attrition_pop = pd.read_csv("HR-Employee-Attrition.csv")
attrition_pop.groupby("EducationField")["HourlyRate"].mean()
Relative error
Calculate relative error
# take a 50 rows sample, set the seed to 2022
sample = attrition_pop.sample(n=50, random_state=202)
# mean of the population
pop_mean = attrition_pop["HourlyRate"].mean()
# mean of the sample
sample_mean = sample["HourlyRate"].mean()
# Relative error
(abs(pop_mean-sample_mean)/pop_mean)*100
Relative error vs sample size
population_mean = attrition_pop["HourlyRate"].mean()
sample_mean = []
n = []
for i in range(len(attrition_pop["HourlyRate"])):
sample_mean.append(attrition_pop.sample(n=i+1)["HourlyRate"].mean())
n.append(i+1)
#Turn sample_mean into a numpy array
sample_mean = np.array(sample_mean)
#Calculate the relative error (%)
rel_error_pct = 100 * abs(population_mean-sample_mean)/population_mean
plt.plot(n, rel_error_pct)
plt.xlabel("Sample size (n)")
plt.ylabel("Relative error (%)")
plt.show()
Sampling distribution
mean_hourlyrate = []
for i in range(500):
mean_hourlyrate.append(
attrition_pop.sample(n=60)["HourlyRate"].mean()
)
plt.hist(mean_hourlyrate, bins=16)
plt.show()
Approximating sampling distributions
n_dice = list(range(1, 101))
n_outcomes = []
for n in n_dice:
n_outcomes.append(6**n)
outcomes = pd.DataFrame(
{
"n_dice" : n_dice,
"n_outcomes" : n_outcomes
}
)
outcomes.plot(
x = "n_dice",
y = "n_outcomes",
kind = "scatter"
)
plt.show()
Bootstrapping
Concept
coffee_ratings = pd.read_csv(
"https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv"
)
coffee_focus = coffee_ratings[["variety", "country_of_origin", "flavor"]].reset_index()
coffee_resamp = coffee_focus.sample(frac=1, replace=True)
#frac=1 makes the sample size as big as the dataset
coffee_resamp["index"].value_counts()
len(coffee_ratings) - len(coffee_resamp.drop_duplicates(subset="index"))
np.mean(coffee_resamp["flavor"])
Actually Bootstrapping
means = []
for i in range(1000):
means.append(
np.mean(
coffee_focus.sample(frac=1, replace=True)["flavor"]
)
)
plt.hist(means)
plt.show()
Bootstrap statistics to estimate population parameters
Mean
Standard Deviation
# Population Standard deviation
coffee_ratings["flavor"].std(ddof=0)
# Sample Standard deviation
coffee_sample = coffee_ratings["flavor"].sample(n=500, replace=False)
coffee_sample.std()
# Estimated population stantard deviation
# 1. Bootstrapping the sample
bootstrapped_means = []
for i in range(5000):
bootstrapped_means.append(
np.mean(coffee_sample.sample(frac=1, replace=True))
)
# 2. Calculating standard error
standard_error = np.std(bootstrapped_means, ddof=1) # ddof=1 because it's a sample std
# 3. Calculating n
n = len(coffee_sample)
# 4, population stantard deviation = to std_error * sqrt(n)
standard_error * np.sqrt(n)
Bootstrapping vs sampling distribution to estimate the mean
spotify_population = pd.read_feather("spotify_2000_2020.feather")
spotify_sample = spotify_population.sample(n=10000)
mean_popularity_2000_samp = []
# Generate a sampling distribution of 2000 replicates
for i in range(2000):
mean_popularity_2000_samp.append(
# Sample 500 rows and calculate the mean popularity
spotify_population["popularity"].sample(n=500).mean()
)
# Print the sampling distribution results
plt.hist(mean_popularity_2000_samp)
plt.show()
mean_popularity_2000_boot = []
# Generate a bootstrap distribution of 2000 replicates
for i in range(2000):
mean_popularity_2000_boot.append(
# Resample 500 rows and calculate the mean popularity
spotify_sample["popularity"].sample(frac=1, replace=True).mean()
)
# Print the bootstrap distribution results
plt.hist(mean_popularity_2000_boot)
plt.show()
# Calculate the population mean popularity
pop_mean = spotify_population["popularity"].mean()
# Calculate the original sample mean popularity
samp_mean = spotify_sample["popularity"].mean()
# Calculate the sampling dist'n estimate of mean popularity
samp_distn_mean = np.mean(mean_popularity_2000_samp)
# Calculate the bootstrap dist'n estimate of mean popularity
boot_distn_mean = np.mean(mean_popularity_2000_boot)
# Print the means
print([pop_mean, samp_mean, samp_distn_mean, boot_distn_mean])
dist_media_muestral = []
# Generate a sampling distribution of 2000 replicates
for i in range(2000):
dist_media_muestral.append(
# Sample 500 rows and calculate the mean popularity
coffee_ratings["flavor"].sample(n=500).mean()
)
np.std(dist_media_muestral, ddof=1) * np.sqrt(500)
# Population Standard deviation
coffee_ratings["flavor"].std(ddof=0)
Confidence intervals
Quantile method for confidence intervals
# Generating a standard normal distribution
standard_normal = np.random.normal(0,1,1000000)
np.quantile(standard_normal, 0.025)
np.quantile(standard_normal, 0.975)
standard error method for confidence intervals
cumulative_density_function = []
for i in np.linspace(-5, 5, 1001):
cumulative_density_function.append(norm.cdf(i, loc=0, scale=1))
plt.plot(np.linspace(-5, 5, 1001), cumulative_density_function)
plt.show()
inverse_cumulative_density_function = []
for i in np.linspace(-5, 5, 1001):
inverse_cumulative_density_function.append(norm.ppf(i, loc=0, scale=1))
plt.plot(np.linspace(-5, 5, 1001), inverse_cumulative_density_function)
plt.show()
norm.ppf(0.025, loc= 0, scale=1)
Hypotesis testing
Concept
Assumptions
Sanity check
Hypothesis test (mean)
Steps to perform hypothesis testing
1. Define the null hypothesis and alternative hypothesis
2. Calculate the Z-score
3. Calculate the P-value
Example1. TWO TAILED TEST
context
stack_overflow = pd.read_feather("stack_overflow.feather")
stack_overflow.head()
stack_overflow["converted_comp"].mean()
so_boot_distn = []
for i in range(5000):
so_boot_distn.append(
np.mean(
stack_overflow.sample(frac=1, replace=True)["converted_comp"]
)
)
plt.hist(so_boot_distn, bins=50)
plt.show()
np.std(so_boot_distn, ddof=1)
Calculating Z-score
z_score = ( stack_overflow["converted_comp"].mean() - 110000 ) / np.std(so_boot_distn, ddof=1)
z_score
Calculating P-value
p_value = 2 * ( 1 - norm.cdf(z_score, loc=0, scale=1) )
p_value
Example2. RIGHT TAILED TEST
context
stack_overflow["age_first_code_cut"].head()
Calculating Z score
# Point estimate
prop_child_samp = (stack_overflow["age_first_code_cut"] == "child").mean()
prop_child_samp
# hypothesized statistic
prop_child_hyp = 0.35
# std_error
first_code_boot_distn = []
# bootstrap distribution
for i in range (5000):
first_code_boot_distn.append(
(stack_overflow.sample(frac=1, replace=True)["age_first_code_cut"] == "child").mean()
)
std_error = np.std(first_code_boot_distn, ddof = 1)
std_error
# Z-score
z_score = (prop_child_samp - prop_child_hyp) / std_error
z_score
Calculating P-value
norm.cdf(z_score, loc=0, scale=1)
p_value = 1 - norm.cdf(z_score, loc=0, scale=1)
p_value
Example 3: RIGHT TAILED TEST
context
late_shipments = pd.read_feather("late_shipments.feather")
late_shipments.head()
prop = (late_shipments["late"] == "Yes").mean()
prop
Calculating Z score
late_shipments_boot_distn = []
# bootstrap distribution
for i in range (5000):
late_shipments_boot_distn.append(
(late_shipments.sample(frac=1, replace=True)["late"] == "Yes").mean()
)
# standard error
std_error = np.std(late_shipments_boot_distn, ddof=1)
# remember:
# z_score = (sample statistc - hypothesized statistic) / standard error
z_score = (prop - 0.06) / std_error
z_score
Calculating P-value
# Remember
# For right tailed test, p_value = 1 - norm.cdf(...)
p_value = 1 - norm.cdf(z_score, loc = 0, scale = 1)
p_value
Confirming the results with a confidence interval
# Calculate 95% confidence interval using quantile method
lower = np.quantile(late_shipments_boot_distn, 0.025)
upper = np.quantile(late_shipments_boot_distn, 0.975)
# Print the confidence interval
print((lower, upper))
Hypothesis test (difference of means)
Assumptions for sample size
Example 1. Difference of means (right tailed)
Context
stack_overflow = pd.read_feather("stack_overflow.feather")
stack_overflow.head()
stack_overflow.groupby("age_first_code_cut")["converted_comp"].mean()
Calculating T statistic
xbar = stack_overflow.groupby("age_first_code_cut")["converted_comp"].mean()
xbar_child = xbar[1]
xbar_adult = xbar[0]
s = stack_overflow.groupby("age_first_code_cut")["converted_comp"].std()
s_child = s[1]
s_adult = s[0]
n = stack_overflow.groupby("age_first_code_cut")["converted_comp"].count()
n_child = n[1]
n_adult = n[0]
numerator = xbar_child - xbar_adult
denominator = np.sqrt( ((s_child**2)/n_child) + ((s_adult**2)/n_adult ) )
t_stat = numerator/denominator
t_stat
Calculating P value
# dof = n - 2
degrees_of_freedom = n_child + n_adult - 2
# since it is a right tailed test
P_value = 1 - t.cdf(t_stat, df=degrees_of_freedom)
P_value
Solving with pengouin
# Child compensation
child_compensation = stack_overflow[
stack_overflow["age_first_code_cut"] == "child"]["converted_comp"]
# Adult compensation
adult_compensation = stack_overflow[
stack_overflow["age_first_code_cut"] == "adult"]["converted_comp"]
pingouin.ttest(
x = child_compensation,
y = adult_compensation,
alternative="greater"
)
Example 2. Difference of means (left tailed)
Context
late_shipments = pd.read_feather("late_shipments.feather")
late_shipments.head()
Calculating T statistic
xbar = late_shipments.groupby("late")["weight_kilograms"].mean()
xbar_no = xbar[0]
xbar_yes = xbar[1]
s = late_shipments.groupby("late")["weight_kilograms"].std()
s_no = s[0]
s_yes = s[1]
n = late_shipments.groupby("late")["weight_kilograms"].count()
n_no = n[0]
n_yes = n[1]
# Calculate the numerator of the test statistic
numerator = xbar_no - xbar_yes
# Calculate the denominator of the test statistic
denominator = np.sqrt( ((s_no**2)/n_no) + ((s_yes**2)/n_yes) )
# Calculate the test statistic
t_stat = numerator/denominator
t_stat
Calculating P-value
# Calculate the degrees of freedom
degrees_of_freedom = n_no + n_yes - 2
# Calculate the p-value from the test stat
p_value = t.cdf(t_stat, df = degrees_of_freedom)
p_value
Hypothesis testing (paired data)
Assumptions for sample size
Example 1. Difference of means (left tailed)
Context
republicans = pd.read_feather("repub_votes_potus_08_12.feather")
republicans
sample_data = republicans
sample_data["diff"] = sample_data["repub_percent_08"] - sample_data["repub_percent_12"]
sample_data["diff"].hist(bins=20)
xbar_diff = sample_data["diff"].mean()
xbar_diff
Calculating T statistic
# calculating t
n_diff = len(sample_data)
s_diff = sample_data["diff"].std()
t_stat = (xbar_diff - 0)/np.sqrt((s_diff**2)/n_diff)
t_stat
Calculating P value
p_value = t.cdf(t_stat, df = n_diff-1)
p_value
solving with pingouin
pingouin.ttest(
x = sample_data["diff"],
y = 0,
alternative = "less"
)
pingouin variation of t-test for paired data
pingouin.ttest(
x = sample_data["repub_percent_08"],
y = sample_data["repub_percent_12"],
paired = True, #this argument is very important for paired data
alternative="less"
)
Example 2. Difference of means (two-tailed)
Context
democrats = pd.read_feather("dem_votes_potus_12_16.feather")
democrats.head()
Solving with pingouin
pingouin.ttest(
x = democrats['dem_percent_12'],
y = democrats['dem_percent_16'],
paired = True,
alternative = "two-sided"
)
Hypothesis test (ANOVA)
Assumptions for sample size
When to use ANOVA
stack_overflow = pd.read_feather("stack_overflow.feather")
stack_overflow[["job_sat", "converted_comp"]].head()
sns.boxplot(
x = stack_overflow["converted_comp"],
y = stack_overflow["job_sat"],
data = stack_overflow
)
plt.show()
sns.boxplot(
x = stack_overflow["converted_comp"],
y = stack_overflow["job_sat"],
data = stack_overflow,
showfliers = False
)
plt.show()
pingouin.anova() method
pingouin.anova(
data = stack_overflow,
dv = "converted_comp", #Dependant Variable
between= "job_sat"
)
pairwise tests
pingouin.pairwise_tests(
data = stack_overflow,
dv = "converted_comp",
between= "job_sat",
padjust= "none"
)
false positive danger
p -value adjustment (bonferroni)
pingouin.pairwise_tests(
data = stack_overflow,
dv = "converted_comp",
between= "job_sat",
padjust= "bonf"
)
Example 2 ANOVA (three categories)
late_shipments = pd.read_feather("late_shipments.feather")
late_shipments[["shipment_mode", "pack_price"]].head()
Checking sample size assumptions
# Count the shipment_mode values
counts = late_shipments["shipment_mode"].value_counts()
# Print the result
print(counts)
# Inspect whether the counts are big enough
print((counts >= 30).all())
Exploratory Data Analysis per Group
late_shipments.groupby("shipment_mode")["pack_price"].mean()
late_shipments.groupby("shipment_mode")['pack_price'].std()
sns.boxplot(
x = late_shipments["pack_price"],
y = late_shipments["shipment_mode"],
data = late_shipments
)
plt.show()
Conducting ANOVA
pingouin.anova(
data = late_shipments,
dv = "pack_price", #Dependant Variable
between= "shipment_mode"
)
Pairwise tests
pingouin.pairwise_tests(
data = late_shipments,
dv = "pack_price",
between = "shipment_mode"
)
P-value adjustment (bonferroni)
pingouin.pairwise_tests(
data = late_shipments,
dv = "pack_price",
between = "shipment_mode",
padjust = "bonf"
)
Hypothesis test (tests for proportions)
Assumptions for sample size
Example 1 (two-tailed test)
Context
stack_overflow["age_cat"].value_counts(normalize = True)
Calculate z score
p_hat = (stack_overflow["age_cat"] == "Under 30").mean()
p_0 = 0.5
n = len(stack_overflow)
z_score = (p_hat - p_0) / np.sqrt( (p_0 * (1 - p_0))/ n )
z_score
Calculating p-value
#substract from 1 since p-value is positive
p_value = 2 * (1 - norm.cdf(z_score))
p_value
Example 2 (right tailed test)
late_shipments = pd.read_feather("late_shipments.feather")
Checking for sample size assumptions
# Count the late values
counts = late_shipments["late"].value_counts()
# Print the result
print(counts)
# Inspect whether the counts are big enough
print((counts >= 10).all())
Calculating z-score
# Hypothesize that the proportion of late shipments is 6%
p_0 = 0.06
# Calculate the sample proportion of late shipments
p_hat = (late_shipments['late'] == "Yes").mean()
# Calculate the sample size
n = len(late_shipments)
# Calculate the numerator and denominator of the test statistic
numerator = p_hat - p_0
denominator = np.sqrt(p_0 * (1 - p_0) / n)
# Calculate the test statistic
z_score = numerator / denominator
Calculating p-value
# Calculate the p-value from the z-score
p_value = 1 - norm.cdf(z_score)
# Print the p-value
print(p_value)
Hypothesis test (tests for differences of proportions)
Assumptions for sample size
Example 1 (two-tailed)
Context
stack_overflow.groupby("age_cat")["hobbyist"].value_counts(normalize = True)
Calculating the Z score
p_hats = stack_overflow.groupby("age_cat")["hobbyist"].value_counts(normalize = True)
p_hats
p_hat_at_least_30 = p_hats[("At least 30", "Yes")]
p_hat_under_30 = p_hats[("Under 30", "Yes")]
print(p_hat_at_least_30, p_hat_under_30)
n = stack_overflow.groupby("age_cat")["hobbyist"].count()
n
n_at_least_30 = n["At least 30"]
n_under_30 = n["Under 30"]
print(n_at_least_30, n_under_30)
# calculating p_hat
p_hat = (
(n_at_least_30 * p_hat_at_least_30 + n_under_30 * p_hat_under_30)/
(n_at_least_30 + n_under_30)
)
# calculating standard error
std_error = np.sqrt(p_hat * (1 - p_hat) / n_at_least_30 +
p_hat * (1 - p_hat) / n_under_30)
# calculating z-score
z_score = (p_hat_at_least_30 - p_hat_under_30) / std_error
z_score
Calculating p-value
# p-value for two sided test
2 * norm.cdf(z_score)
Calculating with statsmodels
stack_overflow.groupby("age_cat")["hobbyist"].value_counts()
n_hobbyists = np.array([812, 1021])
n_hobbyists
n_rows = np.array([812 + 238, 1021 + 190])
n_rows
from statsmodels.stats.proportion import proportions_ztest
z_score, p_value = proportions_ztest(count=n_hobbyists,
nobs=n_rows, alternative="two-sided")
print(z_score, p_value)
Example 2 (right tailed)
Context
p_hats = (
late_shipments.
groupby("freight_cost_groups")["late"].value_counts(normalize=True)
).filter(like='Yes', axis=0)
p_hats
# sample size
ns = late_shipments.groupby("freight_cost_groups")["late"].count()
ns
p_hat = (
(p_hats["reasonable"] * ns["reasonable"] +
p_hats["expensive"] * ns["expensive"]) /
(ns["reasonable"] + ns["expensive"])
)
p_hat
# Calculate the standard error
std_error = np.sqrt(
( ( p_hat * (1 - p_hat) ) / ns["expensive"] )+
( ( p_hat * (1 - p_hat) ) / ns["reasonable"] )
)
std_error
Calculating Z score
z_score = (p_hats["expensive"] - p_hats["reasonable"])/std_error
z_score
Calculating p value
# Right tailed test
p_value = 1 - norm.cdf(z_score)
p_value
Calculating with statmodels
# Count the late column values for each freight_cost_group
late_by_freight_cost_group = (
late_shipments.groupby("freight_cost_groups")["late"].value_counts()
)
success_counts
success_counts = np.array(
[
late_by_freight_cost_group[("expensive","Yes")],
late_by_freight_cost_group[("reasonable","Yes")]
]
)
success_counts
n = np.array([
late_by_freight_cost_group["expensive"].sum(),
late_by_freight_cost_group["reasonable"].sum()
])
n
stat, p_value = proportions_ztest(
count = success_counts,
nobs = n,
alternative = "larger"
)
print(stat, p_value)
Chi Squared Tests of independance
Assumptions for sample size
Example 1
props = (
stack_overflow
.groupby("hobbyist")["age_cat"]
.value_counts(normalize = True)
)
wide_props = props.unstack()
wide_props.plot(kind = "bar", stacked = True)
expected, observed, stats = pingouin.chi2_independence(
data = stack_overflow,
x = "hobbyist",
y = "age_cat",
correction= False
)
stats
Example 2
stack_overflow["age_cat"].value_counts()
stack_overflow["job_sat"].value_counts()
props = (
stack_overflow
.groupby("job_sat")["age_cat"]
.value_counts(normalize = True)
)
wide_props = props.unstack()
wide_props.plot(kind = "bar", stacked = True)
expected, observed, stats = pingouin.chi2_independence(
data = stack_overflow,
x = "job_sat",
y = "age_cat",
correction= False
)
stats
Example 3
Checking sample size assumption
# Remove that DDU for convenience (it's only one)
late_shipments = late_shipments[late_shipments["vendor_inco_term"] != "DDU"]
# Count the values of freight_cost_group grouped by vendor_inco_term
counts = (
late_shipments
.groupby("vendor_inco_term")["freight_cost_groups"]
.value_counts()
)
# Print the result
print(counts)
# Inspect whether the counts are big enough
print((counts >= 5).all())
Exploring variable independance
props = (
late_shipments
.groupby("vendor_inco_term")["freight_cost_groups"]
.value_counts(normalize = True)
)
wide_props = props.unstack()
wide_props.plot(kind = "bar", stacked = True)
Performing chi squared test
expected, observed, stats = pingouin.chi2_independence(
data = late_shipments,
x = "vendor_inco_term",
y = "freight_cost_groups"
)
stats
Chi squared goodness of fit tests
Example 1
purple_link_counts = (
stack_overflow["purple_link"]
.value_counts()
.rename_axis("purple_link")
.reset_index(name="n")
.sort_values("purple_link")
)
purple_link_counts
hypothesized = pd.DataFrame({
"purple_link" : ["Amused", "Annoyed", "Hello, old friend", "Indifferent"],
"prop" : [1/6, 1/6, 1/2, 1/6]
})
hypothesized
n_total = len(stack_overflow)
hypothesized["n"] = hypothesized["prop"] * n_total
hypothesized
plt.bar(
purple_link_counts["purple_link"],
purple_link_counts["n"],
color = "red", label = "Observed"
)
plt.bar(
hypothesized["purple_link"],
hypothesized["n"],
alpha = 0.5,
color = "blue", label = "Hypothesized"
)
plt.legend()
plt.show()
from scipy.stats import chisquare
chisquare(
f_obs=purple_link_counts["n"],
f_exp=hypothesized["n"]
)
Example 2
# Remove that DDU for convenience (it's only one)
late_shipments = late_shipments[late_shipments["vendor_inco_term"] != "DDU"]
incoterm_counts = (
late_shipments["vendor_inco_term"]
.value_counts()
.rename_axis("vendor_inco_term")
.reset_index(name="n")
.sort_values("vendor_inco_term")
)
incoterm_counts
# Hypothesized ditribution
hypothesized = pd.DataFrame({
"vendor_inco_term" : ["CIP", "DDP", "EXW", "FCA"],
"prop" : [0.05, 0.1, 0.75, 0.1]
})
hypothesized
# Counts for hypothesized
hypothesized["n"] = hypothesized["prop"] * n_total
# Find the number of rows in late_shipments
n_total = len(late_shipments)
# Create n column that is prop column * n_total
hypothesized["n"] = hypothesized["prop"] * n_total
# Plot a red bar graph of n vs. vendor_inco_term for incoterm_counts
plt.bar(
incoterm_counts["vendor_inco_term"],
incoterm_counts["n"],
color = "red",
label="Observed")
plt.bar(
hypothesized['vendor_inco_term'],
hypothesized['n'],
alpha = 0.5,
color="blue",
label="Observed")
plt.legend()
plt.show()
gof_test = chisquare(
f_obs=incoterm_counts["n"],
f_exp=hypothesized["n"]
)
gof_test
NON Parametric tests
Wilcoxon-signed rank test (Paired data)
repub_votes = pd.read_feather("repub_votes_potus_08_12.feather")
repub_votes_small = repub_votes.sample(5)
repub_votes_small
alpha = 0.01
pingouin.ttest(
x = repub_votes_small["repub_percent_08"],
y = repub_votes_small["repub_percent_12"],
paired = True,
alternative = "less"
)
Wicoxon test process
from scipy.stats import rankdata
# Creating a list to see it's ranks
x = [1,15,3,10,6]
rankdata(x)
repub_votes_small["diff"] = repub_votes_small["repub_percent_08"] - repub_votes_small["repub_percent_12"]
repub_votes_small[["repub_percent_08", "repub_percent_12", "diff"]]
repub_votes_small["abs_diff"] = repub_votes_small["diff"].abs()
repub_votes_small[["repub_percent_08", "repub_percent_12", "diff", "abs_diff"]]
repub_votes_small["rank_abs_diff"] = rankdata(repub_votes_small["abs_diff"])
repub_votes_small[["repub_percent_08", "repub_percent_12", "diff", "abs_diff","rank_abs_diff"]]
print(repub_votes_small[repub_votes_small["diff"] < 0]["rank_abs_diff"])
T_minus = sum (repub_votes_small[repub_votes_small["diff"] < 0]["rank_abs_diff"])
print(T_minus)
print(repub_votes_small[repub_votes_small["diff"] > 0]["rank_abs_diff"])
T_plus = sum (repub_votes_small[repub_votes_small["diff"] > 0]["rank_abs_diff"])
print(T_plus)
W = np.min([T_minus, T_plus])
W
pingouin.wilcoxon
alpha = 0.01
pingouin.wilcoxon(
x = repub_votes_small["repub_percent_08"],
y = repub_votes_small["repub_percent_12"],
alternative="less"
)
Wilcoxon-Mann-Whitney test (Independent data)
sns.boxplot(
x = stack_overflow["converted_comp"],
y = stack_overflow["age_first_code_cut"],
data = stack_overflow,
showfliers = False # removing the outliers just for the chart to look better
)
plt.show()
age_vs_comp = stack_overflow[["converted_comp", "age_first_code_cut"]]
age_vs_comp_wide = age_vs_comp.pivot(columns="age_first_code_cut", values="converted_comp")
age_vs_comp_wide.head()
alpha = 0.01
pingouin.mwu(
x = age_vs_comp_wide["child"],
y = age_vs_comp_wide["adult"],
alternative = "greater"
)
Kruskal-Wallis test
sns.boxplot(
x = stack_overflow["converted_comp"],
y = stack_overflow["job_sat"],
data = stack_overflow,
showfliers = False # removing the outliers just for the chart to look better
)
plt.show()
alpha = 0.01
pingouin.kruskal(
data = stack_overflow,
dv = "converted_comp",
between = "job_sat"
)