Sampling

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import random import io import math from scipy.stats import norm from scipy.stats import t import scipy.stats as stats import pingouin

econdata = pd.read_csv("econdata.csv")

econdata.head()

Types of sampling

Simple random sampling

rand1 = econdata.sample(n=8) rand1

rand2 = econdata.sample(n=8, random_state=18900217) rand2

# Extract a random 25% of your population prop_25 = econdata.sample(frac = .25) prop_25

Systematic sampling

# let's take samples each 3 rows def systematic_sampling(econdata, step): indexes = np.arange(0, len(econdata), step=step) systematic_sample = econdata.iloc[indexes] return systematic_sample

systematic_sample = systematic_sampling(econdata,3) systematic_sample.head(20)

sample_size = 10 interval = int( len(econdata) / sample_size ) econdata.iloc[::interval]#.reset_index()

Stratified sampling

penguins = sns.load_dataset("penguins") penguins["species"].value_counts(normalize=True)

prop_sample = penguins.groupby("species").sample(frac=0.1) prop_sample["species"].value_counts(normalize=True)

equal_sample = penguins.groupby("species").sample(n=15) equal_sample["species"].value_counts(normalize=True)

condition = penguins["species"] == "Adelie" penguins["weight"] = np.where(condition, 2, 1)

penguins_weight = penguins.sample(frac=0.1, weights="weight") penguins_weight["species"].value_counts(normalize=True)

# 1st step: Create your stratification variable econdata["estratificado"] = econdata["delegacion"] + ", " + econdata["tipo"] (econdata["estratificado"].value_counts()/len(econdata)).sort_values(ascending=False)

def stratified_data(econdata, strat_cols, strat_values, strat_prop, random_state=None): df_strat = pd.DataFrame(columns=econdata.columns) pos = -1 for i in range(len(strat_values)): pos += 1 if pos == len(strat_values) - 1: ratio_len = len(econdata) - len(df_strat) else: ratio_len = int(len(econdata) * strat_prop[i]) df_filtered = econdata[econdata[strat_cols] == strat_values[i]] df_temp = df_filtered.sample(replace=True, n=ratio_len, random_state=random_state) df_strat = pd.concat([df_strat, df_temp]) return df_strat

strat_values = list(econdata["estratificado"].unique()) strat_prop = [0.5, 0.2, 0.1, 0.1, 0.1] # Using the function with the given parameters df_strat = stratified_data(econdata, "estratificado", strat_values, strat_prop, random_state=42) df_strat

( (df_strat["estratificado"].value_counts()/len(econdata)) .sort_values(ascending=False) )

Cluster sampling

Bonus: Calculate mean per categories

attrition_pop = pd.read_csv("HR-Employee-Attrition.csv")

attrition_pop.groupby("EducationField")["HourlyRate"].mean()

Relative error

Calculate relative error

# take a 50 rows sample, set the seed to 2022 sample = attrition_pop.sample(n=50, random_state=202) # mean of the population pop_mean = attrition_pop["HourlyRate"].mean() # mean of the sample sample_mean = sample["HourlyRate"].mean() # Relative error (abs(pop_mean-sample_mean)/pop_mean)*100

Relative error vs sample size

population_mean = attrition_pop["HourlyRate"].mean() sample_mean = [] n = [] for i in range(len(attrition_pop["HourlyRate"])): sample_mean.append(attrition_pop.sample(n=i+1)["HourlyRate"].mean()) n.append(i+1) #Turn sample_mean into a numpy array sample_mean = np.array(sample_mean) #Calculate the relative error (%) rel_error_pct = 100 * abs(population_mean-sample_mean)/population_mean plt.plot(n, rel_error_pct) plt.xlabel("Sample size (n)") plt.ylabel("Relative error (%)") plt.show()

Sampling distribution

mean_hourlyrate = [] for i in range(500): mean_hourlyrate.append( attrition_pop.sample(n=60)["HourlyRate"].mean() ) plt.hist(mean_hourlyrate, bins=16) plt.show()

Approximating sampling distributions

n_dice = list(range(1, 101)) n_outcomes = [] for n in n_dice: n_outcomes.append(6**n) outcomes = pd.DataFrame( { "n_dice" : n_dice, "n_outcomes" : n_outcomes } ) outcomes.plot( x = "n_dice", y = "n_outcomes", kind = "scatter" ) plt.show()

Bootstrapping

Concept

coffee_ratings = pd.read_csv( "https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-07-07/coffee_ratings.csv" )

coffee_focus = coffee_ratings[["variety", "country_of_origin", "flavor"]].reset_index()

coffee_resamp = coffee_focus.sample(frac=1, replace=True) #frac=1 makes the sample size as big as the dataset

coffee_resamp["index"].value_counts()

len(coffee_ratings) - len(coffee_resamp.drop_duplicates(subset="index"))

np.mean(coffee_resamp["flavor"])

Actually Bootstrapping

means = [] for i in range(1000): means.append( np.mean( coffee_focus.sample(frac=1, replace=True)["flavor"] ) ) plt.hist(means) plt.show()

Bootstrap statistics to estimate population parameters

Mean

Standard Deviation

# Population Standard deviation coffee_ratings["flavor"].std(ddof=0)

# Sample Standard deviation coffee_sample = coffee_ratings["flavor"].sample(n=500, replace=False) coffee_sample.std()

# Estimated population stantard deviation # 1. Bootstrapping the sample bootstrapped_means = [] for i in range(5000): bootstrapped_means.append( np.mean(coffee_sample.sample(frac=1, replace=True)) ) # 2. Calculating standard error standard_error = np.std(bootstrapped_means, ddof=1) # ddof=1 because it's a sample std # 3. Calculating n n = len(coffee_sample) # 4, population stantard deviation = to std_error * sqrt(n) standard_error * np.sqrt(n)

Bootstrapping vs sampling distribution to estimate the mean

spotify_population = pd.read_feather("spotify_2000_2020.feather") spotify_sample = spotify_population.sample(n=10000)

mean_popularity_2000_samp = [] # Generate a sampling distribution of 2000 replicates for i in range(2000): mean_popularity_2000_samp.append( # Sample 500 rows and calculate the mean popularity spotify_population["popularity"].sample(n=500).mean() ) # Print the sampling distribution results plt.hist(mean_popularity_2000_samp) plt.show()

mean_popularity_2000_boot = [] # Generate a bootstrap distribution of 2000 replicates for i in range(2000): mean_popularity_2000_boot.append( # Resample 500 rows and calculate the mean popularity spotify_sample["popularity"].sample(frac=1, replace=True).mean() ) # Print the bootstrap distribution results plt.hist(mean_popularity_2000_boot) plt.show()

# Calculate the population mean popularity pop_mean = spotify_population["popularity"].mean() # Calculate the original sample mean popularity samp_mean = spotify_sample["popularity"].mean() # Calculate the sampling dist'n estimate of mean popularity samp_distn_mean = np.mean(mean_popularity_2000_samp) # Calculate the bootstrap dist'n estimate of mean popularity boot_distn_mean = np.mean(mean_popularity_2000_boot) # Print the means print([pop_mean, samp_mean, samp_distn_mean, boot_distn_mean])

dist_media_muestral = [] # Generate a sampling distribution of 2000 replicates for i in range(2000): dist_media_muestral.append( # Sample 500 rows and calculate the mean popularity coffee_ratings["flavor"].sample(n=500).mean() ) np.std(dist_media_muestral, ddof=1) * np.sqrt(500)

# Population Standard deviation coffee_ratings["flavor"].std(ddof=0)

Confidence intervals

Quantile method for confidence intervals

# Generating a standard normal distribution standard_normal = np.random.normal(0,1,1000000)

np.quantile(standard_normal, 0.025)

np.quantile(standard_normal, 0.975)

standard error method for confidence intervals

cumulative_density_function = [] for i in np.linspace(-5, 5, 1001): cumulative_density_function.append(norm.cdf(i, loc=0, scale=1)) plt.plot(np.linspace(-5, 5, 1001), cumulative_density_function) plt.show()

inverse_cumulative_density_function = [] for i in np.linspace(-5, 5, 1001): inverse_cumulative_density_function.append(norm.ppf(i, loc=0, scale=1)) plt.plot(np.linspace(-5, 5, 1001), inverse_cumulative_density_function) plt.show()

norm.ppf(0.025, loc= 0, scale=1)

Hypotesis testing

Concept

Assumptions

Sanity check

Hypothesis test (mean)

Steps to perform hypothesis testing

1. Define the null hypothesis and alternative hypothesis

2. Calculate the Z-score

3. Calculate the P-value

Example1. TWO TAILED TEST

context

stack_overflow = pd.read_feather("stack_overflow.feather") stack_overflow.head()

stack_overflow["converted_comp"].mean()

so_boot_distn = [] for i in range(5000): so_boot_distn.append( np.mean( stack_overflow.sample(frac=1, replace=True)["converted_comp"] ) ) plt.hist(so_boot_distn, bins=50) plt.show()

np.std(so_boot_distn, ddof=1)

Calculating Z-score

z_score = ( stack_overflow["converted_comp"].mean() - 110000 ) / np.std(so_boot_distn, ddof=1) z_score

Calculating P-value

p_value = 2 * ( 1 - norm.cdf(z_score, loc=0, scale=1) ) p_value

Example2. RIGHT TAILED TEST

context

stack_overflow["age_first_code_cut"].head()

Calculating Z score

# Point estimate prop_child_samp = (stack_overflow["age_first_code_cut"] == "child").mean() prop_child_samp

# hypothesized statistic prop_child_hyp = 0.35

# std_error first_code_boot_distn = [] # bootstrap distribution for i in range (5000): first_code_boot_distn.append( (stack_overflow.sample(frac=1, replace=True)["age_first_code_cut"] == "child").mean() ) std_error = np.std(first_code_boot_distn, ddof = 1) std_error

# Z-score z_score = (prop_child_samp - prop_child_hyp) / std_error z_score

Calculating P-value

norm.cdf(z_score, loc=0, scale=1)

p_value = 1 - norm.cdf(z_score, loc=0, scale=1) p_value

Example 3: RIGHT TAILED TEST

context

late_shipments = pd.read_feather("late_shipments.feather") late_shipments.head()

prop = (late_shipments["late"] == "Yes").mean() prop

Calculating Z score

late_shipments_boot_distn = [] # bootstrap distribution for i in range (5000): late_shipments_boot_distn.append( (late_shipments.sample(frac=1, replace=True)["late"] == "Yes").mean() ) # standard error std_error = np.std(late_shipments_boot_distn, ddof=1)

# remember: # z_score = (sample statistc - hypothesized statistic) / standard error z_score = (prop - 0.06) / std_error z_score

Calculating P-value

# Remember # For right tailed test, p_value = 1 - norm.cdf(...) p_value = 1 - norm.cdf(z_score, loc = 0, scale = 1) p_value

Confirming the results with a confidence interval

# Calculate 95% confidence interval using quantile method lower = np.quantile(late_shipments_boot_distn, 0.025) upper = np.quantile(late_shipments_boot_distn, 0.975) # Print the confidence interval print((lower, upper))

Hypothesis test (difference of means)

Assumptions for sample size

Example 1. Difference of means (right tailed)

Context

stack_overflow = pd.read_feather("stack_overflow.feather") stack_overflow.head()

stack_overflow.groupby("age_first_code_cut")["converted_comp"].mean()

Calculating T statistic

xbar = stack_overflow.groupby("age_first_code_cut")["converted_comp"].mean() xbar_child = xbar[1] xbar_adult = xbar[0] s = stack_overflow.groupby("age_first_code_cut")["converted_comp"].std() s_child = s[1] s_adult = s[0] n = stack_overflow.groupby("age_first_code_cut")["converted_comp"].count() n_child = n[1] n_adult = n[0]

numerator = xbar_child - xbar_adult denominator = np.sqrt( ((s_child**2)/n_child) + ((s_adult**2)/n_adult ) ) t_stat = numerator/denominator t_stat

Calculating P value

# dof = n - 2 degrees_of_freedom = n_child + n_adult - 2 # since it is a right tailed test P_value = 1 - t.cdf(t_stat, df=degrees_of_freedom) P_value

Solving with pengouin

# Child compensation child_compensation = stack_overflow[ stack_overflow["age_first_code_cut"] == "child"]["converted_comp"] # Adult compensation adult_compensation = stack_overflow[ stack_overflow["age_first_code_cut"] == "adult"]["converted_comp"] pingouin.ttest( x = child_compensation, y = adult_compensation, alternative="greater" )

Example 2. Difference of means (left tailed)

Context

late_shipments = pd.read_feather("late_shipments.feather") late_shipments.head()

Calculating T statistic

xbar = late_shipments.groupby("late")["weight_kilograms"].mean() xbar_no = xbar[0] xbar_yes = xbar[1] s = late_shipments.groupby("late")["weight_kilograms"].std() s_no = s[0] s_yes = s[1] n = late_shipments.groupby("late")["weight_kilograms"].count() n_no = n[0] n_yes = n[1]

# Calculate the numerator of the test statistic numerator = xbar_no - xbar_yes # Calculate the denominator of the test statistic denominator = np.sqrt( ((s_no**2)/n_no) + ((s_yes**2)/n_yes) ) # Calculate the test statistic t_stat = numerator/denominator t_stat

Calculating P-value

# Calculate the degrees of freedom degrees_of_freedom = n_no + n_yes - 2 # Calculate the p-value from the test stat p_value = t.cdf(t_stat, df = degrees_of_freedom) p_value

Hypothesis testing (paired data)

Assumptions for sample size

Example 1. Difference of means (left tailed)

Context

republicans = pd.read_feather("repub_votes_potus_08_12.feather") republicans

sample_data = republicans sample_data["diff"] = sample_data["repub_percent_08"] - sample_data["repub_percent_12"] sample_data["diff"].hist(bins=20)

xbar_diff = sample_data["diff"].mean() xbar_diff

Calculating T statistic

# calculating t n_diff = len(sample_data) s_diff = sample_data["diff"].std() t_stat = (xbar_diff - 0)/np.sqrt((s_diff**2)/n_diff) t_stat

Calculating P value

p_value = t.cdf(t_stat, df = n_diff-1) p_value

solving with pingouin

pingouin.ttest( x = sample_data["diff"], y = 0, alternative = "less" )

pingouin variation of t-test for paired data

pingouin.ttest( x = sample_data["repub_percent_08"], y = sample_data["repub_percent_12"], paired = True, #this argument is very important for paired data alternative="less" )

Example 2. Difference of means (two-tailed)

Context

democrats = pd.read_feather("dem_votes_potus_12_16.feather") democrats.head()

Solving with pingouin

pingouin.ttest( x = democrats['dem_percent_12'], y = democrats['dem_percent_16'], paired = True, alternative = "two-sided" )

Hypothesis test (ANOVA)

Assumptions for sample size

When to use ANOVA

stack_overflow = pd.read_feather("stack_overflow.feather") stack_overflow[["job_sat", "converted_comp"]].head()

sns.boxplot( x = stack_overflow["converted_comp"], y = stack_overflow["job_sat"], data = stack_overflow ) plt.show()

sns.boxplot( x = stack_overflow["converted_comp"], y = stack_overflow["job_sat"], data = stack_overflow, showfliers = False ) plt.show()

pingouin.anova() method

pingouin.anova( data = stack_overflow, dv = "converted_comp", #Dependant Variable between= "job_sat" )

pairwise tests

pingouin.pairwise_tests( data = stack_overflow, dv = "converted_comp", between= "job_sat", padjust= "none" )

false positive danger

p -value adjustment (bonferroni)

pingouin.pairwise_tests( data = stack_overflow, dv = "converted_comp", between= "job_sat", padjust= "bonf" )

Example 2 ANOVA (three categories)

late_shipments = pd.read_feather("late_shipments.feather") late_shipments[["shipment_mode", "pack_price"]].head()

Checking sample size assumptions

# Count the shipment_mode values counts = late_shipments["shipment_mode"].value_counts() # Print the result print(counts) # Inspect whether the counts are big enough print((counts >= 30).all())

Exploratory Data Analysis per Group

late_shipments.groupby("shipment_mode")["pack_price"].mean()

late_shipments.groupby("shipment_mode")['pack_price'].std()

sns.boxplot( x = late_shipments["pack_price"], y = late_shipments["shipment_mode"], data = late_shipments ) plt.show()

Conducting ANOVA

pingouin.anova( data = late_shipments, dv = "pack_price", #Dependant Variable between= "shipment_mode" )

Pairwise tests

pingouin.pairwise_tests( data = late_shipments, dv = "pack_price", between = "shipment_mode" )

P-value adjustment (bonferroni)

pingouin.pairwise_tests( data = late_shipments, dv = "pack_price", between = "shipment_mode", padjust = "bonf" )

Hypothesis test (tests for proportions)

Assumptions for sample size

Example 1 (two-tailed test)

Context

stack_overflow["age_cat"].value_counts(normalize = True)

Calculate z score

p_hat = (stack_overflow["age_cat"] == "Under 30").mean() p_0 = 0.5 n = len(stack_overflow)

z_score = (p_hat - p_0) / np.sqrt( (p_0 * (1 - p_0))/ n ) z_score

Calculating p-value

#substract from 1 since p-value is positive p_value = 2 * (1 - norm.cdf(z_score)) p_value

Example 2 (right tailed test)

late_shipments = pd.read_feather("late_shipments.feather")

Checking for sample size assumptions

# Count the late values counts = late_shipments["late"].value_counts() # Print the result print(counts) # Inspect whether the counts are big enough print((counts >= 10).all())

Calculating z-score

# Hypothesize that the proportion of late shipments is 6% p_0 = 0.06 # Calculate the sample proportion of late shipments p_hat = (late_shipments['late'] == "Yes").mean() # Calculate the sample size n = len(late_shipments) # Calculate the numerator and denominator of the test statistic numerator = p_hat - p_0 denominator = np.sqrt(p_0 * (1 - p_0) / n) # Calculate the test statistic z_score = numerator / denominator

Calculating p-value

# Calculate the p-value from the z-score p_value = 1 - norm.cdf(z_score) # Print the p-value print(p_value)

Hypothesis test (tests for differences of proportions)

Assumptions for sample size

Example 1 (two-tailed)

Context

stack_overflow.groupby("age_cat")["hobbyist"].value_counts(normalize = True)

Calculating the Z score

p_hats = stack_overflow.groupby("age_cat")["hobbyist"].value_counts(normalize = True) p_hats

p_hat_at_least_30 = p_hats[("At least 30", "Yes")] p_hat_under_30 = p_hats[("Under 30", "Yes")] print(p_hat_at_least_30, p_hat_under_30)

n = stack_overflow.groupby("age_cat")["hobbyist"].count() n

n_at_least_30 = n["At least 30"] n_under_30 = n["Under 30"] print(n_at_least_30, n_under_30)

# calculating p_hat p_hat = ( (n_at_least_30 * p_hat_at_least_30 + n_under_30 * p_hat_under_30)/ (n_at_least_30 + n_under_30) ) # calculating standard error std_error = np.sqrt(p_hat * (1 - p_hat) / n_at_least_30 + p_hat * (1 - p_hat) / n_under_30) # calculating z-score z_score = (p_hat_at_least_30 - p_hat_under_30) / std_error z_score

Calculating p-value

# p-value for two sided test 2 * norm.cdf(z_score)

Calculating with statsmodels

stack_overflow.groupby("age_cat")["hobbyist"].value_counts()

n_hobbyists = np.array([812, 1021]) n_hobbyists

n_rows = np.array([812 + 238, 1021 + 190]) n_rows

from statsmodels.stats.proportion import proportions_ztest z_score, p_value = proportions_ztest(count=n_hobbyists, nobs=n_rows, alternative="two-sided") print(z_score, p_value)

Example 2 (right tailed)

Context

p_hats = ( late_shipments. groupby("freight_cost_groups")["late"].value_counts(normalize=True) ).filter(like='Yes', axis=0) p_hats

# sample size ns = late_shipments.groupby("freight_cost_groups")["late"].count() ns

p_hat = ( (p_hats["reasonable"] * ns["reasonable"] + p_hats["expensive"] * ns["expensive"]) / (ns["reasonable"] + ns["expensive"]) ) p_hat

# Calculate the standard error std_error = np.sqrt( ( ( p_hat * (1 - p_hat) ) / ns["expensive"] )+ ( ( p_hat * (1 - p_hat) ) / ns["reasonable"] ) ) std_error

Calculating Z score

z_score = (p_hats["expensive"] - p_hats["reasonable"])/std_error z_score

Calculating p value

# Right tailed test p_value = 1 - norm.cdf(z_score) p_value

Calculating with statmodels

# Count the late column values for each freight_cost_group late_by_freight_cost_group = ( late_shipments.groupby("freight_cost_groups")["late"].value_counts() ) success_counts

success_counts = np.array( [ late_by_freight_cost_group[("expensive","Yes")], late_by_freight_cost_group[("reasonable","Yes")] ] ) success_counts

n = np.array([ late_by_freight_cost_group["expensive"].sum(), late_by_freight_cost_group["reasonable"].sum() ]) n

stat, p_value = proportions_ztest( count = success_counts, nobs = n, alternative = "larger" ) print(stat, p_value)

Chi Squared Tests of independance

Assumptions for sample size

Example 1

props = ( stack_overflow .groupby("hobbyist")["age_cat"] .value_counts(normalize = True) ) wide_props = props.unstack() wide_props.plot(kind = "bar", stacked = True)

expected, observed, stats = pingouin.chi2_independence( data = stack_overflow, x = "hobbyist", y = "age_cat", correction= False ) stats

Example 2

stack_overflow["age_cat"].value_counts()

stack_overflow["job_sat"].value_counts()

props = ( stack_overflow .groupby("job_sat")["age_cat"] .value_counts(normalize = True) ) wide_props = props.unstack() wide_props.plot(kind = "bar", stacked = True)

expected, observed, stats = pingouin.chi2_independence( data = stack_overflow, x = "job_sat", y = "age_cat", correction= False ) stats

Example 3

Checking sample size assumption

# Remove that DDU for convenience (it's only one) late_shipments = late_shipments[late_shipments["vendor_inco_term"] != "DDU"] # Count the values of freight_cost_group grouped by vendor_inco_term counts = ( late_shipments .groupby("vendor_inco_term")["freight_cost_groups"] .value_counts() ) # Print the result print(counts) # Inspect whether the counts are big enough print((counts >= 5).all())

Exploring variable independance

props = ( late_shipments .groupby("vendor_inco_term")["freight_cost_groups"] .value_counts(normalize = True) ) wide_props = props.unstack() wide_props.plot(kind = "bar", stacked = True)

Performing chi squared test

expected, observed, stats = pingouin.chi2_independence( data = late_shipments, x = "vendor_inco_term", y = "freight_cost_groups" ) stats

Chi squared goodness of fit tests

Example 1

purple_link_counts = ( stack_overflow["purple_link"] .value_counts() .rename_axis("purple_link") .reset_index(name="n") .sort_values("purple_link") ) purple_link_counts

hypothesized = pd.DataFrame({ "purple_link" : ["Amused", "Annoyed", "Hello, old friend", "Indifferent"], "prop" : [1/6, 1/6, 1/2, 1/6] }) hypothesized

n_total = len(stack_overflow) hypothesized["n"] = hypothesized["prop"] * n_total hypothesized

plt.bar( purple_link_counts["purple_link"], purple_link_counts["n"], color = "red", label = "Observed" ) plt.bar( hypothesized["purple_link"], hypothesized["n"], alpha = 0.5, color = "blue", label = "Hypothesized" ) plt.legend() plt.show()

from scipy.stats import chisquare

chisquare( f_obs=purple_link_counts["n"], f_exp=hypothesized["n"] )

Example 2

# Remove that DDU for convenience (it's only one) late_shipments = late_shipments[late_shipments["vendor_inco_term"] != "DDU"] incoterm_counts = ( late_shipments["vendor_inco_term"] .value_counts() .rename_axis("vendor_inco_term") .reset_index(name="n") .sort_values("vendor_inco_term") ) incoterm_counts

# Hypothesized ditribution hypothesized = pd.DataFrame({ "vendor_inco_term" : ["CIP", "DDP", "EXW", "FCA"], "prop" : [0.05, 0.1, 0.75, 0.1] }) hypothesized

# Counts for hypothesized hypothesized["n"] = hypothesized["prop"] * n_total

# Find the number of rows in late_shipments n_total = len(late_shipments) # Create n column that is prop column * n_total hypothesized["n"] = hypothesized["prop"] * n_total # Plot a red bar graph of n vs. vendor_inco_term for incoterm_counts plt.bar( incoterm_counts["vendor_inco_term"], incoterm_counts["n"], color = "red", label="Observed") plt.bar( hypothesized['vendor_inco_term'], hypothesized['n'], alpha = 0.5, color="blue", label="Observed") plt.legend() plt.show()

gof_test = chisquare( f_obs=incoterm_counts["n"], f_exp=hypothesized["n"] ) gof_test

NON Parametric tests

Wilcoxon-signed rank test (Paired data)

repub_votes = pd.read_feather("repub_votes_potus_08_12.feather")

repub_votes_small = repub_votes.sample(5) repub_votes_small

alpha = 0.01 pingouin.ttest( x = repub_votes_small["repub_percent_08"], y = repub_votes_small["repub_percent_12"], paired = True, alternative = "less" )

Wicoxon test process

from scipy.stats import rankdata # Creating a list to see it's ranks x = [1,15,3,10,6]

rankdata(x)

repub_votes_small["diff"] = repub_votes_small["repub_percent_08"] - repub_votes_small["repub_percent_12"] repub_votes_small[["repub_percent_08", "repub_percent_12", "diff"]]

repub_votes_small["abs_diff"] = repub_votes_small["diff"].abs() repub_votes_small[["repub_percent_08", "repub_percent_12", "diff", "abs_diff"]]

repub_votes_small["rank_abs_diff"] = rankdata(repub_votes_small["abs_diff"]) repub_votes_small[["repub_percent_08", "repub_percent_12", "diff", "abs_diff","rank_abs_diff"]]

print(repub_votes_small[repub_votes_small["diff"] < 0]["rank_abs_diff"]) T_minus = sum (repub_votes_small[repub_votes_small["diff"] < 0]["rank_abs_diff"]) print(T_minus)

print(repub_votes_small[repub_votes_small["diff"] > 0]["rank_abs_diff"]) T_plus = sum (repub_votes_small[repub_votes_small["diff"] > 0]["rank_abs_diff"]) print(T_plus)

W = np.min([T_minus, T_plus]) W

pingouin.wilcoxon

alpha = 0.01 pingouin.wilcoxon( x = repub_votes_small["repub_percent_08"], y = repub_votes_small["repub_percent_12"], alternative="less" )

Wilcoxon-Mann-Whitney test (Independent data)

sns.boxplot( x = stack_overflow["converted_comp"], y = stack_overflow["age_first_code_cut"], data = stack_overflow, showfliers = False # removing the outliers just for the chart to look better ) plt.show()

age_vs_comp = stack_overflow[["converted_comp", "age_first_code_cut"]]

age_vs_comp_wide = age_vs_comp.pivot(columns="age_first_code_cut", values="converted_comp") age_vs_comp_wide.head()

alpha = 0.01 pingouin.mwu( x = age_vs_comp_wide["child"], y = age_vs_comp_wide["adult"], alternative = "greater" )

Kruskal-Wallis test

sns.boxplot( x = stack_overflow["converted_comp"], y = stack_overflow["job_sat"], data = stack_overflow, showfliers = False # removing the outliers just for the chart to look better ) plt.show()

alpha = 0.01 pingouin.kruskal( data = stack_overflow, dv = "converted_comp", between = "job_sat" )

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Sampling

Types of sampling

Simple random sampling

Systematic sampling

Stratified sampling

Cluster sampling

Bonus: Calculate mean per categories

Relative error

Calculate relative error

Relative error vs sample size

Sampling distribution

Approximating sampling distributions

Bootstrapping

Concept

Actually Bootstrapping

Bootstrap statistics to estimate population parameters

Mean

Standard Deviation

Bootstrapping vs sampling distribution to estimate the mean

Confidence intervals

Quantile method for confidence intervals

standard error method for confidence intervals

Hypotesis testing

Concept

Assumptions

Sanity check

Hypothesis test (mean)

Steps to perform hypothesis testing

1. Define the null hypothesis and alternative hypothesis

2. Calculate the Z-score

3. Calculate the P-value

Example1. TWO TAILED TEST

context

Calculating Z-score

Calculating P-value

Example2. RIGHT TAILED TEST

context

Calculating Z score

Calculating P-value

Example 3: RIGHT TAILED TEST

context

Calculating Z score

Calculating P-value

Confirming the results with a confidence interval

Hypothesis test (difference of means)

Assumptions for sample size

Example 1. Difference of means (right tailed)

Context

Calculating T statistic

Calculating P value

Solving with pengouin

Example 2. Difference of means (left tailed)

Context

Calculating T statistic

Calculating P-value

Hypothesis testing (paired data)

Assumptions for sample size

Example 1. Difference of means (left tailed)

Context

Calculating T statistic

Calculating P value

solving with pingouin

pingouin variation of t-test for paired data

Example 2. Difference of means (two-tailed)

Context

Solving with pingouin

Hypothesis test (ANOVA)

Assumptions for sample size

When to use ANOVA

pingouin.anova() method

pairwise tests

false positive danger

p -value adjustment (bonferroni)

Example 2 ANOVA (three categories)

Checking sample size assumptions

Exploratory Data Analysis per Group

Conducting ANOVA

Pairwise tests

P-value adjustment (bonferroni)

Hypothesis test (tests for proportions)

Sampling