import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
data = pd.read_csv('data.csv')
data.head()
target_agg = data['new feature'].value_counts().rename(index={False:"WITHOUT \n New Feature", True:'WITH \n New Feature'})
target_bar = plt.bar(x=target_agg.index, height=target_agg.values)
plt.bar_label(target_bar, padding=1)
plt.yticks(ticks=np.arange(0,141,20))
plt.title('The Number of Participants')
plt.show()
with_hours = data[data['new feature'] == True]['hours']
without_hours = data[data['new feature'] == False]['hours']
print("The Average Hours Spent on Gaming (With New Feature): ", np.mean(with_hours))
print("The Average Hours Spent on Gaming (Without New Feature): ", np.mean(without_hours))
print("The Difference in the Average Hours Spent on Gaming: ", np.mean(with_hours) - np.mean(without_hours))
# function for calculating the t-test for two independent samples
def independent_ttest(data1, data2, alpha=0.05, alternative = 'two-sided'):
"""
Calculate the t-test on TWO INDEPENDENT samples of scores, data1 and data2.
This is a test for the null hypothesis that 2 independent samples
have identical average (expected) values. This test assumes that the
populations do not have identical variances.
Parameters
----------
data1, data2 : array_like
The arrays must have the same shape.
alpha : int or float, default: 0.05
The significance level
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis.
The following options are available (default is 'two-sided'):
* 'two-sided': the means of the distributions underlying the samples
are unequal.
* 'less': the mean of the distribution underlying the first sample
is less than the mean of the distribution underlying the second
sample.
* 'greater': the mean of the distribution underlying the first
sample is greater than the mean of the distribution underlying
the second sample.
Returns
-------
t_stat : The t-statistic.
p : The p-value associated with the given alternative.
df : The number of degrees of freedom used in calculation of the
t-statistic; this is one less than the size of the sample
cv: critical value
"""
# calculate means
mean1, mean2 = np.average(data1), np.average(data2)
std1, std2 = np.std(data1, ddof=1), np.std(data2, ddof=1)
n1 = len(data1)
n2 = len(data2)
# calculate standard errors
se1 = std1/np.sqrt(n1)
se2 = std2/np.sqrt(n2)
sqrd_se1 = std1**2/n1
sqrd_se2 = std2**2/n2
# standard error on the difference between the samples
sed = np.sqrt(sqrd_se1 + sqrd_se2)
# calculate the t statistic
t_stat = (mean1 - mean2) / sed
# degrees of freedom
df_n = (sqrd_se1 + sqrd_se2)**2
df_d = (sqrd_se1**2)/(n1-1) + (sqrd_se2**2)/(n2-1)
df = df_n/df_d
# calculate the critical value
cv = ((stats.t.ppf(alpha, df)),(stats.t.ppf(1.0 - alpha, df)))
# calculate the p-value
if alternative == 'two-sided':
p = min([(stats.t.cdf(abs(t_stat), df)),(1.0 - stats.t.cdf(abs(t_stat), df))]) * 2.0
elif alternative == 'greater':
p = 1.0 - stats.t.cdf(abs(t_stat), df)
elif alternative == 'less':
p = stats.t.cdf(abs(t_stat), df)
# return everything
return t_stat, df, cv, p
t_stat, df, cv, p_value = independent_ttest(with_hours, without_hours, alpha=0.01, alternative='greater')
alpha = 0.01
print("The result of the two sample t-test")
print("t statistic: %.8f" % t_stat)
print("p value: %.8f" % p_value)
if p_value <alpha:
print("Reject null hypothesis")
else:
print("Fail to reject null hypothesis")
ttest, p_value = stats.ttest_ind(with_hours, without_hours, equal_var=False, alternative='greater')
print("The result of the two sample t-test for each feature")
print("t statistic:%.8f" % ttest)
print("p value:%.8f" % p_value)
if p_value <0.01:
print("Reject null hypothesis")
else:
print("Fail to reject null hypothesis")
hours_after = data[data['new feature'] == True]['hours']
hours_before = data[data['new feature'] == True]['monthly hours (before experiment)']
diff = hours_after-hours_before
print("The average difference in monthly hours of gaming (before the experiemt vs during the experiment):", np.mean(diff))
def paired_ttest(data1, data2, alpha=0.05, alternative='two-sided'):
"""
Calculate the t-test on TWO RELATED samples of scores, data1 and data2.
This is a test for the null hypothesis that two related or
repeated samples have identical average (expected) values.
Parameters
----------
data1, data2 : array_like
The arrays must have the same shape.
alpha : int or float, default: 0.05
The significance level
alternative : {'two-sided', 'less', 'greater'}, optional
Defines the alternative hypothesis.
The following options are available (default is 'two-sided'):
* 'two-sided': the means of the distributions underlying the samples
are unequal.
* 'less': the mean of the distribution underlying the first sample
is less than the mean of the distribution underlying the second
sample.
* 'greater': the mean of the distribution underlying the first
sample is greater than the mean of the distribution underlying
the second sample.
Returns
-------
t_stat : The t-statistic.
p : The p-value associated with the given alternative.
df : The number of degrees of freedom used in calculation of the
t-statistic; this is one less than the size of the sample
cv: critical value
"""
if len(data1) != len(data2):
raise Exception("The size of data1 and data2 should be the same")
data1 = np.array(data1)
data2 = np.array(data2)
# calculate means and standard devision of data1 and data 2
mean1, mean2 = np.average(data1), np.average(data2)
std1, std2 = np.std(data1, ddof=1), np.std(data2, ddof=1)
# sample size
n = len(data1)
# degree of freedom
df = n-1
diff = data1 - data2
mean_diff = np.average(diff)
# standard deviaion of the difference
s_diff = np.sqrt((1/(n-1))*(sum((diff-mean_diff)**2)))
# t-test statistic
t_stat = mean_diff/(s_diff/np.sqrt(n))
# calculate the p-value
if alternative == 'two-sided':
p = min([(stats.t.cdf(abs(t_stat), df)),(1.0 - stats.t.cdf(abs(t_stat), df))]) * 2.0
elif alternative == 'greater':
p = 1.0 - stats.t.cdf(abs(t_stat), df)
elif alternative == 'less':
p = stats.t.cdf(abs(t_stat), df)
# calculate the critical value
cv = ((stats.t.ppf(alpha, df)),(stats.t.ppf(1.0 - alpha, df)))
# return everything
return t_stat, df, cv, p
t_stat, df, cv, p_value = paired_ttest(hours_after, hours_before, alpha=0.01, alternative='greater')
alpha = 0.01
print("The result of the paired t-test")
print("t statistic: %.8f" % t_stat)
print("p value: %.8f" % p_value)
if p_value <alpha:
print("Reject null hypothesis")
else:
print("Fail to reject null hypothesis")
ttest, p_value = stats.ttest_rel(hours_after, hours_before, alternative='greater')
print("The result of the paired t-test")
print("The test statistic: ", ttest)
print("The p value:%.8f" % p_value)
if p_value <0.05:
print("=> Reject the null hypothesis")
else:
print("=> Fail to reject the null hypothesis")
from statsmodels.stats.power import TTestIndPower
# size of samples (e.g. in a previous study)
n1, n2 = len(with_hours), len(without_hours)
# variance of samples (e.g. in a previous study)
v1, v2 = np.var(with_hours), np.var(without_hours)
# pooled standard deviation
pooled_s = np.sqrt(((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2))
# calculate the effect size (Cohen's d)
d = 2.5 / pooled_s
print(f'Effect size: {d}')
# factors for power analysis
alpha = 0.01
power = 0.8
# power analysis to find sample size for given effect
ttest_ind = TTestIndPower()
n = ttest_ind.solve_power(effect_size=d, alpha=alpha, power=power,
ratio=1, alternative='larger')
print('Sample size needed in each group: {:.0f}'.format(np.round(n)))
from statsmodels.stats.power import TTestPower
# size of samples (e.g. in a previous study)
n1, n2 = len(hours_after), len(hours_before)
# variance of samples (e.g. in a previous study)
v1, v2 = np.var(hours_after), np.var(hours_before)
# pooled standard deviation
pooled_s = np.sqrt(((n1 - 1) * v1 + (n2 - 1) * v2) / (n1 + n2 - 2))
# calculate the effect size (Cohen's d)
d = 2.5 / pooled_s
print(f'Effect size: {d}')
# factors for power analysis
alpha = 0.01
power = 0.8
# power analysis to find sample size for given effect
ttest_paired = TTestPower()
n = ttest_paired.solve_power(effect_size=d, alpha=alpha, power=power, alternative='larger')
print('Sample size needed : {:.0f}'.format(np.round(n)))