import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import io
# Hypothesis Testing
from scipy.stats import shapiro
import scipy.stats as stats
# Configuration
# -----------------------------------
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.4f}'.format
# Path if running through Colab
# path = '/content/drive/MyDrive/cookie_cats.csv'
# Path if through Deepnote
path = 'cookie_cats.csv'
df = pd.read_csv(path)
df.head()
# Check for missing user IDs
print("Missing user IDs:", df['userid'].isnull().sum())
# Check for duplicate user IDs
print("Duplicate user IDs:", df['userid'].duplicated().sum())
# Check the number of users in each version
print("\nNumber of users in each version:")
print(df['version'].value_counts())
# Check the proportion of users in each version
print("\nProportion of users in each version:")
print(df['version'].value_counts(normalize=True))
print("Descriptive statistics for sum_gamerounds by version:")
print(df.groupby('version')['sum_gamerounds'].describe())
plt.figure(figsize=(10, 6))
sns.boxplot(x='version', y='sum_gamerounds', data=df)
plt.title('Distribution of Sum of Gamerounds by Version')
plt.ylabel('Sum of Gamerounds')
plt.xlabel('Version')
plt.show()
df['sum_gamerounds'].sort_values(ascending=False).head(10)
# Remove outlier
df = df[df['sum_gamerounds'] < 49854]
# View updated distribution
plt.figure(figsize=(10, 6))
sns.boxplot(x='version', y='sum_gamerounds', data=df)
plt.title('Distribution of Sum of Gamerounds by Version (Outlier Removed)')
plt.ylabel('Sum of Gamerounds')
plt.xlabel('Version')
plt.show()
# Check for homogeneity of variances using Levene's test
stat, p = stats.levene(df[df['version'] == 'gate_30']['sum_gamerounds'],
df[df['version'] == 'gate_40']['sum_gamerounds'])
print(f"Levene's test for homogeneity of variances:")
print(f"Statistic: {stat:.4f}, p-value: {p:.4f}")
if p < 0.05:
print("Assumption of homogeneity of variances is violated.")
else:
print("Assumption of homogeneity of variances is met.")
# Check for normality using Shapiro-Wilk test
# Due to large sample size, Shapiro-Wilk may reject normality even for small deviations.
# Visual inspection (histograms, QQ plots) might be more informative.
shapiro_gate_30_stat, shapiro_gate_30_p = shapiro(df[df['version'] == 'gate_30']['sum_gamerounds'])
shapiro_gate_40_stat, shapiro_gate_40_p = shapiro(df[df['version'] == 'gate_40']['sum_gamerounds'])
print(f"\nShapiro-Wilk test for normality (gate_30):")
print(f"Statistic: {shapiro_gate_30_stat:.4f}, p-value: {shapiro_gate_30_p:.4f}")
print(f"\nShapiro-Wilk test for normality (gate_40):")
print(f"Statistic: {shapiro_gate_40_stat:.4f}, p-value: {shapiro_gate_40_p:.4f}")
if shapiro_gate_30_p < 0.05 or shapiro_gate_40_p < 0.05:
print("\nAssumption of normality is likely violated in at least one group.")
else:
print("\nAssumption of normality is met in both groups.")
# Visual check for normality (histograms)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df[df['version'] == 'gate_30']['sum_gamerounds'], kde=True, bins=50)
plt.title('Distribution of Sum of Gamerounds (gate_30)')
plt.xlabel('Sum of Gamerounds')
plt.ylabel('Frequency')
plt.subplot(1, 2, 2)
sns.histplot(df[df['version'] == 'gate_40']['sum_gamerounds'], kde=True, bins=50)
plt.title('Distribution of Sum of Gamerounds (gate_40)')
plt.xlabel('Sum of Gamerounds')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Check assumption of expected frequencies for Chi-Squared test (retention_1)
print("\nExpected frequencies for Chi-Squared test (retention_1):")
contingency_table_ret1 = pd.crosstab(df['version'], df['retention_1'])
chi2_ret1, p_ret1_chi2, dof_ret1, expected_ret1 = stats.chi2_contingency(contingency_table_ret1)
print(pd.DataFrame(expected_ret1, index=['gate_30', 'gate_40'], columns=[False, True]))
# Check assumption of expected frequencies for Chi-Squared test (retention_7)
print("\nExpected frequencies for Chi-Squared test (retention_7):")
contingency_table_ret7 = pd.crosstab(df['version'], df['retention_7'])
chi2_ret7, p_ret7_chi2, dof_ret7, expected_ret7 = stats.chi2_contingency(contingency_table_ret7)
print(pd.DataFrame(expected_ret7, index=['gate_30', 'gate_40'], columns=[False, True]))
# Perform Mann-Whitney U test
stat, p = stats.mannwhitneyu(df[df['version'] == 'gate_30']['sum_gamerounds'],
df[df['version'] == 'gate_40']['sum_gamerounds'],
alternative='two-sided') # Using two-sided test as we don't have a specific direction hypothesis
print(f"Mann-Whitney U Test results:")
print(f"Statistic: {stat:.4f}, p-value: {p:.4f}")
# Interpret the results
alpha = 0.05
if p < alpha:
print("\nReject the null hypothesis: There is a statistically significant difference in the sum of gamerounds between the two versions.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant difference in the sum of gamerounds between the two versions.")
# Perform Mann-Whitney U test (one-tailed)
stat_one_tailed, p_one_tailed = stats.mannwhitneyu(df[df['version'] == 'gate_30']['sum_gamerounds'],
df[df['version'] == 'gate_40']['sum_gamerounds'],
alternative='greater') # Testing if gate_30 is greater than gate_40
print(f"Mann-Whitney U Test results (one-tailed):")
print(f"Statistic: {stat_one_tailed:.4f}, p-value: {p_one_tailed:.4f}")
# Interpret the results
alpha = 0.05
if p_one_tailed < alpha:
print("\nReject the null hypothesis: There is statistically significant evidence that the sum of gamerounds is lower in the gate_40 version compared to gate_30.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant evidence that the sum of gamerounds is lower in the gate_40 version compared to gate_30.")
# Perform Chi-Squared test for retention_1
contingency_table_ret1 = pd.crosstab(df['version'], df['retention_1'])
chi2_stat_ret1, p_value_ret1, dof_ret1, expected_ret1 = stats.chi2_contingency(contingency_table_ret1)
print(f"Chi-Squared Test results for retention_1:")
print(f"Chi-Squared Statistic: {chi2_stat_ret1:.4f}")
print(f"P-value: {p_value_ret1:.4f}")
print(f"Degrees of Freedom: {dof_ret1}")
# Interpret the results for retention_1
alpha = 0.05
if p_value_ret1 < alpha:
print("\nReject the null hypothesis: There is a statistically significant difference in 1-day retention between the two versions.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant difference in 1-day retention between the two versions.")
# Perform Chi-Squared test for retention_7
contingency_table_ret7 = pd.crosstab(df['version'], df['retention_7'])
chi2_stat_ret7, p_value_ret7, dof_ret7, expected_ret7 = stats.chi2_contingency(contingency_table_ret7)
print(f"Chi-Squared Test results for retention_7:")
print(f"Chi-Squared Statistic: {chi2_stat_ret7:.4f}")
print(f"P-value: {p_value_ret7:.4f}")
print(f"Degrees of Freedom: {dof_ret7}")
# Interpret the results for retention_7
alpha = 0.05
if p_value_ret7 < alpha:
print("\nReject the null hypothesis: There is a statistically significant difference in 7-day retention between the two versions.")
else:
print("\nFail to reject the null hypothesis: There is no statistically significant difference in 7-day retention between the two versions.")
# Calculate confidence interval for the difference in means for sum_gamerounds
# Using bootstrap method since normality assumption was violated
n_bootstraps = 1000
bootstrapped_diffs = []
gate_30_rounds = df[df['version'] == 'gate_30']['sum_gamerounds']
gate_40_rounds = df[df['version'] == 'gate_40']['sum_gamerounds']
for i in range(n_bootstraps):
bootstrap_gate_30 = gate_30_rounds.sample(frac=1, replace=True)
bootstrap_gate_40 = gate_40_rounds.sample(frac=1, replace=True)
bootstrapped_diffs.append(bootstrap_gate_30.mean() - bootstrap_gate_40.mean())
# Calculate the 95% confidence interval
confidence_interval_sum_gamerounds = np.percentile(bootstrapped_diffs, [2.5, 97.5])
print(f"95% Confidence Interval for the difference in mean sum_gamerounds (gate_30 - gate_40):")
print(f"{confidence_interval_sum_gamerounds}")
# Calculate confidence interval for the difference in proportions for retention_1
count_ret1_gate_30 = df[df['version'] == 'gate_30']['retention_1'].sum()
nobs_ret1_gate_30 = len(df[df['version'] == 'gate_30'])
count_ret1_gate_40 = df[df['version'] == 'gate_40']['retention_1'].sum()
nobs_ret1_gate_40 = len(df[df['version'] == 'gate_40'])
# Use statsmodels for confidence interval for difference in proportions
from statsmodels.stats.proportion import confint_proportions_2indep
ci_ret1_diff = confint_proportions_2indep(count_ret1_gate_30, nobs_ret1_gate_30,
count_ret1_gate_40, nobs_ret1_gate_40,
method='agresti-coull') # Agresti-Coull is a good general method
print(f"95% Confidence Interval for the difference in retention_1 proportions (gate_30 - gate_40):")
print(f"{ci_ret1_diff}")
# Calculate confidence interval for the difference in proportions for retention_7
count_ret7_gate_30 = df[df['version'] == 'gate_30']['retention_7'].sum()
nobs_ret7_gate_30 = len(df[df['version'] == 'gate_30'])
count_ret7_gate_40 = df[df['version'] == 'gate_40']['retention_7'].sum()
nobs_ret7_gate_40 = len(df[df['version'] == 'gate_40'])
# Use statsmodels for confidence interval for difference in proportions
from statsmodels.stats.proportion import confint_proportions_2indep
ci_ret7_diff = confint_proportions_2indep(count_ret7_gate_30, nobs_ret7_gate_30,
count_ret7_gate_40, nobs_ret7_gate_40,
method='agresti-coull') # Agresti-Coull is a good general method
print(f"95% Confidence Interval for the difference in retention_7 proportions (gate_30 - gate_40):")
print(f"{ci_ret7_diff}")