Tactile Entertainment - Level Gate A/B Test

import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import os import io # Hypothesis Testing from scipy.stats import shapiro import scipy.stats as stats # Configuration # ----------------------------------- import warnings warnings.filterwarnings("ignore") warnings.simplefilter(action='ignore', category=FutureWarning) pd.set_option('display.max_columns', None) pd.options.display.float_format = '{:.4f}'.format

# Path if running through Colab # path = '/content/drive/MyDrive/cookie_cats.csv' # Path if through Deepnote path = 'cookie_cats.csv'

df = pd.read_csv(path) df.head()

# Check for missing user IDs print("Missing user IDs:", df['userid'].isnull().sum()) # Check for duplicate user IDs print("Duplicate user IDs:", df['userid'].duplicated().sum()) # Check the number of users in each version print("\nNumber of users in each version:") print(df['version'].value_counts()) # Check the proportion of users in each version print("\nProportion of users in each version:") print(df['version'].value_counts(normalize=True))

print("Descriptive statistics for sum_gamerounds by version:") print(df.groupby('version')['sum_gamerounds'].describe())

plt.figure(figsize=(10, 6)) sns.boxplot(x='version', y='sum_gamerounds', data=df) plt.title('Distribution of Sum of Gamerounds by Version') plt.ylabel('Sum of Gamerounds') plt.xlabel('Version') plt.show()

df['sum_gamerounds'].sort_values(ascending=False).head(10)

# Remove outlier df = df[df['sum_gamerounds'] < 49854]

# View updated distribution plt.figure(figsize=(10, 6)) sns.boxplot(x='version', y='sum_gamerounds', data=df) plt.title('Distribution of Sum of Gamerounds by Version (Outlier Removed)') plt.ylabel('Sum of Gamerounds') plt.xlabel('Version') plt.show()

# Check for homogeneity of variances using Levene's test stat, p = stats.levene(df[df['version'] == 'gate_30']['sum_gamerounds'], df[df['version'] == 'gate_40']['sum_gamerounds']) print(f"Levene's test for homogeneity of variances:") print(f"Statistic: {stat:.4f}, p-value: {p:.4f}") if p < 0.05: print("Assumption of homogeneity of variances is violated.") else: print("Assumption of homogeneity of variances is met.")

# Check for normality using Shapiro-Wilk test # Due to large sample size, Shapiro-Wilk may reject normality even for small deviations. # Visual inspection (histograms, QQ plots) might be more informative. shapiro_gate_30_stat, shapiro_gate_30_p = shapiro(df[df['version'] == 'gate_30']['sum_gamerounds']) shapiro_gate_40_stat, shapiro_gate_40_p = shapiro(df[df['version'] == 'gate_40']['sum_gamerounds']) print(f"\nShapiro-Wilk test for normality (gate_30):") print(f"Statistic: {shapiro_gate_30_stat:.4f}, p-value: {shapiro_gate_30_p:.4f}") print(f"\nShapiro-Wilk test for normality (gate_40):") print(f"Statistic: {shapiro_gate_40_stat:.4f}, p-value: {shapiro_gate_40_p:.4f}") if shapiro_gate_30_p < 0.05 or shapiro_gate_40_p < 0.05: print("\nAssumption of normality is likely violated in at least one group.") else: print("\nAssumption of normality is met in both groups.") # Visual check for normality (histograms) plt.figure(figsize=(12, 6)) plt.subplot(1, 2, 1) sns.histplot(df[df['version'] == 'gate_30']['sum_gamerounds'], kde=True, bins=50) plt.title('Distribution of Sum of Gamerounds (gate_30)') plt.xlabel('Sum of Gamerounds') plt.ylabel('Frequency') plt.subplot(1, 2, 2) sns.histplot(df[df['version'] == 'gate_40']['sum_gamerounds'], kde=True, bins=50) plt.title('Distribution of Sum of Gamerounds (gate_40)') plt.xlabel('Sum of Gamerounds') plt.ylabel('Frequency') plt.tight_layout() plt.show()

# Check assumption of expected frequencies for Chi-Squared test (retention_1) print("\nExpected frequencies for Chi-Squared test (retention_1):") contingency_table_ret1 = pd.crosstab(df['version'], df['retention_1']) chi2_ret1, p_ret1_chi2, dof_ret1, expected_ret1 = stats.chi2_contingency(contingency_table_ret1) print(pd.DataFrame(expected_ret1, index=['gate_30', 'gate_40'], columns=[False, True])) # Check assumption of expected frequencies for Chi-Squared test (retention_7) print("\nExpected frequencies for Chi-Squared test (retention_7):") contingency_table_ret7 = pd.crosstab(df['version'], df['retention_7']) chi2_ret7, p_ret7_chi2, dof_ret7, expected_ret7 = stats.chi2_contingency(contingency_table_ret7) print(pd.DataFrame(expected_ret7, index=['gate_30', 'gate_40'], columns=[False, True]))

# Perform Mann-Whitney U test stat, p = stats.mannwhitneyu(df[df['version'] == 'gate_30']['sum_gamerounds'], df[df['version'] == 'gate_40']['sum_gamerounds'], alternative='two-sided') # Using two-sided test as we don't have a specific direction hypothesis print(f"Mann-Whitney U Test results:") print(f"Statistic: {stat:.4f}, p-value: {p:.4f}") # Interpret the results alpha = 0.05 if p < alpha: print("\nReject the null hypothesis: There is a statistically significant difference in the sum of gamerounds between the two versions.") else: print("\nFail to reject the null hypothesis: There is no statistically significant difference in the sum of gamerounds between the two versions.")

# Perform Mann-Whitney U test (one-tailed) stat_one_tailed, p_one_tailed = stats.mannwhitneyu(df[df['version'] == 'gate_30']['sum_gamerounds'], df[df['version'] == 'gate_40']['sum_gamerounds'], alternative='greater') # Testing if gate_30 is greater than gate_40 print(f"Mann-Whitney U Test results (one-tailed):") print(f"Statistic: {stat_one_tailed:.4f}, p-value: {p_one_tailed:.4f}") # Interpret the results alpha = 0.05 if p_one_tailed < alpha: print("\nReject the null hypothesis: There is statistically significant evidence that the sum of gamerounds is lower in the gate_40 version compared to gate_30.") else: print("\nFail to reject the null hypothesis: There is no statistically significant evidence that the sum of gamerounds is lower in the gate_40 version compared to gate_30.")

# Perform Chi-Squared test for retention_1 contingency_table_ret1 = pd.crosstab(df['version'], df['retention_1']) chi2_stat_ret1, p_value_ret1, dof_ret1, expected_ret1 = stats.chi2_contingency(contingency_table_ret1) print(f"Chi-Squared Test results for retention_1:") print(f"Chi-Squared Statistic: {chi2_stat_ret1:.4f}") print(f"P-value: {p_value_ret1:.4f}") print(f"Degrees of Freedom: {dof_ret1}") # Interpret the results for retention_1 alpha = 0.05 if p_value_ret1 < alpha: print("\nReject the null hypothesis: There is a statistically significant difference in 1-day retention between the two versions.") else: print("\nFail to reject the null hypothesis: There is no statistically significant difference in 1-day retention between the two versions.")

# Perform Chi-Squared test for retention_7 contingency_table_ret7 = pd.crosstab(df['version'], df['retention_7']) chi2_stat_ret7, p_value_ret7, dof_ret7, expected_ret7 = stats.chi2_contingency(contingency_table_ret7) print(f"Chi-Squared Test results for retention_7:") print(f"Chi-Squared Statistic: {chi2_stat_ret7:.4f}") print(f"P-value: {p_value_ret7:.4f}") print(f"Degrees of Freedom: {dof_ret7}") # Interpret the results for retention_7 alpha = 0.05 if p_value_ret7 < alpha: print("\nReject the null hypothesis: There is a statistically significant difference in 7-day retention between the two versions.") else: print("\nFail to reject the null hypothesis: There is no statistically significant difference in 7-day retention between the two versions.")

# Calculate confidence interval for the difference in means for sum_gamerounds # Using bootstrap method since normality assumption was violated n_bootstraps = 1000 bootstrapped_diffs = [] gate_30_rounds = df[df['version'] == 'gate_30']['sum_gamerounds'] gate_40_rounds = df[df['version'] == 'gate_40']['sum_gamerounds'] for i in range(n_bootstraps): bootstrap_gate_30 = gate_30_rounds.sample(frac=1, replace=True) bootstrap_gate_40 = gate_40_rounds.sample(frac=1, replace=True) bootstrapped_diffs.append(bootstrap_gate_30.mean() - bootstrap_gate_40.mean()) # Calculate the 95% confidence interval confidence_interval_sum_gamerounds = np.percentile(bootstrapped_diffs, [2.5, 97.5]) print(f"95% Confidence Interval for the difference in mean sum_gamerounds (gate_30 - gate_40):") print(f"{confidence_interval_sum_gamerounds}")

# Calculate confidence interval for the difference in proportions for retention_1 count_ret1_gate_30 = df[df['version'] == 'gate_30']['retention_1'].sum() nobs_ret1_gate_30 = len(df[df['version'] == 'gate_30']) count_ret1_gate_40 = df[df['version'] == 'gate_40']['retention_1'].sum() nobs_ret1_gate_40 = len(df[df['version'] == 'gate_40']) # Use statsmodels for confidence interval for difference in proportions from statsmodels.stats.proportion import confint_proportions_2indep ci_ret1_diff = confint_proportions_2indep(count_ret1_gate_30, nobs_ret1_gate_30, count_ret1_gate_40, nobs_ret1_gate_40, method='agresti-coull') # Agresti-Coull is a good general method print(f"95% Confidence Interval for the difference in retention_1 proportions (gate_30 - gate_40):") print(f"{ci_ret1_diff}")

# Calculate confidence interval for the difference in proportions for retention_7 count_ret7_gate_30 = df[df['version'] == 'gate_30']['retention_7'].sum() nobs_ret7_gate_30 = len(df[df['version'] == 'gate_30']) count_ret7_gate_40 = df[df['version'] == 'gate_40']['retention_7'].sum() nobs_ret7_gate_40 = len(df[df['version'] == 'gate_40']) # Use statsmodels for confidence interval for difference in proportions from statsmodels.stats.proportion import confint_proportions_2indep ci_ret7_diff = confint_proportions_2indep(count_ret7_gate_30, nobs_ret7_gate_30, count_ret7_gate_40, nobs_ret7_gate_40, method='agresti-coull') # Agresti-Coull is a good general method print(f"95% Confidence Interval for the difference in retention_7 proportions (gate_30 - gate_40):") print(f"{ci_ret7_diff}")