%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from scipy.stats import norm
from statsmodels.stats.proportion import confint_proportions_2indep
from statsmodels.stats.proportion import proportions_ztest
# import the beta function from scipy.special
from scipy.special import beta as beta_function
from scipy.stats import beta as beta_dist
from plotting_utils import plot_gaussian_hypothesis_test
from plotting_utils import plot_type_ii_error_analysis, plot_beta_prior_comparison, plot_prior_vs_posterior
from plotting_utils import plot_informative_prior_posterior_comparison, plot_weakly_informative_prior_with_variants
from plotting_utils import plot_multiple_posteriors_comparison
from nhst import compute_sample_size_non_inferiority
from bayesian import test_non_inferiority, select_best_variant, test_non_inferiority_weakly_informative

# Actual experiment data
nC = 32106
xC_observed = 22772 
control_group_conversion_rate = xC_observed / nC 

# Three variants with actual experiment data
variants = {
    'A': {'n': 4625, 'x': 3244},
    'B': {'n': 2100, 'x': 1433},
    'C': {'n': 2022, 'x': 1396}
}


# Variant A data (from variants dictionary)
nX = variants['C']['n']
xX_observed = variants['C']['x']

# Test parameters
epsilon = 0.02  # 2% non-inferiority margin
nhst_alpha = 0.05    # 5% significance level

# Derived values
hatpC_observed = xC_observed / nC
hatpA_observed = xX_observed / nX
hatDelta_observed = hatpA_observed - hatpC_observed

print(f"Control group conversion rate: {hatpC_observed:.4f}")
print(f"Treatment group A conversion rate: {hatpA_observed:.4f}")
print(f"Observed difference in conversion rate: {hatDelta_observed:.4f}")

Control group conversion rate: 0.7093
Treatment group A conversion rate: 0.6904
Observed difference in conversion rate: -0.0189

pooled_proportion = (xC_observed + xX_observed) / (nC + nX)
wald_pooled_SE = (pooled_proportion * (1 - pooled_proportion) * (1/nC + 1/nX))**0.5
print(f"Wald Pooled Standard Error: {wald_pooled_SE:.4f}")
wald_unpooled_SE = ((hatpC_observed * (1 - hatpC_observed) / nC) + (hatpA_observed * (1 - hatpA_observed) / nX))**0.5
print(f"Wald Unpooled Standard Error: {wald_unpooled_SE:.4f}")

Wald Pooled Standard Error: 0.0104
Wald Unpooled Standard Error: 0.0106

# Plot the Gaussian N(mu, sigma) and shade the right-tail area beyond x

# Use previously defined values
SE_H0 = wald_unpooled_SE
mu_H0 = -epsilon
sigma_HO = SE_H0
x0 = hatDelta_observed

# Create the plot using the helper function
fig, ax = plot_gaussian_hypothesis_test(
    mu_H0=mu_H0,
    sigma_H0=sigma_HO,
    observed_value=x0,
    alpha=nhst_alpha,
    epsilon=epsilon
)

SE_H0 = wald_unpooled_SE
from scipy.stats import norm

mu_H0 = -epsilon    # mean
sigma_HO = SE_H0  # standard deviation
x = hatDelta_observed  # value to evaluate

# Survival function P(X > x)
p_value = norm.sf(x, loc=mu_H0, scale=sigma_HO)
print(f'p-value (one-sided) for an observed difference in proportions at {hatDelta_observed:.4f}: {p_value:.4f}')

# inverse survival function to find critical value for given p-value
critical_value = norm.isf(nhst_alpha, loc=mu_H0, scale=sigma_HO)
print(f"Critical value for our alpha cutoff value at {nhst_alpha:.4f}: {critical_value:.4f}")

p-value (one-sided) for an observed difference in proportions at -0.0189: 0.4575
Critical value for our alpha cutoff value at 0.0500: -0.0026

zni = (hatDelta_observed + epsilon) / SE_H0
p_zni = norm.sf(zni)
print(f"z_NI: {zni:.4f}, p-value: {p_zni:.4f}")

z_NI: 0.1067, p-value: 0.4575

SE_H1 = wald_pooled_SE
mu_H1 = 0
sigma_H1 = SE_H1
x = critical_value
beta = norm.cdf(x, loc=mu_H1, scale=sigma_H1)
print(f"Observed probability of false negative a.k.a β a.k.a type 2 errors,  at critical value : {beta:.4f}")
power = 1 - beta
print(f"Observed Power (1 - β): {power:.4f}")

Observed probability of false negative a.k.a β a.k.a type 2 errors,  at critical value : 0.4022
Observed Power (1 - β): 0.5978

# Plot Type II error analysis

# Create the plot using the helper function
fig, ax = plot_type_ii_error_analysis(
    mu_H1=mu_H1,
    sigma_H1=sigma_H1,
    critical_value=critical_value,
    hatDelta_observed=hatDelta_observed,
    epsilon=epsilon,
    beta=beta,
    power=power
)

# Example: Compute sample size for our passkey example using the utility function
print("="*80)
print("SAMPLE SIZE CALCULATION FOR NON-INFERIORITY TEST")
print("="*80)

# Parameters from our example
p_control = control_group_conversion_rate  
epsilon_val = epsilon  
alpha_val = nhst_alpha  
target_power = 0.80

print(f"\nParameters:")
print(f"  Control conversion rate: {p_control:.2%}")
print(f"  Non-inferiority margin (ε): {epsilon_val:.2%}")
print(f"  Significance level (α): {alpha_val:.2%}")
print(f"  Target power: {target_power:.2%}")
print(f"  Assumed true difference under H1: 0 (no difference)")

# Equal allocation (1:1)
result_equal = compute_sample_size_non_inferiority(
    p_control=p_control,
    epsilon=epsilon_val,
    alpha=alpha_val,
    target_power=target_power,
    h1_effect_size=0.0,
    allocation_ratio=1.0
)

print(f"\n{'='*80}")
print("EQUAL ALLOCATION (1:1 - Control:Variant)")
print(f"{'='*80}")
print(f"Required sample size per group: {result_equal['n_variant']:,}")
print(f"  Control: {result_equal['n_control']:,}")
print(f"  Variant: {result_equal['n_variant']:,}")
print(f"  Total: {result_equal['n_total']:,}")
print(f"\nAchieved power: {result_equal['power_achieved']:.4f} ({result_equal['power_achieved']*100:.1f}%)")

# Unequal allocation (10:1 - more traffic to control)
result_unequal = compute_sample_size_non_inferiority(
    p_control=p_control,
    epsilon=epsilon_val,
    alpha=alpha_val,
    target_power=target_power,
    h1_effect_size=0.0,
    allocation_ratio=0.1  # Variant gets 10% of control's sample size
)

print(f"\n{'='*80}")
print("UNEQUAL ALLOCATION (10:1 - Control gets 10x more traffic)")
print(f"{'='*80}")
print(f"  Control: {result_unequal['n_control']:,}")
print(f"  Variant: {result_unequal['n_variant']:,}")
print(f"  Total: {result_unequal['n_total']:,}")
print(f"\nAchieved power: {result_unequal['power_achieved']:.4f} ({result_unequal['power_achieved']*100:.1f}%)")

print(f"\n{'='*80}")
print("COMPARISON WITH CURRENT EXAMPLE")
print(f"{'='*80}")
print(f"Current sample sizes:")
print(f"  Control: {nC:,}")
print(f"  Variant: {nX:,}")
print(f"  Observed power: {power:.4f} ({power*100:.1f}%)")
print(f"\nTo achieve 80% power, you would need:")
print(f"  Equal allocation: {result_equal['n_variant']:,} per group")
print(f"  Increase factor: {result_equal['n_variant'] / nX:.1f}x more samples per group")

print(f"\n💡 KEY INSIGHT:")
print(f"   With current n={nX}, power is only {power*100:.1f}%")
print(f"   Need n≈{result_equal['n_variant']:,} per group for 80% power")
print(f"   This is why NHST struggles with small samples!")

================================================================================
SAMPLE SIZE CALCULATION FOR NON-INFERIORITY TEST
================================================================================

Parameters:
  Control conversion rate: 70.93%
  Non-inferiority margin (ε): 2.00%
  Significance level (α): 5.00%
  Target power: 80.00%
  Assumed true difference under H1: 0 (no difference)

================================================================================
EQUAL ALLOCATION (1:1 - Control:Variant)
================================================================================
Required sample size per group: 6,375
  Control: 6,375
  Variant: 6,375
  Total: 12,750

Achieved power: 0.8000 (80.0%)

================================================================================
UNEQUAL ALLOCATION (10:1 - Control gets 10x more traffic)
================================================================================
  Control: 35,059
  Variant: 3,506
  Total: 38,565

Achieved power: 0.8000 (80.0%)

================================================================================
COMPARISON WITH CURRENT EXAMPLE
================================================================================
Current sample sizes:
  Control: 32,106
  Variant: 2,022
  Observed power: 0.5978 (59.8%)

To achieve 80% power, you would need:
  Equal allocation: 6,375 per group
  Increase factor: 3.2x more samples per group

💡 KEY INSIGHT:
   With current n=2022, power is only 59.8%
   Need n≈6,375 per group for 80% power
   This is why NHST struggles with small samples!

# Plot comparison of different Beta prior distributions
fig, axes = plot_beta_prior_comparison()

expected_value_posterior = (xX_observed + 1) / (nX + 2)
print(f"Expected value of posterior distribution for p_A: {expected_value_posterior:.4f}")

Expected value of posterior distribution for p_A: 0.6902

# Posterior parameters
nhst_alpha = xX_observed + 1
beta_param = nX - xX_observed + 1



# Compute 95% credible interval (2.5th and 97.5th percentiles)
p_L = beta_dist.ppf(0.025, nhst_alpha, beta_param)
p_U = beta_dist.ppf(0.975, nhst_alpha, beta_param)

# Output the result
print(f"95% Credible Interval for p: [{p_L:.4f}, {p_U:.4f}]")

95% Credible Interval for p: [0.6699, 0.7102]

# Visualize non-informative prior vs posterior
fig, ax = plot_prior_vs_posterior(
    alpha=nhst_alpha,
    beta_param=beta_param,
    control_group_conversion_rate=control_group_conversion_rate,
    epsilon=epsilon,
    p_L=p_L,
    p_U=p_U
)
plt.show()

# Informative prior parameters
expected_degradation = 0.0 # This is spund Baeysian prior consruction as long as teh prior is weakly informative
target_prior_mean = control_group_conversion_rate  - expected_degradation
alpha_prior = 20 # Small value for high entropy
beta_prior = (alpha_prior / target_prior_mean) - alpha_prior  # Solve for beta given mean

print(f"Prior: Beta({alpha_prior:.2f}, {beta_prior:.2f})")
print(f"Prior mean: {alpha_prior / (alpha_prior + beta_prior):.4f}")
print(f"Prior variance: {(alpha_prior * beta_prior) / ((alpha_prior + beta_prior)**2 * (alpha_prior + beta_prior + 1)):.6f}")

# Posterior parameters after observing data
alpha_posterior = xX_observed + alpha_prior
beta_posterior = (nX - xX_observed) + beta_prior

print(f"\nPosterior: Beta({alpha_posterior:.2f}, {beta_posterior:.2f})")
posterior_mean = alpha_posterior / (alpha_posterior + beta_posterior)
print(f"Posterior mean: {posterior_mean:.4f}")

# Compute probability that variant is non-inferior (p_A > p_C - epsilon)
# This is P(p_A > 0.17) under the posterior
non_inferiority_threshold = control_group_conversion_rate - epsilon
prob_non_inferior = 1 - beta_dist.cdf(non_inferiority_threshold, alpha_posterior, beta_posterior)

print(f"\nProbability that variant is non-inferior: {prob_non_inferior:.4f}")
print(f"This means there's a {prob_non_inferior*100:.2f}% probability that the variant conversion rate is above {non_inferiority_threshold:.2f}")

Prior: Beta(20.00, 8.20)
Prior mean: 0.7093
Prior variance: 0.007062

Posterior: Beta(1416.00, 634.20)
Posterior mean: 0.6907

Probability that variant is non-inferior: 0.5565
This means there's a 55.65% probability that the variant conversion rate is above 0.69

# Compute probability that variant is non-inferior (p_A > p_C - expected_degradation)
# This is P(p_A > 0.17) under the posterior
prob_non_inferior = 1 - beta_dist.cdf(control_group_conversion_rate - expected_degradation, 
                                       alpha_posterior, beta_posterior)
print(f"Probability that variant is non-inferior: {prob_non_inferior:.4f}")
print(f"This means there's a {prob_non_inferior*100:.2f}% probability that the variant conversion rate is above {control_group_conversion_rate - epsilon:.2f}")

Probability that variant is non-inferior: 0.0330
This means there's a 3.30% probability that the variant conversion rate is above 0.69

posterior_data = test_non_inferiority_weakly_informative(
    n_control=nC,
    x_control=xC_observed,
    variants_data=variants,
    epsilon=epsilon,  # Business: can tolerate x% degradation
    expected_degradation=expected_degradation,  # 0 because we pick a weakly informative prior centered on control rate
    alpha_prior_strength=20,  # Weak prior (high entropy)
    threshold=0.95  # 95% probability required
)
print(f'probability that variant A is non-inferior: {posterior_data["A"]["probability"]:.4f} ')
posterior_data

probability that variant A is non-inferior: 0.9645

{'A': {'is_non_inferior': np.True_,
  'probability': np.float64(0.9645213339830435),
  'control_rate': 0.7092755248240205,
  'variant_rate': 0.7014530973280955,
  'posterior_params': (3264, 1389.1977867556648),
  'prior_params': (20, 8.19778675566485),
  'prior_mean': 0.7092755248240205,
  'threshold': 0.6892755248240204,
  'epsilon': 0.02,
  'n': 4625,
  'x': 3244},
 'B': {'is_non_inferior': np.False_,
  'probability': np.float64(0.2595423969709174),
  'control_rate': 0.7092755248240205,
  'variant_rate': 0.6827372949273801,
  'posterior_params': (1453, 675.1977867556649),
  'prior_params': (20, 8.19778675566485),
  'prior_mean': 0.7092755248240205,
  'threshold': 0.6892755248240204,
  'epsilon': 0.02,
  'n': 2100,
  'x': 1433},
 'C': {'is_non_inferior': np.False_,
  'probability': np.float64(0.5564853661754626),
  'control_rate': 0.7092755248240205,
  'variant_rate': 0.6906650710226104,
  'posterior_params': (1416, 634.1977867556649),
  'prior_params': (20, 8.19778675566485),
  'prior_mean': 0.7092755248240205,
  'threshold': 0.6892755248240204,
  'epsilon': 0.02,
  'n': 2022,
  'x': 1396}}

# Prior vs Posterior with non-inferiority tail area (P(p_A > p_C - ε))

threshold = control_group_conversion_rate - epsilon

# Create the plot using the helper function
fig, ax, prob_non_inferior_post, prob_non_inferior_prior = plot_informative_prior_posterior_comparison(
    alpha_prior=alpha_prior,
    beta_prior=beta_prior,
    alpha_posterior=alpha_posterior,
    beta_posterior=beta_posterior,
    threshold=threshold
)
plt.show()

print(f"Posterior P(p_A > {threshold:.2f}) = {prob_non_inferior_post:.4f} "
      f"({prob_non_inferior_post*100:.2f}%)")
print(f"Prior     P(p_A > {threshold:.2f}) = {prob_non_inferior_prior:.4f} "
      f"({prob_non_inferior_prior*100:.2f}%)")

Posterior P(p_A > 0.69) = 0.5565 (55.65%)
Prior     P(p_A > 0.69) = 0.6129 (61.29%)

fix, ax = plot_weakly_informative_prior_with_variants(variants_results=posterior_data)

print("\nVariant conversion rates:")
for name, data in variants.items():
    rate = data['x'] / data['n']
    print(f"  {name}: {rate:.4f} ({data['x']}/{data['n']})")

Variant conversion rates:
  A: 0.7014 (3244/4625)
  B: 0.6824 (1433/2100)
  C: 0.6904 (1396/2022)

# Compute posterior distributions (using non-informative prior Beta(1,1))
posteriors = {}

print("\nPosterior Distributions:")
print("-" * 80)

for name, data in variants.items():
    # Posterior parameters (with non-informative prior Beta(1,1))
    alpha_post = data['x'] + 1
    beta_post = data['n'] - data['x'] + 1
    
    # Posterior statistics
    posterior_mean = alpha_post / (alpha_post + beta_post)
    posterior_var = (alpha_post * beta_post) / \
                    ((alpha_post + beta_post)**2 * (alpha_post + beta_post + 1))
    posterior_std = np.sqrt(posterior_var)
    
    # Credible intervals
    ci_95_lower = beta_dist.ppf(0.025, alpha_post, beta_post)
    ci_95_upper = beta_dist.ppf(0.975, alpha_post, beta_post)
    
    posteriors[name] = {
        'alpha': alpha_post,
        'beta': beta_post,
        'mean': posterior_mean,
        'std': posterior_std,
        'ci_95': (ci_95_lower, ci_95_upper)
    }
    
    print(f"\nVariant {name}:")
    print(f"  Posterior: Beta(α={alpha_post}, β={beta_post})")
    print(f"  Posterior mean: {posterior_mean:.4f}")
    print(f"  Posterior std: {posterior_std:.4f}")
    print(f"  95% Credible Interval: [{ci_95_lower:.4f}, {ci_95_upper:.4f}]")

Posterior Distributions:
--------------------------------------------------------------------------------

Variant A:
  Posterior: Beta(α=3245, β=1382)
  Posterior mean: 0.7013
  Posterior std: 0.0067
  95% Credible Interval: [0.6881, 0.7144]

Variant B:
  Posterior: Beta(α=1434, β=668)
  Posterior mean: 0.6822
  Posterior std: 0.0102
  95% Credible Interval: [0.6621, 0.7019]

Variant C:
  Posterior: Beta(α=1397, β=627)
  Posterior mean: 0.6902
  Posterior std: 0.0103
  95% Credible Interval: [0.6699, 0.7102]

# Create visualization of the three posterior distributions
fig, ax = plot_multiple_posteriors_comparison(
    posteriors=posteriors,
    control_group_conversion_rate=control_group_conversion_rate,
    epsilon=epsilon
)
plt.show()

print("\n✓ All three posterior distributions overlap significantly")
print("  This shows there's uncertainty about which is truly best")

✓ All three posterior distributions overlap significantly
  This shows there's uncertainty about which is truly best

# Run Monte Carlo simulation
n_simulations = 100000
print(f"Running {n_simulations:,} simulations...\n")

# Draw samples from each posterior
samples = {}
for name in ['A', 'B', 'C']:
    alpha_p = posteriors[name]['alpha']
    beta_p = posteriors[name]['beta']
    samples[name] = beta_dist.rvs(alpha_p, beta_p, size=n_simulations)

# For each simulation, determine which variant is best
best_counts = {'A': 0, 'B': 0, 'C': 0}

for i in range(n_simulations):
    # Get the sampled values for this simulation
    sample_values = {
        'A': samples['A'][i],
        'B': samples['B'][i],
        'C': samples['C'][i]
    }
    
    # Find which variant has the highest value in this simulation
    best_variant = max(sample_values, key=lambda k: sample_values[k])
    best_counts[best_variant] += 1

# Calculate probabilities
probabilities = {name: count / n_simulations for name, count in best_counts.items()}

print("RESULTS: Probability Each Variant is Best")
print("-" * 80)

for name in ['A', 'B', 'C']:
    prob = probabilities[name]
    bar = '█' * int(prob * 60)
    print(f"P({name} is best) = {prob:.4f} ({prob*100:5.2f}%) {bar}")

# Determine the winner
winner = max(probabilities, key=probabilities.get)
winner_prob = probabilities[winner]

print("\n" + "="*80)
print("BAYESIAN CONCLUSION:")
print("="*80)
print(f"✓ Variant {winner} is most likely the best")
print(f"  Probability: {winner_prob:.4f} ({winner_prob*100:.1f}%)")
print(f"\nInterpretation:")
print(f"  - There's a {winner_prob*100:.1f}% chance that {winner} has the highest true conversion rate")
print(f"  - This accounts for uncertainty in all three estimates")
print(f"  - Clear, actionable decision with quantified confidence")

Running 100,000 simulations...

RESULTS: Probability Each Variant is Best
--------------------------------------------------------------------------------
P(A is best) = 0.7815 (78.15%) ██████████████████████████████████████████████
P(B is best) = 0.0444 ( 4.44%) ██
P(C is best) = 0.1741 (17.41%) ██████████

================================================================================
BAYESIAN CONCLUSION:
================================================================================
✓ Variant A is most likely the best
  Probability: 0.7815 (78.2%)

Interpretation:
  - There's a 78.2% chance that A has the highest true conversion rate
  - This accounts for uncertainty in all three estimates
  - Clear, actionable decision with quantified confidence

Aspect	Traditional NHST	Bayesian Approach
Small samples	Underpowered, inconclusive	Works well with prior knowledge
Unbalanced allocation	Loses efficiency	No problem
Multiple variants	Complex corrections needed	Natural single analysis
Interpretation	p-value (hard to explain)	Probability (intuitive)
Decision making	Binary reject/fail	Quantified risk/confidence
Continuous monitoring	Forbidden (p-hacking)	Allowed and rigorous
Time to decision	Weeks (need larger n)	Days (works with small n)

Limitation	Impact
Computes P(data \| hypothesis)	Not what we want: P(hypothesis \| data)
Binary decisions	Reject/fail-to-reject; no probability of being better
Asymmetric framework	Must pick a direction or waste α budget
No direct answer	Cannot directly answer "Which is better?"
No expected value	Cannot compute expected value for decision-making

Exec Summary¶

Problem Statement¶

The Bayesian Solution¶

1. Works with Small Samples¶

2. Handles Unbalanced Allocation Naturally¶

3. Scales to Many Variants Effortlessly¶

4. Provides Actionable Probabilities¶

5. Allows Continuous Monitoring¶

Key Benefits Summary¶

Bottom Line¶

Implementation¶

In Depth Analysis of the 2 methodologies¶

Test Setup: Control Group vs. Variants¶

Null Hypothesis Significance Testing (NHST)¶

Modeling Conversion as Random Variables¶

Sample Proportions¶

Variance and Standard Deviation of a Sample Proportion¶

Difference in Proportions¶

Hypotheses¶

Numerical Example¶

Standard Deviation of the Estimator $\hat{\Delta}$ (a.k.a. Standard Error in Frequentist Statistics)¶

Common Plug-In Approaches (a.k.a. “Standard Error Hacks”)¶

1. Wald Pooled Standard Error (for a “No Effect” Hypothesis)¶

2. Wald Unpooled Standard Error (for Non-Inferiority)¶

3. Newcombe / Score-Based (Wilson)¶

4. Miettinen–Nurminen¶

Summary¶

Probability of False Positive, p-value, Significance Level $\alpha$ (sometimes called “confidence”), and Critical Value¶

Using “Boundary” as the mean of the distribution to integrate to compute the probability of what we are observing.¶

Computing the p-Value¶

Significance Level $\alpha$ and Critical Value¶

Traditional Presentation Using z-Scores¶

Tail Probability in Standard Normal Form¶

False Positive (a.k.a. Type I Error)¶

False Negative (Type II Error), Power, and Sample Size¶

Choosing an Effect Size Under $H_1$¶

Modeling Under $H_1$¶

Beta and Power¶

Designing for Target Power¶

NHST Confidence Interval (CI)¶

Bayesian Approach¶

Using the Beta Distribution for Our Prior Belief¶

Conservative Approach: Assuming We Know Nothing (Non-Informative Prior)¶

Posterior After Observing Data¶

Expected Value of the Posterior¶

Credible Intervals and Visualizing Prior vs. Posterior¶

Weakly Informative Prior Using the Control Group as the base for the prior¶

Picking the Best Variant¶

🧪 NHST Approaches¶

1. Winner-Takes-All¶

2. Pairwise t-Tests with Bonferroni Correction¶

3. ANOVA + Post-Hoc Tests¶

4. Confidence Interval Overlap¶

Key Takeaway¶

Summary: Why NHST Struggles with Symmetric A vs B Comparisons¶

🌟 Bayesian Approach — Probability of Being Best¶