# Setup
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import beta as beta_dist
from plotting_utils import plot_gaussian_hypothesis_test
from plotting_utils import plot_type_ii_error_analysis
from nhst import compute_sample_size_non_inferiority

# Real experiment data from passkey creation launch
nC = 32106
xC_observed = 22772 
control_group_conversion_rate = xC_observed / nC 

# Three variants with actual experiment data
variants = {
    'A': {'n': 4625, 'x': 3244},
    'B': {'n': 2100, 'x': 1433},
    'C': {'n': 2022, 'x': 1396}
}

# Focus on Variant C for detailed NHST analysis
nX = variants['C']['n']
xX_observed = variants['C']['x']

# Test parameters
epsilon = 0.02  # 2% non-inferiority margin
alpha = 0.05    # 5% significance level

# Derived values
hatpC_observed = xC_observed / nC
hatpA_observed = xX_observed / nX
hatDelta_observed = hatpA_observed - hatpC_observed

print("="*80)
print("REAL EXPERIMENT DATA")
print("="*80)
print(f"\nControl group:")
print(f"  Sample size: {nC:,}")
print(f"  Conversions: {xC_observed:,}")
print(f"  Conversion rate: {hatpC_observed:.4f} ({hatpC_observed*100:.2f}%)")

print(f"\nVariant C:")
print(f"  Sample size: {nX:,}")
print(f"  Conversions: {xX_observed:,}")
print(f"  Conversion rate: {hatpA_observed:.4f} ({hatpA_observed*100:.2f}%)")

print(f"\nObserved difference: {hatDelta_observed:.4f} ({hatDelta_observed*100:.2f}%)")
print(f"Non-inferiority margin (ε): {epsilon:.4f} ({epsilon*100:.2f}%)")
print(f"Non-inferiority threshold: {-epsilon:.4f} ({-epsilon*100:.2f}%)")
print(f"\n{'='*80}")

================================================================================
REAL EXPERIMENT DATA
================================================================================

Control group:
  Sample size: 32,106
  Conversions: 22,772
  Conversion rate: 0.7093 (70.93%)

Variant C:
  Sample size: 2,022
  Conversions: 1,396
  Conversion rate: 0.6904 (69.04%)

Observed difference: -0.0189 (-1.89%)
Non-inferiority margin (ε): 0.0200 (2.00%)
Non-inferiority threshold: -0.0200 (-2.00%)

================================================================================

# Compute standard errors
pooled_proportion = (xC_observed + xX_observed) / (nC + nX)
wald_pooled_SE = (pooled_proportion * (1 - pooled_proportion) * (1/nC + 1/nX))**0.5
wald_unpooled_SE = ((hatpC_observed * (1 - hatpC_observed) / nC) + 
                    (hatpA_observed * (1 - hatpA_observed) / nX))**0.5

print("Standard Error Estimates:")
print(f"  Wald Pooled SE: {wald_pooled_SE:.4f}")
print(f"  Wald Unpooled SE: {wald_unpooled_SE:.4f}")
print(f"\n  → Using Unpooled SE for non-inferiority test")

Standard Error Estimates:
  Wald Pooled SE: 0.0104
  Wald Unpooled SE: 0.0106

  → Using Unpooled SE for non-inferiority test

# Compute p-value and critical value
SE_H0 = wald_unpooled_SE
mu_H0 = -epsilon    # mean under boundary hypothesis
sigma_H0 = SE_H0    # standard deviation

# p-value: P(Delta >= Delta_obs | H0)
p_value = norm.sf(hatDelta_observed, loc=mu_H0, scale=sigma_H0)

# Critical value for alpha = 0.05
critical_value = norm.isf(alpha, loc=mu_H0, scale=sigma_H0)

print("="*80)
print("NHST RESULTS")
print("="*80)
print(f"\np-value: {p_value:.4f} ({p_value*100:.2f}%)")
print(f"Significance level (α): {alpha:.4f} ({alpha*100:.2f}%)")
print(f"Critical value: {critical_value:.4f}")
print(f"Observed difference: {hatDelta_observed:.4f}")

if p_value <= alpha:
    print(f"\n✓ REJECT H₀: p-value ({p_value:.4f}) ≤ α ({alpha})")
    print(f"  Conclusion: Variant is non-inferior (at {(1-alpha)*100:.0f}% significance)")
else:
    print(f"\n✗ FAIL TO REJECT H₀: p-value ({p_value:.4f}) > α ({alpha})")
    print(f"  Conclusion: Cannot determine if variant is non-inferior")
    print(f"  → Result is INCONCLUSIVE with current sample size")
    print(f"\n  The p-value of {p_value*100:.1f}% is much larger than the 5% threshold.")
    print(f"  This means the observed data is quite likely under H₀.")
    print(f"  NHST provides no actionable guidance in this situation.")

print(f"\n{'='*80}")

================================================================================
NHST RESULTS
================================================================================

p-value: 0.4575 (45.75%)
Significance level (α): 0.0500 (5.00%)
Critical value: -0.0026
Observed difference: -0.0189

✗ FAIL TO REJECT H₀: p-value (0.4575) > α (0.05)
  Conclusion: Cannot determine if variant is non-inferior
  → Result is INCONCLUSIVE with current sample size

  The p-value of 45.8% is much larger than the 5% threshold.
  This means the observed data is quite likely under H₀.
  NHST provides no actionable guidance in this situation.

================================================================================

# Visualize the hypothesis test
fig, ax = plot_gaussian_hypothesis_test(
    mu_H0=mu_H0,
    sigma_H0=sigma_H0,
    observed_value=hatDelta_observed,
    alpha=alpha,
    epsilon=epsilon
)
plt.show()

print(f"\n📊 The plot shows:")
print(f"   • Null distribution centered at -ε = {mu_H0:.4f}")
print(f"   • Critical value (red line) at {critical_value:.4f}")
print(f"   • Observed difference (blue line) at {hatDelta_observed:.4f}")
print(f"   • Right-tail area (p-value) = {p_value:.4f} ({p_value*100:.1f}%)")
print(f"\n   Since p-value ({p_value*100:.1f}%) >> α ({alpha*100:.0f}%), we cannot reject H₀")
print(f"   The observed difference is not far enough to the right to be convincing.")

📊 The plot shows:
   • Null distribution centered at -ε = -0.0200
   • Critical value (red line) at -0.0026
   • Observed difference (blue line) at -0.0189
   • Right-tail area (p-value) = 0.4575 (45.8%)

   Since p-value (45.8%) >> α (5%), we cannot reject H₀
   The observed difference is not far enough to the right to be convincing.

# z-score formulation
z_ni = (hatDelta_observed + epsilon) / SE_H0
p_zni = norm.sf(z_ni)

print(f"z-score formulation:")
print(f"  z_NI = (Δ_obs + ε) / SE = {z_ni:.4f}")
print(f"  p-value = {p_zni:.4f}")
print(f"\n  ✓ Same result as before (as expected)")

z-score formulation:
  z_NI = (Δ_obs + ε) / SE = 0.1067
  p-value = 0.4575

  ✓ Same result as before (as expected)

# Compute power under H1 (assuming true difference = 0)
SE_H1 = wald_pooled_SE
mu_H1 = 0  # Assume no true difference
sigma_H1 = SE_H1

# Beta = P(Delta < critical_value | H1 is true)
beta = norm.cdf(critical_value, loc=mu_H1, scale=sigma_H1)
power = 1 - beta

print("="*80)
print("POWER ANALYSIS")
print("="*80)
print(f"\nAssumption under H₁: True difference = 0 (no degradation)")
print(f"\nType II Error (β): {beta:.4f} ({beta*100:.2f}%)")
print(f"Power (1 - β): {power:.4f} ({power*100:.2f}%)")

print(f"\nInterpretation:")
if power >= 0.80:
    print(f"  ✓ Power ≥ 80%: Test is adequately powered")
else:
    print(f"  ✗ Power < 80%: Test is SEVERELY UNDERPOWERED")
    print(f"  → Only {power*100:.1f}% chance of detecting non-inferiority")
    print(f"  → {beta*100:.1f}% chance of false negative (missing a truly non-inferior variant)")
    print(f"  → Need MUCH larger sample size for reliable conclusions")

print(f"\n{'='*80}")

================================================================================
POWER ANALYSIS
================================================================================

Assumption under H₁: True difference = 0 (no degradation)

Type II Error (β): 0.4022 (40.22%)
Power (1 - β): 0.5978 (59.78%)

Interpretation:
  ✗ Power < 80%: Test is SEVERELY UNDERPOWERED
  → Only 59.8% chance of detecting non-inferiority
  → 40.2% chance of false negative (missing a truly non-inferior variant)
  → Need MUCH larger sample size for reliable conclusions

================================================================================

# Visualize Type II error analysis
fig, ax = plot_type_ii_error_analysis(
    mu_H1=mu_H1,
    sigma_H1=sigma_H1,
    critical_value=critical_value,
    hatDelta_observed=hatDelta_observed,
    epsilon=epsilon,
    beta=beta,
    power=power
)
plt.show()

print(f"\n📊 The plot shows:")
print(f"   • H₀ distribution (red) centered at -ε = {mu_H0:.4f}")
print(f"   • H₁ distribution (green) centered at 0 (no difference)")
print(f"   • Critical value at {critical_value:.4f}")
print(f"   • β (orange area) = {beta:.4f} = probability of missing a non-inferior variant")
print(f"   • Power (green area) = {power:.4f} = probability of correctly detecting non-inferiority")
print(f"\n   The two distributions overlap substantially, showing why the test is underpowered.")

📊 The plot shows:
   • H₀ distribution (red) centered at -ε = -0.0200
   • H₁ distribution (green) centered at 0 (no difference)
   • Critical value at -0.0026
   • β (orange area) = 0.4022 = probability of missing a non-inferior variant
   • Power (green area) = 0.5978 = probability of correctly detecting non-inferiority

   The two distributions overlap substantially, showing why the test is underpowered.

# Compute required sample size for 80% power
print("="*80)
print("SAMPLE SIZE CALCULATION FOR NON-INFERIORITY TEST")
print("="*80)

# Parameters
p_control = control_group_conversion_rate  
epsilon_val = epsilon  
alpha_val = alpha  
target_power = 0.80

print(f"\nParameters:")
print(f"  Control conversion rate: {p_control:.2%}")
print(f"  Non-inferiority margin (ε): {epsilon_val:.2%}")
print(f"  Significance level (α): {alpha_val:.2%}")
print(f"  Target power: {target_power:.2%}")
print(f"  Assumed true difference under H₁: 0 (no difference)")

# Equal allocation (1:1)
result_equal = compute_sample_size_non_inferiority(
    p_control=p_control,
    epsilon=epsilon_val,
    alpha=alpha_val,
    target_power=target_power,
    h1_effect_size=0.0,
    allocation_ratio=1.0
)

print(f"\n{'='*80}")
print("EQUAL ALLOCATION (1:1 - Control:Variant)")
print(f"{'='*80}")
print(f"Required sample size per group: {result_equal['n_variant']:,}")
print(f"  Control: {result_equal['n_control']:,}")
print(f"  Variant: {result_equal['n_variant']:,}")
print(f"  Total: {result_equal['n_total']:,}")
print(f"\nAchieved power: {result_equal['power_achieved']:.4f} ({result_equal['power_achieved']*100:.1f}%)")

print(f"\n{'='*80}")
print("COMPARISON WITH CURRENT EXPERIMENT")
print(f"{'='*80}")
print(f"\nCurrent sample sizes:")
print(f"  Control: {nC:,}")
print(f"  Variant C: {nX:,}")
print(f"  Observed power: {power:.4f} ({power*100:.1f}%)")

print(f"\nTo achieve 80% power:")
print(f"  Required: {result_equal['n_variant']:,} per group")
print(f"  Current: {nX:,} per group")
increase_factor = result_equal['n_variant'] / nX
print(f"  Increase needed: {increase_factor:.1f}x more samples")

print(f"\n{'='*80}")
print("💡 KEY INSIGHT: WHY NHST FAILS WITH SMALL SAMPLES")
print(f"{'='*80}")
print(f"\nWith current sample (n={nX:,}):")
print(f"  • Power is only {power*100:.1f}% (severely underpowered)")
print(f"  • p-value = {p_value:.4f} >> α = {alpha} (cannot reject H₀)")
print(f"  • Result: INCONCLUSIVE - no actionable guidance")

print(f"\nNeed n≈{result_equal['n_variant']:,} per group for reliable conclusions:")
print(f"  • That's {increase_factor:.1f}x more data")
print(f"  • Could take weeks or months to collect")
print(f"  • Impractical for rapid product iteration")

print(f"\n📌 This is why NHST is unsuitable for:")
print(f"   ✗ Early-stage feature launches with limited traffic")
print(f"   ✗ Risk-averse traffic allocation (2-5% to variants)")
print(f"   ✗ Fast decision-making in product development")
print(f"\n{'='*80}")

================================================================================
SAMPLE SIZE CALCULATION FOR NON-INFERIORITY TEST
================================================================================

Parameters:
  Control conversion rate: 70.93%
  Non-inferiority margin (ε): 2.00%
  Significance level (α): 5.00%
  Target power: 80.00%
  Assumed true difference under H₁: 0 (no difference)

================================================================================
EQUAL ALLOCATION (1:1 - Control:Variant)
================================================================================
Required sample size per group: 6,375
  Control: 6,375
  Variant: 6,375
  Total: 12,750

Achieved power: 0.8000 (80.0%)

================================================================================
COMPARISON WITH CURRENT EXPERIMENT
================================================================================

Current sample sizes:
  Control: 32,106
  Variant C: 2,022
  Observed power: 0.5978 (59.8%)

To achieve 80% power:
  Required: 6,375 per group
  Current: 2,022 per group
  Increase needed: 3.2x more samples

================================================================================
💡 KEY INSIGHT: WHY NHST FAILS WITH SMALL SAMPLES
================================================================================

With current sample (n=2,022):
  • Power is only 59.8% (severely underpowered)
  • p-value = 0.4575 >> α = 0.05 (cannot reject H₀)
  • Result: INCONCLUSIVE - no actionable guidance

Need n≈6,375 per group for reliable conclusions:
  • That's 3.2x more data
  • Could take weeks or months to collect
  • Impractical for rapid product iteration

📌 This is why NHST is unsuitable for:
   ✗ Early-stage feature launches with limited traffic
   ✗ Risk-averse traffic allocation (2-5% to variants)
   ✗ Fast decision-making in product development

================================================================================

Metric	Value	Interpretation
p-value	~45%	>> 5% threshold → Cannot reject null
Power	Very low	Severely underpowered
Conclusion	Inconclusive	Cannot determine if variant is non-inferior

Metric	Value	Meaning
p-value	~45%	>> 5% threshold
Decision	Fail to reject H₀	INCONCLUSIVE
Power	Very low	Severely underpowered
Sample size needed	Much larger	Current insufficient
Actionable guidance	NONE	Cannot make decision

Product Reality	NHST Requirement
Small samples (2-5% traffic)	Large samples (many multiples more)
Fast decisions (days)	Long wait (weeks/months)
Multiple variants (3-5)	Complex corrections needed
Unbalanced allocation	Loses efficiency
Continuous monitoring	Forbidden (p-hacking)
Actionable probabilities	Binary reject/fail

NHST Non-Inferiority Testing with Real Experiment Data¶

Executive Summary: Why NHST Fails with Small Samples¶

The Problem¶

Real Experiment Data¶

NHST Results with Real Data¶

Required Sample Sizes for 80% Power¶

Bottom Line¶

Problem Statement¶

Test Setup: Control Group vs. Variants¶

Null Hypothesis Significance Testing (NHST)¶

Two Important Caveats¶

Modeling Conversion as Random Variables¶

Sample Proportions¶

Variance and Standard Deviation of a Sample Proportion¶

Difference in Proportions¶

Hypotheses¶

Real Experiment Data¶

Standard Error Estimation: The Plug-In Principle Problem¶

Wald Unpooled Standard Error (for Non-Inferiority)¶

Computing the p-Value¶

Using the "Boundary" as the Mean¶

The p-Value¶

Critical Value¶

Alternative z-Score Formulation¶

Type I Error (False Positive)¶

Type II Error (False Negative), Power, and Sample Size¶

Choosing an Effect Size Under $H_1$¶

Modeling Under $H_1$¶

Beta and Power¶

Required Sample Size for Target Power¶

Summary: NHST Limitations with Real Data¶

What NHST Gave Us¶

What NHST Cannot Tell Us¶

Why NHST Fails for Modern Product Development¶

The Core Problem¶

What We Actually Need¶

Conclusion¶