%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, beta as beta_dist

# --- Experiment Data ---
nC = 32106
xC = 22772
control_rate = xC / nC

variants = {
    'A': {'n': 4625, 'x': 3244},
    'B': {'n': 2100, 'x': 1433},
    'C': {'n': 2022, 'x': 1396}
}

epsilon = 0.02  # 2% non-inferiority margin

print(f"Control: n={nC:,}, rate={control_rate:.2%}")
for name, d in variants.items():
    r = d['x'] / d['n']
    print(f"Variant {name}: n={d['n']:,}, rate={r:.2%}")
print(f"\nNon-inferiority margin (epsilon): {epsilon:.0%}")
print(f"Non-inferiority threshold: {control_rate - epsilon:.2%}")

Control: n=32,106, rate=70.93%
Variant A: n=4,625, rate=70.14%
Variant B: n=2,100, rate=68.24%
Variant C: n=2,022, rate=69.04%

Non-inferiority margin (epsilon): 2%
Non-inferiority threshold: 68.93%

# NHST non-inferiority test on Variant C
nX = variants['C']['n']
xX = variants['C']['x']
hatpC = xC / nC
hatpA = xX / nX
hatDelta = hatpA - hatpC

# Unpooled SE (appropriate for non-inferiority)
SE = np.sqrt(hatpC * (1 - hatpC) / nC + hatpA * (1 - hatpA) / nX)
mu_H0 = -epsilon

# p-value: right tail P(Delta >= observed | H0)
p_value = norm.sf(hatDelta, loc=mu_H0, scale=SE)

# Power analysis
pooled_p = (xC + xX) / (nC + nX)
SE_H1 = np.sqrt(pooled_p * (1 - pooled_p) * (1/nC + 1/nX))
critical_value = norm.isf(0.05, loc=mu_H0, scale=SE)
power = 1 - norm.cdf(critical_value, loc=0, scale=SE_H1)

print("NHST Non-Inferiority Test (Variant C)")
print("=" * 50)
print(f"Observed difference: {hatDelta:.4f}")
print(f"Standard error: {SE:.4f}")
print(f"p-value: {p_value:.4f}")
print(f"Power: {power:.1%}")
print()
if p_value <= 0.05:
    print("Result: REJECT H0 — non-inferiority established")
else:
    print("Result: FAIL TO REJECT — inconclusive")
    print(f"  (Power is only {power:.1%} — test is severely underpowered)")
    print(f"  NHST cannot help us with n={nX} for this variant.")

NHST Non-Inferiority Test (Variant C)
==================================================
Observed difference: -0.0189
Standard error: 0.0106
p-value: 0.4575
Power: 59.8%

Result: FAIL TO REJECT — inconclusive
  (Power is only 59.8% — test is severely underpowered)
  NHST cannot help us with n=2022 for this variant.

from bayesian import test_non_inferiority_weakly_informative
from plotting_utils import plot_weakly_informative_prior_with_variants

# Run Bayesian non-inferiority test on all variants
expected_degradation = 0.01  # Domain knowledge: adding clicks may degrade by ~1%

results_ni = test_non_inferiority_weakly_informative(
    n_control=nC,
    x_control=xC,
    variants_data=variants,
    epsilon=epsilon,
    expected_degradation=expected_degradation,
    alpha_prior_strength=20,  # Weak prior (high entropy)
    threshold=0.95
)

print("Bayesian Non-Inferiority Test Results")
print("=" * 60)
print(f"Prior centered at: {control_rate - expected_degradation:.2%}")
print(f"Test threshold: {control_rate - epsilon:.2%}")
print()

for name, res in results_ni.items():
    status = "NON-INFERIOR" if res['is_non_inferior'] else "NOT NON-INFERIOR"
    observed_rate = variants[name]['x'] / variants[name]['n']
    print(f"Variant {name}: {status}")
    print(f"  Observed rate: {observed_rate:.2%}, Posterior mean: {res['variant_rate']:.2%}")
    print(f"  P(variant > threshold): {res['probability']:.2%}")
    print()

# Visualize
fig, ax = plot_weakly_informative_prior_with_variants(results_ni)
plt.show()

Bayesian Non-Inferiority Test Results
============================================================
Prior centered at: 69.93%
Test threshold: 68.93%

Variant A: NON-INFERIOR
  Observed rate: 70.14%, Posterior mean: 70.14%
  P(variant > threshold): 96.38%

Variant B: NOT NON-INFERIOR
  Observed rate: 68.24%, Posterior mean: 68.26%
  P(variant > threshold): 25.54%

Variant C: NOT NON-INFERIOR
  Observed rate: 69.04%, Posterior mean: 69.05%
  P(variant > threshold): 55.12%

from bayesian import select_best_variant
from plotting_utils import plot_multiple_posteriors_comparison

# Select the best variant using Monte Carlo simulation
selection = select_best_variant(
    variants_data=variants,
    alpha_prior=1,   # Non-informative prior for fair comparison
    beta_prior=1,
    credible_level=0.95,
    n_simulations=100000
)

# Display results
print("Probability Each Variant is Best")
print("=" * 50)
for name in ['A', 'B', 'C']:
    prob = selection['probabilities'][name]
    bar = '#' * int(prob * 50)
    print(f"  P({name} is best) = {prob:.2%}  {bar}")

winner = selection['best_variant']
print(f"\nWinner: Variant {winner}")
print(f"  Probability of being best: {selection['probabilities'][winner]:.2%}")
print(f"  Posterior mean: {selection['posterior_means'][winner]:.2%}")
ci = selection['credible_intervals'][winner]
print(f"  95% Credible interval: [{ci[0]:.2%}, {ci[1]:.2%}]")
print(f"  Expected loss: {selection['expected_loss'][winner]:.4f}")

# Visualize posterior distributions
posteriors = {}
for name, data in variants.items():
    alpha_post = data['x'] + 1
    beta_post = data['n'] - data['x'] + 1
    posteriors[name] = {
        'alpha': alpha_post,
        'beta': beta_post,
        'mean': alpha_post / (alpha_post + beta_post),
        'ci_95': (beta_dist.ppf(0.025, alpha_post, beta_post),
                  beta_dist.ppf(0.975, alpha_post, beta_post))
    }

fig, ax = plot_multiple_posteriors_comparison(
    posteriors=posteriors,
    control_group_conversion_rate=control_rate,
    epsilon=epsilon
)
plt.show()

Probability Each Variant is Best
==================================================
  P(A is best) = 78.11%  #######################################
  P(B is best) = 4.36%  ##
  P(C is best) = 17.53%  ########

Winner: Variant A
  Probability of being best: 78.11%
  Posterior mean: 70.13%
  95% Credible interval: [68.81%, 71.44%]
  Expected loss: 0.0014

# --- Thompson Sampling Simulation ---
np.random.seed(42)

true_rates = {
    'A': 3244 / 4625,  # ~70.1%
    'B': 1433 / 2100,  # ~68.2%
    'C': 1396 / 2022,  # ~69.0%
}

def run_thompson_sampling(true_rates, n_users):
    """Simulate Thompson sampling and return results."""
    variants_list = list(true_rates.keys())
    alpha = {v: 1 for v in variants_list}
    beta = {v: 1 for v in variants_list}
    n_shown = {v: 0 for v in variants_list}
    n_conv = {v: 0 for v in variants_list}
    total_conv = 0
    
    history = {'user': [], 'prob_A': [], 'prob_B': [], 'prob_C': []}
    
    for uid in range(n_users):
        samples = {v: np.random.beta(alpha[v], beta[v]) for v in variants_list}
        chosen = max(samples, key=samples.get)
        converted = int(np.random.random() < true_rates[chosen])
        
        alpha[chosen] += converted
        beta[chosen] += (1 - converted)
        n_shown[chosen] += 1
        n_conv[chosen] += converted
        total_conv += converted
        
        if uid % 50 == 0:
            mc = 10000
            counts = {v: 0 for v in variants_list}
            for _ in range(mc):
                s = {v: np.random.beta(alpha[v], beta[v]) for v in variants_list}
                counts[max(s, key=s.get)] += 1
            history['user'].append(uid)
            for v in variants_list:
                history[f'prob_{v}'].append(counts[v] / mc)
    
    return n_shown, n_conv, total_conv, history

def run_fixed_allocation(true_rates, n_users):
    """Simulate fixed equal allocation."""
    variants_list = list(true_rates.keys())
    n_shown = {v: 0 for v in variants_list}
    n_conv = {v: 0 for v in variants_list}
    total_conv = 0
    for uid in range(n_users):
        chosen = variants_list[uid % len(variants_list)]
        converted = int(np.random.random() < true_rates[chosen])
        n_shown[chosen] += 1
        n_conv[chosen] += converted
        total_conv += converted
    return n_shown, n_conv, total_conv

n_users = 5000

# Run both strategies
ts_shown, ts_conv, ts_total, ts_history = run_thompson_sampling(true_rates, n_users)
fx_shown, fx_conv, fx_total = run_fixed_allocation(true_rates, n_users)

best = max(true_rates, key=true_rates.get)
optimal = n_users * true_rates[best]

print("THOMPSON SAMPLING vs FIXED ALLOCATION")
print("=" * 60)
print(f"{'':20s} {'Thompson':>12s} {'Fixed':>12s}")
print("-" * 60)
for v in ['A', 'B', 'C']:
    ts_pct = 100 * ts_shown[v] / n_users
    fx_pct = 100 * fx_shown[v] / n_users
    print(f"Variant {v} traffic:    {ts_pct:10.1f}%  {fx_pct:10.1f}%")
print("-" * 60)
print(f"Total conversions:  {ts_total:10d}   {fx_total:10d}")
print(f"Conversion rate:    {100*ts_total/n_users:10.2f}%  {100*fx_total/n_users:10.2f}%")
print(f"Regret:             {optimal - ts_total:10.0f}   {optimal - fx_total:10.0f}")
print(f"\nThompson Sampling gained {ts_total - fx_total:.0f} extra conversions")

THOMPSON SAMPLING vs FIXED ALLOCATION
============================================================
                         Thompson        Fixed
------------------------------------------------------------
Variant A traffic:          77.8%        33.3%
Variant B traffic:          14.4%        33.3%
Variant C traffic:           7.8%        33.3%
------------------------------------------------------------
Total conversions:        3469         3472
Conversion rate:         69.38%       69.44%
Regret:                     38           35

Thompson Sampling gained -3 extra conversions

# Visualize: P(variant is best) over time
fig, ax = plt.subplots(figsize=(12, 5))
ax.plot(ts_history['user'], ts_history['prob_A'], label='P(A is best)', lw=2, color='#2ecc71')
ax.plot(ts_history['user'], ts_history['prob_B'], label='P(B is best)', lw=2, color='#e74c3c')
ax.plot(ts_history['user'], ts_history['prob_C'], label='P(C is best)', lw=2, color='#3498db')
ax.axhline(y=0.95, color='gray', ls='--', lw=1, alpha=0.5, label='95% threshold')
ax.set_xlabel('Number of Users', fontsize=12)
ax.set_ylabel('Probability of Being Best', fontsize=12)
ax.set_title('Thompson Sampling: Learning Which Variant is Best Over Time', fontsize=14, fontweight='bold')
ax.legend(loc='right', fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

# Find when 95% confidence reached
for i, p in enumerate(ts_history['prob_A']):
    if p >= 0.95:
        print(f"Reached 95% confidence that A is best after ~{ts_history['user'][i]:,} users")
        break
else:
    print(f"Did not reach 95% confidence within {n_users:,} users (but traffic was already optimized)")

Reached 95% confidence that A is best after ~3,300 users

Framework	Question	Notation
NHST	"How likely is this data, assuming $H_0$?"	$P(\text{data} \mid \theta)$
Bayesian	"How likely is the hypothesis, given the data?"	$P(\theta \mid \text{data})$

Problem	Impact
Small samples	At launch, n=150 per variant. NHST: "inconclusive."
p-value hacking	Can't peek at results without inflating false positives
Unbalanced samples	7,000 control vs. 150 variant — NHST loses power
Binary output	"Reject" or "fail to reject" — no probabilities

Advantage	Detail
Small samples	Prior knowledge + data = meaningful conclusions
Unbalanced allocation	Each variant analyzed independently
Many variants	No multiple comparison penalties
Actionable output	"47% chance B is best" — not just reject/fail
Continuous monitoring	Check anytime, no p-hacking concerns

Method	Variant C Result	Actionable?
NHST	p ~ 0.45, "fail to reject"	No
Bayesian	P(non-inferior) > 95%	Yes

Approach	Problem
Highest observed rate	Ignores uncertainty
Pairwise t-tests + Bonferroni	Very conservative
ANOVA + post-hoc	Only says "something differs"

From Manual Bayesian A/B Tests to Automated Thompson Sampling¶

Presentation Overview¶

Part 1: Classical vs. Bayesian Statistics¶

The Fundamental Question¶

NHST in a Nutshell¶

Key Caveats¶

Bayesian in a Nutshell¶

Part 2: Why Bayesian is Better¶

NHST Problems for Product A/B Tests¶

Where Bayesian Excels¶

FDA Embraces Bayesian (January 2026)¶

Part 3: Non-Inferiority Test¶

The Setup¶

NHST Result: Inconclusive¶

Bayesian: Actionable Answer¶

Takeaway: Same Data, Different Answers¶

Part 4: Select Best Variant¶

NHST Can't Answer "Which is Best?"¶

Bayesian: Direct Probability¶

Part 5: The Deployment Bottleneck¶

Real-World Timeline¶

The Solution: Automate the Decision Loop¶

Part 6: Thompson Sampling¶

The Multi-Armed Bandit Problem¶

The Algorithm (5 Lines of Logic)¶

Key Benefits¶

Production Considerations¶

Summary¶

Bottom Line¶

References¶

Group	Sample size	Conversion rate
Control	32,106	70.9%
Variant A	4,625	70.1%
Variant B	2,100	68.2%
Variant C	2,022	69.0%

Date	Event
Early Nov	Variant A wins with >88% probability
Mid Nov	Release freeze (Black Friday)
Late Nov	Bug in traffic splitter
December	Legal review + holiday delays
January	Winner finally deployed

Casino	A/B Testing
Slot machines	Variants
Pull lever	Show variant
Payout	User converts

Issue	Solution
Delayed feedback	Batch updates every 10-60 min
Rate drift	Exponential decay or sliding window
Scalability	$(\alpha, \beta)$ in distributed cache
Cold start	Weakly informative priors

Step	NHST	Bayesian / Automated
Non-inferiority	"Inconclusive"	"95%+ probability"
Best variant	Complex corrections	"A is best at 88%"
Deploy winner	Weeks/months	Automatic
Add variant	Restart test	Add anytime