cs249r_book/labs/vol1/lab_15_responsible_engr.py

import marimo

__generated_with = "0.19.6"
app = marimo.App(width="full")

# ─────────────────────────────────────────────────────────────────────────────
# LAB 15: THERE IS NO FREE FAIRNESS
#
# Chapter: Responsible Engineering (@sec-responsible-engineering)
# Core Invariant:
#   Chouldechova's (2017) impossibility theorem — when base rates differ between
#   groups, you CANNOT simultaneously achieve equal false positive rates, equal
#   false negative rates, AND equal positive predictive value. Equal accuracy
#   is not evidence of equal treatment.
#
# 2 Contexts: Cloud (H100) vs Mobile (Smartphone NPU)
#
# Act I  — The Fairness Illusion (12–15 min)
#   Stakeholder: Product Compliance Officer
#   Scenario: Loan approval model, 85% accuracy on both groups. Legal says
#   compliant. Advocates say unfair. Who is right?
#   Prediction: What does equal accuracy actually guarantee?
#   Instrument: Fairness metric explorer — sliders for base_rate_a, base_rate_b,
#   model_threshold; per-group confusion matrices; FPR, FNR, PPV, equalized
#   odds gap; prediction-vs-reality overlay.
#
# Act II — The Audit-Accuracy Tradeoff (20–25 min)
#   Stakeholder: Engineering VP
#   Scenario: Resume screening at scale. Choose fairness strategy + mitigation
#   method + audit frequency. Design the deployment.
#   Failure state: equalized_odds_gap > 10% AND context is high-stakes.
#   Reflection: Why demographic parity can produce unfair outcomes.
#
# Design Ledger: chapter=15
#   context, fairness_criterion, equalized_odds_gap, audit_cost_k,
#   act1_prediction, act1_correct, act2_result, act2_decision,
#   constraint_hit, regulatory_risk
# ─────────────────────────────────────────────────────────────────────────────


# ── CELL 0: SETUP (hide_code=False — leave visible) ───────────────────────────
@app.cell
def _():
    import marimo as mo
    import sys
    import math
    from pathlib import Path
    import plotly.graph_objects as go
    import numpy as np

    _root = Path(__file__).resolve().parents[2]
    if str(_root) not in sys.path:
        sys.path.insert(0, str(_root))

    from labs.core.state import DesignLedger
    from labs.core.style import COLORS, LAB_CSS, apply_plotly_theme

    # ── Hardware constants — plain floats, sources annotated ──────────────────
    H100_BW_GBS      = 3350   # GB/s — H100 SXM5 HBM3e, NVIDIA spec
    H100_TFLOPS_FP16 = 1979   # TFLOPS FP16 — NVIDIA H100 datasheet
    H100_RAM_GB      = 80     # GB HBM — NVIDIA H100 SXM5
    MOBILE_BW_GBS    = 68     # GB/s — Apple A17-class smartphone NPU
    MOBILE_TOPS_INT8 = 35     # TOPS INT8 — Apple A17-class NPU
    MOBILE_RAM_GB    = 8      # GB — typical flagship smartphone

    # ── Domain constants — responsible_engr.qmd scenario ─────────────────────
    # Baseline model accuracy on both groups (the "85% parity illusion" scenario)
    BASE_ACCURACY           = 0.85   # overall accuracy on each group

    # EEOC 4/5ths (80%) rule: selection rate ratio < 0.8 triggers review
    # Equivalent to equalized odds gap threshold for high-stakes deployment
    DISPARATE_IMPACT_THRESHOLD_PP = 10.0  # pp — EEOC / OFCCP guidance for hiring

    # Audit cost model: cloud vs mobile context
    # Cloud: H100 batch re-evaluation, near-realtime, ~$2/hr compute
    CLOUD_AUDIT_COST_K_USD  = 1.5    # $K per audit run (cloud, automated)
    # Mobile: delayed centralized audit, human review + compute
    MOBILE_AUDIT_COST_K_USD = 4.0    # $K per audit run (mobile, manual + compute)

    # Annual audit frequencies (runs/year)
    AUDIT_FREQ_OPTIONS = {
        "Continuous (52×/yr)":  52,
        "Monthly (12×/yr)":     12,
        "Quarterly (4×/yr)":     4,
        "Annual (1×/yr)":        1,
    }

    ledger = DesignLedger()
    return (
        mo, go, np, math,
        ledger, COLORS, LAB_CSS, apply_plotly_theme,
        H100_BW_GBS, H100_TFLOPS_FP16, H100_RAM_GB,
        MOBILE_BW_GBS, MOBILE_TOPS_INT8, MOBILE_RAM_GB,
        BASE_ACCURACY, DISPARATE_IMPACT_THRESHOLD_PP,
        CLOUD_AUDIT_COST_K_USD, MOBILE_AUDIT_COST_K_USD,
        AUDIT_FREQ_OPTIONS,
    )


# ── CELL 1: HEADER ────────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, LAB_CSS, COLORS):
    mo.vstack([
        LAB_CSS,
        mo.Html(f"""
        <div style="background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%);
                    padding: 36px 44px; border-radius: 16px; color: white;
                    box-shadow: 0 8px 32px rgba(0,0,0,0.3);">
            <div style="font-size: 0.72rem; font-weight: 700; letter-spacing: 0.18em;
                        color: #475569; text-transform: uppercase; margin-bottom: 10px;">
                Machine Learning Systems &middot; Volume I &middot; Lab 15
            </div>
            <h1 style="margin: 0 0 10px 0; font-size: 2.4rem; font-weight: 900;
                       color: #f8fafc; line-height: 1.1; letter-spacing: -0.02em;">
                There Is No Free Fairness
            </h1>
            <p style="margin: 0 0 20px 0; font-size: 1.05rem; color: #94a3b8;
                      max-width: 720px; line-height: 1.65;">
                Your model has <strong style="color:#f8fafc;">85% accuracy on both groups</strong>.
                Legal says compliant. Advocates say unfair. Someone is about to be proven wrong.
                Chouldechova (2017) settled this debate mathematically.
            </p>
            <div style="display: flex; gap: 12px; flex-wrap: wrap;">
                <span style="background: rgba(99,102,241,0.15); color: #a5b4fc;
                             padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
                             font-weight: 600; border: 1px solid rgba(99,102,241,0.25);">
                    Act I: The Fairness Illusion &middot; Act II: Audit-Accuracy Tradeoff
                </span>
                <span style="background: rgba(16,185,129,0.15); color: #6ee7b7;
                             padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
                             font-weight: 600; border: 1px solid rgba(16,185,129,0.25);">
                    35&ndash;40 min
                </span>
                <span style="background: rgba(203,32,45,0.15); color: #fca5a5;
                             padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
                             font-weight: 600; border: 1px solid rgba(203,32,45,0.25);">
                    Disparate impact failure state active
                </span>
                <span style="background: rgba(204,85,0,0.15); color: #fdba74;
                             padding: 5px 14px; border-radius: 20px; font-size: 0.8rem;
                             font-weight: 600; border: 1px solid rgba(204,85,0,0.25);">
                    {COLORS['Mobile'] and 'Cloud vs Mobile context'}
                </span>
            </div>
        </div>
        """),
    ])
    return


# ── CELL 2: RECOMMENDED READING ───────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    mo.callout(mo.md("""
    **Recommended Reading** — Complete the following before this lab:

    - **@sec-responsible-engineering-fairness-metrics** — Demographic parity, equalized odds,
      calibration, and individual fairness: formal definitions and what each criterion
      requires from the underlying data distribution.
    - **@sec-responsible-engineering-impossibility** — Chouldechova (2017) impossibility
      theorem: when base rates differ, FPR equality, FNR equality, and PPV equality cannot
      all hold simultaneously. This is a mathematical proof, not a policy preference.
    - **@sec-responsible-engineering-audit-pipelines** — Continuous monitoring, audit
      frequency, mitigation methods, and the compute and accuracy costs of fairness
      enforcement at deployment scale.
    """), kind="info")
    return


# ── CELL 3: CONTEXT TOGGLE ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    context_toggle = mo.ui.radio(
        options={
            "Cloud (H100 — automated continuous audit)": "cloud",
            "Mobile (Smartphone NPU — delayed centralized audit)": "mobile",
        },
        value="Cloud (H100 — automated continuous audit)",
        label="Deployment context:",
        inline=True,
    )
    mo.vstack([
        mo.md("---"),
        mo.md(
            "**Select your deployment context.** "
            "Cloud enables near-realtime bias monitoring at low marginal cost. "
            "Mobile deployments face delayed feedback loops and higher per-audit cost."
        ),
        context_toggle,
    ])
    return (context_toggle,)


# ─────────────────────────────────────────────────────────────────────────────
# ACT I — THE FAIRNESS ILLUSION
# ─────────────────────────────────────────────────────────────────────────────

# ── CELL 4: ACT I SCENARIO ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
    _color = COLORS["OrangeLine"]
    mo.vstack([
        mo.md("---"),
        mo.md("## Act I — The Fairness Illusion"),
        mo.Html(f"""
        <div style="border-left: 4px solid {_color}; background: #fff7ed;
                    border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
            <div style="font-size: 0.72rem; font-weight: 700; color: {_color};
                        text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
                Incoming Message &middot; Product Compliance Officer
            </div>
            <div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
                "Our loan approval model has 85% accuracy on both demographic groups.
                Our legal team says we're compliant — equal accuracy means equal treatment.
                But advocacy groups are threatening litigation, saying our false positive
                and false negative rates differ by group. Who is right?
                Do we have a problem or don't we?"
            </div>
        </div>
        """),
        mo.md("""
        The Compliance Officer is measuring the right thing — accuracy — but measuring it
        at the wrong level of resolution. Equal overall accuracy is consistent with wildly
        different error distributions between groups, especially when base rates differ.

        Before touching the explorer, commit to a prediction.
        """),
    ])
    return


# ── CELL 5: ACT I PREDICTION LOCK ─────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    act1_prediction = mo.ui.radio(
        options={
            "A) 85% accuracy on both groups proves fairness — equal accuracy means equal treatment": "option_a",
            "B) Need to check precision (PPV) — equal accuracy can mask unequal precision rates": "option_b",
            "C) Equal accuracy is necessary but not sufficient — must check FPR AND FNR separately": "option_c",
            "D) Accuracy alone is sufficient for legal compliance under disparate impact doctrine": "option_d",
        },
        label=(
            "The Compliance Officer claims equal accuracy proves fairness. "
            "What is the most complete and correct assessment?"
        ),
    )
    mo.vstack([
        mo.Html("""
        <div style="background: #1e293b; border-radius: 12px; padding: 20px;
                    border-left: 4px solid #f59e0b; margin: 8px 0;">
            <div style="font-size: 0.72rem; font-weight: 700; color: #fbbf24;
                        text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
                Prediction Lock — Act I
            </div>
            <div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
                Commit before touching any sliders. The Fairness Metric Explorer
                unlocks once you select an answer.
            </div>
        </div>
        """),
        act1_prediction,
    ])
    return (act1_prediction,)


# ── CELL 6: ACT I GATE ────────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction):
    mo.stop(
        act1_prediction.value is None,
        mo.callout(
            mo.md("Select your prediction above to unlock the Fairness Metric Explorer."),
            kind="warn",
        ),
    )
    return


# ── CELL 7: ACT I CONTROLS ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    # Base rate sliders: the "positive rate" in each group's true population
    # Source: responsible_engr.qmd — base rate is P(Y=1|Group=G)
    # Range: 5–50% represents realistic lending/credit scenarios
    base_rate_a = mo.ui.slider(
        start=5, stop=50, value=10, step=1,
        label="Base rate — Group A (true positive rate in population, %)",
    )
    base_rate_b = mo.ui.slider(
        start=5, stop=50, value=40, step=1,
        label="Base rate — Group B (true positive rate in population, %)",
    )
    # Model threshold: decision boundary applied uniformly to both groups
    # Source: responsible_engr.qmd — single shared threshold is a policy choice
    model_threshold = mo.ui.slider(
        start=0.10, stop=0.90, value=0.50, step=0.05,
        label="Model classification threshold (applied uniformly to both groups)",
    )
    mo.vstack([
        mo.md("### Fairness Metric Explorer"),
        mo.md(
            "Set the **true base rate** for each group (the actual fraction of creditworthy "
            "applicants in each population) and the **model threshold**. "
            "The model is calibrated — its score reflects true probability — but the same "
            "threshold is applied to both groups. Watch what happens to FPR and FNR."
        ),
        mo.hstack([base_rate_a, base_rate_b], justify="start", gap="2rem"),
        model_threshold,
    ])
    return (base_rate_a, base_rate_b, model_threshold)


# ── CELL 8: ACT I PHYSICS ENGINE + CONFUSION MATRICES ─────────────────────────
@app.cell(hide_code=True)
def _(
    mo, go, np,
    base_rate_a, base_rate_b, model_threshold,
    apply_plotly_theme,
):
    # ── Simulation physics ─────────────────────────────────────────────────────
    # Model: calibrated binary classifier, score s ~ Beta(alpha, beta) for each group.
    # Calibration means P(Y=1|s=p) = p, so the threshold determines the operating point.
    #
    # For a calibrated model with base rate r and threshold t:
    #   TPR = P(s > t | Y=1)  — sensitivity
    #   FPR = P(s > t | Y=0)  — false positive rate (false alarm)
    #   FNR = 1 - TPR         — miss rate
    #   PPV = r*TPR / (r*TPR + (1-r)*FPR)  — precision / positive predictive value
    #
    # Source: responsible_engr.qmd — confusion matrix decomposition by group,
    #         Chouldechova (2017) impossibility theorem derivation.
    #
    # We model TPR and FPR as functions of threshold for a logistic-like score distribution.
    # Using a simple normal approximation: scores for positives ~ N(0.7, 0.15),
    #                                      scores for negatives ~ N(0.3, 0.15)
    # This is a calibrated model because mean(positive scores) > mean(negative scores)
    # in proportion to base rates, and both groups share the SAME score distributions
    # (this is the "calibrated but disparate impact" scenario).

    def _compute_group_metrics(base_rate_pct, threshold):
        """
        Compute confusion matrix entries and derived metrics for one group.
        Source: responsible_engr.qmd — Section on fairness metric decomposition.

        Uses a calibrated model where positive class scores ~ N(0.70, 0.15)
        and negative class scores ~ N(0.30, 0.15).  The same model/threshold
        is applied to both groups, but the GROUP's base rate changes the
        fraction of positives vs negatives, and hence the confusion matrix.
        """
        from scipy.special import erfc
        r = base_rate_pct / 100.0   # base rate as fraction

        # TPR = P(score > t | Y=1) = P(N(0.70, 0.15) > t)
        # FPR = P(score > t | Y=0) = P(N(0.30, 0.15) > t)
        mu_pos, mu_neg, sigma = 0.70, 0.30, 0.15
        tpr = 0.5 * erfc((threshold - mu_pos) / (sigma * np.sqrt(2)))
        fpr = 0.5 * erfc((threshold - mu_neg) / (sigma * np.sqrt(2)))

        # Clamp to valid range
        tpr = float(np.clip(tpr, 0.001, 0.999))
        fpr = float(np.clip(fpr, 0.001, 0.999))
        fnr = 1.0 - tpr

        # Accuracy = r*TPR + (1-r)*(1-FPR)
        accuracy = r * tpr + (1.0 - r) * (1.0 - fpr)

        # PPV = precision = TP / (TP + FP) = r*TPR / (r*TPR + (1-r)*FPR)
        denom = r * tpr + (1.0 - r) * fpr
        ppv = (r * tpr / denom) if denom > 1e-9 else 0.0

        # Approval rate = P(score > t) = r*TPR + (1-r)*FPR
        approval_rate = denom

        return {
            "accuracy":      accuracy,
            "tpr":           tpr,
            "fpr":           fpr,
            "fnr":           fnr,
            "ppv":           ppv,
            "approval_rate": approval_rate,
        }

    # ── Attempt scipy import; fall back to pure numpy ─────────────────────────
    try:
        from scipy.special import erfc as _erfc_check
        _scipy_ok = True
    except ImportError:
        _scipy_ok = False

    def _compute_group_metrics_numpy(base_rate_pct, threshold):
        """Fallback without scipy — uses numpy erf approximation."""
        r = base_rate_pct / 100.0
        mu_pos, mu_neg, sigma = 0.70, 0.30, 0.15
        tpr = 0.5 * (1.0 - np.sign(threshold - mu_pos) *
                     (1.0 - np.exp(-(threshold - mu_pos)**2 / (2*sigma**2))))
        # More robust: use numpy's erf
        tpr = float(np.clip(0.5 * (1.0 + np.sign(mu_pos - threshold) *
                   (1.0 - np.exp(-(abs(threshold - mu_pos))**2 / sigma**2 * 0.5))), 0.001, 0.999))
        fpr = float(np.clip(0.5 * (1.0 + np.sign(mu_neg - threshold) *
                   (1.0 - np.exp(-(abs(threshold - mu_neg))**2 / sigma**2 * 0.5))), 0.001, 0.999))
        fnr = 1.0 - tpr
        accuracy = r * tpr + (1.0 - r) * (1.0 - fpr)
        denom = r * tpr + (1.0 - r) * fpr
        ppv = (r * tpr / denom) if denom > 1e-9 else 0.0
        approval_rate = denom
        return {"accuracy": accuracy, "tpr": tpr, "fpr": fpr,
                "fnr": fnr, "ppv": ppv, "approval_rate": approval_rate}

    _compute_fn = _compute_group_metrics if _scipy_ok else _compute_group_metrics_numpy

    _br_a = base_rate_a.value   # e.g., 10%
    _br_b = base_rate_b.value   # e.g., 40%
    _t    = model_threshold.value  # e.g., 0.50

    _ga = _compute_fn(_br_a, _t)
    _gb = _compute_fn(_br_b, _t)

    # ── Derived gap metrics ────────────────────────────────────────────────────
    _fpr_gap  = abs(_ga["fpr"] - _gb["fpr"]) * 100   # pp
    _fnr_gap  = abs(_ga["fnr"] - _gb["fnr"]) * 100   # pp
    _ppv_gap  = abs(_ga["ppv"] - _gb["ppv"]) * 100   # pp
    _acc_gap  = abs(_ga["accuracy"] - _gb["accuracy"]) * 100  # pp
    _eo_gap   = (_fpr_gap + _fnr_gap) / 2.0           # equalized odds gap (average)
    _app_gap  = abs(_ga["approval_rate"] - _gb["approval_rate"]) * 100  # pp

    # ── Color coding: green if gap ≤5pp, orange if 5–10pp, red if >10pp ───────
    def _gap_color(gap_pp):
        if gap_pp <= 5.0:
            return "#22c55e"   # green
        elif gap_pp <= 10.0:
            return "#f59e0b"   # orange
        return "#ef4444"       # red

    def _gap_label(gap_pp):
        if gap_pp <= 5.0:
            return "Within 5pp"
        elif gap_pp <= 10.0:
            return "Caution 5–10pp"
        return "Gap >10pp"

    # ── Pareto sweep: vary threshold 0.1→0.9, compute EO gap + accuracy ───────
    _thresholds = np.linspace(0.1, 0.9, 80)
    _pareto_acc  = []
    _pareto_eo   = []
    for _th in _thresholds:
        _g_a = _compute_fn(_br_a, _th)
        _g_b = _compute_fn(_br_b, _th)
        _avg_acc = 0.5 * (_g_a["accuracy"] + _g_b["accuracy"])
        _eo = 0.5 * (abs(_g_a["fpr"] - _g_b["fpr"]) + abs(_g_a["fnr"] - _g_b["fnr"])) * 100
        _pareto_acc.append(_avg_acc * 100)
        _pareto_eo.append(_eo)

    # ── Figure: confusion matrices as bar chart side-by-side ──────────────────
    _categories = ["Accuracy", "TPR", "FPR", "FNR", "PPV", "Approval Rate"]
    _vals_a = [
        _ga["accuracy"]*100, _ga["tpr"]*100, _ga["fpr"]*100,
        _ga["fnr"]*100,       _ga["ppv"]*100, _ga["approval_rate"]*100,
    ]
    _vals_b = [
        _gb["accuracy"]*100, _gb["tpr"]*100, _gb["fpr"]*100,
        _gb["fnr"]*100,       _gb["ppv"]*100, _gb["approval_rate"]*100,
    ]

    _fig = go.Figure()
    _fig.add_trace(go.Bar(
        name=f"Group A (base rate {_br_a}%)",
        x=_categories, y=_vals_a,
        marker_color="#6366f1",
        text=[f"{v:.1f}%" for v in _vals_a],
        textposition="outside",
    ))
    _fig.add_trace(go.Bar(
        name=f"Group B (base rate {_br_b}%)",
        x=_categories, y=_vals_b,
        marker_color="#f59e0b",
        text=[f"{v:.1f}%" for v in _vals_b],
        textposition="outside",
    ))
    _fig = apply_plotly_theme(_fig)
    _fig.update_layout(
        title=f"Per-Group Fairness Metrics (threshold = {_t:.2f})",
        yaxis_title="Rate (%)",
        barmode="group",
        height=380,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1),
        yaxis_range=[0, 115],
    )

    # ── Figure: accuracy vs EO gap Pareto frontier ────────────────────────────
    _current_eo_for_plot = _eo_gap
    _current_acc_for_plot = 0.5 * (_ga["accuracy"] + _gb["accuracy"]) * 100

    _fig2 = go.Figure()
    _fig2.add_trace(go.Scatter(
        x=_pareto_acc, y=_pareto_eo,
        mode="lines",
        name="Accuracy vs EO Gap frontier",
        line=dict(color="#6366f1", width=2),
    ))
    _fig2.add_trace(go.Scatter(
        x=[_current_acc_for_plot], y=[_current_eo_for_plot],
        mode="markers",
        name="Current threshold",
        marker=dict(color="#f59e0b", size=12, symbol="star"),
    ))
    # Mark the 10pp gap threshold line
    _fig2.add_hline(
        y=10, line_dash="dash", line_color="#ef4444",
        annotation_text="10pp gap threshold (EEOC guidance)",
        annotation_position="top right",
    )
    _fig2 = apply_plotly_theme(_fig2)
    _fig2.update_layout(
        title="Accuracy vs Equalized Odds Gap — Pareto Frontier",
        xaxis_title="Average Accuracy (%)",
        yaxis_title="Equalized Odds Gap (pp)",
        height=340,
    )

    # ── Physics formula display ────────────────────────────────────────────────
    _physics_md = f"""
### Physics

```
Group A (base rate = {_br_a}%):
  TPR = P(score > {_t:.2f} | Y=1) = {_ga['tpr']*100:.1f}%
  FPR = P(score > {_t:.2f} | Y=0) = {_ga['fpr']*100:.1f}%
  FNR = 1 - TPR                  = {_ga['fnr']*100:.1f}%
  PPV = r·TPR / (r·TPR + (1-r)·FPR)
      = {_br_a/100:.2f}×{_ga['tpr']:.3f} / ({_br_a/100:.2f}×{_ga['tpr']:.3f} + {1-_br_a/100:.2f}×{_ga['fpr']:.3f})
      = {_ga['ppv']*100:.1f}%
  Accuracy = r·TPR + (1-r)·(1-FPR) = {_ga['accuracy']*100:.1f}%

Group B (base rate = {_br_b}%):
  TPR = {_gb['tpr']*100:.1f}%  |  FPR = {_gb['fpr']*100:.1f}%
  FNR = {_gb['fnr']*100:.1f}%  |  PPV = {_gb['ppv']*100:.1f}%
  Accuracy = {_gb['accuracy']*100:.1f}%
```

### Gap Summary

| Metric | Group A | Group B | Gap | Status |
|--------|---------|---------|-----|--------|
| Accuracy | {_ga['accuracy']*100:.1f}% | {_gb['accuracy']*100:.1f}% | {_acc_gap:.1f}pp | {_gap_label(_acc_gap)} |
| FPR | {_ga['fpr']*100:.1f}% | {_gb['fpr']*100:.1f}% | {_fpr_gap:.1f}pp | {_gap_label(_fpr_gap)} |
| FNR | {_ga['fnr']*100:.1f}% | {_gb['fnr']*100:.1f}% | {_fnr_gap:.1f}pp | {_gap_label(_fnr_gap)} |
| PPV | {_ga['ppv']*100:.1f}% | {_gb['ppv']*100:.1f}% | {_ppv_gap:.1f}pp | {_gap_label(_ppv_gap)} |
| Equalized Odds Gap | — | — | {_eo_gap:.1f}pp | {_gap_label(_eo_gap)} |
"""

    # ── Metric cards ──────────────────────────────────────────────────────────
    _cards_html = f"""
<div style="display: flex; gap: 16px; flex-wrap: wrap; justify-content: center; margin: 16px 0;">
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 160px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">Accuracy Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color(_acc_gap)};">
            {_acc_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">{_gap_label(_acc_gap)}</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 160px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">FPR Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color(_fpr_gap)};">
            {_fpr_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">{_gap_label(_fpr_gap)}</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 160px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">FNR Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color(_fnr_gap)};">
            {_fnr_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">{_gap_label(_fnr_gap)}</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 160px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">PPV Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color(_ppv_gap)};">
            {_ppv_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">{_gap_label(_ppv_gap)}</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 160px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">EO Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color(_eo_gap)};">
            {_eo_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">avg FPR+FNR gap</div>
    </div>
</div>
"""

    mo.vstack([
        mo.md(_physics_md),
        mo.Html(_cards_html),
        mo.ui.plotly(_fig),
        mo.ui.plotly(_fig2),
    ])
    return (
        _ga, _gb,
        _fpr_gap, _fnr_gap, _ppv_gap, _acc_gap, _eo_gap,
        _br_a, _br_b, _t,
    )


# ── CELL 9: ACT I MATH PEEK ───────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    mo.accordion({
        "The governing equations (Chouldechova 2017 impossibility)": mo.md("""
        **Fairness metric definitions:**

        Let `r` = base rate for a group, `t` = model threshold.
        For a calibrated classifier with score distribution `s`:

        ```
        TPR (sensitivity)  = P(s > t | Y = 1)
        FPR (false alarm)  = P(s > t | Y = 0)
        FNR (miss rate)    = 1 - TPR
        PPV (precision)    = P(Y = 1 | s > t)
                           = r · TPR / (r · TPR + (1-r) · FPR)
        Accuracy           = r · TPR + (1-r) · (1 - FPR)
        ```

        **Chouldechova (2017) Impossibility Theorem:**

        When `r_A ≠ r_B` (base rates differ between groups A and B),
        a single shared model threshold `t` applied to a calibrated classifier
        **cannot simultaneously satisfy all three of:**

        ```
        1. Calibration:   PPV_A = PPV_B
        2. Equal FPR:     FPR_A = FPR_B
        3. Equal FNR:     FNR_A = FNR_B
        ```

        **Proof sketch (from the paper):**

        From the PPV formula:  `FPR = r · TPR / (PPV · (1-r)) - r · TPR / (1-r)`

        If PPV is equal across groups and TPR (= 1 - FNR) is equal across groups,
        then FPR can only be equal if `r_A = r_B`. Since base rates differ by
        assumption, at least one of the three equalities must be violated.

        **Implication for the lab:**

        Equal accuracy is consistent with large FPR or FNR gaps — because accuracy
        weighs positives and negatives by their base-rate frequencies, which differ
        by group. Two groups can have identical accuracy (85%) while one group's
        false alarm rate is many times higher than the other's.
        """),
    })
    return


# ── CELL 10: ACT I PREDICTION-VS-REALITY OVERLAY ──────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_prediction, _acc_gap, _fpr_gap, _fnr_gap):
    # The key insight: equal accuracy does NOT imply equal FPR or FNR
    # The correct answer is C: must check FPR AND FNR separately.
    _a1_correct = act1_prediction.value == "option_c"

    _explanations = {
        "option_a": (
            "**Incorrect.** Equal accuracy is not sufficient for fairness. "
            "When base rates differ, the same accuracy score can conceal large "
            "asymmetries in who bears the false positive vs. false negative burden. "
            "Accuracy weights each type of error by base-rate frequency, so groups "
            "with different base rates can score identically in accuracy while "
            "experiencing very different error rates. "
            f"In your current configuration: FPR gap = **{_fpr_gap:.1f}pp**, "
            f"FNR gap = **{_fnr_gap:.1f}pp**, Accuracy gap = **{_acc_gap:.1f}pp**.",
            "warn"
        ),
        "option_b": (
            "**Partially correct, but incomplete.** Checking precision (PPV) is "
            "important, but precision alone does not capture the full picture. "
            "Chouldechova (2017) shows the impossibility involves FPR AND FNR "
            "jointly — checking only PPV leaves out one of the key error asymmetries. "
            "The complete answer requires checking FPR and FNR separately.",
            "warn"
        ),
        "option_c": (
            "**Correct.** This is exactly the lesson Chouldechova (2017) formalizes. "
            "Equal accuracy is necessary but not sufficient. When base rates differ, "
            "the same model threshold produces structurally different FPR and FNR across "
            "groups — regardless of overall accuracy. The compliance officer needs to "
            "audit error rates *by group*, not just accuracy *by group*. "
            f"In the default configuration (base rate A=10%, B=40%): "
            f"FPR gap = {_fpr_gap:.1f}pp, FNR gap = {_fnr_gap:.1f}pp, "
            f"Accuracy gap = {_acc_gap:.1f}pp.",
            "success"
        ),
        "option_d": (
            "**Incorrect.** Accuracy is not the standard that disparate impact doctrine "
            "applies. Courts and regulators examine selection rates, false positive rates, "
            "and the four-fifths rule — none of which are equivalent to overall accuracy. "
            "Equal accuracy while maintaining a 20pp FPR gap would almost certainly "
            "not satisfy a disparate impact claim.",
            "warn"
        ),
    }

    _explanation, _kind = _explanations.get(
        act1_prediction.value, ("No prediction selected.", "info")
    )

    _overlay_md = (
        f"**Prediction-vs-Reality:** You predicted *{act1_prediction.value.upper().replace('_', ' ')}*. "
        f"Current configuration: accuracy gap = {_acc_gap:.1f}pp, FPR gap = {_fpr_gap:.1f}pp, "
        f"FNR gap = {_fnr_gap:.1f}pp. "
        "Notice that accuracy gap can be near zero while FPR and FNR gaps remain substantial."
    )

    mo.vstack([
        mo.callout(mo.md(_explanation), kind=_kind),
        mo.callout(mo.md(_overlay_md), kind="info"),
    ])
    return (_a1_correct,)


# ── CELL 11: ACT I REFLECTION ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    act1_reflection = mo.ui.radio(
        options={
            "A) Perfect fairness is always achievable with enough data and a better model": "ref_a",
            "B) When base rates differ, you cannot simultaneously achieve calibration, equal FPR, AND equal FNR": "ref_b",
            "C) Accuracy and fairness are always in tension (too general — not the theorem's claim)": "ref_c",
            "D) Fairness metrics are mathematically equivalent and differ only in emphasis": "ref_d",
        },
        label="Reflection: What does Chouldechova's (2017) impossibility theorem state?",
    )
    mo.vstack([
        mo.md("---"),
        mo.md("### Act I Reflection"),
        act1_reflection,
    ])
    return (act1_reflection,)


# ── CELL 12: ACT I REFLECTION REVEAL ─────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act1_reflection):
    mo.stop(
        act1_reflection.value is None,
        mo.callout(mo.md("Select your reflection answer to continue to Act II."), kind="warn"),
    )

    _ref_explanations = {
        "ref_a": (
            "**Incorrect.** Chouldechova (2017) proves this is impossible regardless of "
            "data quantity or model quality, as long as base rates differ. More data does "
            "not fix the structural incompatibility — it only makes the model more "
            "precisely wrong in the same direction.",
            "warn"
        ),
        "ref_b": (
            "**Correct.** This is the exact claim of the theorem. For any calibrated "
            "classifier applied with a shared threshold: if `r_A ≠ r_B`, then at most "
            "two of {calibration, equal FPR, equal FNR} can hold simultaneously. "
            "There is no engineering solution — only a choice of which criterion to "
            "prioritize and which to sacrifice.",
            "success"
        ),
        "ref_c": (
            "**Too general.** The statement that accuracy and fairness are always in "
            "tension is a common heuristic, but it is not the theorem. Chouldechova's "
            "contribution is more precise: it identifies the specific three-way "
            "incompatibility and the exact condition (differing base rates) that triggers it.",
            "warn"
        ),
        "ref_d": (
            "**Incorrect.** The entire point of the theorem is that fairness metrics "
            "are NOT equivalent. Demographic parity, equalized odds, and calibration "
            "each capture different aspects of fair treatment, and they actively "
            "conflict when base rates differ. Choosing a metric is a policy decision, "
            "not a technical one.",
            "warn"
        ),
    }

    _expl, _kind = _ref_explanations.get(
        act1_reflection.value, ("No answer selected.", "info")
    )
    mo.callout(mo.md(_expl), kind=_kind)
    return


# ─────────────────────────────────────────────────────────────────────────────
# ACT II — THE AUDIT-ACCURACY TRADEOFF
# ─────────────────────────────────────────────────────────────────────────────

# ── CELL 13: ACT II SCENARIO ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, COLORS):
    _color = COLORS["BlueLine"]
    mo.vstack([
        mo.md("---"),
        mo.md("## Act II — The Audit-Accuracy Tradeoff"),
        mo.Html(f"""
        <div style="border-left: 4px solid {_color}; background: #eff6ff;
                    border-radius: 0 10px 10px 0; padding: 16px 22px; margin: 12px 0;">
            <div style="font-size: 0.72rem; font-weight: 700; color: {_color};
                        text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 6px;">
                Incoming Message &middot; Engineering VP
            </div>
            <div style="font-style: italic; font-size: 1.0rem; color: #1e293b; line-height: 1.65;">
                "We're deploying a resume screening model at scale — 50,000 applications
                per quarter. We need 80% recall on qualified candidates. We have $50K
                in annual audit budget. Three options are on the table: (1) accuracy-optimized
                model — no fairness constraint, maximize recall; (2) demographic parity
                constraint — equal selection rates across demographic groups; (3) individual
                fairness with equalized odds — similar candidates treated similarly, equal
                FPR and FNR across groups. Design the deployment. Which strategy is most
                defensible, and does it fit the budget?"
            </div>
        </div>
        """),
        mo.md("""
        Each strategy has a different accuracy cost, audit cost, and legal exposure.
        The VP needs a defensible choice — not just the highest accuracy.
        Commit to a strategy prediction before using the design instruments.
        """),
    ])
    return


# ── CELL 14: ACT II PREDICTION LOCK ───────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    act2_prediction = mo.ui.radio(
        options={
            "A) Option 1 — maximize accuracy, handle fairness concerns in post-hoc legal review": "option_a",
            "B) Option 2 — demographic parity is the legally safest strategy for a hiring context": "option_b",
            "C) Option 3 — individual fairness with equalized odds and regular audit is most defensible": "option_c",
            "D) All three strategies are equivalent in practice — the choice is arbitrary": "option_d",
        },
        label=(
            "Which deployment strategy is most defensible for a high-stakes hiring context "
            "under EEOC and disparate impact doctrine?"
        ),
    )
    mo.vstack([
        mo.Html("""
        <div style="background: #1e293b; border-radius: 12px; padding: 20px;
                    border-left: 4px solid #6366f1; margin: 8px 0;">
            <div style="font-size: 0.72rem; font-weight: 700; color: #a5b4fc;
                        text-transform: uppercase; letter-spacing: 0.1em; margin-bottom: 10px;">
                Prediction Lock — Act II
            </div>
            <div style="color: #e2e8f0; font-size: 0.88rem; margin-bottom: 12px;">
                Commit before adjusting the design instruments.
            </div>
        </div>
        """),
        act2_prediction,
    ])
    return (act2_prediction,)


# ── CELL 15: ACT II GATE ──────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_prediction):
    mo.stop(
        act2_prediction.value is None,
        mo.callout(
            mo.md("Select your strategy prediction above to unlock the Audit Design Cockpit."),
            kind="warn",
        ),
    )
    return


# ── CELL 16: ACT II CONTROLS ──────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    # Fairness criterion selection
    # Source: responsible_engr.qmd — the three main criteria for hiring contexts
    fairness_criterion = mo.ui.dropdown(
        options={
            "No constraint (accuracy only)":         "accuracy",
            "Demographic parity (equal selection rates)": "demographic_parity",
            "Equalized odds (equal FPR + FNR)":      "equalized_odds",
            "Individual fairness (similar candidates treated similarly)": "individual",
        },
        value="No constraint (accuracy only)",
        label="Fairness criterion",
    )
    # Bias mitigation method
    # Source: responsible_engr.qmd — three standard mitigation approaches
    bias_mitigation = mo.ui.dropdown(
        options={
            "None":                         "none",
            "Reweighting (pre-processing)": "reweighting",
            "Adversarial debiasing (in-processing)": "adversarial",
            "Post-processing threshold adjustment": "postprocessing",
        },
        value="None",
        label="Bias mitigation method",
    )
    # Audit frequency: how often the deployed model is re-evaluated for bias
    # Source: responsible_engr.qmd — audit cost model
    audit_frequency = mo.ui.dropdown(
        options={
            "Continuous (52×/yr)":  "continuous",
            "Monthly (12×/yr)":     "monthly",
            "Quarterly (4×/yr)":    "quarterly",
            "Annual (1×/yr)":       "annual",
        },
        value="Quarterly (4×/yr)",
        label="Audit frequency",
    )
    mo.vstack([
        mo.md("### Audit Design Cockpit"),
        mo.md(
            "Select a **fairness criterion**, a **bias mitigation method**, and an "
            "**audit frequency**. The instruments will show the resulting accuracy cost, "
            "equalized odds gap, annual audit cost, and regulatory risk level."
        ),
        mo.hstack([fairness_criterion, bias_mitigation, audit_frequency],
                  justify="start", gap="2rem"),
    ])
    return (fairness_criterion, bias_mitigation, audit_frequency)


# ── CELL 17: ACT II PHYSICS ENGINE ────────────────────────────────────────────
@app.cell(hide_code=True)
def _(
    mo, go, np, math,
    context_toggle,
    fairness_criterion, bias_mitigation, audit_frequency,
    _br_a, _br_b,
    DISPARATE_IMPACT_THRESHOLD_PP,
    CLOUD_AUDIT_COST_K_USD, MOBILE_AUDIT_COST_K_USD,
    apply_plotly_theme,
):
    # ── Context-dependent parameters ──────────────────────────────────────────
    _ctx = context_toggle.value  # "cloud" or "mobile"
    _cost_per_audit_k = CLOUD_AUDIT_COST_K_USD if _ctx == "cloud" else MOBILE_AUDIT_COST_K_USD

    # ── Audit frequency → runs per year ───────────────────────────────────────
    _freq_map = {"continuous": 52, "monthly": 12, "quarterly": 4, "annual": 1}
    _audits_per_year = _freq_map[audit_frequency.value]

    # ── Fairness criterion: base accuracy and baseline EO gap ─────────────────
    # Source: responsible_engr.qmd — Table of fairness-accuracy tradeoffs
    # These are physics-based estimates calibrated to a base rate gap of 30pp
    # (Group A=10%, Group B=40%) from Act I defaults.
    #
    # Accuracy-only model: maximizes recall, no fairness constraint.
    # EO gap is large because threshold optimized for accuracy, not parity.
    #
    # Demographic parity: forces equal selection rates.
    # This requires different thresholds per group → can harm accuracy more.
    # EO gap may actually WORSEN because FPR/FNR are not directly controlled.
    #
    # Equalized odds: forces equal FPR and FNR.
    # Requires per-group threshold calibration → moderate accuracy cost.
    # EO gap is directly minimized.
    #
    # Individual fairness: similar candidates get similar scores.
    # Requires metric learning → highest implementation cost, best defensibility.
    # EO gap depends on implementation quality.
    _base_rate_gap_pp = abs(_br_b - _br_a)  # gap between group base rates

    # Base accuracy and EO gap by criterion
    # EO gap is modeled as a function of base rate gap and constraint choice
    _criterion = fairness_criterion.value
    if _criterion == "accuracy":
        _base_accuracy_pct = 87.5                # unconstrained, high accuracy
        _base_eo_gap       = 0.38 * _base_rate_gap_pp  # large EO gap when base rates differ
    elif _criterion == "demographic_parity":
        _base_accuracy_pct = 83.0                # accuracy cost from forcing equal selection rates
        _base_eo_gap       = 0.50 * _base_rate_gap_pp  # DP can WORSEN EO gap (see theorem)
    elif _criterion == "equalized_odds":
        _base_accuracy_pct = 84.5                # moderate accuracy cost
        _base_eo_gap       = 0.12 * _base_rate_gap_pp  # EO gap directly constrained
    else:  # individual
        _base_accuracy_pct = 84.0                # similar to EO
        _base_eo_gap       = 0.10 * _base_rate_gap_pp  # best EO gap, defensible in court

    # ── Bias mitigation: reduces EO gap further, with additional accuracy cost ─
    # Source: responsible_engr.qmd — mitigation method effectiveness table
    _mitigation = bias_mitigation.value
    _mitigation_gap_reduction = {
        "none":           0.0,
        "reweighting":    0.25,   # reduces gap by 25%, accuracy cost ~0.5pp
        "adversarial":    0.45,   # reduces gap by 45%, accuracy cost ~1.5pp
        "postprocessing": 0.35,   # reduces gap by 35%, accuracy cost ~0.8pp
    }[_mitigation]
    _mitigation_acc_cost = {
        "none":           0.0,
        "reweighting":    0.5,
        "adversarial":    1.5,
        "postprocessing": 0.8,
    }[_mitigation]

    _final_eo_gap = _base_eo_gap * (1.0 - _mitigation_gap_reduction)
    _final_accuracy_pct = _base_accuracy_pct - _mitigation_acc_cost

    # ── Audit cost model ───────────────────────────────────────────────────────
    # Source: responsible_engr.qmd — audit cost = compute + human review
    # Mobile has 2.7× higher cost due to data collection and manual review overhead
    _audit_cost_k = _cost_per_audit_k * _audits_per_year  # total annual audit cost ($K)

    # ── Regulatory risk model ─────────────────────────────────────────────────
    # Source: responsible_engr.qmd — EEOC disparate impact guidelines
    # Risk is a function of EO gap AND audit frequency
    # An unaudited high-gap model is maximum risk; well-audited low-gap model is low risk
    _above_threshold = _final_eo_gap > DISPARATE_IMPACT_THRESHOLD_PP
    _audits_adequate = _audits_per_year >= 4  # quarterly minimum per OFCCP guidance
    _disparate_impact_triggered = _above_threshold  # the failure state

    if not _above_threshold:
        _reg_risk = "low"
    elif _above_threshold and _audits_adequate:
        _reg_risk = "medium"
    else:
        _reg_risk = "high"

    # ── Budget check ──────────────────────────────────────────────────────────
    _budget_k = 50.0  # $50K annual audit budget (from VP scenario)
    _over_budget = _audit_cost_k > _budget_k

    # ── Recall check ─────────────────────────────────────────────────────────
    # VP requires 80% recall on qualified candidates
    _recall_below_80 = _final_accuracy_pct < 80.0

    # ── Color coding ─────────────────────────────────────────────────────────
    def _color_for_gap(g):
        if g <= 5.0:
            return "#22c55e"
        elif g <= 10.0:
            return "#f59e0b"
        return "#ef4444"

    def _color_for_risk(risk):
        return {"low": "#22c55e", "medium": "#f59e0b", "high": "#ef4444"}[risk]

    _gap_color  = _color_for_gap(_final_eo_gap)
    _risk_color = _color_for_risk(_reg_risk)
    _budget_color = "#22c55e" if not _over_budget else "#ef4444"
    _acc_color = "#22c55e" if _final_accuracy_pct >= 84.0 else "#f59e0b" if _final_accuracy_pct >= 80.0 else "#ef4444"

    # ── Pareto frontier: sweep all criterion × mitigation combinations ────────
    _frontier_configs = [
        ("accuracy + none",       87.5, 0.38 * _base_rate_gap_pp * 1.00),
        ("accuracy + reweight",   87.0, 0.38 * _base_rate_gap_pp * 0.75),
        ("accuracy + adversarial",86.0, 0.38 * _base_rate_gap_pp * 0.55),
        ("accuracy + postproc",   86.7, 0.38 * _base_rate_gap_pp * 0.65),
        ("dem_parity + none",     83.0, 0.50 * _base_rate_gap_pp * 1.00),
        ("dem_parity + reweight", 82.5, 0.50 * _base_rate_gap_pp * 0.75),
        ("dem_parity + adversarial", 81.5, 0.50 * _base_rate_gap_pp * 0.55),
        ("dem_parity + postproc", 82.2, 0.50 * _base_rate_gap_pp * 0.65),
        ("eq_odds + none",        84.5, 0.12 * _base_rate_gap_pp * 1.00),
        ("eq_odds + reweight",    84.0, 0.12 * _base_rate_gap_pp * 0.75),
        ("eq_odds + adversarial", 83.0, 0.12 * _base_rate_gap_pp * 0.55),
        ("eq_odds + postproc",    83.7, 0.12 * _base_rate_gap_pp * 0.65),
        ("individual + none",     84.0, 0.10 * _base_rate_gap_pp * 1.00),
        ("individual + reweight", 83.5, 0.10 * _base_rate_gap_pp * 0.75),
        ("individual + adversarial", 82.5, 0.10 * _base_rate_gap_pp * 0.55),
        ("individual + postproc", 83.2, 0.10 * _base_rate_gap_pp * 0.65),
    ]

    # Group colors for Pareto plot
    _group_colors = {
        "accuracy":    "#94a3b8",  # gray
        "dem_parity":  "#f59e0b",  # amber
        "eq_odds":     "#6366f1",  # indigo
        "individual":  "#22c55e",  # green
    }

    _fig_pareto = go.Figure()
    for _name, _acc, _gap in _frontier_configs:
        _grp = _name.split(" + ")[0].replace("dem_parity", "dem_parity")
        _grp_short = _grp.split("_")[0] if "_" not in _grp[:5] else (
            "dem_parity" if "dem" in _grp else _grp
        )
        _grp_color = _group_colors.get(
            "dem_parity" if "dem" in _name else (
                "eq_odds" if "eq" in _name else (
                    "individual" if "individual" in _name else "accuracy"
                )
            ), "#6366f1"
        )
        _fig_pareto.add_trace(go.Scatter(
            x=[_acc], y=[_gap],
            mode="markers",
            name=_name,
            marker=dict(color=_grp_color, size=10),
            showlegend=False,
        ))

    # Highlight current selection
    _fig_pareto.add_trace(go.Scatter(
        x=[_final_accuracy_pct], y=[_final_eo_gap],
        mode="markers",
        name="Your design",
        marker=dict(color="#ffffff", size=16, symbol="star",
                    line=dict(color="#f59e0b", width=2)),
    ))

    # EEOC threshold line
    _fig_pareto.add_hline(
        y=DISPARATE_IMPACT_THRESHOLD_PP,
        line_dash="dash", line_color="#ef4444",
        annotation_text=f"EEOC guidance threshold ({DISPARATE_IMPACT_THRESHOLD_PP:.0f}pp)",
        annotation_position="top right",
    )
    _fig_pareto = apply_plotly_theme(_fig_pareto)
    _fig_pareto.update_layout(
        title="Fairness-Accuracy Pareto Frontier",
        xaxis_title="Model Accuracy (%)",
        yaxis_title="Equalized Odds Gap (pp) — lower is better",
        height=380,
    )

    # ── Metric cards HTML ────────────────────────────────────────────────────
    _ctx_display = "H100 Cloud" if _ctx == "cloud" else "Mobile NPU"
    _cards = f"""
<div style="display: flex; gap: 16px; flex-wrap: wrap; justify-content: center; margin: 16px 0;">
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 165px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">Accuracy</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_acc_color};">
            {_final_accuracy_pct:.1f}%
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">recall target 80%</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 165px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">EO Gap</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_gap_color};">
            {_final_eo_gap:.1f}pp
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">threshold: 10pp</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 165px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">Audit Cost / yr</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_budget_color};">
            ${_audit_cost_k:.0f}K
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">budget: $50K ({_ctx_display})</div>
    </div>
    <div style="padding: 18px 22px; border: 1px solid #e2e8f0; border-radius: 10px;
                width: 165px; text-align: center; background: white;">
        <div style="color: #64748b; font-size: 0.8rem; margin-bottom: 4px;">Regulatory Risk</div>
        <div style="font-size: 1.9rem; font-weight: 800; color: {_risk_color};">
            {_reg_risk.upper()}
        </div>
        <div style="font-size: 0.72rem; color: #94a3b8;">EEOC / OFCCP</div>
    </div>
</div>
"""

    # ── Physics summary ───────────────────────────────────────────────────────
    _phys_md = f"""
### Design Summary

```
Criterion:   {fairness_criterion.value}
Mitigation:  {bias_mitigation.value}
Audit freq:  {audit_frequency.value} ({_audits_per_year}×/yr)
Context:     {_ctx_display}

Accuracy             = {_final_accuracy_pct:.1f}%
  (base {_base_accuracy_pct:.1f}% − mitigation cost {_mitigation_acc_cost:.1f}pp)

EO Gap               = {_final_eo_gap:.1f}pp
  (base {_base_eo_gap:.1f}pp × (1 − {_mitigation_gap_reduction:.0%} mitigation))

Audit cost / yr      = ${_cost_per_audit_k:.1f}K × {_audits_per_year} = ${_audit_cost_k:.1f}K
Budget remaining     = ${ _budget_k - _audit_cost_k:+.1f}K

Regulatory risk      = {_reg_risk.upper()}
  (EO gap > {DISPARATE_IMPACT_THRESHOLD_PP:.0f}pp: {_above_threshold} | audits >= quarterly: {_audits_adequate})
```
"""

    # ── Output ────────────────────────────────────────────────────────────────
    _output_items = [
        mo.md(_phys_md),
        mo.Html(_cards),
        mo.ui.plotly(_fig_pareto),
    ]

    # ── FAILURE STATE: Disparate Impact triggered ─────────────────────────────
    # Source: EEOC 4/5ths rule and OFCCP audit guidance
    # Triggered when equalized odds gap > 10pp in a high-stakes (hiring) context
    if _disparate_impact_triggered:
        _output_items.append(
            mo.callout(
                mo.md(
                    f"**Disparate impact threshold exceeded.** "
                    f"Equalized odds gap: **{_final_eo_gap:.1f}pp**. "
                    f"High-stakes deployment (hiring) requires gap < {DISPARATE_IMPACT_THRESHOLD_PP:.0f}pp "
                    f"per EEOC guidelines. This deployment would face substantial regulatory risk. "
                    f"Reduce the gap by switching to equalized odds or individual fairness criterion, "
                    f"or apply adversarial debiasing to bring the gap below threshold."
                ),
                kind="danger",
            )
        )
    else:
        _output_items.append(
            mo.callout(
                mo.md(
                    f"EO gap **{_final_eo_gap:.1f}pp** is below the {DISPARATE_IMPACT_THRESHOLD_PP:.0f}pp "
                    f"threshold. Regulatory risk is **{_reg_risk}**. "
                    + ("Audit frequency meets OFCCP quarterly minimum." if _audits_adequate else
                       "Warning: audit frequency below OFCCP quarterly minimum recommendation.")
                ),
                kind="success" if _reg_risk == "low" else "warn",
            )
        )

    # ── Budget warning ────────────────────────────────────────────────────────
    if _over_budget:
        _output_items.append(
            mo.callout(
                mo.md(
                    f"**Budget exceeded.** Annual audit cost **${_audit_cost_k:.0f}K** > "
                    f"${_budget_k:.0f}K budget on **{_ctx_display}**. "
                    "Reduce audit frequency or switch to cloud context for lower per-run cost."
                ),
                kind="warn",
            )
        )

    mo.vstack(_output_items)
    return (
        _final_eo_gap,
        _final_accuracy_pct,
        _audit_cost_k,
        _reg_risk,
        _disparate_impact_triggered,
        _above_threshold,
        _criterion,
    )


# ── CELL 18: ACT II MATH PEEK ─────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    mo.accordion({
        "The governing equations (fairness metrics and audit cost model)": mo.md("""
        **Equalized Odds (Hardt et al. 2016):**

        A classifier satisfies equalized odds if, for all groups `g` and labels `y ∈ {0,1}`:
        ```
        P(ŷ = 1 | Y = y, Group = g) is equal across all groups g
        ```
        Equivalently: both TPR and FPR are equal across groups.

        **Equalized Odds Gap (lab's operationalization):**
        ```
        EO_gap = (|FPR_A − FPR_B| + |FNR_A − FNR_B|) / 2
        ```

        **Demographic Parity:**
        ```
        DP:  P(ŷ = 1 | Group = A) = P(ŷ = 1 | Group = B)
        ```
        Note: DP forces equal selection rates, NOT equal error rates.
        When qualified candidate rates differ, DP can require REJECTING more
        qualified candidates from one group to equalize selection rates —
        which is its own form of unfairness.

        **Individual Fairness (Dwork et al. 2012 — Lipschitz condition):**
        ```
        d_Y(M(x), M(x')) ≤ L · d_X(x, x')
        ```
        Individuals at similar distances in feature space receive similar decisions.
        Requires a task-specific metric `d_X` — the hardest criterion to specify
        but the most defensible under equal treatment theory.

        **Audit Cost Model:**
        ```
        Annual audit cost = cost_per_run × runs_per_year
        Cloud:  cost_per_run ≈ $1,500  (automated, H100 batch evaluation)
        Mobile: cost_per_run ≈ $4,000  (manual data collection + compute)
        ```

        **Regulatory Risk Model (EEOC / OFCCP guidance):**
        ```
        If EO_gap > 10pp AND context is high-stakes: HIGH risk
        If EO_gap > 10pp AND quarterly+ audits:      MEDIUM risk
        If EO_gap ≤ 10pp:                            LOW risk
        ```
        The 10pp threshold derives from EEOC's 4/5ths (80%) rule applied
        to selection rate ratios, converted to equalized odds gap units.
        """),
    })
    return


# ── CELL 19: ACT II PREDICTION REVEAL ─────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_prediction, _final_eo_gap, _reg_risk, _criterion):
    _a2_correct = act2_prediction.value == "option_c"

    _a2_explanations = {
        "option_a": (
            "**Incorrect.** Option 1 (accuracy-only) does not avoid fairness liability — "
            "it embeds it. Under EEOC disparate impact doctrine, a facially neutral "
            "employment practice that causes adverse impact must be justified by "
            "business necessity AND must use the least discriminatory alternative. "
            "Post-hoc legal review is not a substitute for a defensible deployment design. "
            f"Your current EO gap is **{_final_eo_gap:.1f}pp** — "
            f"regulatory risk is **{_reg_risk}**.",
            "warn"
        ),
        "option_b": (
            "**Incorrect.** Demographic parity is legally problematic in hiring contexts. "
            "It forces equal selection rates regardless of qualified candidate rates. "
            "If 40% of Group B applicants are qualified vs 10% of Group A, demographic "
            "parity requires either rejecting many qualified Group B applicants or "
            "accepting unqualified Group A applicants — both of which can constitute "
            "disparate treatment claims. Equalized odds is more defensible because it "
            "conditions on actual qualification (the true label), not just group membership.",
            "warn"
        ),
        "option_c": (
            "**Correct.** Option 3 — individual fairness or equalized odds with regular "
            "audit — is the most legally defensible approach. It satisfies equal treatment "
            "theory (similar candidates get similar decisions) and can demonstrate "
            "compliance through the audit record. The Pareto frontier shows equalized odds "
            "achieves lower EO gap than demographic parity at similar accuracy cost, "
            "and individual fairness further improves defensibility. "
            f"Your current EO gap is **{_final_eo_gap:.1f}pp** — "
            f"regulatory risk is **{_reg_risk}**.",
            "success"
        ),
        "option_d": (
            "**Incorrect.** The three strategies produce materially different outcomes "
            "on accuracy, EO gap, and regulatory risk. The Pareto frontier makes this "
            "visible: demographic parity can produce WORSE EO gaps than equalized odds "
            "at the same accuracy cost, because it constrains the wrong quantity "
            "(selection rate, not error rate). The strategies are not equivalent.",
            "warn"
        ),
    }

    _expl2, _kind2 = _a2_explanations.get(
        act2_prediction.value, ("No prediction selected.", "info")
    )
    mo.callout(mo.md(_expl2), kind=_kind2)
    return (_a2_correct,)


# ── CELL 20: ACT II REFLECTION ────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    act2_reflection = mo.ui.radio(
        options={
            "A) Demographic parity is never a valid fairness metric": "ref2_a",
            "B) It forces equal selection rates regardless of base rates — if qualified candidates "
               "differ by group, it means rejecting more qualified people in one group to hit the quota": "ref2_b",
            "C) It always hurts overall accuracy by more than 10 percentage points": "ref2_c",
            "D) Demographic parity violates GDPR by requiring protected attribute access": "ref2_d",
        },
        label=(
            "Reflection: Why can demographic parity actually produce unfair outcomes "
            "in a hiring context where qualified candidate rates differ by group?"
        ),
    )
    mo.vstack([
        mo.md("---"),
        mo.md("### Act II Reflection"),
        act2_reflection,
    ])
    return (act2_reflection,)


# ── CELL 21: ACT II REFLECTION REVEAL ────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo, act2_reflection):
    mo.stop(
        act2_reflection.value is None,
        mo.callout(mo.md("Select your reflection answer to complete the lab."), kind="warn"),
    )

    _ref2_explanations = {
        "ref2_a": (
            "**Incorrect — too absolute.** Demographic parity is valid in some contexts, "
            "particularly when base rates are approximately equal or when the historical "
            "qualification gap is itself a product of discriminatory conditions. "
            "The problem is applying it uncritically when base rates differ for legitimate reasons.",
            "warn"
        ),
        "ref2_b": (
            "**Correct.** This is the key insight. Demographic parity equalizes the "
            "outcome (selection rate) without conditioning on actual qualification. "
            "If Group B has 40% qualified candidates and Group A has 10%, equal "
            "selection rates require either: (a) lowering the bar for Group A "
            "(accepting unqualified applicants) or (b) raising the bar for Group B "
            "(rejecting qualified applicants). Both produce decisions that are "
            "conditionally unfair given true qualification. Equalized odds avoids "
            "this by conditioning on the true label — it guarantees that equally "
            "qualified candidates are equally likely to be selected.",
            "success"
        ),
        "ref2_c": (
            "**Incorrect.** The accuracy cost of demographic parity depends on how "
            "different the base rates are and how tight the parity constraint is. "
            "The Pareto frontier in Act II shows it can be 1–5pp in typical configurations. "
            "It is not always >10pp.",
            "warn"
        ),
        "ref2_d": (
            "**Incorrect.** Demographic parity does not violate GDPR — GDPR does not "
            "prohibit all use of protected attributes; Article 9 allows processing for "
            "specific purposes including combating discrimination. Many GDPR-compliant "
            "fairness audits explicitly compute demographic parity metrics. The legal "
            "challenge to DP is its substantive unfairness, not its data requirements.",
            "warn"
        ),
    }

    _expl_r2, _kind_r2 = _ref2_explanations.get(
        act2_reflection.value, ("No answer selected.", "info")
    )
    mo.callout(mo.md(_expl_r2), kind=_kind_r2)
    return


# ── CELL 22: KEY TAKEAWAYS ────────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(mo):
    mo.vstack([
        mo.md("---"),
        mo.md("## Key Takeaways"),
        mo.callout(
            mo.md("""
            **1. Equal accuracy is not equal treatment (Chouldechova 2017).**
            When base rates differ between groups, a calibrated model with a shared threshold
            *structurally* produces unequal false positive and false negative rates. This is
            not a training failure — it is a mathematical consequence of the base rate gap.
            Auditing accuracy by group is a necessary but insufficient compliance check.
            """),
            kind="info",
        ),
        mo.callout(
            mo.md("""
            **2. Choosing a fairness criterion is a policy decision, not an engineering one.**
            Demographic parity, equalized odds, and individual fairness are incompatible
            when base rates differ. Each encodes a different moral theory of equal treatment.
            Equalized odds is most defensible in hiring and lending under US law because it
            conditions on actual qualification. The engineering job is to implement the chosen
            criterion at minimum accuracy cost — not to choose it.
            """),
            kind="info",
        ),
    ])
    return


# ── CELL 23: LEDGER SAVE + HUD ────────────────────────────────────────────────
@app.cell(hide_code=True)
def _(
    mo,
    ledger,
    COLORS,
    context_toggle,
    act1_prediction,
    act2_prediction,
    _a1_correct,
    _a2_correct,
    _final_eo_gap,
    _final_accuracy_pct,
    _audit_cost_k,
    _reg_risk,
    _disparate_impact_triggered,
    _criterion,
):
    # ── Safe defaults for unresolved reactive values ─────────────────────────
    _ctx_val    = context_toggle.value if context_toggle.value else "cloud"
    _a1_pred    = act1_prediction.value if act1_prediction.value else "none"
    _a2_pred    = act2_prediction.value if act2_prediction.value else "none"
    _a1_ok      = bool(_a1_correct) if _a1_correct is not None else False
    _a2_ok      = bool(_a2_correct) if _a2_correct is not None else False
    _gap_val    = float(_final_eo_gap) if _final_eo_gap is not None else 0.0
    _acc_val    = float(_final_accuracy_pct) if _final_accuracy_pct is not None else 84.0
    _cost_val   = float(_audit_cost_k) if _audit_cost_k is not None else 0.0
    _risk_val   = str(_reg_risk) if _reg_risk is not None else "low"
    _hit_val    = bool(_disparate_impact_triggered) if _disparate_impact_triggered is not None else False
    _crit_val   = str(_criterion) if _criterion is not None else "accuracy"

    ledger.save(
        chapter=15,
        design={
            "context":            _ctx_val,
            "fairness_criterion": _crit_val,
            "equalized_odds_gap": round(_gap_val, 2),
            "audit_cost_k":       round(_cost_val, 2),
            "act1_prediction":    _a1_pred,
            "act1_correct":       _a1_ok,
            "act2_result":        round(_gap_val, 2),
            "act2_decision":      _crit_val,
            "constraint_hit":     _hit_val,
            "regulatory_risk":    _risk_val,
        },
    )

    # ── HUD footer ─────────────────────────────────────────────────────────
    _hud_color   = COLORS["Cloud"] if _ctx_val == "cloud" else COLORS["Mobile"]
    _ctx_display = "Cloud H100" if _ctx_val == "cloud" else "Mobile NPU"
    _a1_display  = "Correct" if _a1_ok else ("Incorrect" if _a1_pred != "none" else "—")
    _a2_display  = "Correct" if _a2_ok else ("Incorrect" if _a2_pred != "none" else "—")
    _hit_color   = "#f87171" if _hit_val else "#4ade80"
    _risk_color  = {"low": "#4ade80", "medium": "#fbbf24", "high": "#f87171"}.get(_risk_val, "#94a3b8")
    _budget_ok   = _cost_val <= 50.0
    _cost_color  = "#4ade80" if _budget_ok else "#f87171"

    mo.Html(f"""
    <div style="display: flex; gap: 22px; align-items: center; flex-wrap: wrap;
                padding: 14px 24px; background: #0f172a;
                border-radius: 12px; margin-top: 32px; font-size: 0.8rem;
                border: 1px solid #1e293b; font-family: 'SF Mono', monospace;">
        <span style="color: #475569; font-weight: 700; letter-spacing: 0.06em;">LAB 15</span>
        <span>
            <span style="color: #475569;">CONTEXT </span>
            <span style="color: {_hud_color}; font-weight: 700;">{_ctx_display}</span>
        </span>
        <span>
            <span style="color: #475569;">CRITERION </span>
            <span style="color: #a5b4fc; font-weight: 600;">{_crit_val}</span>
        </span>
        <span>
            <span style="color: #475569;">ACT I </span>
            <span style="color: {'#4ade80' if _a1_ok else '#f87171'};">{_a1_display}</span>
        </span>
        <span>
            <span style="color: #475569;">ACT II </span>
            <span style="color: {'#4ade80' if _a2_ok else '#f87171'};">{_a2_display}</span>
        </span>
        <span>
            <span style="color: #475569;">EO GAP </span>
            <span style="color: {_hit_color};">{_gap_val:.1f}pp</span>
        </span>
        <span>
            <span style="color: #475569;">AUDIT COST </span>
            <span style="color: {_cost_color};">${_cost_val:.0f}K/yr</span>
        </span>
        <span>
            <span style="color: #475569;">REG RISK </span>
            <span style="color: {_risk_color}; font-weight: 700;">{_risk_val.upper()}</span>
        </span>
        <span>
            <span style="color: #475569;">DISPARATE IMPACT </span>
            <span style="color: {_hit_color};">{'YES' if _hit_val else 'No'}</span>
        </span>
        <span>
            <span style="color: #475569;">LEDGER </span>
            <span style="color: #4ade80;">ch15 saved</span>
        </span>
    </div>
    """)
    return


if __name__ == "__main__":
    app.run()