cs249r_book/interviews/vault-cli/scripts/_judges.py

"""Shared infrastructure for Gemini-judge gates across vault-cli scripts.

Extracted to keep the gate constants and the Gemini-call wrapper in one
place rather than duplicated across:

  - validate_drafts.py    (single-draft gate flow)
  - audit_chains_with_gemini.py (chain audit)
  - audit_math.py         (math spot-check)
  - audit_corpus_batched.py (full-corpus batched audit; CORPUS_HARDENING_PLAN.md Phase 3)

What's exported:

  - GEMINI_MODEL            — pinned model id ("gemini-3.1-pro-preview")
  - COMMON_MISTAKE_MARKERS  — bold-marker tuple for the Pitfall/Rationale/Consequence convention
  - NAPKIN_MATH_MARKERS     — bold-marker tuple for the Assumptions/Calculations/Conclusion convention
  - FAILURE_MODE_TAXONOMY   — prose block enumerating the 4 coherence-failure modes; embed in any prompt that asks Gemini to judge coherence
  - call_gemini_judge()     — subprocess wrapper around the gemini CLI, with strict-JSON parsing and lock-guarded stderr
  - strip_fences()          — small helper for response cleanup
  - gate_format()           — the regex-only format-compliance gate (no LLM call)

Single-question judge functions (gate_level_fit, gate_coherence,
gate_bridge, gate_math) live in their owning scripts because their
prompts are coupled to the script's flow (single vs. batched). The
COMMON shape — marker constants, Gemini call, format regex, failure
taxonomy text — is what's centralized here.
"""

from __future__ import annotations

import json
import subprocess
import sys
import tempfile
import threading
from pathlib import Path

GEMINI_MODEL = "gemini-3.1-pro-preview"

# The gemini CLI in --yolo mode occasionally writes scratch files to its
# CWD (prompt dumps, partial JSON outputs). When invoked from the repo
# root those land alongside the worktree and pollute `git status`.
# Use a process-wide temp dir as the subprocess CWD so scratch files
# stay isolated from the working tree.
_GEMINI_SCRATCH_DIR = Path(tempfile.gettempdir()) / "vault_audit_gemini_scratch"
_GEMINI_SCRATCH_DIR.mkdir(parents=True, exist_ok=True)

# Markup-convention markers required by the format-compliance gate.
# Mirrored in vault-cli/src/vault_cli/commands/authoring.py's COMMON_MISTAKE_TEMPLATE
# and NAPKIN_MATH_TEMPLATE — and tested by tests/test_authoring_scaffold.py
# so a marker rename in one place breaks the test loudly. See
# interviews/vault/AUTHORING.md "Markup conventions" for the rationale.
COMMON_MISTAKE_MARKERS: tuple[str, ...] = (
    "**The Pitfall:**",
    "**The Rationale:**",
    "**The Consequence:**",
)
NAPKIN_MATH_MARKERS: tuple[str, ...] = (
    # Prefix-match: "Assumptions" accepts both "Assumptions:" and
    # "Assumptions & Constraints:". Same for "Conclusion".
    "**Assumptions",
    "**Calculations:**",
    "**Conclusion",
)

# Failure-mode taxonomy used by every coherence-judging prompt. Pasted
# verbatim into the prompt so the judge applies the same rubric whether
# called from validate_drafts (per-draft) or audit_corpus_batched
# (per-batch). Updates here propagate to every judge.
FAILURE_MODE_TAXONOMY = """FAILURE MODES (REJECT verdict=no on any of these — patterns from
the 2026-05-02 audit that previous coherence judges let through):

  1. PHYSICAL ABSURDITY: numbers in the scenario violate real-world
     hardware/software bounds. Examples that should be REJECTED:
       - Mobile/edge NPU wake-up time > ~50ms (real NPUs wake in
         single-digit ms; 0.5s wake-up is fiction)
       - Power figures inconsistent with the device class (e.g., 50W
         for a "smartphone NPU"; 0.05W for a "datacenter accelerator")
       - Latency or throughput figures off by >5× from realistic for
         the named hardware
       - Memory or model-size claims that don't fit the device's
         capacity envelope
       - Duty-cycling patterns that defeat the use-case

  2. VENDOR-NAME FABRICATION: hardware, accelerators, frameworks, or
     benchmarks named in the scenario that don't actually exist or are
     misattributed (e.g., "Coral Edge TPU XL" — there's no XL variant).
     If unsure, treat ambiguous-but-plausible as ok; only flag clearly
     invented names.

  3. SCENARIO/QUESTION/SOLUTION MISMATCH:
       - Question doesn't logically follow from the scenario
       - realistic_solution doesn't actually answer the question (e.g.,
         restates the question, gives generic advice, or answers a
         related-but-different question)
       - Numbers contradict across the three fields

  4. ARITHMETIC ERRORS in napkin_math: the calculations don't add up,
     unit conversions are wrong, or the conclusion doesn't follow from
     the calculations.
"""

# Lock to keep concurrent stderr from interleaving across worker threads.
_print_lock = threading.Lock()


def strip_fences(text: str) -> str:
    """Trim leading/trailing whitespace and strip ```...``` or ```json``` fences.

    The gemini CLI sometimes wraps JSON in fences despite "no fences"
    instruction; this helper makes downstream JSON parsing robust.
    """
    out = text.strip()
    if out.startswith("```"):
        out = out.strip("`")
        if out.startswith("json"):
            out = out[4:].lstrip()
    return out


def call_gemini_judge(prompt: str, *, timeout: int = 600) -> dict | None:
    """Invoke the gemini CLI and parse the strict-JSON response.

    Default timeout is 600s. The 2026-05-03 canary run on the global
    track measured average call latency ~167s for 30-question batches
    (52-72K char prompts), with the 72K-char calls occasionally hitting
    the previous 240s ceiling and timing out. 600s gives ~3× the typical
    call time and still triggers fast on a genuinely-stuck call.

    Returns a dict on success, or None on:
      - subprocess timeout
      - non-zero exit with no parseable JSON
      - JSONDecodeError on the extracted brace-delimited substring

    The parser is lenient: it strips fences, then extracts the substring
    between the first '{' and the last '}'. This handles common
    prose-leakage patterns where the model emits "Here is the JSON:"
    before the actual object.
    """
    try:
        result = subprocess.run(
            # --skip-trust required: cwd is a temp scratch dir Gemini doesn't
            # treat as a trusted workspace; without --skip-trust the CLI
            # silently overrides --yolo to "default" and exits 55 (2026-05-04).
            ["gemini", "-m", GEMINI_MODEL, "-p", prompt, "--yolo", "--skip-trust"],
            capture_output=True, text=True, timeout=timeout,
            cwd=str(_GEMINI_SCRATCH_DIR),
        )
    except subprocess.TimeoutExpired:
        return None
    out = strip_fences(result.stdout or "")
    i = out.find("{")
    j = out.rfind("}")
    if i == -1 or j == -1:
        if result.returncode != 0:
            with _print_lock:
                print(f"  gemini exit {result.returncode}: "
                      f"{(result.stderr or '')[:200]}", file=sys.stderr)
        return None
    try:
        return json.loads(out[i:j + 1])
    except json.JSONDecodeError as e:
        with _print_lock:
            print(f"  JSON parse failed: {e}", file=sys.stderr)
        return None


def gate_format(question: dict) -> dict:
    """Free regex-only format-compliance gate.

    Returns a result dict with shape:
      {
        "verdict": "pass" | "fail",
        "issues": [...],
        "common_mistake_present": bool,
        "napkin_math_present": bool,
      }

    Both common_mistake and napkin_math are technically optional in the
    schema; this gate only flags PRESENT-AND-MALFORMED, never absent.
    CORPUS_HARDENING_PLAN.md Phase 6 lifts this into vault check
    --strict's structural tier.
    """
    details = question.get("details") or {}
    issues: list[str] = []

    cm = (details.get("common_mistake") or "").strip()
    if cm:
        missing = [m for m in COMMON_MISTAKE_MARKERS if m not in cm]
        if missing:
            issues.append(f"common_mistake missing {missing!r}")

    nm = (details.get("napkin_math") or "").strip()
    if nm:
        missing = [m for m in NAPKIN_MATH_MARKERS if m not in nm]
        if missing:
            issues.append(f"napkin_math missing {missing!r}")

    return {
        "verdict": "pass" if not issues else "fail",
        "issues": issues,
        "common_mistake_present": bool(cm),
        "napkin_math_present": bool(nm),
    }


__all__ = [
    "GEMINI_MODEL",
    "COMMON_MISTAKE_MARKERS",
    "NAPKIN_MATH_MARKERS",
    "FAILURE_MODE_TAXONOMY",
    "call_gemini_judge",
    "strip_fences",
    "gate_format",
]