mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
The gemini CLI silently overrides --yolo to default approval mode when its cwd is not in the trusted-folders list (e.g., a tempfile.gettempdir scratch dir). The override is logged to stderr as 'Approval mode overridden to "default" because the current folder is not trusted' and the call exits 55. --skip-trust opts out of that gate. Verified 2026-05-04 in /tmp/gemini-trust-test.
215 lines
8.4 KiB
Python
215 lines
8.4 KiB
Python
"""Shared infrastructure for Gemini-judge gates across vault-cli scripts.
|
||
|
||
Extracted to keep the gate constants and the Gemini-call wrapper in one
|
||
place rather than duplicated across:
|
||
|
||
- validate_drafts.py (single-draft gate flow)
|
||
- audit_chains_with_gemini.py (chain audit)
|
||
- audit_math.py (math spot-check)
|
||
- audit_corpus_batched.py (full-corpus batched audit; CORPUS_HARDENING_PLAN.md Phase 3)
|
||
|
||
What's exported:
|
||
|
||
- GEMINI_MODEL — pinned model id ("gemini-3.1-pro-preview")
|
||
- COMMON_MISTAKE_MARKERS — bold-marker tuple for the Pitfall/Rationale/Consequence convention
|
||
- NAPKIN_MATH_MARKERS — bold-marker tuple for the Assumptions/Calculations/Conclusion convention
|
||
- FAILURE_MODE_TAXONOMY — prose block enumerating the 4 coherence-failure modes; embed in any prompt that asks Gemini to judge coherence
|
||
- call_gemini_judge() — subprocess wrapper around the gemini CLI, with strict-JSON parsing and lock-guarded stderr
|
||
- strip_fences() — small helper for response cleanup
|
||
- gate_format() — the regex-only format-compliance gate (no LLM call)
|
||
|
||
Single-question judge functions (gate_level_fit, gate_coherence,
|
||
gate_bridge, gate_math) live in their owning scripts because their
|
||
prompts are coupled to the script's flow (single vs. batched). The
|
||
COMMON shape — marker constants, Gemini call, format regex, failure
|
||
taxonomy text — is what's centralized here.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import json
|
||
import subprocess
|
||
import sys
|
||
import tempfile
|
||
import threading
|
||
from pathlib import Path
|
||
|
||
GEMINI_MODEL = "gemini-3.1-pro-preview"
|
||
|
||
# The gemini CLI in --yolo mode occasionally writes scratch files to its
|
||
# CWD (prompt dumps, partial JSON outputs). When invoked from the repo
|
||
# root those land alongside the worktree and pollute `git status`.
|
||
# Use a process-wide temp dir as the subprocess CWD so scratch files
|
||
# stay isolated from the working tree.
|
||
_GEMINI_SCRATCH_DIR = Path(tempfile.gettempdir()) / "vault_audit_gemini_scratch"
|
||
_GEMINI_SCRATCH_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Markup-convention markers required by the format-compliance gate.
|
||
# Mirrored in vault-cli/src/vault_cli/commands/authoring.py's COMMON_MISTAKE_TEMPLATE
|
||
# and NAPKIN_MATH_TEMPLATE — and tested by tests/test_authoring_scaffold.py
|
||
# so a marker rename in one place breaks the test loudly. See
|
||
# interviews/vault/AUTHORING.md "Markup conventions" for the rationale.
|
||
COMMON_MISTAKE_MARKERS: tuple[str, ...] = (
|
||
"**The Pitfall:**",
|
||
"**The Rationale:**",
|
||
"**The Consequence:**",
|
||
)
|
||
NAPKIN_MATH_MARKERS: tuple[str, ...] = (
|
||
# Prefix-match: "Assumptions" accepts both "Assumptions:" and
|
||
# "Assumptions & Constraints:". Same for "Conclusion".
|
||
"**Assumptions",
|
||
"**Calculations:**",
|
||
"**Conclusion",
|
||
)
|
||
|
||
# Failure-mode taxonomy used by every coherence-judging prompt. Pasted
|
||
# verbatim into the prompt so the judge applies the same rubric whether
|
||
# called from validate_drafts (per-draft) or audit_corpus_batched
|
||
# (per-batch). Updates here propagate to every judge.
|
||
FAILURE_MODE_TAXONOMY = """FAILURE MODES (REJECT verdict=no on any of these — patterns from
|
||
the 2026-05-02 audit that previous coherence judges let through):
|
||
|
||
1. PHYSICAL ABSURDITY: numbers in the scenario violate real-world
|
||
hardware/software bounds. Examples that should be REJECTED:
|
||
- Mobile/edge NPU wake-up time > ~50ms (real NPUs wake in
|
||
single-digit ms; 0.5s wake-up is fiction)
|
||
- Power figures inconsistent with the device class (e.g., 50W
|
||
for a "smartphone NPU"; 0.05W for a "datacenter accelerator")
|
||
- Latency or throughput figures off by >5× from realistic for
|
||
the named hardware
|
||
- Memory or model-size claims that don't fit the device's
|
||
capacity envelope
|
||
- Duty-cycling patterns that defeat the use-case
|
||
|
||
2. VENDOR-NAME FABRICATION: hardware, accelerators, frameworks, or
|
||
benchmarks named in the scenario that don't actually exist or are
|
||
misattributed (e.g., "Coral Edge TPU XL" — there's no XL variant).
|
||
If unsure, treat ambiguous-but-plausible as ok; only flag clearly
|
||
invented names.
|
||
|
||
3. SCENARIO/QUESTION/SOLUTION MISMATCH:
|
||
- Question doesn't logically follow from the scenario
|
||
- realistic_solution doesn't actually answer the question (e.g.,
|
||
restates the question, gives generic advice, or answers a
|
||
related-but-different question)
|
||
- Numbers contradict across the three fields
|
||
|
||
4. ARITHMETIC ERRORS in napkin_math: the calculations don't add up,
|
||
unit conversions are wrong, or the conclusion doesn't follow from
|
||
the calculations.
|
||
"""
|
||
|
||
# Lock to keep concurrent stderr from interleaving across worker threads.
|
||
_print_lock = threading.Lock()
|
||
|
||
|
||
def strip_fences(text: str) -> str:
|
||
"""Trim leading/trailing whitespace and strip ```...``` or ```json``` fences.
|
||
|
||
The gemini CLI sometimes wraps JSON in fences despite "no fences"
|
||
instruction; this helper makes downstream JSON parsing robust.
|
||
"""
|
||
out = text.strip()
|
||
if out.startswith("```"):
|
||
out = out.strip("`")
|
||
if out.startswith("json"):
|
||
out = out[4:].lstrip()
|
||
return out
|
||
|
||
|
||
def call_gemini_judge(prompt: str, *, timeout: int = 600) -> dict | None:
|
||
"""Invoke the gemini CLI and parse the strict-JSON response.
|
||
|
||
Default timeout is 600s. The 2026-05-03 canary run on the global
|
||
track measured average call latency ~167s for 30-question batches
|
||
(52-72K char prompts), with the 72K-char calls occasionally hitting
|
||
the previous 240s ceiling and timing out. 600s gives ~3× the typical
|
||
call time and still triggers fast on a genuinely-stuck call.
|
||
|
||
Returns a dict on success, or None on:
|
||
- subprocess timeout
|
||
- non-zero exit with no parseable JSON
|
||
- JSONDecodeError on the extracted brace-delimited substring
|
||
|
||
The parser is lenient: it strips fences, then extracts the substring
|
||
between the first '{' and the last '}'. This handles common
|
||
prose-leakage patterns where the model emits "Here is the JSON:"
|
||
before the actual object.
|
||
"""
|
||
try:
|
||
result = subprocess.run(
|
||
# --skip-trust required: cwd is a temp scratch dir Gemini doesn't
|
||
# treat as a trusted workspace; without --skip-trust the CLI
|
||
# silently overrides --yolo to "default" and exits 55 (2026-05-04).
|
||
["gemini", "-m", GEMINI_MODEL, "-p", prompt, "--yolo", "--skip-trust"],
|
||
capture_output=True, text=True, timeout=timeout,
|
||
cwd=str(_GEMINI_SCRATCH_DIR),
|
||
)
|
||
except subprocess.TimeoutExpired:
|
||
return None
|
||
out = strip_fences(result.stdout or "")
|
||
i = out.find("{")
|
||
j = out.rfind("}")
|
||
if i == -1 or j == -1:
|
||
if result.returncode != 0:
|
||
with _print_lock:
|
||
print(f" gemini exit {result.returncode}: "
|
||
f"{(result.stderr or '')[:200]}", file=sys.stderr)
|
||
return None
|
||
try:
|
||
return json.loads(out[i:j + 1])
|
||
except json.JSONDecodeError as e:
|
||
with _print_lock:
|
||
print(f" JSON parse failed: {e}", file=sys.stderr)
|
||
return None
|
||
|
||
|
||
def gate_format(question: dict) -> dict:
|
||
"""Free regex-only format-compliance gate.
|
||
|
||
Returns a result dict with shape:
|
||
{
|
||
"verdict": "pass" | "fail",
|
||
"issues": [...],
|
||
"common_mistake_present": bool,
|
||
"napkin_math_present": bool,
|
||
}
|
||
|
||
Both common_mistake and napkin_math are technically optional in the
|
||
schema; this gate only flags PRESENT-AND-MALFORMED, never absent.
|
||
CORPUS_HARDENING_PLAN.md Phase 6 lifts this into vault check
|
||
--strict's structural tier.
|
||
"""
|
||
details = question.get("details") or {}
|
||
issues: list[str] = []
|
||
|
||
cm = (details.get("common_mistake") or "").strip()
|
||
if cm:
|
||
missing = [m for m in COMMON_MISTAKE_MARKERS if m not in cm]
|
||
if missing:
|
||
issues.append(f"common_mistake missing {missing!r}")
|
||
|
||
nm = (details.get("napkin_math") or "").strip()
|
||
if nm:
|
||
missing = [m for m in NAPKIN_MATH_MARKERS if m not in nm]
|
||
if missing:
|
||
issues.append(f"napkin_math missing {missing!r}")
|
||
|
||
return {
|
||
"verdict": "pass" if not issues else "fail",
|
||
"issues": issues,
|
||
"common_mistake_present": bool(cm),
|
||
"napkin_math_present": bool(nm),
|
||
}
|
||
|
||
|
||
__all__ = [
|
||
"GEMINI_MODEL",
|
||
"COMMON_MISTAKE_MARKERS",
|
||
"NAPKIN_MATH_MARKERS",
|
||
"FAILURE_MODE_TAXONOMY",
|
||
"call_gemini_judge",
|
||
"strip_fences",
|
||
"gate_format",
|
||
]
|