mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
Sync the yaml-audit branch with the latest dev work since the previous sync (5c5af75ed). Brings in 73 commits including: - CI security fixes: postcss XSS bump, uuid bounds bump, codeql paths-ignore for vendored bundles, read-only token on staffml-validate-vault workflow - kits/ dark mode polish: code-block readability, dropdown contrast - vault-cli/: pre-commit ruff hook + 20 ruff fixes, all-contributors auto-credit workflow change to pull_request_target - dev's earlier merge of yaml-audit (836d481b5) carrying the pre-trailer-strip Phase 1/2/3 history; this merge harmonises that with the current trailer-clean yaml-audit tip - misc bug fixes (tinytorch perceptron seed, infra workflows, socratiq vite dev injector) Conflicts resolved (if any) preserve the yaml-audit-side authoritative state for vault/* files (we own those) and the dev-side authoritative state for .github/workflows/* and other shared infrastructure. # Conflicts: # .github/workflows/all-contributors-auto-credit.yml # .github/workflows/staffml-preview-dev.yml # interviews/staffml/src/data/corpus-summary.json # interviews/staffml/src/data/vault-manifest.json # interviews/staffml/tests/chain-and-vault-smoke.mjs # interviews/vault-cli/README.md # interviews/vault-cli/docs/CHAIN_ROADMAP.md # interviews/vault-cli/scripts/build_chains_with_gemini.py # interviews/vault-cli/scripts/generate_question_for_gap.py # interviews/vault-cli/scripts/merge_chain_passes.py # interviews/vault-cli/scripts/validate_drafts.py # interviews/vault-cli/src/vault_cli/legacy_export.py # interviews/vault-cli/tests/test_chain_validation.py # interviews/vault/.gitignore # interviews/vault/ARCHITECTURE.md # interviews/vault/chains.json # interviews/vault/id-registry.yaml # interviews/vault/questions/edge/optimization/edge-2536.yaml # interviews/vault/questions/mobile/deployment/mobile-2147.yaml # tinytorch/src/03_layers/03_layers.py
317 lines
11 KiB
Python
Executable File
317 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""Independent math verifier for question napkin_math blocks.
|
|
|
|
Standalone tool — runs ONE focused Gemini call per question to re-derive
|
|
the napkin_math arithmetic from scratch, then compares against what's
|
|
written. Catches calculation errors, unit-conversion mistakes, and
|
|
conclusions that don't follow from the calculations.
|
|
|
|
Use cases:
|
|
1. Final gate on Phase 3-authored drafts before promotion
|
|
(validate_drafts.py's coherence gate covers this generally; this
|
|
gate is focused and stricter on the math specifically).
|
|
2. Retroactive audit of any subset of the published corpus.
|
|
|
|
Usage:
|
|
# Verify all .yaml.draft files (post-generation, pre-promotion):
|
|
python3 verify_math.py --drafts-only
|
|
|
|
# Verify specific files:
|
|
python3 verify_math.py --files interviews/vault/questions/edge/latency/edge-2537.yaml ...
|
|
|
|
# Verify a sample of published questions in a track:
|
|
python3 verify_math.py --sample-track edge --sample-size 50
|
|
|
|
Parallelism is real: --workers N runs N concurrent Gemini calls. Default
|
|
is 4 (gentle on RPM). Cap at 8 to stay under typical rate limits.
|
|
|
|
Output:
|
|
interviews/vault/_pipeline/math-verification.json — per-question rows
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import random
|
|
import subprocess
|
|
import sys
|
|
import threading
|
|
import time
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import yaml
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
VAULT_DIR = REPO_ROOT / "interviews" / "vault"
|
|
QUESTIONS_DIR = VAULT_DIR / "questions"
|
|
PIPELINE_DIR = VAULT_DIR / "_pipeline"
|
|
DEFAULT_OUTPUT = PIPELINE_DIR / "math-verification.json"
|
|
|
|
GEMINI_MODEL = "gemini-3.1-pro-preview"
|
|
DEFAULT_WORKERS = 4
|
|
|
|
|
|
PROMPT_TEMPLATE = """You are independently verifying the napkin_math block of an
|
|
ML systems interview question. Re-derive every calculation from the stated
|
|
assumptions; compare against what the question actually wrote.
|
|
|
|
Return STRICT JSON, no prose, no fences:
|
|
|
|
{{
|
|
"arithmetic_correct": "yes" | "no" | "no_math",
|
|
"unit_conversions_correct": "yes" | "no" | "no_conversions",
|
|
"conclusion_follows": "yes" | "no",
|
|
"errors": ["<specific issue>", ...],
|
|
"rationale": "<one or two sentences>"
|
|
}}
|
|
|
|
GROUND RULES:
|
|
- "arithmetic_correct=no_math" only if napkin_math is empty.
|
|
- Be concrete in errors[]: "claims X = Y but X = Z" — quote the
|
|
specific line and the correct value.
|
|
- Tolerate small rounding (≤ 5%); flag anything bigger.
|
|
- Don't penalize the question for being hard; only flag actual
|
|
arithmetic / unit / logic errors.
|
|
|
|
QUESTION:
|
|
id: {qid}
|
|
level: {level}
|
|
track: {track}
|
|
topic: {topic}
|
|
|
|
scenario:
|
|
{scenario}
|
|
|
|
question:
|
|
{question}
|
|
|
|
realistic_solution:
|
|
{solution}
|
|
|
|
napkin_math:
|
|
{napkin}
|
|
"""
|
|
|
|
|
|
# ─── i/o helpers ──────────────────────────────────────────────────────────
|
|
|
|
|
|
def load_yaml(path: Path) -> dict | None:
|
|
try:
|
|
with path.open(encoding="utf-8") as f:
|
|
d = yaml.safe_load(f)
|
|
except Exception:
|
|
return None
|
|
return d if isinstance(d, dict) else None
|
|
|
|
|
|
def discover_targets(args: argparse.Namespace) -> list[Path]:
|
|
if args.files:
|
|
return [p for p in args.files if p.exists()]
|
|
if args.drafts_only:
|
|
return sorted(QUESTIONS_DIR.rglob("*.yaml.draft"))
|
|
if args.sample_track:
|
|
pool = [p for p in QUESTIONS_DIR.rglob("*.yaml")
|
|
if f"/{args.sample_track}/" in str(p)]
|
|
rng = random.Random(args.seed)
|
|
rng.shuffle(pool)
|
|
return pool[: args.sample_size]
|
|
return []
|
|
|
|
|
|
def has_napkin_math(body: dict) -> bool:
|
|
details = body.get("details") or {}
|
|
nm = (details.get("napkin_math") or "").strip()
|
|
return bool(nm)
|
|
|
|
|
|
def indent(text: str | None, level: int = 4) -> str:
|
|
if not text:
|
|
return " " * level + "(empty)"
|
|
pad = " " * level
|
|
return "\n".join(pad + line for line in text.splitlines())
|
|
|
|
|
|
# ─── Gemini call ──────────────────────────────────────────────────────────
|
|
|
|
|
|
# Lock to avoid interleaved stderr from concurrent failure messages.
|
|
_print_lock = threading.Lock()
|
|
|
|
|
|
def call_gemini(prompt: str, timeout: int = 240) -> dict | None:
|
|
try:
|
|
result = subprocess.run(
|
|
["gemini", "-m", GEMINI_MODEL, "-p", prompt, "--yolo"],
|
|
capture_output=True, text=True, timeout=timeout,
|
|
)
|
|
except subprocess.TimeoutExpired:
|
|
return None
|
|
out = (result.stdout or "").strip()
|
|
if out.startswith("```"):
|
|
out = out.strip("`")
|
|
if out.startswith("json"):
|
|
out = out[4:].lstrip()
|
|
i = out.find("{")
|
|
j = out.rfind("}")
|
|
if i == -1 or j == -1:
|
|
if result.returncode != 0:
|
|
with _print_lock:
|
|
print(f" gemini exit {result.returncode}: "
|
|
f"{(result.stderr or '')[:200]}", file=sys.stderr)
|
|
return None
|
|
try:
|
|
return json.loads(out[i:j+1])
|
|
except json.JSONDecodeError as e:
|
|
with _print_lock:
|
|
print(f" JSON parse failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
# ─── core verification ────────────────────────────────────────────────────
|
|
|
|
|
|
def verify_one(path: Path) -> dict[str, Any]:
|
|
body = load_yaml(path)
|
|
if not body:
|
|
return {"path": str(path), "verdict": "skip", "reason": "could not load"}
|
|
qid = body.get("id", "?")
|
|
|
|
if not has_napkin_math(body):
|
|
return {"path": str(path), "qid": qid, "verdict": "skip",
|
|
"reason": "no napkin_math present"}
|
|
|
|
details = body.get("details") or {}
|
|
prompt = PROMPT_TEMPLATE.format(
|
|
qid=qid,
|
|
level=body.get("level"),
|
|
track=body.get("track"),
|
|
topic=body.get("topic"),
|
|
scenario=indent(body.get("scenario")),
|
|
question=indent(body.get("question")),
|
|
solution=indent(details.get("realistic_solution")),
|
|
napkin=indent(details.get("napkin_math")),
|
|
)
|
|
resp = call_gemini(prompt)
|
|
if resp is None:
|
|
return {"path": str(path), "qid": qid, "verdict": "error",
|
|
"reason": "no judge response"}
|
|
|
|
arith = (resp.get("arithmetic_correct") or "").lower()
|
|
units = (resp.get("unit_conversions_correct") or "").lower()
|
|
concl = (resp.get("conclusion_follows") or "").lower()
|
|
errors = resp.get("errors") or []
|
|
|
|
# Pass iff arithmetic is correct AND (no unit conversions OR conversions correct)
|
|
# AND conclusion follows. Empty-math drafts are scored "skip".
|
|
has_arith_issue = arith not in ("yes", "no_math")
|
|
has_unit_issue = units not in ("yes", "no_conversions")
|
|
has_concl_issue = concl != "yes"
|
|
verdict = "fail" if (has_arith_issue or has_unit_issue or has_concl_issue) else "pass"
|
|
|
|
return {
|
|
"path": str(path),
|
|
"qid": qid,
|
|
"level": body.get("level"),
|
|
"track": body.get("track"),
|
|
"topic": body.get("topic"),
|
|
"verdict": verdict,
|
|
"arithmetic_correct": arith,
|
|
"unit_conversions_correct": units,
|
|
"conclusion_follows": concl,
|
|
"errors": errors,
|
|
"rationale": resp.get("rationale", ""),
|
|
}
|
|
|
|
|
|
# ─── runner ───────────────────────────────────────────────────────────────
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
src = ap.add_mutually_exclusive_group(required=True)
|
|
src.add_argument("--drafts-only", action="store_true",
|
|
help="verify all *.yaml.draft files in the questions tree")
|
|
src.add_argument("--files", nargs="+", type=Path, default=None,
|
|
help="explicit YAML paths")
|
|
src.add_argument("--sample-track", choices=["cloud", "edge", "mobile", "tinyml", "global"],
|
|
help="random sample from a track (use --sample-size)")
|
|
ap.add_argument("--sample-size", type=int, default=30,
|
|
help="sample size for --sample-track (default 30)")
|
|
ap.add_argument("--seed", type=int, default=42,
|
|
help="RNG seed for sampling (default 42)")
|
|
ap.add_argument("--workers", type=int, default=DEFAULT_WORKERS,
|
|
help=f"concurrent Gemini calls (default {DEFAULT_WORKERS}, "
|
|
f"cap 8 to stay under typical RPM limits)")
|
|
ap.add_argument("--output", type=Path, default=DEFAULT_OUTPUT,
|
|
help=f"scorecard JSON (default {DEFAULT_OUTPUT})")
|
|
args = ap.parse_args()
|
|
|
|
if args.workers < 1:
|
|
args.workers = 1
|
|
if args.workers > 8:
|
|
print("warning: workers > 8 may hit Gemini RPM limits; capping at 8",
|
|
file=sys.stderr)
|
|
args.workers = 8
|
|
|
|
targets = discover_targets(args)
|
|
if not targets:
|
|
print("no targets found")
|
|
return 0
|
|
|
|
print(f"verifying {len(targets)} question(s) with {args.workers} concurrent workers")
|
|
|
|
results: list[dict[str, Any]] = []
|
|
started = time.time()
|
|
with ThreadPoolExecutor(max_workers=args.workers) as pool:
|
|
futures = {pool.submit(verify_one, p): p for p in targets}
|
|
for i, fut in enumerate(as_completed(futures), start=1):
|
|
row = fut.result()
|
|
results.append(row)
|
|
v = row.get("verdict", "?")
|
|
qid = row.get("qid", "?")
|
|
extra = ""
|
|
if v == "fail":
|
|
errs = row.get("errors") or []
|
|
extra = f" [{len(errs)} error(s)] {(errs[0] if errs else '')[:80]}"
|
|
elif v == "skip":
|
|
extra = f" ({row.get('reason')})"
|
|
with _print_lock:
|
|
print(f" [{i:3d}/{len(targets)}] {qid:14s} {v:6s}{extra}")
|
|
|
|
elapsed = time.time() - started
|
|
args.output.parent.mkdir(parents=True, exist_ok=True)
|
|
args.output.write_text(json.dumps({
|
|
"generated_at": datetime.now(UTC).isoformat(timespec="seconds"),
|
|
"model": GEMINI_MODEL,
|
|
"workers": args.workers,
|
|
"elapsed_seconds": round(elapsed, 1),
|
|
"total": len(results),
|
|
"passes": sum(1 for r in results if r.get("verdict") == "pass"),
|
|
"fails": sum(1 for r in results if r.get("verdict") == "fail"),
|
|
"errors": sum(1 for r in results if r.get("verdict") == "error"),
|
|
"skips": sum(1 for r in results if r.get("verdict") == "skip"),
|
|
"rows": sorted(results, key=lambda r: r.get("qid", "")),
|
|
}, indent=2) + "\n", encoding="utf-8")
|
|
|
|
n_pass = sum(1 for r in results if r.get("verdict") == "pass")
|
|
n_fail = sum(1 for r in results if r.get("verdict") == "fail")
|
|
n_err = sum(1 for r in results if r.get("verdict") == "error")
|
|
n_skip = sum(1 for r in results if r.get("verdict") == "skip")
|
|
print(f"\nelapsed: {elapsed:.1f}s pass={n_pass} fail={n_fail} "
|
|
f"error={n_err} skip={n_skip}")
|
|
try:
|
|
out_display = args.output.relative_to(REPO_ROOT)
|
|
except ValueError:
|
|
out_display = args.output
|
|
print(f"wrote {out_display}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|