diff --git a/interviews/staffml/src/app/practice/page.tsx b/interviews/staffml/src/app/practice/page.tsx index 6ea8409cd..ed7668760 100644 --- a/interviews/staffml/src/app/practice/page.tsx +++ b/interviews/staffml/src/app/practice/page.tsx @@ -161,6 +161,7 @@ function PracticePage() { const [selectedArea, setSelectedArea] = useState(null); const [selectedZone, setSelectedZone] = useState(null); const [napkinOnly, setNapkinOnly] = useState(false); + const [visualOnly, setVisualOnly] = useState(false); // "Chains only" restricts the pool to questions that are part of a // deepening chain (L1→L6+ on one topic). 890 chains cover ~30% of // the corpus. This toggle is the minimum-viable discoverability @@ -345,13 +346,14 @@ function PracticePage() { skipFilterCount.current--; return; } - const filters: { track?: string; level?: string; competency_area?: string; zone?: string; chainsOnly?: boolean } = { + const filters: { track?: string; level?: string; competency_area?: string; zone?: string; chainsOnly?: boolean; visualOnly?: boolean } = { track: selectedTrack, level: selectedLevel, }; if (selectedArea) filters.competency_area = selectedArea; if (selectedZone) filters.zone = selectedZone; if (chainsOnly) filters.chainsOnly = true; + if (visualOnly) filters.visualOnly = true; let q = getQuestionsByFilter(filters); if (napkinOnly) q = q.filter(question => !!question.details.napkin_math); setPool(q); @@ -363,7 +365,7 @@ function PracticePage() { setShowAnswer(false); setUserAnswer(""); setNapkinResult(null); - }, [mounted, selectedTrack, selectedLevel, selectedArea, selectedZone, napkinOnly, chainsOnly]); + }, [mounted, selectedTrack, selectedLevel, selectedArea, selectedZone, napkinOnly, chainsOnly, visualOnly]); // Keyboard shortcuts: Enter to reveal, 1-4 for scoring, N to skip useEffect(() => { @@ -858,6 +860,17 @@ function PracticePage() { Napkin math only + {/* Visual-only toggle — useful for diagram/topology/timing practice. */} + + {/* Chains-only toggle — discoverability affordance for the 890 curated chain sequences (L1→L6+ on one topic). Separate from the gated `/chains` browse page — this is the minimum @@ -876,6 +889,7 @@ function PracticePage() {
{pool.length} questions in pool {chainsOnly && · chains} + {visualOnly && · visuals}
diff --git a/interviews/staffml/src/lib/corpus.ts b/interviews/staffml/src/lib/corpus.ts index 6e18f3f83..ff61a7758 100644 --- a/interviews/staffml/src/lib/corpus.ts +++ b/interviews/staffml/src/lib/corpus.ts @@ -153,6 +153,8 @@ export function getQuestionsByFilter(filters: { zone?: string; /** When true, restrict results to questions that are part of a chain. */ chainsOnly?: boolean; + /** When true, restrict results to questions with an attached visual. */ + visualOnly?: boolean; }): Question[] { return questions.filter((q) => { if (filters.track && q.track !== filters.track) return false; @@ -161,6 +163,7 @@ export function getQuestionsByFilter(filters: { if (filters.topic && q.topic !== filters.topic) return false; if (filters.zone && q.zone !== filters.zone) return false; if (filters.chainsOnly && (!q.chain_ids || q.chain_ids.length === 0)) return false; + if (filters.visualOnly && !q.visual) return false; return true; }); } diff --git a/interviews/vault/scripts/gemini_cli_math_review.py b/interviews/vault/scripts/gemini_cli_math_review.py new file mode 100644 index 000000000..5b7c6c9e2 --- /dev/null +++ b/interviews/vault/scripts/gemini_cli_math_review.py @@ -0,0 +1,312 @@ +#!/usr/bin/env python3 +"""Chunked Gemini CLI math review for StaffML YAML questions. + +This runner is deliberately review-first: it batches many questions into each +Gemini CLI call, asks for strict JSON, and writes reports/correction proposals. +It does not mutate YAML. Human or maintainer review should apply accepted fixes. + +Example: + + python3 interviews/vault/scripts/gemini_cli_math_review.py \ + --model gemini-3.1-pro-preview --unverified-only --chunk-size 35 --max-calls 250 +""" + +from __future__ import annotations + +import argparse +import json +import re +import subprocess +import sys +import time +from collections import Counter, defaultdict +from datetime import datetime +from pathlib import Path +from typing import Any + +import yaml + +VAULT_DIR = Path(__file__).resolve().parent.parent +ROOT_DIR = VAULT_DIR.parents[1] +QUESTIONS_DIR = VAULT_DIR / "questions" +DEFAULT_MODEL = "gemini-3.1-pro-preview" +DEFAULT_OUTPUT_DIR = VAULT_DIR / "_validation_results" / "gemini_math_review" + +HARDWARE_REFERENCE = """ +Reference constants to use unless the question explicitly states otherwise: +- H100 SXM: 80 GB HBM3, 3.35 TB/s HBM bandwidth, 989 TFLOP/s FP16 tensor, 700 W. +- A100 80GB SXM: ~2.0 TB/s HBM2e bandwidth, 312 TFLOP/s FP16 tensor, 400 W. +- MI300X: 192 GB HBM3, 5.3 TB/s bandwidth, ~1307 TFLOP/s FP16 sparse peak. +- Jetson AGX Orin: up to 275 TOPS INT8, ~204.8 GB/s LPDDR5, 15-60 W modes. +- Hailo-8: 26 TOPS INT8, ~2.5 W accelerator power. +- Apple A17 Pro Neural Engine: roughly 35 TOPS. +- Snapdragon 8 Gen 3 Hexagon NPU: roughly 45 TOPS. +- Cortex-M4 examples commonly use 80-240 MHz, KB-scale SRAM. +- 1 byte = 8 bits. 1 GB/s = 1000 MB/s for napkin math unless question says GiB. +- FP16/BF16 weights: 2 bytes/parameter. INT8: 1 byte/parameter. INT4: 0.5 byte/parameter before metadata. +- KV cache: 2 x layers x KV heads x head dim x sequence length x batch x bytes. +- Ring AllReduce lower-bound byte term: 2(N-1)/N x payload / bandwidth. +- Power energy: Wh = W x hours; kWh cost = kWh x price. +""".strip() + +PROMPT_TEMPLATE = """You are an expert ML systems math reviewer. + +Review the StaffML questions below for arithmetic, unit conversions, hardware +specs, and whether the solution's conclusion follows from its math. + +{hardware_reference} + +Return STRICT JSON only with this schema: +[ + {{ + "id": "question-id", + "status": "CORRECT" | "WARN" | "ERROR", + "issues": ["short issue text"], + "corrections": ["specific corrected value or wording"], + "confidence": "low" | "medium" | "high" + }} +] + +Rules: +- Use ERROR for wrong arithmetic, wrong units, wrong hardware specs, or a conclusion contradicted by math. +- Use WARN for plausible but imprecise assumptions, missing caveats, or ambiguous unit conventions. +- Use CORRECT when math is sound enough for interview/practice use. +- Do not complain merely because napkin math is approximate. +- Check every question; do not skip. + +Questions: +{questions_json} +""" + + +def load_questions(args: argparse.Namespace) -> list[dict[str, Any]]: + paths: list[Path] + if args.files_from: + paths = [ + ROOT_DIR / line.strip() + for line in Path(args.files_from).read_text(encoding="utf-8").splitlines() + if line.strip() + ] + else: + paths = sorted(QUESTIONS_DIR.glob("*/*.yaml")) + + questions: list[dict[str, Any]] = [] + for path in paths: + data = yaml.safe_load(path.read_text(encoding="utf-8")) or {} + details = data.get("details") or {} + if args.published_only and data.get("status") != "published": + continue + if args.draft_only and data.get("status") != "draft": + continue + if args.unverified_only and data.get("math_verified") is True: + continue + if args.require_napkin and not (details.get("napkin_math") or "").strip(): + continue + questions.append( + { + "path": str(path.relative_to(ROOT_DIR)), + "id": data.get("id", path.stem), + "track": data.get("track"), + "level": data.get("level"), + "zone": data.get("zone"), + "topic": data.get("topic"), + "title": data.get("title"), + "scenario": data.get("scenario"), + "question": data.get("question"), + "realistic_solution": details.get("realistic_solution"), + "napkin_math": details.get("napkin_math"), + "common_mistake": details.get("common_mistake"), + "options": details.get("options"), + "correct_index": details.get("correct_index"), + } + ) + return questions[: args.limit] if args.limit else questions + + +def chunk_questions(questions: list[dict[str, Any]], chunk_size: int) -> list[list[dict[str, Any]]]: + grouped: dict[tuple[str, str], list[dict[str, Any]]] = defaultdict(list) + for question in questions: + grouped[(question.get("track") or "", question.get("topic") or "")].append(question) + + chunks: list[list[dict[str, Any]]] = [] + current: list[dict[str, Any]] = [] + for _, group in sorted(grouped.items()): + for question in group: + if len(current) >= chunk_size: + chunks.append(current) + current = [] + current.append(question) + if current: + chunks.append(current) + return chunks + + +def slim(question: dict[str, Any]) -> dict[str, Any]: + def cap(value: Any, n: int) -> Any: + if not isinstance(value, str): + return value + return value if len(value) <= n else value[: n - 1] + "…" + + return { + "id": question["id"], + "track": question["track"], + "level": question["level"], + "topic": question["topic"], + "title": cap(question["title"], 160), + "scenario": cap(question["scenario"], 1200), + "question": cap(question["question"], 300), + "realistic_solution": cap(question["realistic_solution"], 1200), + "napkin_math": cap(question["napkin_math"], 1000), + "common_mistake": cap(question["common_mistake"], 500), + "options": question.get("options"), + "correct_index": question.get("correct_index"), + } + + +def build_prompt(chunk: list[dict[str, Any]]) -> str: + return PROMPT_TEMPLATE.format( + hardware_reference=HARDWARE_REFERENCE, + questions_json=json.dumps([slim(q) for q in chunk], ensure_ascii=False, indent=2), + ) + + +def parse_json_response(text: str) -> list[dict[str, Any]]: + text = text.strip() + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\s*", "", text) + text = re.sub(r"\s*```$", "", text) + return json.loads(text) + + +def call_gemini(prompt: str, model: str, timeout: int) -> tuple[list[dict[str, Any]] | None, str | None]: + try: + proc = subprocess.run( + ["gemini", "-m", model, "-o", "text"], + input=prompt, + text=True, + capture_output=True, + timeout=timeout, + ) + except subprocess.TimeoutExpired: + return None, "timeout" + if proc.returncode != 0: + return None, proc.stderr.strip() or proc.stdout.strip() or f"exit {proc.returncode}" + try: + return parse_json_response(proc.stdout), None + except Exception as exc: # noqa: BLE001 - preserve raw response + return None, f"json_parse_error: {exc}; raw={proc.stdout[:500]}" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument( + "--allow-model-override", + action="store_true", + help=( + "Allow a model other than gemini-3.1-pro-preview. By default this " + "runner enforces the Pro preview model used for release math review." + ), + ) + parser.add_argument("--chunk-size", type=int, default=35) + parser.add_argument("--max-calls", type=int, default=250) + parser.add_argument("--limit", type=int, default=0) + parser.add_argument("--timeout", type=int, default=600) + parser.add_argument("--sleep", type=float, default=1.0) + parser.add_argument("--output-dir", default=str(DEFAULT_OUTPUT_DIR)) + parser.add_argument("--files-from") + parser.add_argument("--published-only", action="store_true") + parser.add_argument("--draft-only", action="store_true") + parser.add_argument("--unverified-only", action="store_true") + parser.add_argument("--require-napkin", action="store_true", default=True) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + if args.model != DEFAULT_MODEL and not args.allow_model_override: + parser.error( + f"--model must be {DEFAULT_MODEL!r} for release math review " + "(pass --allow-model-override for experiments only)" + ) + + questions = load_questions(args) + total_candidates = len(questions) + chunks = chunk_questions(questions, args.chunk_size) + chunks_before_cap = len(chunks) + if len(chunks) > args.max_calls: + chunks = chunks[: args.max_calls] + reviewed_question_count = sum(len(chunk) for chunk in chunks) + + output_dir = Path(args.output_dir) / datetime.now().strftime("%Y%m%d_%H%M%S") + output_dir.mkdir(parents=True, exist_ok=True) + manifest = { + "model": args.model, + "candidate_questions": total_candidates, + "questions_planned_for_review": reviewed_question_count, + "questions_deferred_by_call_cap": max(0, total_candidates - reviewed_question_count), + "chunks_planned": len(chunks), + "chunks_before_call_cap": chunks_before_cap, + "chunk_size": args.chunk_size, + "max_calls": args.max_calls, + "dry_run": args.dry_run, + "grouping": "track/topic", + } + (output_dir / "manifest.json").write_text(json.dumps(manifest, indent=2) + "\n") + + print(json.dumps(manifest, indent=2)) + if args.dry_run: + for idx, chunk in enumerate(chunks): + prompt = build_prompt(chunk) + (output_dir / f"prompt_{idx:03d}.txt").write_text(prompt, encoding="utf-8") + print(f"Dry run prompts written to {output_dir}") + return 0 + + all_results: list[dict[str, Any]] = [] + failures: list[dict[str, Any]] = [] + for idx, chunk in enumerate(chunks): + prompt = build_prompt(chunk) + (output_dir / f"prompt_{idx:03d}.txt").write_text(prompt, encoding="utf-8") + print(f"[{idx + 1}/{len(chunks)}] reviewing {len(chunk)} questions...") + results, error = call_gemini(prompt, args.model, args.timeout) + if error: + failures.append({"chunk": idx, "error": error, "ids": [q["id"] for q in chunk]}) + (output_dir / f"error_{idx:03d}.txt").write_text(error, encoding="utf-8") + print(f" ERROR {error[:160]}") + else: + assert results is not None + all_results.extend(results) + (output_dir / f"result_{idx:03d}.json").write_text( + json.dumps(results, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + counts = defaultdict(int) + for result in results: + counts[result.get("status", "UNKNOWN")] += 1 + print(" " + ", ".join(f"{k}={v}" for k, v in sorted(counts.items()))) + time.sleep(args.sleep) + + (output_dir / "all_results.json").write_text( + json.dumps(all_results, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + (output_dir / "failures.json").write_text( + json.dumps(failures, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + summary = Counter(result.get("status", "UNKNOWN") for result in all_results) + summary_payload = { + "model": args.model, + "reviewed": len(all_results), + "status_counts": dict(summary), + "failed_chunks": len(failures), + "failed_question_ids": [qid for failure in failures for qid in failure.get("ids", [])], + } + (output_dir / "summary.json").write_text( + json.dumps(summary_payload, indent=2, ensure_ascii=False) + "\n", + encoding="utf-8", + ) + print("Summary:", summary_payload, "output:", output_dir) + return 1 if failures else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/interviews/vault/scripts/plan_gap_improvements.py b/interviews/vault/scripts/plan_gap_improvements.py index 32823b96e..b1c221a80 100644 --- a/interviews/vault/scripts/plan_gap_improvements.py +++ b/interviews/vault/scripts/plan_gap_improvements.py @@ -245,6 +245,8 @@ def build_coverage(questions: list[Question], chains: list[dict[str, Any]]) -> d "track_level": counter_to_dict(count_by(questions, "track", "level")), "track_zone": counter_to_dict(count_by(questions, "track", "zone")), "track_phase": counter_to_dict(count_by(questions, "track", "phase")), + "track_area_level": counter_to_dict(count_by(questions, "track", "competency_area", "level")), + "track_area_zone": counter_to_dict(count_by(questions, "track", "competency_area", "zone")), "track_area": {f"{k[0]}:{k[1]}": v for k, v in sorted(track_area.items())}, "topic_track": topic_track, "topic_track_zone": { @@ -729,6 +731,14 @@ def write_markdown_reports( phase_counts = coverage["phase_counts"] thin_pairs = [r for r in coverage["topic_track"] if r["classification"] == "thin"] suspect_pairs = [r for r in coverage["topic_track"] if r["classification"] == "suspect"] + low_track_area_level = [ + {"cell": k, "count": v} + for k, v in sorted(coverage["track_area_level"].items(), key=lambda item: item[1])[:80] + ] + low_track_area_zone = [ + {"cell": k, "count": v} + for k, v in sorted(coverage["track_area_zone"].items(), key=lambda item: item[1])[:80] + ] (OUTPUT_DIR / "coverage_report.md").write_text( "\n".join( @@ -760,6 +770,14 @@ def write_markdown_reports( "", table_rows(thin_pairs[:80], ["topic", "track", "competency_area", "count", "classification"]), "", + "## Lowest Track-Area-Level Cells", + "", + table_rows(low_track_area_level, ["cell", "count"]), + "", + "## Lowest Track-Area-Zone Cells", + "", + table_rows(low_track_area_zone, ["cell", "count"]), + "", "## Suspect Topic-Track Pairs", "", table_rows(suspect_pairs[:80], ["topic", "track", "competency_area", "count", "classification"]),