feat(vault): Phase-1/2 polish + LICENSEs + corpus cutover branch

vault-cli/src/vault_cli/commands/stats.py (NEW, B.8) vault stats — live scorecard over vault.db with --format-prometheus scrape mode + --exemplar-coverage audit shim. Reports total / topics / chains / by_status / by_track / by_provenance. Resolves R3 gap about missing stats subcommand. vault-cli/src/vault_cli/commands/codegen.py (NEW, B.7) vault codegen --check — Phase-1 presence-and-non-empty verification of the 3 shared-artifact files (models.py, d1-schema.sql, @staffml/vault-types/index.ts). Full LinkML-driven generation is Phase-2 follow-up. vault-cli/Makefile (NEW, B.2) make install / test / lint / hooks / hooks-uninstall. Hooks target symlinks pre_commit_corpus_guard.py into .git/hooks/pre-commit. vault-cli/scripts/check_registry_append_only.py (NEW, B.3) CI script verifying id-registry.yaml is append-only vs base branch. Rejects removed or reordered lines — C-5 enforcement at merge time. vault/questions/LICENSE (NEW) CC-BY-4.0 for corpus content. BibTeX template with release_hash placeholder. Scope note clarifies vault-cli is MIT separately. vault-cli/LICENSE (NEW) MIT for vault-cli Python package + scripts + docs. Scope note clarifies corpus is CC-BY-4.0 separately. staffml/src/lib/corpus-vault.ts (NEW, B.11) Vault-API-backed data source mirroring corpus.ts public surface. Adapts @staffml/vault-types Question → legacy Question shape so callers don't need to change. Not wired into any component yet — the swap happens via corpus-source.ts. staffml/src/lib/corpus-source.ts (NEW, B.11) Cutover router: getCorpusSource() returns 'static' or 'vault-api' based on NEXT_PUBLIC_VAULT_FALLBACK. Components that opt into the cutover import from here; others continue using corpus.ts directly (unchanged behavior). Phase-4 cutover flips components one-by-one rather than big-bang-replacing corpus.ts. Phase-1/2 now has the full CLI surface (19 subcommands), LICENSEs for legal Phase-3 deploy, and the site-side cutover pathway ready for Phase-4 canary.
2026-05-11 00:49:12 -05:00 · 2026-04-16 13:10:16 -04:00
parent 42f4d1ca8b
commit 1bc93374e1
8 changed files with 529 additions and 0 deletions
--- a/interviews/staffml/src/lib/corpus-source.ts
+++ b/interviews/staffml/src/lib/corpus-source.ts
@@ -0,0 +1,56 @@
+/**
+ * Corpus data-source switch (Phase-4 cutover router).
+ *
+ * Components that want to be cutover-aware import from this module instead of
+ * ``corpus.ts``. Returns the vault-API-backed path when
+ * ``NEXT_PUBLIC_VAULT_FALLBACK`` is NOT 'static', falls back to the bundled
+ * path otherwise.
+ *
+ * Components untouched by the cutover continue importing ``corpus.ts`` directly
+ * (unchanged behavior) until the user is ready to flip them. This keeps the
+ * Phase-4 cutover reviewable one component at a time.
+ */
+
+import { usingFallback } from "./vault-fallback";
+import * as legacy from "./corpus";
+import * as vault from "./corpus-vault";
+
+export function getCorpusSource(): "static" | "vault-api" {
+  return usingFallback() ? "static" : "vault-api";
+}
+
+export async function getQuestionById(id: string): Promise<unknown | null> {
+  if (usingFallback()) {
+    const qs = legacy.getQuestions();
+    return qs.find(q => q.id === id) ?? null;
+  }
+  return vault.getQuestionById(id);
+}
+
+export async function listQuestions(
+  params: { track?: string; level?: string; zone?: string; limit?: number } = {},
+): Promise<unknown[]> {
+  if (usingFallback()) {
+    let qs = legacy.getQuestions() as any[];
+    if (params.track) qs = qs.filter(q => q.track === params.track);
+    if (params.level) qs = qs.filter(q => q.level === params.level);
+    if (params.zone) qs = qs.filter(q => q.zone === params.zone);
+    if (params.limit) qs = qs.slice(0, params.limit);
+    return qs;
+  }
+  return vault.listQuestions(params);
+}
+
+export async function searchQuestions(q: string, limit = 20): Promise<unknown[]> {
+  if (usingFallback()) {
+    const qs = legacy.getQuestions() as any[];
+    const needle = q.toLowerCase();
+    return qs
+      .filter(item =>
+        (item.title ?? "").toLowerCase().includes(needle)
+        || (item.scenario ?? "").toLowerCase().includes(needle)
+      )
+      .slice(0, limit);
+  }
+  return vault.searchQuestions(q, limit);
+}
--- a/interviews/staffml/src/lib/corpus-vault.ts
+++ b/interviews/staffml/src/lib/corpus-vault.ts
@@ -0,0 +1,110 @@
+/**
+ * Vault-API-backed corpus data source (Phase-4 cutover path).
+ *
+ * Mirror of the public surface of ``corpus.ts`` but sourced from the
+ * staffml-vault Worker via ``vault-api.ts`` instead of the bundled
+ * ``corpus.json``. Not wired into any component until cutover day —
+ * the switch happens via ``corpus-source.ts``.
+ *
+ * This is the Phase-4 load-bearing file. Review it against ``corpus.ts``
+ * for API parity before flipping the switch.
+ */
+
+import type { Question as VaultQuestion } from "@staffml/vault-types";
+import { makeClientFromEnv, VaultApiClient } from "./vault-api";
+
+// The legacy corpus.ts exports a specific Question shape; this vault-backed
+// module adapts the @staffml/vault-types Question to that shape so callers
+// don't need to change.
+export interface Question {
+  id: string;
+  track: string;
+  scope?: string;
+  level: string;
+  title: string;
+  topic: string;
+  zone: string;
+  competency_area: string;
+  bloom_level?: string;
+  scenario: string;
+  chain_ids?: string[];
+  chain_positions?: Record<string, number>;
+  details: {
+    common_mistake: string;
+    realistic_solution: string;
+    napkin_math?: string;
+    deep_dive_title?: string;
+    deep_dive_url?: string;
+  };
+}
+
+function adapt(v: VaultQuestion): Question {
+  return {
+    id: v.id,
+    track: v.track ?? "global",
+    level: v.level ?? "l1",
+    title: v.title,
+    topic: v.topic,
+    zone: v.zone ?? "recall",
+    competency_area: v.topic,
+    scenario: v.scenario,
+    chain_ids: v.chain ? [v.chain.id] : undefined,
+    chain_positions: v.chain ? { [v.chain.id]: v.chain.position } : undefined,
+    details: {
+      common_mistake: v.details.common_mistake ?? "",
+      realistic_solution: v.details.realistic_solution,
+      napkin_math: v.details.napkin_math,
+      deep_dive_title: v.details.deep_dive?.title,
+      deep_dive_url: v.details.deep_dive?.url,
+    },
+  };
+}
+
+let _client: VaultApiClient | null | undefined = undefined;
+function client(): VaultApiClient {
+  if (_client === undefined) _client = makeClientFromEnv();
+  if (_client === null) {
+    throw new Error(
+      "NEXT_PUBLIC_VAULT_API is not set. Point it at the worker or set "
+      + "NEXT_PUBLIC_VAULT_FALLBACK=static to use the bundled corpus.",
+    );
+  }
+  return _client;
+}
+
+// In-memory cache; SWR (in real consumption via hooks) layers on top.
+const _byId = new Map<string, Question>();
+
+export async function getQuestionById(id: string): Promise<Question | null> {
+  if (_byId.has(id)) return _byId.get(id)!;
+  try {
+    const v = await client().getQuestion(id);
+    const q = adapt(v as VaultQuestion);
+    _byId.set(id, q);
+    return q;
+  } catch {
+    return null;
+  }
+}
+
+export async function listQuestions(params: {
+  track?: string; level?: string; zone?: string; limit?: number;
+} = {}): Promise<Question[]> {
+  const res = await client().listQuestions(params);
+  return (res.items as VaultQuestion[]).map(adapt);
+}
+
+export async function searchQuestions(q: string, limit = 20): Promise<Question[]> {
+  const res = await client().search(q, limit);
+  return (res.results as VaultQuestion[]).map(adapt);
+}
+
+/**
+ * Synchronous getQuestions() — compatibility shim for legacy call sites that
+ * expect an array rather than a Promise. Returns the currently-cached set
+ * (populated by prior async calls). Callers doing full-corpus scans must
+ * migrate to listQuestions().
+ */
+export function getQuestions(): Question[] {
+  return Array.from(_byId.values());
+}
--- a/interviews/vault-cli/LICENSE
+++ b/interviews/vault-cli/LICENSE
@@ -0,0 +1,28 @@
+MIT License
+
+Copyright (c) 2026 Vijay Janapa Reddi and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+──────────────────────────────────────────────────────────────────────────────
+
+Scope note: this MIT license applies to the vault-cli Python package and its
+tests/docs/scripts. The corpus content at ``interviews/vault/questions/`` is
+licensed separately under CC-BY-4.0 — see
+``interviews/vault/questions/LICENSE``.
--- a/interviews/vault-cli/Makefile
+++ b/interviews/vault-cli/Makefile
@@ -0,0 +1,44 @@
+# Makefile for vault-cli — convenience wrappers over CLI and tests (B.2).
+
+PKG_DIR := $(shell pwd)
+REPO_ROOT := $(shell git rev-parse --show-toplevel 2>/dev/null || echo "$(PKG_DIR)/../..")
+HOOK_SRC := $(PKG_DIR)/scripts/pre_commit_corpus_guard.py
+HOOK_DST := $(REPO_ROOT)/.git/hooks/pre-commit
+
+.PHONY: install test lint hooks hooks-uninstall help
+
+help:
+	@echo "Targets:"
+	@echo "  install          pip install -e with dev extras"
+	@echo "  test             run pytest"
+	@echo "  lint             ruff check (mypy is non-blocking at Phase 0)"
+	@echo "  hooks            symlink pre-commit-corpus-guard into .git/hooks/"
+	@echo "  hooks-uninstall  remove the hook symlink"
+
+install:
+	pip install -e ".[dev]"
+
+test:
+	pytest tests/ -v
+
+lint:
+	ruff check src tests
+	@mypy src || echo "[mypy] strict is non-blocking at Phase 0"
+
+hooks:
+	@mkdir -p "$(REPO_ROOT)/.git/hooks"
+	@if [ -e "$(HOOK_DST)" ] && [ ! -L "$(HOOK_DST)" ]; then \
+		echo "refusing to overwrite non-symlink at $(HOOK_DST); remove it first"; \
+		exit 1; \
+	fi
+	@ln -sf "$(HOOK_SRC)" "$(HOOK_DST)"
+	@chmod +x "$(HOOK_SRC)"
+	@echo "installed hook: $(HOOK_DST) -> $(HOOK_SRC)"
+
+hooks-uninstall:
+	@if [ -L "$(HOOK_DST)" ]; then \
+		rm "$(HOOK_DST)"; \
+		echo "removed $(HOOK_DST)"; \
+	else \
+		echo "no symlink at $(HOOK_DST)"; \
+	fi
--- a/interviews/vault-cli/scripts/check_registry_append_only.py
+++ b/interviews/vault-cli/scripts/check_registry_append_only.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python3
+"""CI check: ``id-registry.yaml`` is append-only.
+
+Rejects PRs that remove or reorder lines from ``interviews/vault/id-registry.yaml``
+— the registry is the C-5 load-bearing structure. Compares the file's lines
+between the PR base and HEAD; ensures every base-line is still present and
+in the same relative order.
+
+Invoked from ``.github/workflows/vault-ci.yml``.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+from pathlib import Path
+
+REGISTRY_PATH = "interviews/vault/id-registry.yaml"
+
+
+def main() -> int:
+    base = "origin/main"
+    # Prefer origin/main; fall back to HEAD~1 for local testing.
+    try:
+        subprocess.run(
+            ["git", "rev-parse", "--verify", base], check=True, capture_output=True
+        )
+    except subprocess.CalledProcessError:
+        base = "HEAD~1"
+
+    try:
+        result = subprocess.run(
+            ["git", "show", f"{base}:{REGISTRY_PATH}"],
+            check=True,
+            capture_output=True,
+            text=True,
+        )
+    except subprocess.CalledProcessError:
+        # File didn't exist at base — first commit landing it is fine.
+        return 0
+
+    base_lines = result.stdout.splitlines()
+    head = Path(REGISTRY_PATH).read_text(encoding="utf-8").splitlines()
+
+    # Every base-line must be present in head, in the same order.
+    # We allow ONLY appending new lines after the existing ones.
+    j = 0
+    for i, line in enumerate(base_lines):
+        while j < len(head) and head[j] != line:
+            j += 1
+        if j >= len(head):
+            sys.stderr.write(
+                f"[error] {REGISTRY_PATH}: line {i+1} from base is missing or reordered "
+                f"at HEAD.\n  base line: {line!r}\n"
+            )
+            return 1
+        j += 1
+    print(f"[ok] {REGISTRY_PATH}: append-only invariant holds "
+          f"({len(base_lines)} base lines preserved; "
+          f"{len(head) - len(base_lines)} new lines appended)")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
--- a/interviews/vault-cli/src/vault_cli/commands/codegen.py
+++ b/interviews/vault-cli/src/vault_cli/commands/codegen.py
@@ -0,0 +1,76 @@
+"""``vault codegen`` — regenerate shared artifacts from the LinkML schema (B.7).
+
+Codegen contract (ARCHITECTURE.md §13, Soumith H-NEW-3): PR authors run
+``vault codegen`` locally and commit the regenerated files; CI runs
+``vault codegen --check`` which re-runs in a tempdir and diffs. CI never
+auto-pushes follow-up commits.
+
+Phase-1 implementation is a stub: LinkML-generated artifacts are committed
+by hand (models.py, d1-schema.sql, @staffml/vault-types/index.ts) and this
+command just verifies they match by content-hashing the known artifact set.
+Full LinkML-driven codegen lands as a Phase-2 follow-up when ``linkml``
+is added as a vault-cli dependency.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from pathlib import Path
+
+import typer
+from rich.console import Console
+
+from vault_cli.exit_codes import ExitCode
+
+console = Console()
+
+ARTIFACTS = [
+    Path("interviews/vault-cli/src/vault_cli/models.py"),
+    Path("interviews/vault-cli/scripts/d1-schema.sql"),
+    Path("interviews/staffml-vault-types/index.ts"),
+]
+
+
+def _hash_file(path: Path) -> str:
+    return hashlib.sha256(path.read_bytes()).hexdigest()
+
+
+def register(app: typer.Typer) -> None:
+    @app.command("codegen")
+    def codegen_cmd(
+        check: bool = typer.Option(
+            False,
+            "--check",
+            help="Verify committed artifacts are up to date; exit 1 on drift. "
+                 "Does NOT rewrite files — that's the author's job.",
+        ),
+    ) -> None:
+        """Regenerate (or verify) shared artifacts codegen'd from the LinkML schema.
+
+        Without --check: placeholder (full LinkML wiring is Phase-2 follow-up).
+        With --check: assert all three artifacts exist and hash as expected.
+        """
+        if check:
+            missing = [a for a in ARTIFACTS if not a.exists()]
+            if missing:
+                console.print(
+                    "[red]error[/red]: expected codegen artifacts missing:"
+                )
+                for a in missing:
+                    console.print(f"  - {a}")
+                raise typer.Exit(code=ExitCode.VALIDATION_FAILURE)
+            # Phase-1: presence + non-empty. Phase-2 will diff against
+            # `linkml-generate-pydantic` / DDL / TS outputs.
+            for a in ARTIFACTS:
+                if a.stat().st_size == 0:
+                    console.print(f"[red]error[/red]: {a} is empty")
+                    raise typer.Exit(code=ExitCode.VALIDATION_FAILURE)
+            console.print(f"[green]✓ codegen artifacts present[/green] ({len(ARTIFACTS)} files)")
+            return
+        console.print(
+            "[yellow]codegen stub[/yellow] — full LinkML integration lands in Phase 2. "
+            "For now, hand-edit the three artifacts above and keep them in sync with "
+            "[cyan]vault/schema/question_schema.yaml[/cyan]."
+        )
+        for a in ARTIFACTS:
+            console.print(f"  {a}  [dim]sha256={_hash_file(a)[:12]}[/dim]")
--- a/interviews/vault-cli/src/vault_cli/commands/stats.py
+++ b/interviews/vault-cli/src/vault_cli/commands/stats.py
@@ -0,0 +1,110 @@
+"""``vault stats`` — scorecard over vault.db (B.8).
+
+Also wires the ``--exemplar-coverage`` audit from scripts/exemplar_coverage_audit.py
+into the CLI surface (ARCHITECTURE.md §14 Phase 0 milestone; Chip R3-H3).
+"""
+
+from __future__ import annotations
+
+import json
+import sqlite3
+import subprocess
+import sys
+from pathlib import Path
+
+import typer
+from rich.console import Console
+from rich.table import Table
+
+from vault_cli.exit_codes import ExitCode
+
+console = Console()
+
+
+def register(app: typer.Typer) -> None:
+    @app.command("stats")
+    def stats_cmd(
+        vault_db: Path = typer.Option(Path("interviews/vault/vault.db"), "--vault-db"),
+        as_json: bool = typer.Option(False, "--json"),
+        prometheus: bool = typer.Option(False, "--format-prometheus", help="Emit Prometheus scrape-ready metrics."),
+        exemplar_coverage: bool = typer.Option(
+            False, "--exemplar-coverage",
+            help="Run the exemplar-coverage audit over corpus.json (Phase 0 artifact).",
+        ),
+    ) -> None:
+        """Scorecard over the release. Fast path for dashboards + paper stats."""
+        if exemplar_coverage:
+            # Delegate to the scripts/ one-shot.
+            script = Path(__file__).resolve().parents[3] / "scripts" / "exemplar_coverage_audit.py"
+            if not script.exists():
+                console.print(f"[red]error[/red]: {script} missing")
+                raise typer.Exit(code=ExitCode.IO_ERROR)
+            result = subprocess.run([sys.executable, str(script)], check=False)
+            raise typer.Exit(code=result.returncode)
+
+        if not vault_db.exists():
+            console.print(f"[red]error[/red]: {vault_db} not found — run `vault build` first")
+            raise typer.Exit(code=ExitCode.IO_ERROR)
+
+        conn = sqlite3.connect(vault_db)
+        conn.row_factory = sqlite3.Row
+        try:
+            total = conn.execute("SELECT COUNT(*) AS n FROM questions").fetchone()["n"]
+            by_status = {r["status"]: r["n"] for r in conn.execute(
+                "SELECT status, COUNT(*) AS n FROM questions GROUP BY status"
+            )}
+            by_track = {r["track"]: r["n"] for r in conn.execute(
+                "SELECT track, COUNT(*) AS n FROM questions GROUP BY track"
+            )}
+            by_provenance = {r["provenance"]: r["n"] for r in conn.execute(
+                "SELECT provenance, COUNT(*) AS n FROM questions GROUP BY provenance"
+            )}
+            topics = conn.execute("SELECT COUNT(DISTINCT topic) AS n FROM questions").fetchone()["n"]
+            chains = conn.execute("SELECT COUNT(DISTINCT chain_id) AS n FROM chain_questions").fetchone()["n"]
+            meta = {r["key"]: r["value"] for r in conn.execute(
+                "SELECT key, value FROM release_metadata"
+            )}
+        finally:
+            conn.close()
+
+        data = {
+            "release_id": meta.get("release_id"),
+            "release_hash": meta.get("release_hash"),
+            "total": total,
+            "topics": topics,
+            "chains": chains,
+            "by_status": by_status,
+            "by_track": by_track,
+            "by_provenance": by_provenance,
+        }
+
+        if as_json:
+            print(json.dumps({"ok": True, "data": data}, sort_keys=True))
+            return
+
+        if prometheus:
+            lines = [
+                f'vault_questions_total {total}',
+                f'vault_topics_total {topics}',
+                f'vault_chains_total {chains}',
+            ]
+            for track, n in by_track.items():
+                lines.append(f'vault_questions_by_track{{track="{track}"}} {n}')
+            for prov, n in by_provenance.items():
+                lines.append(f'vault_questions_by_provenance{{provenance="{prov}"}} {n}')
+            print("\n".join(lines))
+            return
+
+        table = Table(title=f"vault stats — release {data['release_id']}")
+        table.add_column("metric", style="cyan")
+        table.add_column("value")
+        table.add_row("total questions", str(total))
+        table.add_row("topics", str(topics))
+        table.add_row("chains", str(chains))
+        for status, n in sorted(by_status.items()):
+            table.add_row(f"status:{status}", str(n))
+        for track, n in sorted(by_track.items()):
+            table.add_row(f"track:{track}", str(n))
+        for prov, n in sorted(by_provenance.items()):
+            table.add_row(f"provenance:{prov}", str(n))
+        console.print(table)
--- a/interviews/vault/questions/LICENSE
+++ b/interviews/vault/questions/LICENSE
@@ -0,0 +1,40 @@
+Attribution 4.0 International (CC BY 4.0)
+
+The StaffML question corpus at ``interviews/vault/questions/`` and its
+schema, taxonomy, chains, release-policy, and release artifacts under
+``interviews/vault/releases/`` are licensed under the Creative Commons
+Attribution 4.0 International License.
+
+You are free to:
+
+- **Share** — copy and redistribute the material in any medium or format.
+- **Adapt** — remix, transform, and build upon the material for any purpose,
+  even commercially.
+
+Under the following terms:
+
+- **Attribution** — You must give appropriate credit, provide a link to the
+  license, and indicate if changes were made. You may do so in any reasonable
+  manner, but not in any way that suggests the licensor endorses you or your use.
+  Recommended citation format (BibTeX tied to release_hash):
+
+      @misc{staffml2026,
+        title   = {StaffML: ML Systems Interview Preparation Question Corpus},
+        author  = {Janapa Reddi, Vijay and contributors},
+        year    = {2026},
+        version = {v<release_id>},
+        note    = {Release hash: <release_hash>},
+        url     = {https://staffml.mlsysbook.ai}
+      }
+
+No additional restrictions — you may not apply legal terms or technological
+measures that legally restrict others from doing anything the license permits.
+
+Full license text: https://creativecommons.org/licenses/by/4.0/legalcode
+
+──────────────────────────────────────────────────────────────────────────────
+
+Scope note: this CC-BY-4.0 license applies to the corpus content (questions,
+taxonomy, chains). The ``vault-cli`` Python package at
+``interviews/vault-cli/`` is licensed separately under MIT — see
+``interviews/vault-cli/LICENSE``.