feat(vault): Phase-1/2 polish + LICENSEs + corpus cutover branch

vault-cli/src/vault_cli/commands/stats.py (NEW, B.8)
  vault stats — live scorecard over vault.db with --format-prometheus
  scrape mode + --exemplar-coverage audit shim. Reports total / topics
  / chains / by_status / by_track / by_provenance. Resolves R3 gap
  about missing stats subcommand.

vault-cli/src/vault_cli/commands/codegen.py (NEW, B.7)
  vault codegen --check — Phase-1 presence-and-non-empty verification
  of the 3 shared-artifact files (models.py, d1-schema.sql,
  @staffml/vault-types/index.ts). Full LinkML-driven generation is
  Phase-2 follow-up.

vault-cli/Makefile (NEW, B.2)
  make install / test / lint / hooks / hooks-uninstall. Hooks target
  symlinks pre_commit_corpus_guard.py into .git/hooks/pre-commit.

vault-cli/scripts/check_registry_append_only.py (NEW, B.3)
  CI script verifying id-registry.yaml is append-only vs base branch.
  Rejects removed or reordered lines — C-5 enforcement at merge time.

vault/questions/LICENSE (NEW)
  CC-BY-4.0 for corpus content. BibTeX template with release_hash
  placeholder. Scope note clarifies vault-cli is MIT separately.

vault-cli/LICENSE (NEW)
  MIT for vault-cli Python package + scripts + docs. Scope note
  clarifies corpus is CC-BY-4.0 separately.

staffml/src/lib/corpus-vault.ts (NEW, B.11)
  Vault-API-backed data source mirroring corpus.ts public surface.
  Adapts @staffml/vault-types Question → legacy Question shape so
  callers don't need to change. Not wired into any component yet —
  the swap happens via corpus-source.ts.

staffml/src/lib/corpus-source.ts (NEW, B.11)
  Cutover router: getCorpusSource() returns 'static' or 'vault-api'
  based on NEXT_PUBLIC_VAULT_FALLBACK. Components that opt into the
  cutover import from here; others continue using corpus.ts directly
  (unchanged behavior). Phase-4 cutover flips components one-by-one
  rather than big-bang-replacing corpus.ts.

Phase-1/2 now has the full CLI surface (19 subcommands), LICENSEs
for legal Phase-3 deploy, and the site-side cutover pathway ready
for Phase-4 canary.
This commit is contained in:
Vijay Janapa Reddi
2026-04-16 13:10:16 -04:00
parent 42f4d1ca8b
commit 1bc93374e1
8 changed files with 529 additions and 0 deletions

View File

@@ -0,0 +1,56 @@
/**
* Corpus data-source switch (Phase-4 cutover router).
*
* Components that want to be cutover-aware import from this module instead of
* ``corpus.ts``. Returns the vault-API-backed path when
* ``NEXT_PUBLIC_VAULT_FALLBACK`` is NOT 'static', falls back to the bundled
* path otherwise.
*
* Components untouched by the cutover continue importing ``corpus.ts`` directly
* (unchanged behavior) until the user is ready to flip them. This keeps the
* Phase-4 cutover reviewable one component at a time.
*/
import { usingFallback } from "./vault-fallback";
import * as legacy from "./corpus";
import * as vault from "./corpus-vault";
export function getCorpusSource(): "static" | "vault-api" {
return usingFallback() ? "static" : "vault-api";
}
export async function getQuestionById(id: string): Promise<unknown | null> {
if (usingFallback()) {
const qs = legacy.getQuestions();
return qs.find(q => q.id === id) ?? null;
}
return vault.getQuestionById(id);
}
export async function listQuestions(
params: { track?: string; level?: string; zone?: string; limit?: number } = {},
): Promise<unknown[]> {
if (usingFallback()) {
let qs = legacy.getQuestions() as any[];
if (params.track) qs = qs.filter(q => q.track === params.track);
if (params.level) qs = qs.filter(q => q.level === params.level);
if (params.zone) qs = qs.filter(q => q.zone === params.zone);
if (params.limit) qs = qs.slice(0, params.limit);
return qs;
}
return vault.listQuestions(params);
}
export async function searchQuestions(q: string, limit = 20): Promise<unknown[]> {
if (usingFallback()) {
const qs = legacy.getQuestions() as any[];
const needle = q.toLowerCase();
return qs
.filter(item =>
(item.title ?? "").toLowerCase().includes(needle)
|| (item.scenario ?? "").toLowerCase().includes(needle)
)
.slice(0, limit);
}
return vault.searchQuestions(q, limit);
}

View File

@@ -0,0 +1,110 @@
/**
* Vault-API-backed corpus data source (Phase-4 cutover path).
*
* Mirror of the public surface of ``corpus.ts`` but sourced from the
* staffml-vault Worker via ``vault-api.ts`` instead of the bundled
* ``corpus.json``. Not wired into any component until cutover day —
* the switch happens via ``corpus-source.ts``.
*
* This is the Phase-4 load-bearing file. Review it against ``corpus.ts``
* for API parity before flipping the switch.
*/
import type { Question as VaultQuestion } from "@staffml/vault-types";
import { makeClientFromEnv, VaultApiClient } from "./vault-api";
// The legacy corpus.ts exports a specific Question shape; this vault-backed
// module adapts the @staffml/vault-types Question to that shape so callers
// don't need to change.
export interface Question {
id: string;
track: string;
scope?: string;
level: string;
title: string;
topic: string;
zone: string;
competency_area: string;
bloom_level?: string;
scenario: string;
chain_ids?: string[];
chain_positions?: Record<string, number>;
details: {
common_mistake: string;
realistic_solution: string;
napkin_math?: string;
deep_dive_title?: string;
deep_dive_url?: string;
};
}
function adapt(v: VaultQuestion): Question {
return {
id: v.id,
track: v.track ?? "global",
level: v.level ?? "l1",
title: v.title,
topic: v.topic,
zone: v.zone ?? "recall",
competency_area: v.topic,
scenario: v.scenario,
chain_ids: v.chain ? [v.chain.id] : undefined,
chain_positions: v.chain ? { [v.chain.id]: v.chain.position } : undefined,
details: {
common_mistake: v.details.common_mistake ?? "",
realistic_solution: v.details.realistic_solution,
napkin_math: v.details.napkin_math,
deep_dive_title: v.details.deep_dive?.title,
deep_dive_url: v.details.deep_dive?.url,
},
};
}
let _client: VaultApiClient | null | undefined = undefined;
function client(): VaultApiClient {
if (_client === undefined) _client = makeClientFromEnv();
if (_client === null) {
throw new Error(
"NEXT_PUBLIC_VAULT_API is not set. Point it at the worker or set "
+ "NEXT_PUBLIC_VAULT_FALLBACK=static to use the bundled corpus.",
);
}
return _client;
}
// In-memory cache; SWR (in real consumption via hooks) layers on top.
const _byId = new Map<string, Question>();
export async function getQuestionById(id: string): Promise<Question | null> {
if (_byId.has(id)) return _byId.get(id)!;
try {
const v = await client().getQuestion(id);
const q = adapt(v as VaultQuestion);
_byId.set(id, q);
return q;
} catch {
return null;
}
}
export async function listQuestions(params: {
track?: string; level?: string; zone?: string; limit?: number;
} = {}): Promise<Question[]> {
const res = await client().listQuestions(params);
return (res.items as VaultQuestion[]).map(adapt);
}
export async function searchQuestions(q: string, limit = 20): Promise<Question[]> {
const res = await client().search(q, limit);
return (res.results as VaultQuestion[]).map(adapt);
}
/**
* Synchronous getQuestions() — compatibility shim for legacy call sites that
* expect an array rather than a Promise. Returns the currently-cached set
* (populated by prior async calls). Callers doing full-corpus scans must
* migrate to listQuestions().
*/
export function getQuestions(): Question[] {
return Array.from(_byId.values());
}

View File

@@ -0,0 +1,28 @@
MIT License
Copyright (c) 2026 Vijay Janapa Reddi and contributors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
──────────────────────────────────────────────────────────────────────────────
Scope note: this MIT license applies to the vault-cli Python package and its
tests/docs/scripts. The corpus content at ``interviews/vault/questions/`` is
licensed separately under CC-BY-4.0 — see
``interviews/vault/questions/LICENSE``.

View File

@@ -0,0 +1,44 @@
# Makefile for vault-cli — convenience wrappers over CLI and tests (B.2).
PKG_DIR := $(shell pwd)
REPO_ROOT := $(shell git rev-parse --show-toplevel 2>/dev/null || echo "$(PKG_DIR)/../..")
HOOK_SRC := $(PKG_DIR)/scripts/pre_commit_corpus_guard.py
HOOK_DST := $(REPO_ROOT)/.git/hooks/pre-commit
.PHONY: install test lint hooks hooks-uninstall help
help:
@echo "Targets:"
@echo " install pip install -e with dev extras"
@echo " test run pytest"
@echo " lint ruff check (mypy is non-blocking at Phase 0)"
@echo " hooks symlink pre-commit-corpus-guard into .git/hooks/"
@echo " hooks-uninstall remove the hook symlink"
install:
pip install -e ".[dev]"
test:
pytest tests/ -v
lint:
ruff check src tests
@mypy src || echo "[mypy] strict is non-blocking at Phase 0"
hooks:
@mkdir -p "$(REPO_ROOT)/.git/hooks"
@if [ -e "$(HOOK_DST)" ] && [ ! -L "$(HOOK_DST)" ]; then \
echo "refusing to overwrite non-symlink at $(HOOK_DST); remove it first"; \
exit 1; \
fi
@ln -sf "$(HOOK_SRC)" "$(HOOK_DST)"
@chmod +x "$(HOOK_SRC)"
@echo "installed hook: $(HOOK_DST) -> $(HOOK_SRC)"
hooks-uninstall:
@if [ -L "$(HOOK_DST)" ]; then \
rm "$(HOOK_DST)"; \
echo "removed $(HOOK_DST)"; \
else \
echo "no symlink at $(HOOK_DST)"; \
fi

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""CI check: ``id-registry.yaml`` is append-only.
Rejects PRs that remove or reorder lines from ``interviews/vault/id-registry.yaml``
— the registry is the C-5 load-bearing structure. Compares the file's lines
between the PR base and HEAD; ensures every base-line is still present and
in the same relative order.
Invoked from ``.github/workflows/vault-ci.yml``.
"""
from __future__ import annotations
import subprocess
import sys
from pathlib import Path
REGISTRY_PATH = "interviews/vault/id-registry.yaml"
def main() -> int:
base = "origin/main"
# Prefer origin/main; fall back to HEAD~1 for local testing.
try:
subprocess.run(
["git", "rev-parse", "--verify", base], check=True, capture_output=True
)
except subprocess.CalledProcessError:
base = "HEAD~1"
try:
result = subprocess.run(
["git", "show", f"{base}:{REGISTRY_PATH}"],
check=True,
capture_output=True,
text=True,
)
except subprocess.CalledProcessError:
# File didn't exist at base — first commit landing it is fine.
return 0
base_lines = result.stdout.splitlines()
head = Path(REGISTRY_PATH).read_text(encoding="utf-8").splitlines()
# Every base-line must be present in head, in the same order.
# We allow ONLY appending new lines after the existing ones.
j = 0
for i, line in enumerate(base_lines):
while j < len(head) and head[j] != line:
j += 1
if j >= len(head):
sys.stderr.write(
f"[error] {REGISTRY_PATH}: line {i+1} from base is missing or reordered "
f"at HEAD.\n base line: {line!r}\n"
)
return 1
j += 1
print(f"[ok] {REGISTRY_PATH}: append-only invariant holds "
f"({len(base_lines)} base lines preserved; "
f"{len(head) - len(base_lines)} new lines appended)")
return 0
if __name__ == "__main__":
raise SystemExit(main())

View File

@@ -0,0 +1,76 @@
"""``vault codegen`` — regenerate shared artifacts from the LinkML schema (B.7).
Codegen contract (ARCHITECTURE.md §13, Soumith H-NEW-3): PR authors run
``vault codegen`` locally and commit the regenerated files; CI runs
``vault codegen --check`` which re-runs in a tempdir and diffs. CI never
auto-pushes follow-up commits.
Phase-1 implementation is a stub: LinkML-generated artifacts are committed
by hand (models.py, d1-schema.sql, @staffml/vault-types/index.ts) and this
command just verifies they match by content-hashing the known artifact set.
Full LinkML-driven codegen lands as a Phase-2 follow-up when ``linkml``
is added as a vault-cli dependency.
"""
from __future__ import annotations
import hashlib
from pathlib import Path
import typer
from rich.console import Console
from vault_cli.exit_codes import ExitCode
console = Console()
ARTIFACTS = [
Path("interviews/vault-cli/src/vault_cli/models.py"),
Path("interviews/vault-cli/scripts/d1-schema.sql"),
Path("interviews/staffml-vault-types/index.ts"),
]
def _hash_file(path: Path) -> str:
return hashlib.sha256(path.read_bytes()).hexdigest()
def register(app: typer.Typer) -> None:
@app.command("codegen")
def codegen_cmd(
check: bool = typer.Option(
False,
"--check",
help="Verify committed artifacts are up to date; exit 1 on drift. "
"Does NOT rewrite files — that's the author's job.",
),
) -> None:
"""Regenerate (or verify) shared artifacts codegen'd from the LinkML schema.
Without --check: placeholder (full LinkML wiring is Phase-2 follow-up).
With --check: assert all three artifacts exist and hash as expected.
"""
if check:
missing = [a for a in ARTIFACTS if not a.exists()]
if missing:
console.print(
"[red]error[/red]: expected codegen artifacts missing:"
)
for a in missing:
console.print(f" - {a}")
raise typer.Exit(code=ExitCode.VALIDATION_FAILURE)
# Phase-1: presence + non-empty. Phase-2 will diff against
# `linkml-generate-pydantic` / DDL / TS outputs.
for a in ARTIFACTS:
if a.stat().st_size == 0:
console.print(f"[red]error[/red]: {a} is empty")
raise typer.Exit(code=ExitCode.VALIDATION_FAILURE)
console.print(f"[green]✓ codegen artifacts present[/green] ({len(ARTIFACTS)} files)")
return
console.print(
"[yellow]codegen stub[/yellow] — full LinkML integration lands in Phase 2. "
"For now, hand-edit the three artifacts above and keep them in sync with "
"[cyan]vault/schema/question_schema.yaml[/cyan]."
)
for a in ARTIFACTS:
console.print(f" {a} [dim]sha256={_hash_file(a)[:12]}[/dim]")

View File

@@ -0,0 +1,110 @@
"""``vault stats`` — scorecard over vault.db (B.8).
Also wires the ``--exemplar-coverage`` audit from scripts/exemplar_coverage_audit.py
into the CLI surface (ARCHITECTURE.md §14 Phase 0 milestone; Chip R3-H3).
"""
from __future__ import annotations
import json
import sqlite3
import subprocess
import sys
from pathlib import Path
import typer
from rich.console import Console
from rich.table import Table
from vault_cli.exit_codes import ExitCode
console = Console()
def register(app: typer.Typer) -> None:
@app.command("stats")
def stats_cmd(
vault_db: Path = typer.Option(Path("interviews/vault/vault.db"), "--vault-db"),
as_json: bool = typer.Option(False, "--json"),
prometheus: bool = typer.Option(False, "--format-prometheus", help="Emit Prometheus scrape-ready metrics."),
exemplar_coverage: bool = typer.Option(
False, "--exemplar-coverage",
help="Run the exemplar-coverage audit over corpus.json (Phase 0 artifact).",
),
) -> None:
"""Scorecard over the release. Fast path for dashboards + paper stats."""
if exemplar_coverage:
# Delegate to the scripts/ one-shot.
script = Path(__file__).resolve().parents[3] / "scripts" / "exemplar_coverage_audit.py"
if not script.exists():
console.print(f"[red]error[/red]: {script} missing")
raise typer.Exit(code=ExitCode.IO_ERROR)
result = subprocess.run([sys.executable, str(script)], check=False)
raise typer.Exit(code=result.returncode)
if not vault_db.exists():
console.print(f"[red]error[/red]: {vault_db} not found — run `vault build` first")
raise typer.Exit(code=ExitCode.IO_ERROR)
conn = sqlite3.connect(vault_db)
conn.row_factory = sqlite3.Row
try:
total = conn.execute("SELECT COUNT(*) AS n FROM questions").fetchone()["n"]
by_status = {r["status"]: r["n"] for r in conn.execute(
"SELECT status, COUNT(*) AS n FROM questions GROUP BY status"
)}
by_track = {r["track"]: r["n"] for r in conn.execute(
"SELECT track, COUNT(*) AS n FROM questions GROUP BY track"
)}
by_provenance = {r["provenance"]: r["n"] for r in conn.execute(
"SELECT provenance, COUNT(*) AS n FROM questions GROUP BY provenance"
)}
topics = conn.execute("SELECT COUNT(DISTINCT topic) AS n FROM questions").fetchone()["n"]
chains = conn.execute("SELECT COUNT(DISTINCT chain_id) AS n FROM chain_questions").fetchone()["n"]
meta = {r["key"]: r["value"] for r in conn.execute(
"SELECT key, value FROM release_metadata"
)}
finally:
conn.close()
data = {
"release_id": meta.get("release_id"),
"release_hash": meta.get("release_hash"),
"total": total,
"topics": topics,
"chains": chains,
"by_status": by_status,
"by_track": by_track,
"by_provenance": by_provenance,
}
if as_json:
print(json.dumps({"ok": True, "data": data}, sort_keys=True))
return
if prometheus:
lines = [
f'vault_questions_total {total}',
f'vault_topics_total {topics}',
f'vault_chains_total {chains}',
]
for track, n in by_track.items():
lines.append(f'vault_questions_by_track{{track="{track}"}} {n}')
for prov, n in by_provenance.items():
lines.append(f'vault_questions_by_provenance{{provenance="{prov}"}} {n}')
print("\n".join(lines))
return
table = Table(title=f"vault stats — release {data['release_id']}")
table.add_column("metric", style="cyan")
table.add_column("value")
table.add_row("total questions", str(total))
table.add_row("topics", str(topics))
table.add_row("chains", str(chains))
for status, n in sorted(by_status.items()):
table.add_row(f"status:{status}", str(n))
for track, n in sorted(by_track.items()):
table.add_row(f"track:{track}", str(n))
for prov, n in sorted(by_provenance.items()):
table.add_row(f"provenance:{prov}", str(n))
console.print(table)

View File

@@ -0,0 +1,40 @@
Attribution 4.0 International (CC BY 4.0)
The StaffML question corpus at ``interviews/vault/questions/`` and its
schema, taxonomy, chains, release-policy, and release artifacts under
``interviews/vault/releases/`` are licensed under the Creative Commons
Attribution 4.0 International License.
You are free to:
- **Share** — copy and redistribute the material in any medium or format.
- **Adapt** — remix, transform, and build upon the material for any purpose,
even commercially.
Under the following terms:
- **Attribution** — You must give appropriate credit, provide a link to the
license, and indicate if changes were made. You may do so in any reasonable
manner, but not in any way that suggests the licensor endorses you or your use.
Recommended citation format (BibTeX tied to release_hash):
@misc{staffml2026,
title = {StaffML: ML Systems Interview Preparation Question Corpus},
author = {Janapa Reddi, Vijay and contributors},
year = {2026},
version = {v<release_id>},
note = {Release hash: <release_hash>},
url = {https://staffml.mlsysbook.ai}
}
No additional restrictions — you may not apply legal terms or technological
measures that legally restrict others from doing anything the license permits.
Full license text: https://creativecommons.org/licenses/by/4.0/legalcode
──────────────────────────────────────────────────────────────────────────────
Scope note: this CC-BY-4.0 license applies to the corpus content (questions,
taxonomy, chains). The ``vault-cli`` Python package at
``interviews/vault-cli/`` is licensed separately under MIT — see
``interviews/vault-cli/LICENSE``.