mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
Sync the yaml-audit branch with the latest dev work since the previous sync (5c5af75ed). Brings in 73 commits including: - CI security fixes: postcss XSS bump, uuid bounds bump, codeql paths-ignore for vendored bundles, read-only token on staffml-validate-vault workflow - kits/ dark mode polish: code-block readability, dropdown contrast - vault-cli/: pre-commit ruff hook + 20 ruff fixes, all-contributors auto-credit workflow change to pull_request_target - dev's earlier merge of yaml-audit (836d481b5) carrying the pre-trailer-strip Phase 1/2/3 history; this merge harmonises that with the current trailer-clean yaml-audit tip - misc bug fixes (tinytorch perceptron seed, infra workflows, socratiq vite dev injector) Conflicts resolved (if any) preserve the yaml-audit-side authoritative state for vault/* files (we own those) and the dev-side authoritative state for .github/workflows/* and other shared infrastructure. # Conflicts: # .github/workflows/all-contributors-auto-credit.yml # .github/workflows/staffml-preview-dev.yml # interviews/staffml/src/data/corpus-summary.json # interviews/staffml/src/data/vault-manifest.json # interviews/staffml/tests/chain-and-vault-smoke.mjs # interviews/vault-cli/README.md # interviews/vault-cli/docs/CHAIN_ROADMAP.md # interviews/vault-cli/scripts/build_chains_with_gemini.py # interviews/vault-cli/scripts/generate_question_for_gap.py # interviews/vault-cli/scripts/merge_chain_passes.py # interviews/vault-cli/scripts/validate_drafts.py # interviews/vault-cli/src/vault_cli/legacy_export.py # interviews/vault-cli/tests/test_chain_validation.py # interviews/vault/.gitignore # interviews/vault/ARCHITECTURE.md # interviews/vault/chains.json # interviews/vault/id-registry.yaml # interviews/vault/questions/edge/optimization/edge-2536.yaml # interviews/vault/questions/mobile/deployment/mobile-2147.yaml # tinytorch/src/03_layers/03_layers.py
166 lines
5.6 KiB
Python
166 lines
5.6 KiB
Python
#!/usr/bin/env python3
|
|
"""Apply a Gemini-proposed chains.json to replace the live registry.
|
|
|
|
Reads `interviews/vault/chains.proposed.json` (output of
|
|
build_chains_with_gemini.py), validates it against the YAML corpus and
|
|
chain invariants, and on success replaces `interviews/vault/chains.json`.
|
|
|
|
Validation:
|
|
- Every member id exists in the YAML corpus and is published
|
|
- Levels in array order are non-decreasing (Bloom-monotonic) — Δ=0 IS
|
|
allowed at this layer; the strict Δ ∈ {1,2} rule is enforced upstream
|
|
in build_chains_with_gemini.py based on its --mode setting
|
|
- 2 ≤ chain size ≤ 6
|
|
- Single-topic
|
|
- No qid in more than 2 chains, and Δ=2 only allowed for L1/L2 anchors
|
|
- chain_id unique
|
|
|
|
The optional ``tier`` field on a chain entry (``primary``/``secondary``,
|
|
added in Phase 1.3 of CHAIN_ROADMAP.md) is intentionally not validated
|
|
here — it's a UI-routing hint, not a structural invariant.
|
|
|
|
Always run `vault check --strict` after this script — that's the final gate.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import shutil
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
|
|
VAULT_DIR = Path(__file__).resolve().parents[2] / "vault"
|
|
# AI-pipeline staging artifacts live under _pipeline/ (gitignored).
|
|
# See interviews/CLAUDE.md.
|
|
PIPELINE_DIR = VAULT_DIR / "_pipeline"
|
|
PROPOSED = PIPELINE_DIR / "chains.proposed.json"
|
|
LIVE = VAULT_DIR / "chains.json"
|
|
LIVE_BACKUP = VAULT_DIR / "chains.json.bak"
|
|
|
|
LEVEL_RANK = {"L1": 1, "L2": 2, "L3": 3, "L4": 4, "L5": 5, "L6+": 6}
|
|
|
|
|
|
def load_yaml_corpus() -> dict[str, dict]:
|
|
out = {}
|
|
for p in (VAULT_DIR / "questions").rglob("*.yaml"):
|
|
try:
|
|
with open(p) as f:
|
|
d = yaml.safe_load(f)
|
|
if d.get("status") not in ("published", None):
|
|
continue
|
|
out[d["id"]] = d
|
|
except Exception:
|
|
pass
|
|
return out
|
|
|
|
|
|
def validate(proposed: list[dict], corpus: dict[str, dict]) -> list[str]:
|
|
errors: list[str] = []
|
|
qid_to_chains: dict[str, list[str]] = {}
|
|
seen_chain_ids = Counter()
|
|
|
|
for ch in proposed:
|
|
cid = ch.get("chain_id")
|
|
if not cid:
|
|
errors.append(f"chain missing chain_id: {ch}")
|
|
continue
|
|
seen_chain_ids[cid] += 1
|
|
|
|
topic = ch.get("topic")
|
|
track = ch.get("track")
|
|
members = ch.get("questions", [])
|
|
if not (2 <= len(members) <= 6):
|
|
errors.append(f"{cid}: size {len(members)} not in [2,6]")
|
|
continue
|
|
|
|
levels = []
|
|
for m in members:
|
|
qid = m.get("id")
|
|
if qid not in corpus:
|
|
errors.append(f"{cid}: member {qid!r} not in published corpus")
|
|
continue
|
|
d = corpus[qid]
|
|
levels.append(LEVEL_RANK.get(d.get("level"), 0))
|
|
if d.get("topic") != topic:
|
|
errors.append(f"{cid}: member {qid} topic={d.get('topic')!r} != chain topic {topic!r}")
|
|
if d.get("track") != track:
|
|
errors.append(f"{cid}: member {qid} track={d.get('track')!r} != chain track {track!r}")
|
|
qid_to_chains.setdefault(qid, []).append(cid)
|
|
|
|
if levels != sorted(levels):
|
|
errors.append(f"{cid}: levels not monotonic: {levels}")
|
|
|
|
for cid, n in seen_chain_ids.items():
|
|
if n > 1:
|
|
errors.append(f"chain_id {cid!r} appears {n} times")
|
|
|
|
# Multi-chain membership cap: a question can be in at most 2 chains, and
|
|
# only if it's L1 or L2 (foundational anchor pattern). Anything beyond
|
|
# that is over-stuffing — likely a generic question reused too widely.
|
|
for qid, chain_list in qid_to_chains.items():
|
|
if len(chain_list) > 2:
|
|
errors.append(f"{qid} appears in {len(chain_list)} chains; cap is 2")
|
|
elif len(chain_list) == 2:
|
|
level = corpus.get(qid, {}).get("level")
|
|
if level not in ("L1", "L2"):
|
|
errors.append(
|
|
f"{qid} (level={level}) is in 2 chains but multi-membership "
|
|
f"is only allowed for L1/L2 anchors"
|
|
)
|
|
|
|
return errors
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__)
|
|
ap.add_argument("--proposed", default=str(PROPOSED))
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
ap.add_argument("--force", action="store_true",
|
|
help="Apply even if validation produces warnings (errors still block).")
|
|
args = ap.parse_args()
|
|
|
|
proposed_path = Path(args.proposed)
|
|
if not proposed_path.exists():
|
|
print(f"missing: {proposed_path}")
|
|
return 1
|
|
|
|
proposed = json.loads(proposed_path.read_text())
|
|
print(f"proposed chains: {len(proposed)}")
|
|
|
|
corpus = load_yaml_corpus()
|
|
print(f"corpus: {len(corpus)} published questions")
|
|
|
|
errors = validate(proposed, corpus)
|
|
if errors:
|
|
print(f"\n{len(errors)} validation issue(s):")
|
|
for e in errors[:30]:
|
|
print(f" - {e}")
|
|
if len(errors) > 30:
|
|
print(f" ... and {len(errors)-30} more")
|
|
if not args.force:
|
|
print("\nBlocking. Re-run with --force to apply anyway (NOT recommended).")
|
|
return 1
|
|
else:
|
|
print("validation: clean")
|
|
|
|
if args.dry_run:
|
|
print("\n--dry-run set; not applying.")
|
|
return 0
|
|
|
|
# Back up live, then write proposed in canonical chains.json shape
|
|
if LIVE.exists():
|
|
shutil.copy2(LIVE, LIVE_BACKUP)
|
|
print(f"backed up {LIVE} -> {LIVE_BACKUP}")
|
|
LIVE.write_text(json.dumps(proposed, indent=2) + "\n")
|
|
print(f"wrote {LIVE} with {len(proposed)} chains")
|
|
print("\nNow run: vault check --strict")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|