Files
cs249r_book/scripts/version/release.py
Vijay Janapa Reddi b8183404b8 chore(release): shared versioning infrastructure
Lays foundation for unified release versioning across MLSysBook
publishable artifacts. Pure additions — no existing builds, configs,
or sources are touched.

scripts/version/release.py
  Python CLI with helpers:
  - compute-id: semver bump from previous tag (patch/minor/major/none/explicit)
  - compute-hash: deterministic SHA-256 over input directories with per-file index
  - emit-release: writes releases/<project>-<id>/release.json (canonical artifact)
  - emit-manifest: writes the build-time manifest the deployable bundles
  Tier A (citable) emits per-file Merkle index; Tier B (lite) is flat.

scripts/version/schema.json
  JSON Schema for release.json. Validates project/tier/release_id/release_hash
  + Tier A's files[] index. Used by validators in CI.

shared/release/release-pill.html
  Footer snippet — fetches deployable manifest at runtime, renders
  "v0.1.0 · Apr 26, 2026" pill. Configured per-project via
  <meta name="release-manifest"> tag. Silent on any fetch failure.

shared/release/release-card.html
  About-page snippet — fuller release-identity card with
  click-to-copy hash. Same fetch + meta-tag conventions.

shared/release/README.md
  Operator-facing contract documentation.

.github/workflows/_release-prepare.yml
  Reusable workflow_call. Validates confirm == "PUBLISH", computes
  new_release_id from previous tag + bump (delegates to release.py
  for canonical math). Outputs new_release_id/new_tag/previous_*
  for caller's downstream build and finalize steps. Refuses to
  re-tag existing releases (citation integrity).

Caller workflows still own their build commands and tag/release
creation; this only standardizes the input shape and version math.
2026-04-28 18:06:07 -04:00

406 lines
15 KiB
Python

#!/usr/bin/env python3
"""Shared release-versioning helpers for MLSysBook artifacts.
Single source of truth for "what release is this artifact?" across every
publishable project in the repo (StaffML, TinyTorch, Book Vol I/II,
MLSYSIM, Kits, Labs, Instructors). Designed to be additive: every helper
either emits a NEW file (release.json, manifest.json) or computes a
value to be passed to a publish workflow. Nothing here mutates an
existing build artifact or rewrites pyproject.toml-style canonical
sources.
Two adoption tiers (see docs/VERSIONING.md):
Tier A (citable: StaffML, TinyTorch, Book, MLSYSIM)
- Full release.json with input_paths and per-file hashes.
- Merkle-style root hash bound into release_hash.
- CHANGELOG.md per project, GitHub Release with notes.
Tier B (rapidly-iterating: Kits, Labs, Instructors)
- Flat SHA-256 over content directory (no Merkle).
- Single release.json at project root, history in CHANGELOG.md.
The CLI is entry-pointed via ``__main__`` for use from any GitHub
Actions step regardless of project language.
Usage from a workflow step:
python3 scripts/version/release.py compute-id \\
--previous staffml-v0.1.0 --bump patch --prefix staffml-v
# → 0.1.1 on stdout
python3 scripts/version/release.py compute-hash \\
--paths interviews/vault/questions interviews/vault/release-policy.yaml
# → 64-char hex hash on stdout
python3 scripts/version/release.py emit-release \\
--project staffml --release-id 0.1.1 --tier A \\
--release-hash <hash> --git-sha <sha> \\
--output releases/staffml-0.1.1/release.json
python3 scripts/version/release.py emit-manifest \\
--project staffml --release-id 0.1.1 --release-hash <hash> \\
--output interviews/staffml/src/data/vault-manifest.json
"""
from __future__ import annotations
import argparse
import hashlib
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
# ── Schema constants ──────────────────────────────────────────────────────────
RELEASE_SCHEMA_VERSION = "1"
"""Bumped when the shape of release.json itself changes."""
VALID_TIERS = frozenset({"A", "B"})
VALID_BUMPS = frozenset({"patch", "minor", "major", "none"})
SEMVER_RE = re.compile(r"^(\d+)\.(\d+)\.(\d+)(?:-([0-9A-Za-z.-]+))?$")
# ── Hashing ──────────────────────────────────────────────────────────────────
def _iter_files(paths: list[Path], exclude: list[str] | None = None) -> list[Path]:
"""Walk paths, returning sorted list of files. Stable for hashing.
Excludes match against POSIX-style relative paths via simple glob
(``fnmatch``). We intentionally do NOT use ``.gitignore`` here —
inputs to the release MUST be deterministic and not depend on
transient ignore rules. Callers pass explicit excludes.
"""
import fnmatch
exclude = exclude or []
out: list[Path] = []
for p in paths:
if p.is_file():
out.append(p)
continue
if not p.is_dir():
raise FileNotFoundError(f"input path does not exist: {p}")
for root, dirs, files in os.walk(p):
# Filter dirs in-place so os.walk skips them entirely.
dirs[:] = sorted(
d for d in dirs
if not any(fnmatch.fnmatch(d, pat) for pat in exclude)
)
for f in sorted(files):
rel = (Path(root) / f)
if any(fnmatch.fnmatch(str(rel), pat) for pat in exclude):
continue
if any(fnmatch.fnmatch(f, pat) for pat in exclude):
continue
out.append(rel)
return sorted(out)
def compute_dir_hash(
paths: list[Path],
exclude: list[str] | None = None,
) -> tuple[str, list[dict[str, str]]]:
"""SHA-256 over sorted (path, content) pairs across ``paths``.
Returns ``(hex_hash, file_index)`` where file_index is a list of
``{"path": "...", "hash": "..."}`` entries — useful for Tier A
release.json's ``files`` array (Merkle-ish: the root hash binds
every per-file hash). Tier B can drop the index.
The hash is intentionally newline-sensitive but case-sensitive on
paths; it's stable across machines as long as filesystems agree on
case (mac+linux do via case-folding awareness, since we hash the
on-disk bytes). Symlinks are followed (hash the target's bytes).
"""
sha = hashlib.sha256()
index: list[dict[str, str]] = []
files = _iter_files([Path(p) for p in paths], exclude)
for f in files:
per_file = hashlib.sha256()
# Hash the relative path first so reordering or renaming changes
# the root hash even if bytes are identical.
rel = f.as_posix()
per_file.update(rel.encode("utf-8"))
per_file.update(b"\x00")
with open(f, "rb") as fh:
while True:
chunk = fh.read(65536)
if not chunk:
break
per_file.update(chunk)
digest = per_file.hexdigest()
index.append({"path": rel, "hash": digest})
sha.update(digest.encode("ascii"))
sha.update(b"\n")
return sha.hexdigest(), index
# ── Semver ───────────────────────────────────────────────────────────────────
def parse_semver(s: str) -> tuple[int, int, int, str | None]:
"""Parse a semver string. Strips a leading ``v`` and any prefix-tag."""
s = s.strip()
# Strip prefix like "staffml-v" or "vol1-v" or just "v".
s = re.sub(r"^[A-Za-z][A-Za-z0-9-]*-?v", "", s)
s = s.lstrip("v")
m = SEMVER_RE.match(s)
if not m:
raise ValueError(f"not a semver: {s!r}")
return int(m.group(1)), int(m.group(2)), int(m.group(3)), m.group(4)
def compute_release_id(
previous: str | None,
bump: str,
*,
explicit: str | None = None,
) -> str:
"""Return the new release_id given a previous tag and a bump type.
- ``explicit`` overrides everything: returned as-is (after stripping
a leading ``v`` if present, so the caller can pass either form).
- ``bump=none`` returns the previous version unchanged. Caller is
responsible for treating this as "site-only redeploy, no tag".
- ``previous`` may be a bare ``X.Y.Z``, a ``vX.Y.Z``, or any
project-prefixed tag like ``staffml-v0.1.0`` or ``vol1-v0.6.0``.
- On no previous (first release ever), returns ``0.1.0`` for any
bump that's not ``major`` (which returns ``1.0.0``).
"""
if explicit:
return explicit.lstrip("v").strip()
if bump not in VALID_BUMPS:
raise ValueError(f"bump must be one of {sorted(VALID_BUMPS)}: got {bump!r}")
if bump == "none":
if not previous:
raise ValueError("bump=none requires a previous version")
major, minor, patch, _ = parse_semver(previous)
return f"{major}.{minor}.{patch}"
if not previous:
return "1.0.0" if bump == "major" else "0.1.0"
major, minor, patch, _ = parse_semver(previous)
if bump == "major":
return f"{major + 1}.0.0"
if bump == "minor":
return f"{major}.{minor + 1}.0"
return f"{major}.{minor}.{patch + 1}"
# ── Release artifact + manifest emitters ─────────────────────────────────────
def _utc_now_iso() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def emit_release_json(
*,
output: Path,
project: str,
tier: str,
release_id: str,
release_hash: str,
schema_version: str,
previous_release_id: str | None,
git_sha: str,
input_paths: list[str],
file_index: list[dict[str, str]] | None = None,
metadata: dict[str, Any] | None = None,
description: str | None = None,
) -> dict[str, Any]:
"""Write the canonical release.json artifact. Tier A includes file_index."""
if tier not in VALID_TIERS:
raise ValueError(f"tier must be one of {sorted(VALID_TIERS)}: got {tier!r}")
if not release_hash or len(release_hash) < 16:
raise ValueError("release_hash must be a full hex digest (>= 16 chars)")
payload: dict[str, Any] = {
"release_schema_version": RELEASE_SCHEMA_VERSION,
"project": project,
"tier": tier,
"release_id": release_id,
"release_hash": release_hash,
"schema_version": schema_version,
"previous_release_id": previous_release_id,
"git_sha": git_sha,
"created_at": _utc_now_iso(),
"input_paths": list(input_paths),
"metadata": metadata or {},
}
if description:
payload["description"] = description
if tier == "A" and file_index is not None:
payload["files"] = file_index
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(
json.dumps(payload, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
return payload
def emit_manifest(
*,
output: Path,
project: str,
tier: str,
release_id: str,
release_hash: str,
schema_version: str,
metadata: dict[str, Any] | None = None,
) -> dict[str, Any]:
"""Write the build-time manifest the deployable bundles.
This is the "single source of truth" the site reads at build time.
A Tier A project may extend this with project-specific keys (e.g.
StaffML's vault-manifest.json adds questionCount/trackDistribution);
callers that need that should write a wrapper rather than tacking
extra keys on through metadata. The shape here is the MINIMUM every
project agrees on.
"""
if tier not in VALID_TIERS:
raise ValueError(f"tier must be one of {sorted(VALID_TIERS)}: got {tier!r}")
if not release_hash or len(release_hash) < 16:
raise ValueError("release_hash must be a full hex digest (>= 16 chars)")
payload: dict[str, Any] = {
"releaseId": release_id,
"releaseHash": release_hash,
"schemaVersion": schema_version,
"tier": tier,
"project": project,
"buildDate": _utc_now_iso(),
}
if metadata:
payload["metadata"] = metadata
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(
json.dumps(payload, indent=2, sort_keys=True) + "\n",
encoding="utf-8",
)
return payload
# ── CLI ───────────────────────────────────────────────────────────────────────
def _cmd_compute_id(args: argparse.Namespace) -> int:
out = compute_release_id(
previous=args.previous or None,
bump=args.bump,
explicit=args.explicit or None,
)
print(out)
return 0
def _cmd_compute_hash(args: argparse.Namespace) -> int:
digest, _ = compute_dir_hash(
[Path(p) for p in args.paths],
exclude=args.exclude or [],
)
print(digest)
return 0
def _cmd_emit_release(args: argparse.Namespace) -> int:
file_index = None
if args.tier == "A" and args.input_paths:
_, file_index = compute_dir_hash(
[Path(p) for p in args.input_paths],
exclude=args.exclude or [],
)
metadata = json.loads(args.metadata) if args.metadata else {}
payload = emit_release_json(
output=Path(args.output),
project=args.project,
tier=args.tier,
release_id=args.release_id,
release_hash=args.release_hash,
schema_version=args.schema_version,
previous_release_id=args.previous or None,
git_sha=args.git_sha,
input_paths=args.input_paths or [],
file_index=file_index,
metadata=metadata,
description=args.description or None,
)
print(json.dumps({"output": str(args.output), "release_id": payload["release_id"]}))
return 0
def _cmd_emit_manifest(args: argparse.Namespace) -> int:
metadata = json.loads(args.metadata) if args.metadata else {}
payload = emit_manifest(
output=Path(args.output),
project=args.project,
tier=args.tier,
release_id=args.release_id,
release_hash=args.release_hash,
schema_version=args.schema_version,
metadata=metadata,
)
print(json.dumps({"output": str(args.output), "releaseId": payload["releaseId"]}))
return 0
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(
prog="release",
description="Shared release-versioning helpers for MLSysBook.",
)
sub = parser.add_subparsers(dest="cmd", required=True)
p_id = sub.add_parser("compute-id", help="Compute next release_id from previous + bump.")
p_id.add_argument("--previous", default="", help="Previous tag/version. Empty = first release.")
p_id.add_argument("--bump", default="patch", choices=sorted(VALID_BUMPS))
p_id.add_argument("--explicit", default="", help="Override: explicit X.Y.Z to use, bypasses bump math.")
p_id.set_defaults(func=_cmd_compute_id)
p_hash = sub.add_parser("compute-hash", help="Compute SHA-256 dir hash over input paths.")
p_hash.add_argument("--paths", nargs="+", required=True)
p_hash.add_argument("--exclude", nargs="*", default=[])
p_hash.set_defaults(func=_cmd_compute_hash)
p_rel = sub.add_parser("emit-release", help="Write releases/<id>/release.json.")
p_rel.add_argument("--output", required=True)
p_rel.add_argument("--project", required=True)
p_rel.add_argument("--tier", required=True, choices=sorted(VALID_TIERS))
p_rel.add_argument("--release-id", required=True)
p_rel.add_argument("--release-hash", required=True)
p_rel.add_argument("--schema-version", default="1")
p_rel.add_argument("--previous", default="")
p_rel.add_argument("--git-sha", required=True)
p_rel.add_argument("--input-paths", nargs="*", default=[])
p_rel.add_argument("--exclude", nargs="*", default=[])
p_rel.add_argument("--metadata", default="", help="Extra JSON object to merge into metadata.")
p_rel.add_argument("--description", default="")
p_rel.set_defaults(func=_cmd_emit_release)
p_man = sub.add_parser("emit-manifest", help="Write the build-time manifest the deployable reads.")
p_man.add_argument("--output", required=True)
p_man.add_argument("--project", required=True)
p_man.add_argument("--tier", required=True, choices=sorted(VALID_TIERS))
p_man.add_argument("--release-id", required=True)
p_man.add_argument("--release-hash", required=True)
p_man.add_argument("--schema-version", default="1")
p_man.add_argument("--metadata", default="")
p_man.set_defaults(func=_cmd_emit_manifest)
args = parser.parse_args(argv)
return args.func(args)
if __name__ == "__main__":
sys.exit(main())