fix(ci): gate staffml dev preview deploy on validate workflows passing

Closes a race where the parallel staffml preview and validate workflows
(both push-triggered on dev) could finish independently — a deploy could
ship even when Validate (Dev) or Validate (Vault) failed on the same SHA.

Restructures preview-dev into validate-dev (uses) + validate-vault (uses)
+ build + deploy with `needs:` on all three. Build runs in parallel with
the validates and uploads its artifact; deploy downloads it and SSH-pushes.
Validates gain a workflow_call trigger and rekey concurrency to
`head_ref || run_id` so the standalone push run and the Preview-uses call
don't collide on a shared group.
This commit is contained in:
Vijay Janapa Reddi
2026-05-01 13:25:14 -04:00
parent 99249d00b3
commit 6ddb82a71b
3 changed files with 121 additions and 102 deletions

View File

@@ -4,26 +4,37 @@ name: '🎯 StaffML · 👁️ Preview (Dev)'
# StaffML — Dev Preview Deploy
# =============================================================================
#
# Builds the StaffML Next.js interview prep app and deploys to the dev
# preview site via SSH. Includes vault validation and smoke tests.
# Builds the StaffML Next.js interview-prep app and deploys to the dev
# preview site via SSH. Validation is delegated to the reusable workflows
# staffml-validate-dev.yml + staffml-validate-vault.yml — they run as
# `uses:` jobs in this workflow, and the deploy job blocks on `needs:`
# both passing. This closes the race window where Preview could deploy
# on a SHA the parallel Validate workflows had already failed on.
#
# Flow:
# 1. BUILD — npm ci + Next.js static export
# 2. VALIDATE — Build output + vault integrity + smoke tests
# 3. DEPLOY — Push to dev preview repo via SSH
# Job graph (parallel where possible):
# ┌── validate-dev (uses: staffml-validate-dev.yml)
# ├── validate-vault (uses: staffml-validate-vault.yml)
# ├── build (Preview-specific Next.js static export)
# └── deploy (needs: validate-dev + validate-vault + build)
#
# The validate jobs and build run in parallel — wall-clock for the typical
# push is max(validate, build) + ~1 min for SSH deploy, basically
# unchanged from the previous single-job design.
#
# Triggers:
# - push: dev branch, interviews/staffml/** paths
# - push: dev branch, interviews/staffml/** + vault questions/chains/schema
# - workflow_dispatch: manual
#
# Deploys to: harvard-edge.github.io/{DEV_REPO}/staffml/
# Secrets: SSH_DEPLOY_KEY
# Secrets: SSH_DEPLOY_KEY (deploy job only)
# Vars: DEV_REPO_URL
#
# Related:
# - staffml-publish-live.yml — Production deploy to mlsysbook.ai/staffml/
# - staffml-auto-pr.yml — Auto-PR from community question issues
# - staffml-welcome.yml — Welcome comment on contributor PRs
# - staffml-validate-dev.yml Reusable site validate (called above)
# - staffml-validate-vault.yml — Reusable vault validate (called above)
# - staffml-publish-live.yml — Production deploy to mlsysbook.ai/staffml/
# - staffml-auto-pr.yml — Auto-PR from community question issues
# - staffml-welcome.yml — Welcome comment on contributor PRs
#
# =============================================================================
@@ -33,7 +44,7 @@ on:
branches: [dev]
paths:
- 'interviews/staffml/**'
# Also redeploy when YAMLs or chains change — the workflow
# Also redeploy when YAMLs or chains change — the build job
# regenerates corpus.json + corpus-summary.json from YAMLs before
# building, so the dev site always reflects current vault state.
- 'interviews/vault/questions/**'
@@ -48,23 +59,46 @@ concurrency:
group: staffml-dev-deploy
cancel-in-progress: true
jobs:
build-and-deploy:
name: '🎯 Build & Deploy StaffML (Dev)'
runs-on: ubuntu-latest
env:
# Single source for paths and versions — see docs/CI-VARIABLES.md.
# The `paths:` trigger filter at the top of this file is intentionally
# NOT vars-ified (GitHub Actions evaluates triggers at workflow-load
# time, before vars are resolved). If STAFFML_ROOT moves, update the
# paths filter manually.
STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }}
VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }}
VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }}
DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }}
NODE_VERSION: ${{ vars.NODE_VERSION || '20' }}
PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }}
env:
# Single source for paths and versions — see docs/CI-VARIABLES.md.
# The `paths:` trigger filter at the top of this file is intentionally
# NOT vars-ified (GitHub Actions evaluates triggers at workflow-load
# time, before vars are resolved). If STAFFML_ROOT moves, update the
# paths filter manually.
STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }}
VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }}
VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }}
DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }}
NODE_VERSION: ${{ vars.NODE_VERSION || '20' }}
PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }}
jobs:
# ===========================================================================
# Validate — reusable workflows, run in parallel with build.
# ===========================================================================
# Site validate: tsc + tests + build + Playwright E2E + vault smoke + links.
validate-dev:
name: '✅ Validate (Dev)'
uses: ./.github/workflows/staffml-validate-dev.yml
# Vault validate: vault-cli ruff/mypy/pytest + vault check --strict +
# codegen drift + registry append-only + exemplar coverage + worker vitest.
validate-vault:
name: '✅ Validate (Vault)'
uses: ./.github/workflows/staffml-validate-vault.yml
# ===========================================================================
# Build — Preview-specific Next.js static export, runs in parallel.
# ===========================================================================
# Uses Preview-specific env (BASE_PATH for the dev subdirectory, hosted
# interviewer endpoint, hosted vault API) so the artifact is exactly
# what gets shipped. The validate-dev job builds with its own validation
# env vars and throws away the artifact — that build proves the code
# compiles, this build produces the deployable bytes.
build:
name: '🔨 Build StaffML'
runs-on: ubuntu-latest
timeout-minutes: 25
steps:
- name: 📥 Checkout
uses: actions/checkout@v6
@@ -95,14 +129,6 @@ jobs:
# the committed JSON artifacts drift from YAMLs.
run: vault build --vault-dir "$VAULT_DIR" --release-id preview-dev --local-json
- name: 🔍 Type check
working-directory: ${{ env.STAFFML_ROOT }}
run: npx tsc --noEmit
- name: 🧪 Run tests
working-directory: ${{ env.STAFFML_ROOT }}
run: npm test
- name: 🔨 Build StaffML
working-directory: ${{ env.STAFFML_ROOT }}
env:
@@ -139,7 +165,12 @@ jobs:
- name: 🔗 Rewrite URLs for dev site
run: bash .github/scripts/rewrite-dev-urls.sh "$DEV_STAFFML_PATH" "$STAFFML_ROOT/out"
- name: 🔍 Validate build
- name: 🔍 Validate build output
# Light sanity check on the artifact about to be uploaded. Heavier
# checks (corpus invariants, vault integrity, E2E smoke, link
# check, schema drift) live in the called validate workflows;
# this just confirms the critical pages were emitted by the
# build before we hand it to the deploy job.
run: |
if [ ! -f "$STAFFML_ROOT/out/index.html" ]; then
echo "❌ CRITICAL: index.html missing. Aborting deployment."
@@ -177,69 +208,33 @@ jobs:
echo "📋 Vault release: v$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['releaseId'])" "$MANIFEST")"
fi
- name: 🔐 Validate vault integrity
run: python3 "$STAFFML_ROOT/scripts/validate-vault.py"
- name: 📤 Upload build artifact
uses: actions/upload-artifact@v4
with:
name: staffml-dev-build
path: ${{ env.STAFFML_ROOT }}/out
retention-days: 1
if-no-files-found: error
- name: 🧪 Smoke tests
run: |
STAFFML_ROOT="$STAFFML_ROOT" python3 - <<'PY'
import json, os, sys
root = os.environ["STAFFML_ROOT"]
# Test 1: Corpus integrity
with open(f"{root}/src/data/corpus.json") as f:
corpus = json.load(f)
assert len(corpus) >= 4000, f"Corpus too small: {len(corpus)} questions"
print(f"✅ Corpus: {len(corpus)} questions")
# Test 2: Every question has required fields
required = ["id", "title", "level", "track", "scenario", "competency_area", "topic", "zone", "details"]
missing = []
for q in corpus:
for field in required:
if not q.get(field):
missing.append(f"{q.get('id', '???')} missing {field}")
if missing:
print(f"⚠️ {len(missing)} questions with missing fields")
for m in missing[:5]:
print(f" {m}")
if len(missing) > 5:
print(f" ... and {len(missing) - 5} more")
else:
print("✅ All questions have required fields")
# Test 3: Every question has a valid level
valid_levels = {"L1", "L2", "L3", "L4", "L5", "L6", "L6+"}
bad_levels = [q["id"] for q in corpus if q.get("level") not in valid_levels]
assert len(bad_levels) == 0, f"{len(bad_levels)} questions with invalid levels: {bad_levels[:3]}"
print("✅ All levels valid (L1-L6+)")
# Test 4: Taxonomy loads and has concepts
with open(f"{root}/src/data/taxonomy.json") as f:
taxonomy = json.load(f)
concepts = taxonomy.get("concepts", [])
assert len(concepts) >= 70, f"Taxonomy too small: {len(concepts)} topics"
print(f"✅ Taxonomy: {len(concepts)} topics")
# Test 5: Manifest exists and is valid (single source of truth)
with open(f"{root}/src/data/vault-manifest.json") as f:
manifest = json.load(f)
assert "releaseId" in manifest, "Manifest missing releaseId"
assert "releaseHash" in manifest and len(manifest["releaseHash"]) >= 16, \
"Manifest missing or truncated releaseHash"
assert manifest["questionCount"] == len(corpus), \
f"Manifest count mismatch: {manifest['questionCount']} vs {len(corpus)}"
print(f"✅ Manifest: v{manifest['releaseId']} ({manifest['questionCount']} Qs, hash {manifest['releaseHash'][:7]})")
# Test 6: Build output has static assets
js_files = [f for f in os.listdir(f"{root}/out/_next/static") if not f.startswith(".")]
assert len(js_files) > 0, "No static JS chunks found"
print(f"✅ Static assets: {len(js_files)} chunks")
print()
print("🎯 All smoke tests passed")
PY
# ===========================================================================
# Deploy — gated on validates + build all succeeding.
# ===========================================================================
# The job graph means this never starts unless every validate workflow
# and the Preview-specific build all reached `success`. The screenshot
# case "Preview ✅ but Validate ❌ on the same SHA" can no longer occur:
# if either validate fails, this job is skipped and Preview as a whole
# reports failure.
deploy:
name: '🚀 Deploy StaffML to Dev Site'
runs-on: ubuntu-latest
needs: [validate-dev, validate-vault, build]
timeout-minutes: 10
steps:
- name: 📥 Download build artifact
uses: actions/download-artifact@v4
with:
name: staffml-dev-build
path: staffml-out
- name: 🚀 Deploy to Dev Site via SSH
env:
@@ -264,7 +259,7 @@ jobs:
mkdir -p staffml
echo "🚚 Copying StaffML build..."
cp -r "${{ github.workspace }}/interviews/staffml/out/." staffml/
cp -r "${{ github.workspace }}/staffml-out/." staffml/
if [ ! -f "staffml/index.html" ]; then
echo "❌ CRITICAL: staffml/index.html missing. Aborting."

View File

@@ -31,6 +31,11 @@ name: '🎯 StaffML · ✅ Validate (Dev)'
on:
workflow_dispatch:
# Reusable: staffml-preview-dev.yml calls this via `uses:` so the deploy
# job can `needs:` a green validate. Standalone push/PR triggers below
# stay so the publish guard (infra-publish-guard.yml) and README badge
# still see direct runs on dev.
workflow_call:
pull_request:
paths:
- 'interviews/staffml/**'
@@ -47,7 +52,15 @@ permissions:
contents: read
concurrency:
group: staffml-validate-${{ github.ref }}
# `head_ref || run_id` preserves PR cancel-on-amend (head_ref is the PR
# source branch and is stable across PR commits) while making push and
# workflow_call runs unique per-run (head_ref is empty for non-PR events,
# so the group falls back to run_id). Without the per-run fallback, a
# push to dev would trigger BOTH this workflow standalone AND Preview's
# `uses:` call into it; the two would share the same group and one would
# cancel the other — same class of badge-flicker bug the CLAUDE.md note
# describes for manual dispatch.
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env:

View File

@@ -22,6 +22,12 @@ name: '🎯 StaffML · ✅ Validate (Vault)'
on:
workflow_dispatch:
# Reusable: staffml-preview-dev.yml calls this via `uses:` so its deploy
# job can `needs:` a green vault validate. Standalone push/PR triggers
# below stay so the publish guard and README badge still see direct
# runs on dev (and so PRs that touch vault-cli/worker without staffml
# still get validated independent of Preview).
workflow_call:
pull_request:
paths:
- 'interviews/vault/**'
@@ -37,7 +43,12 @@ on:
- 'interviews/staffml-vault-worker/**'
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
# `head_ref || run_id` keeps PR cancel-on-amend behavior while making
# push and workflow_call runs unique per-run, so a push to dev that
# triggers both this workflow standalone AND Preview's `uses:` call
# doesn't collide on a shared group. See staffml-validate-dev.yml for
# the long-form rationale.
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
env: