mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 01:28:35 -05:00
fix(ci): gate staffml dev preview deploy on validate workflows passing
Closes a race where the parallel staffml preview and validate workflows (both push-triggered on dev) could finish independently — a deploy could ship even when Validate (Dev) or Validate (Vault) failed on the same SHA. Restructures preview-dev into validate-dev (uses) + validate-vault (uses) + build + deploy with `needs:` on all three. Build runs in parallel with the validates and uploads its artifact; deploy downloads it and SSH-pushes. Validates gain a workflow_call trigger and rekey concurrency to `head_ref || run_id` so the standalone push run and the Preview-uses call don't collide on a shared group.
This commit is contained in:
195
.github/workflows/staffml-preview-dev.yml
vendored
195
.github/workflows/staffml-preview-dev.yml
vendored
@@ -4,26 +4,37 @@ name: '🎯 StaffML · 👁️ Preview (Dev)'
|
||||
# StaffML — Dev Preview Deploy
|
||||
# =============================================================================
|
||||
#
|
||||
# Builds the StaffML Next.js interview prep app and deploys to the dev
|
||||
# preview site via SSH. Includes vault validation and smoke tests.
|
||||
# Builds the StaffML Next.js interview-prep app and deploys to the dev
|
||||
# preview site via SSH. Validation is delegated to the reusable workflows
|
||||
# staffml-validate-dev.yml + staffml-validate-vault.yml — they run as
|
||||
# `uses:` jobs in this workflow, and the deploy job blocks on `needs:`
|
||||
# both passing. This closes the race window where Preview could deploy
|
||||
# on a SHA the parallel Validate workflows had already failed on.
|
||||
#
|
||||
# Flow:
|
||||
# 1. BUILD — npm ci + Next.js static export
|
||||
# 2. VALIDATE — Build output + vault integrity + smoke tests
|
||||
# 3. DEPLOY — Push to dev preview repo via SSH
|
||||
# Job graph (parallel where possible):
|
||||
# ┌── validate-dev (uses: staffml-validate-dev.yml)
|
||||
# ├── validate-vault (uses: staffml-validate-vault.yml)
|
||||
# ├── build (Preview-specific Next.js static export)
|
||||
# └── deploy (needs: validate-dev + validate-vault + build)
|
||||
#
|
||||
# The validate jobs and build run in parallel — wall-clock for the typical
|
||||
# push is max(validate, build) + ~1 min for SSH deploy, basically
|
||||
# unchanged from the previous single-job design.
|
||||
#
|
||||
# Triggers:
|
||||
# - push: dev branch, interviews/staffml/** paths
|
||||
# - push: dev branch, interviews/staffml/** + vault questions/chains/schema
|
||||
# - workflow_dispatch: manual
|
||||
#
|
||||
# Deploys to: harvard-edge.github.io/{DEV_REPO}/staffml/
|
||||
# Secrets: SSH_DEPLOY_KEY
|
||||
# Secrets: SSH_DEPLOY_KEY (deploy job only)
|
||||
# Vars: DEV_REPO_URL
|
||||
#
|
||||
# Related:
|
||||
# - staffml-publish-live.yml — Production deploy to mlsysbook.ai/staffml/
|
||||
# - staffml-auto-pr.yml — Auto-PR from community question issues
|
||||
# - staffml-welcome.yml — Welcome comment on contributor PRs
|
||||
# - staffml-validate-dev.yml — Reusable site validate (called above)
|
||||
# - staffml-validate-vault.yml — Reusable vault validate (called above)
|
||||
# - staffml-publish-live.yml — Production deploy to mlsysbook.ai/staffml/
|
||||
# - staffml-auto-pr.yml — Auto-PR from community question issues
|
||||
# - staffml-welcome.yml — Welcome comment on contributor PRs
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
@@ -33,7 +44,7 @@ on:
|
||||
branches: [dev]
|
||||
paths:
|
||||
- 'interviews/staffml/**'
|
||||
# Also redeploy when YAMLs or chains change — the workflow
|
||||
# Also redeploy when YAMLs or chains change — the build job
|
||||
# regenerates corpus.json + corpus-summary.json from YAMLs before
|
||||
# building, so the dev site always reflects current vault state.
|
||||
- 'interviews/vault/questions/**'
|
||||
@@ -48,23 +59,46 @@ concurrency:
|
||||
group: staffml-dev-deploy
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
build-and-deploy:
|
||||
name: '🎯 Build & Deploy StaffML (Dev)'
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
# Single source for paths and versions — see docs/CI-VARIABLES.md.
|
||||
# The `paths:` trigger filter at the top of this file is intentionally
|
||||
# NOT vars-ified (GitHub Actions evaluates triggers at workflow-load
|
||||
# time, before vars are resolved). If STAFFML_ROOT moves, update the
|
||||
# paths filter manually.
|
||||
STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }}
|
||||
VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }}
|
||||
VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }}
|
||||
DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }}
|
||||
NODE_VERSION: ${{ vars.NODE_VERSION || '20' }}
|
||||
PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }}
|
||||
env:
|
||||
# Single source for paths and versions — see docs/CI-VARIABLES.md.
|
||||
# The `paths:` trigger filter at the top of this file is intentionally
|
||||
# NOT vars-ified (GitHub Actions evaluates triggers at workflow-load
|
||||
# time, before vars are resolved). If STAFFML_ROOT moves, update the
|
||||
# paths filter manually.
|
||||
STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }}
|
||||
VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }}
|
||||
VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }}
|
||||
DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }}
|
||||
NODE_VERSION: ${{ vars.NODE_VERSION || '20' }}
|
||||
PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }}
|
||||
|
||||
jobs:
|
||||
# ===========================================================================
|
||||
# Validate — reusable workflows, run in parallel with build.
|
||||
# ===========================================================================
|
||||
# Site validate: tsc + tests + build + Playwright E2E + vault smoke + links.
|
||||
validate-dev:
|
||||
name: '✅ Validate (Dev)'
|
||||
uses: ./.github/workflows/staffml-validate-dev.yml
|
||||
|
||||
# Vault validate: vault-cli ruff/mypy/pytest + vault check --strict +
|
||||
# codegen drift + registry append-only + exemplar coverage + worker vitest.
|
||||
validate-vault:
|
||||
name: '✅ Validate (Vault)'
|
||||
uses: ./.github/workflows/staffml-validate-vault.yml
|
||||
|
||||
# ===========================================================================
|
||||
# Build — Preview-specific Next.js static export, runs in parallel.
|
||||
# ===========================================================================
|
||||
# Uses Preview-specific env (BASE_PATH for the dev subdirectory, hosted
|
||||
# interviewer endpoint, hosted vault API) so the artifact is exactly
|
||||
# what gets shipped. The validate-dev job builds with its own validation
|
||||
# env vars and throws away the artifact — that build proves the code
|
||||
# compiles, this build produces the deployable bytes.
|
||||
build:
|
||||
name: '🔨 Build StaffML'
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 25
|
||||
steps:
|
||||
- name: 📥 Checkout
|
||||
uses: actions/checkout@v6
|
||||
@@ -95,14 +129,6 @@ jobs:
|
||||
# the committed JSON artifacts drift from YAMLs.
|
||||
run: vault build --vault-dir "$VAULT_DIR" --release-id preview-dev --local-json
|
||||
|
||||
- name: 🔍 Type check
|
||||
working-directory: ${{ env.STAFFML_ROOT }}
|
||||
run: npx tsc --noEmit
|
||||
|
||||
- name: 🧪 Run tests
|
||||
working-directory: ${{ env.STAFFML_ROOT }}
|
||||
run: npm test
|
||||
|
||||
- name: 🔨 Build StaffML
|
||||
working-directory: ${{ env.STAFFML_ROOT }}
|
||||
env:
|
||||
@@ -139,7 +165,12 @@ jobs:
|
||||
- name: 🔗 Rewrite URLs for dev site
|
||||
run: bash .github/scripts/rewrite-dev-urls.sh "$DEV_STAFFML_PATH" "$STAFFML_ROOT/out"
|
||||
|
||||
- name: 🔍 Validate build
|
||||
- name: 🔍 Validate build output
|
||||
# Light sanity check on the artifact about to be uploaded. Heavier
|
||||
# checks (corpus invariants, vault integrity, E2E smoke, link
|
||||
# check, schema drift) live in the called validate workflows;
|
||||
# this just confirms the critical pages were emitted by the
|
||||
# build before we hand it to the deploy job.
|
||||
run: |
|
||||
if [ ! -f "$STAFFML_ROOT/out/index.html" ]; then
|
||||
echo "❌ CRITICAL: index.html missing. Aborting deployment."
|
||||
@@ -177,69 +208,33 @@ jobs:
|
||||
echo "📋 Vault release: v$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['releaseId'])" "$MANIFEST")"
|
||||
fi
|
||||
|
||||
- name: 🔐 Validate vault integrity
|
||||
run: python3 "$STAFFML_ROOT/scripts/validate-vault.py"
|
||||
- name: 📤 Upload build artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: staffml-dev-build
|
||||
path: ${{ env.STAFFML_ROOT }}/out
|
||||
retention-days: 1
|
||||
if-no-files-found: error
|
||||
|
||||
- name: 🧪 Smoke tests
|
||||
run: |
|
||||
STAFFML_ROOT="$STAFFML_ROOT" python3 - <<'PY'
|
||||
import json, os, sys
|
||||
|
||||
root = os.environ["STAFFML_ROOT"]
|
||||
|
||||
# Test 1: Corpus integrity
|
||||
with open(f"{root}/src/data/corpus.json") as f:
|
||||
corpus = json.load(f)
|
||||
assert len(corpus) >= 4000, f"Corpus too small: {len(corpus)} questions"
|
||||
print(f"✅ Corpus: {len(corpus)} questions")
|
||||
|
||||
# Test 2: Every question has required fields
|
||||
required = ["id", "title", "level", "track", "scenario", "competency_area", "topic", "zone", "details"]
|
||||
missing = []
|
||||
for q in corpus:
|
||||
for field in required:
|
||||
if not q.get(field):
|
||||
missing.append(f"{q.get('id', '???')} missing {field}")
|
||||
if missing:
|
||||
print(f"⚠️ {len(missing)} questions with missing fields")
|
||||
for m in missing[:5]:
|
||||
print(f" {m}")
|
||||
if len(missing) > 5:
|
||||
print(f" ... and {len(missing) - 5} more")
|
||||
else:
|
||||
print("✅ All questions have required fields")
|
||||
|
||||
# Test 3: Every question has a valid level
|
||||
valid_levels = {"L1", "L2", "L3", "L4", "L5", "L6", "L6+"}
|
||||
bad_levels = [q["id"] for q in corpus if q.get("level") not in valid_levels]
|
||||
assert len(bad_levels) == 0, f"{len(bad_levels)} questions with invalid levels: {bad_levels[:3]}"
|
||||
print("✅ All levels valid (L1-L6+)")
|
||||
|
||||
# Test 4: Taxonomy loads and has concepts
|
||||
with open(f"{root}/src/data/taxonomy.json") as f:
|
||||
taxonomy = json.load(f)
|
||||
concepts = taxonomy.get("concepts", [])
|
||||
assert len(concepts) >= 70, f"Taxonomy too small: {len(concepts)} topics"
|
||||
print(f"✅ Taxonomy: {len(concepts)} topics")
|
||||
|
||||
# Test 5: Manifest exists and is valid (single source of truth)
|
||||
with open(f"{root}/src/data/vault-manifest.json") as f:
|
||||
manifest = json.load(f)
|
||||
assert "releaseId" in manifest, "Manifest missing releaseId"
|
||||
assert "releaseHash" in manifest and len(manifest["releaseHash"]) >= 16, \
|
||||
"Manifest missing or truncated releaseHash"
|
||||
assert manifest["questionCount"] == len(corpus), \
|
||||
f"Manifest count mismatch: {manifest['questionCount']} vs {len(corpus)}"
|
||||
print(f"✅ Manifest: v{manifest['releaseId']} ({manifest['questionCount']} Qs, hash {manifest['releaseHash'][:7]})")
|
||||
|
||||
# Test 6: Build output has static assets
|
||||
js_files = [f for f in os.listdir(f"{root}/out/_next/static") if not f.startswith(".")]
|
||||
assert len(js_files) > 0, "No static JS chunks found"
|
||||
print(f"✅ Static assets: {len(js_files)} chunks")
|
||||
|
||||
print()
|
||||
print("🎯 All smoke tests passed")
|
||||
PY
|
||||
# ===========================================================================
|
||||
# Deploy — gated on validates + build all succeeding.
|
||||
# ===========================================================================
|
||||
# The job graph means this never starts unless every validate workflow
|
||||
# and the Preview-specific build all reached `success`. The screenshot
|
||||
# case "Preview ✅ but Validate ❌ on the same SHA" can no longer occur:
|
||||
# if either validate fails, this job is skipped and Preview as a whole
|
||||
# reports failure.
|
||||
deploy:
|
||||
name: '🚀 Deploy StaffML to Dev Site'
|
||||
runs-on: ubuntu-latest
|
||||
needs: [validate-dev, validate-vault, build]
|
||||
timeout-minutes: 10
|
||||
steps:
|
||||
- name: 📥 Download build artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
name: staffml-dev-build
|
||||
path: staffml-out
|
||||
|
||||
- name: 🚀 Deploy to Dev Site via SSH
|
||||
env:
|
||||
@@ -264,7 +259,7 @@ jobs:
|
||||
mkdir -p staffml
|
||||
|
||||
echo "🚚 Copying StaffML build..."
|
||||
cp -r "${{ github.workspace }}/interviews/staffml/out/." staffml/
|
||||
cp -r "${{ github.workspace }}/staffml-out/." staffml/
|
||||
|
||||
if [ ! -f "staffml/index.html" ]; then
|
||||
echo "❌ CRITICAL: staffml/index.html missing. Aborting."
|
||||
|
||||
15
.github/workflows/staffml-validate-dev.yml
vendored
15
.github/workflows/staffml-validate-dev.yml
vendored
@@ -31,6 +31,11 @@ name: '🎯 StaffML · ✅ Validate (Dev)'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
# Reusable: staffml-preview-dev.yml calls this via `uses:` so the deploy
|
||||
# job can `needs:` a green validate. Standalone push/PR triggers below
|
||||
# stay so the publish guard (infra-publish-guard.yml) and README badge
|
||||
# still see direct runs on dev.
|
||||
workflow_call:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'interviews/staffml/**'
|
||||
@@ -47,7 +52,15 @@ permissions:
|
||||
contents: read
|
||||
|
||||
concurrency:
|
||||
group: staffml-validate-${{ github.ref }}
|
||||
# `head_ref || run_id` preserves PR cancel-on-amend (head_ref is the PR
|
||||
# source branch and is stable across PR commits) while making push and
|
||||
# workflow_call runs unique per-run (head_ref is empty for non-PR events,
|
||||
# so the group falls back to run_id). Without the per-run fallback, a
|
||||
# push to dev would trigger BOTH this workflow standalone AND Preview's
|
||||
# `uses:` call into it; the two would share the same group and one would
|
||||
# cancel the other — same class of badge-flicker bug the CLAUDE.md note
|
||||
# describes for manual dispatch.
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
|
||||
13
.github/workflows/staffml-validate-vault.yml
vendored
13
.github/workflows/staffml-validate-vault.yml
vendored
@@ -22,6 +22,12 @@ name: '🎯 StaffML · ✅ Validate (Vault)'
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
# Reusable: staffml-preview-dev.yml calls this via `uses:` so its deploy
|
||||
# job can `needs:` a green vault validate. Standalone push/PR triggers
|
||||
# below stay so the publish guard and README badge still see direct
|
||||
# runs on dev (and so PRs that touch vault-cli/worker without staffml
|
||||
# still get validated independent of Preview).
|
||||
workflow_call:
|
||||
pull_request:
|
||||
paths:
|
||||
- 'interviews/vault/**'
|
||||
@@ -37,7 +43,12 @@ on:
|
||||
- 'interviews/staffml-vault-worker/**'
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.ref }}
|
||||
# `head_ref || run_id` keeps PR cancel-on-amend behavior while making
|
||||
# push and workflow_call runs unique per-run, so a push to dev that
|
||||
# triggers both this workflow standalone AND Preview's `uses:` call
|
||||
# doesn't collide on a shared group. See staffml-validate-dev.yml for
|
||||
# the long-form rationale.
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
env:
|
||||
|
||||
Reference in New Issue
Block a user