diff --git a/.github/workflows/staffml-preview-dev.yml b/.github/workflows/staffml-preview-dev.yml index 82b16f4c4..7a1d4d24f 100644 --- a/.github/workflows/staffml-preview-dev.yml +++ b/.github/workflows/staffml-preview-dev.yml @@ -4,26 +4,37 @@ name: '๐ŸŽฏ StaffML ยท ๐Ÿ‘๏ธ Preview (Dev)' # StaffML โ€” Dev Preview Deploy # ============================================================================= # -# Builds the StaffML Next.js interview prep app and deploys to the dev -# preview site via SSH. Includes vault validation and smoke tests. +# Builds the StaffML Next.js interview-prep app and deploys to the dev +# preview site via SSH. Validation is delegated to the reusable workflows +# staffml-validate-dev.yml + staffml-validate-vault.yml โ€” they run as +# `uses:` jobs in this workflow, and the deploy job blocks on `needs:` +# both passing. This closes the race window where Preview could deploy +# on a SHA the parallel Validate workflows had already failed on. # -# Flow: -# 1. BUILD โ€” npm ci + Next.js static export -# 2. VALIDATE โ€” Build output + vault integrity + smoke tests -# 3. DEPLOY โ€” Push to dev preview repo via SSH +# Job graph (parallel where possible): +# โ”Œโ”€โ”€ validate-dev (uses: staffml-validate-dev.yml) +# โ”œโ”€โ”€ validate-vault (uses: staffml-validate-vault.yml) +# โ”œโ”€โ”€ build (Preview-specific Next.js static export) +# โ””โ”€โ”€ deploy (needs: validate-dev + validate-vault + build) +# +# The validate jobs and build run in parallel โ€” wall-clock for the typical +# push is max(validate, build) + ~1 min for SSH deploy, basically +# unchanged from the previous single-job design. # # Triggers: -# - push: dev branch, interviews/staffml/** paths +# - push: dev branch, interviews/staffml/** + vault questions/chains/schema # - workflow_dispatch: manual # # Deploys to: harvard-edge.github.io/{DEV_REPO}/staffml/ -# Secrets: SSH_DEPLOY_KEY +# Secrets: SSH_DEPLOY_KEY (deploy job only) # Vars: DEV_REPO_URL # # Related: -# - staffml-publish-live.yml โ€” Production deploy to mlsysbook.ai/staffml/ -# - staffml-auto-pr.yml โ€” Auto-PR from community question issues -# - staffml-welcome.yml โ€” Welcome comment on contributor PRs +# - staffml-validate-dev.yml โ€” Reusable site validate (called above) +# - staffml-validate-vault.yml โ€” Reusable vault validate (called above) +# - staffml-publish-live.yml โ€” Production deploy to mlsysbook.ai/staffml/ +# - staffml-auto-pr.yml โ€” Auto-PR from community question issues +# - staffml-welcome.yml โ€” Welcome comment on contributor PRs # # ============================================================================= @@ -33,7 +44,7 @@ on: branches: [dev] paths: - 'interviews/staffml/**' - # Also redeploy when YAMLs or chains change โ€” the workflow + # Also redeploy when YAMLs or chains change โ€” the build job # regenerates corpus.json + corpus-summary.json from YAMLs before # building, so the dev site always reflects current vault state. - 'interviews/vault/questions/**' @@ -48,23 +59,46 @@ concurrency: group: staffml-dev-deploy cancel-in-progress: true -jobs: - build-and-deploy: - name: '๐ŸŽฏ Build & Deploy StaffML (Dev)' - runs-on: ubuntu-latest - env: - # Single source for paths and versions โ€” see docs/CI-VARIABLES.md. - # The `paths:` trigger filter at the top of this file is intentionally - # NOT vars-ified (GitHub Actions evaluates triggers at workflow-load - # time, before vars are resolved). If STAFFML_ROOT moves, update the - # paths filter manually. - STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }} - VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }} - VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }} - DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }} - NODE_VERSION: ${{ vars.NODE_VERSION || '20' }} - PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }} +env: + # Single source for paths and versions โ€” see docs/CI-VARIABLES.md. + # The `paths:` trigger filter at the top of this file is intentionally + # NOT vars-ified (GitHub Actions evaluates triggers at workflow-load + # time, before vars are resolved). If STAFFML_ROOT moves, update the + # paths filter manually. + STAFFML_ROOT: ${{ vars.STAFFML_ROOT || 'interviews/staffml' }} + VAULT_DIR: ${{ vars.VAULT_DIR || 'interviews/vault' }} + VAULT_CLI_DIR: ${{ vars.VAULT_CLI_DIR || 'interviews/vault-cli' }} + DEV_STAFFML_PATH: ${{ vars.DEV_STAFFML_PATH || 'staffml' }} + NODE_VERSION: ${{ vars.NODE_VERSION || '20' }} + PYTHON_VERSION: ${{ vars.PYTHON_VERSION || '3.12' }} +jobs: + # =========================================================================== + # Validate โ€” reusable workflows, run in parallel with build. + # =========================================================================== + # Site validate: tsc + tests + build + Playwright E2E + vault smoke + links. + validate-dev: + name: 'โœ… Validate (Dev)' + uses: ./.github/workflows/staffml-validate-dev.yml + + # Vault validate: vault-cli ruff/mypy/pytest + vault check --strict + + # codegen drift + registry append-only + exemplar coverage + worker vitest. + validate-vault: + name: 'โœ… Validate (Vault)' + uses: ./.github/workflows/staffml-validate-vault.yml + + # =========================================================================== + # Build โ€” Preview-specific Next.js static export, runs in parallel. + # =========================================================================== + # Uses Preview-specific env (BASE_PATH for the dev subdirectory, hosted + # interviewer endpoint, hosted vault API) so the artifact is exactly + # what gets shipped. The validate-dev job builds with its own validation + # env vars and throws away the artifact โ€” that build proves the code + # compiles, this build produces the deployable bytes. + build: + name: '๐Ÿ”จ Build StaffML' + runs-on: ubuntu-latest + timeout-minutes: 25 steps: - name: ๐Ÿ“ฅ Checkout uses: actions/checkout@v6 @@ -95,14 +129,6 @@ jobs: # the committed JSON artifacts drift from YAMLs. run: vault build --vault-dir "$VAULT_DIR" --release-id preview-dev --local-json - - name: ๐Ÿ” Type check - working-directory: ${{ env.STAFFML_ROOT }} - run: npx tsc --noEmit - - - name: ๐Ÿงช Run tests - working-directory: ${{ env.STAFFML_ROOT }} - run: npm test - - name: ๐Ÿ”จ Build StaffML working-directory: ${{ env.STAFFML_ROOT }} env: @@ -139,7 +165,12 @@ jobs: - name: ๐Ÿ”— Rewrite URLs for dev site run: bash .github/scripts/rewrite-dev-urls.sh "$DEV_STAFFML_PATH" "$STAFFML_ROOT/out" - - name: ๐Ÿ” Validate build + - name: ๐Ÿ” Validate build output + # Light sanity check on the artifact about to be uploaded. Heavier + # checks (corpus invariants, vault integrity, E2E smoke, link + # check, schema drift) live in the called validate workflows; + # this just confirms the critical pages were emitted by the + # build before we hand it to the deploy job. run: | if [ ! -f "$STAFFML_ROOT/out/index.html" ]; then echo "โŒ CRITICAL: index.html missing. Aborting deployment." @@ -177,69 +208,33 @@ jobs: echo "๐Ÿ“‹ Vault release: v$(python3 -c "import json,sys; print(json.load(open(sys.argv[1]))['releaseId'])" "$MANIFEST")" fi - - name: ๐Ÿ” Validate vault integrity - run: python3 "$STAFFML_ROOT/scripts/validate-vault.py" + - name: ๐Ÿ“ค Upload build artifact + uses: actions/upload-artifact@v4 + with: + name: staffml-dev-build + path: ${{ env.STAFFML_ROOT }}/out + retention-days: 1 + if-no-files-found: error - - name: ๐Ÿงช Smoke tests - run: | - STAFFML_ROOT="$STAFFML_ROOT" python3 - <<'PY' - import json, os, sys - - root = os.environ["STAFFML_ROOT"] - - # Test 1: Corpus integrity - with open(f"{root}/src/data/corpus.json") as f: - corpus = json.load(f) - assert len(corpus) >= 4000, f"Corpus too small: {len(corpus)} questions" - print(f"โœ… Corpus: {len(corpus)} questions") - - # Test 2: Every question has required fields - required = ["id", "title", "level", "track", "scenario", "competency_area", "topic", "zone", "details"] - missing = [] - for q in corpus: - for field in required: - if not q.get(field): - missing.append(f"{q.get('id', '???')} missing {field}") - if missing: - print(f"โš ๏ธ {len(missing)} questions with missing fields") - for m in missing[:5]: - print(f" {m}") - if len(missing) > 5: - print(f" ... and {len(missing) - 5} more") - else: - print("โœ… All questions have required fields") - - # Test 3: Every question has a valid level - valid_levels = {"L1", "L2", "L3", "L4", "L5", "L6", "L6+"} - bad_levels = [q["id"] for q in corpus if q.get("level") not in valid_levels] - assert len(bad_levels) == 0, f"{len(bad_levels)} questions with invalid levels: {bad_levels[:3]}" - print("โœ… All levels valid (L1-L6+)") - - # Test 4: Taxonomy loads and has concepts - with open(f"{root}/src/data/taxonomy.json") as f: - taxonomy = json.load(f) - concepts = taxonomy.get("concepts", []) - assert len(concepts) >= 70, f"Taxonomy too small: {len(concepts)} topics" - print(f"โœ… Taxonomy: {len(concepts)} topics") - - # Test 5: Manifest exists and is valid (single source of truth) - with open(f"{root}/src/data/vault-manifest.json") as f: - manifest = json.load(f) - assert "releaseId" in manifest, "Manifest missing releaseId" - assert "releaseHash" in manifest and len(manifest["releaseHash"]) >= 16, \ - "Manifest missing or truncated releaseHash" - assert manifest["questionCount"] == len(corpus), \ - f"Manifest count mismatch: {manifest['questionCount']} vs {len(corpus)}" - print(f"โœ… Manifest: v{manifest['releaseId']} ({manifest['questionCount']} Qs, hash {manifest['releaseHash'][:7]})") - - # Test 6: Build output has static assets - js_files = [f for f in os.listdir(f"{root}/out/_next/static") if not f.startswith(".")] - assert len(js_files) > 0, "No static JS chunks found" - print(f"โœ… Static assets: {len(js_files)} chunks") - - print() - print("๐ŸŽฏ All smoke tests passed") - PY + # =========================================================================== + # Deploy โ€” gated on validates + build all succeeding. + # =========================================================================== + # The job graph means this never starts unless every validate workflow + # and the Preview-specific build all reached `success`. The screenshot + # case "Preview โœ… but Validate โŒ on the same SHA" can no longer occur: + # if either validate fails, this job is skipped and Preview as a whole + # reports failure. + deploy: + name: '๐Ÿš€ Deploy StaffML to Dev Site' + runs-on: ubuntu-latest + needs: [validate-dev, validate-vault, build] + timeout-minutes: 10 + steps: + - name: ๐Ÿ“ฅ Download build artifact + uses: actions/download-artifact@v4 + with: + name: staffml-dev-build + path: staffml-out - name: ๐Ÿš€ Deploy to Dev Site via SSH env: @@ -264,7 +259,7 @@ jobs: mkdir -p staffml echo "๐Ÿšš Copying StaffML build..." - cp -r "${{ github.workspace }}/interviews/staffml/out/." staffml/ + cp -r "${{ github.workspace }}/staffml-out/." staffml/ if [ ! -f "staffml/index.html" ]; then echo "โŒ CRITICAL: staffml/index.html missing. Aborting." diff --git a/.github/workflows/staffml-validate-dev.yml b/.github/workflows/staffml-validate-dev.yml index 37e0b7f1e..f0a027fef 100644 --- a/.github/workflows/staffml-validate-dev.yml +++ b/.github/workflows/staffml-validate-dev.yml @@ -31,6 +31,11 @@ name: '๐ŸŽฏ StaffML ยท โœ… Validate (Dev)' on: workflow_dispatch: + # Reusable: staffml-preview-dev.yml calls this via `uses:` so the deploy + # job can `needs:` a green validate. Standalone push/PR triggers below + # stay so the publish guard (infra-publish-guard.yml) and README badge + # still see direct runs on dev. + workflow_call: pull_request: paths: - 'interviews/staffml/**' @@ -47,7 +52,15 @@ permissions: contents: read concurrency: - group: staffml-validate-${{ github.ref }} + # `head_ref || run_id` preserves PR cancel-on-amend (head_ref is the PR + # source branch and is stable across PR commits) while making push and + # workflow_call runs unique per-run (head_ref is empty for non-PR events, + # so the group falls back to run_id). Without the per-run fallback, a + # push to dev would trigger BOTH this workflow standalone AND Preview's + # `uses:` call into it; the two would share the same group and one would + # cancel the other โ€” same class of badge-flicker bug the CLAUDE.md note + # describes for manual dispatch. + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: diff --git a/.github/workflows/staffml-validate-vault.yml b/.github/workflows/staffml-validate-vault.yml index 8739c883d..a87bfcc30 100644 --- a/.github/workflows/staffml-validate-vault.yml +++ b/.github/workflows/staffml-validate-vault.yml @@ -22,6 +22,12 @@ name: '๐ŸŽฏ StaffML ยท โœ… Validate (Vault)' on: workflow_dispatch: + # Reusable: staffml-preview-dev.yml calls this via `uses:` so its deploy + # job can `needs:` a green vault validate. Standalone push/PR triggers + # below stay so the publish guard and README badge still see direct + # runs on dev (and so PRs that touch vault-cli/worker without staffml + # still get validated independent of Preview). + workflow_call: pull_request: paths: - 'interviews/vault/**' @@ -37,7 +43,12 @@ on: - 'interviews/staffml-vault-worker/**' concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + # `head_ref || run_id` keeps PR cancel-on-amend behavior while making + # push and workflow_call runs unique per-run, so a push to dev that + # triggers both this workflow standalone AND Preview's `uses:` call + # doesn't collide on a shared group. See staffml-validate-dev.yml for + # the long-form rationale. + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} cancel-in-progress: true env: