Files
cs249r_book/interviews/vault/scripts/run_parallel.sh
Vijay Janapa Reddi 26e0ab3856 restructure interviews/ with vault separation and per-directory licenses
- Move corpus, taxonomy, chains, scripts into interviews/vault/
- Rename interviews/staffml/ (was interviews/staffml/) as the branded app
- Add CC BY-NC-SA 4.0 LICENSE to: book, kits, labs, slides, instructors, interviews
- Add AGPL-3.0 LICENSE to interviews/staffml/ (the app)
- Add vault LICENSE for pipeline scripts
- Update all GitHub Actions workflows for new paths
- Update README links and vault.yaml export paths
- Fix regex patterns in site/book deploy workflows

License structure:
  interviews/LICENSE      — CC BY-NC-SA 4.0 (corpus + data)
  interviews/staffml/LICENSE — AGPL-3.0 (app code)
  interviews/vault/LICENSE   — pipeline copyright
  book|kits|labs|slides|instructors/LICENSE — CC BY-NC-SA 4.0
  tinytorch/LICENSE       — Apache 2.0 (unchanged)
2026-03-25 15:18:14 -04:00

139 lines
3.8 KiB
Bash
Executable File

#!/bin/bash
# run_parallel.sh — Orchestrate parallel question generation across (level, track) combos.
#
# Architecture:
# 1. vault.py plan-all → creates _plans/plan-{level}-{track}.json files
# 2. This script runs generate-batch on each plan in waves (to respect rate limits)
# 3. vault.py merge → combines all _generated/ batches into corpus
# 4. vault.py release → rebuild chains, stats, export
#
# Usage:
# ./run_parallel.sh # Full run (all gaps)
# ./run_parallel.sh --wave-size 2 # Conservative (2 concurrent batches)
# ./run_parallel.sh --workers 2 # Fewer Gemini calls per batch
# ./run_parallel.sh --dry-run # Plan only, don't generate
set -euo pipefail
cd "$(dirname "$0")"
WAVE_SIZE=${1:-3} # Concurrent batches per wave
WORKERS=${2:-4} # Gemini workers per batch
DRY_RUN=${3:-""}
LOG_DIR="_logs"
mkdir -p "$LOG_DIR" "_generated"
TIMESTAMP=$(date +%Y%m%d-%H%M)
MASTER_LOG="$LOG_DIR/parallel-$TIMESTAMP.log"
log() { echo "[$(date +%H:%M:%S)] $1" | tee -a "$MASTER_LOG"; }
log "═══ StaffML Parallel Generation ═══"
log " Wave size: $WAVE_SIZE concurrent batches"
log " Workers: $WORKERS per batch"
log " Timestamp: $TIMESTAMP"
# Step 1: Generate all plans
log ""
log "── Step 1: Generating plans ──"
python3 vault.py plan-all 2>&1 | tee -a "$MASTER_LOG"
# Step 2: Run generation in waves
log ""
log "── Step 2: Generating questions in waves ──"
if [ -n "$DRY_RUN" ]; then
log "DRY RUN — skipping generation"
exit 0
fi
PLAN_DIR="_plans"
PLAN_FILES=($(ls "$PLAN_DIR"/plan-*.json 2>/dev/null | sort))
TOTAL_PLANS=${#PLAN_FILES[@]}
if [ "$TOTAL_PLANS" -eq 0 ]; then
log "No plan files found"
exit 0
fi
log " Total plan files: $TOTAL_PLANS"
# Process in waves
WAVE=1
IDX=0
TOTAL_GENERATED=0
TOTAL_FAILED=0
while [ $IDX -lt $TOTAL_PLANS ]; do
WAVE_END=$((IDX + WAVE_SIZE))
if [ $WAVE_END -gt $TOTAL_PLANS ]; then
WAVE_END=$TOTAL_PLANS
fi
WAVE_FILES=("${PLAN_FILES[@]:$IDX:$WAVE_SIZE}")
log ""
log "── Wave $WAVE: batches $((IDX+1))-$WAVE_END of $TOTAL_PLANS ──"
PIDS=()
WAVE_LOGS=()
for PLAN in "${WAVE_FILES[@]}"; do
PLAN_NAME=$(basename "$PLAN" .json)
BATCH_LOG="$LOG_DIR/$PLAN_NAME-$TIMESTAMP.log"
log " Starting: $PLAN_NAME (log: $BATCH_LOG)"
python3 -u vault.py generate-batch "$PLAN" --workers "$WORKERS" > "$BATCH_LOG" 2>&1 &
PIDS+=($!)
WAVE_LOGS+=("$BATCH_LOG")
done
# Wait for all in this wave
WAVE_OK=0
WAVE_FAIL=0
for i in "${!PIDS[@]}"; do
PID=${PIDS[$i]}
BLOG=${WAVE_LOGS[$i]}
if wait "$PID"; then
WAVE_OK=$((WAVE_OK + 1))
# Extract counts from log
GEN=$(grep -oP 'Generated: \K\d+' "$BLOG" 2>/dev/null | tail -1 || echo "0")
TOTAL_GENERATED=$((TOTAL_GENERATED + GEN))
log " ✓ PID $PID done (+$GEN)"
else
WAVE_FAIL=$((WAVE_FAIL + 1))
TOTAL_FAILED=$((TOTAL_FAILED + 1))
log " ✗ PID $PID failed (see $BLOG)"
fi
done
log " Wave $WAVE complete: $WAVE_OK ok, $WAVE_FAIL failed"
IDX=$WAVE_END
WAVE=$((WAVE + 1))
# Brief pause between waves to let rate limits reset
if [ $IDX -lt $TOTAL_PLANS ]; then
log " Cooling down 10s..."
sleep 10
fi
done
log ""
log "═══ Generation Complete ═══"
log " Total generated: ~$TOTAL_GENERATED"
log " Failed batches: $TOTAL_FAILED"
# Step 3: Merge
log ""
log "── Step 3: Merging into corpus ──"
python3 vault.py merge 2>&1 | tee -a "$MASTER_LOG"
# Step 4: Release
log ""
log "── Step 4: Release ──"
python3 vault.py release --skip figures 2>&1 | tee -a "$MASTER_LOG"
log ""
log "═══ All Done ═══"
log " Log: $MASTER_LOG"