mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
The Phase 0 cleanup removed 18 scripts as deprecated, but 6 of them have
unique-capability patterns not yet covered by the modern tooling. Restoring
them as reference patterns, not active scripts.
What's restored and why:
gemini_backfill_question.py
Idempotent corpus-walk + Gemini batch + thread-pool + JSON YAML
round-trip. The "fix one field across thousands of YAMLs" pattern.
To be mined in CORPUS_HARDENING_PLAN.md Phase 5.
gpt_backfill_question.py
OpenAI variant of the above. Cross-provider template.
gemini_cli_generate_questions.py (35K)
BATCHED generation: 12 cells per call with balanced track × area ×
zone × level round-robin. `vault generate` does NOT batch — it calls
once per question. This script's batching pattern is what we want
when generating > 100 questions in bulk.
generate.py (30K)
Coverage-survey-driven generation engine: surveys the corpus, finds
empty cells, generates to fill the emptiest first, stops when
saturated. `vault generate` lacks this auto-balance loop.
gemini_fix_errors.py
Batch error-fixer with hardware-reference grounding (V100 / A100 /
H100 / B200 / T4 specs as ground-truth context). To be mined for
audit_corpus_batched.py --propose-fixes in Phase 5.
deep_verify.py
Claude Opus + extended thinking; SHOWS ITS WORK on every napkin-math
claim. Useful as a tiebreaker on borderline math findings from the
lightweight audit.
Each restored file has a 5-line STATUS comment block at the top
documenting what to adapt before running. DEPRECATED.md is restructured
to make the three categories explicit (removed / preserved-for-adaptation
/ active-migration), and adds an adaptation checklist that applies to
all preserved scripts (replace corpus.json loading, verify SDK pins,
update output paths, re-validate prompts, sample first).
Validation:
vault check --strict — 10,711 loaded, 0 invariant failures
pytest — 74/74
ruff — clean
236 lines
8.4 KiB
Python
236 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
||
# STATUS (2026-05-03): preserved as a reference pattern — see vault/scripts/DEPRECATED.md
|
||
# §"Preserved for adaptation". The HARDWARE_REFERENCE constant (V100/A100/H100/B200/T4
|
||
# specs as ground-truth context for the judge) is exactly what
|
||
# audit_corpus_batched.py needs in Phase 5 (CORPUS_HARDENING_PLAN.md). The
|
||
# error-input format targets a legacy validation-results JSON shape; adapt to
|
||
# whatever audit_corpus_batched.py emits.
|
||
"""
|
||
Gemini 3.1 Pro error fixer for StaffML corpus.
|
||
|
||
Takes batches of questions flagged with errors and asks Gemini to fix them.
|
||
Outputs corrected question JSON for each batch.
|
||
|
||
Usage:
|
||
source ~/.zshrc_secrets
|
||
PYTHONUNBUFFERED=1 python3 staffml/vault/scripts/gemini_fix_errors.py --workers 8
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from datetime import datetime
|
||
|
||
HARDWARE_REFERENCE = """
|
||
## Hardware Reference (mlsysim/core/constants.py — single source of truth)
|
||
|
||
| GPU | Memory | Type | Bandwidth | FP16 Tensor | TDP |
|
||
|-----|--------|------|-----------|-------------|-----|
|
||
| V100 | 32 GiB | HBM2 | 900 GB/s | 125 TFLOPS | 300W |
|
||
| A100 SXM | 80 GiB | HBM2e | 2039 GB/s (~2.0 TB/s) | 312 TFLOPS | 400W |
|
||
| H100 SXM | 80 GiB | HBM3 | 3.35 TB/s | 989 TFLOPS | 700W |
|
||
| H200 | 141 GB | HBM3e | 4.8 TB/s | 989 TFLOPS | 700W |
|
||
| B200 | 192 GiB | HBM3e | 8.0 TB/s | 2250 TFLOPS | 1000W |
|
||
| T4 | 16 GiB | GDDR6 | 320 GB/s | 65 TFLOPS | 70W |
|
||
|
||
| Interconnect | Bandwidth |
|
||
|---|---|
|
||
| NVLink A100 | 600 GB/s | NVLink H100 | 900 GB/s |
|
||
| PCIe Gen4 x16 | 32 GB/s (bidirectional) |
|
||
| PCIe Gen5 x16 | 64 GB/s (bidirectional) |
|
||
| IB HDR | 200 Gbps = 25 GB/s | IB NDR | 400 Gbps = 50 GB/s |
|
||
|
||
Key formulas:
|
||
- 1B params = 2 GB FP16, 4 GB FP32, 1 GB INT8, 0.5 GB INT4
|
||
- Training memory (Adam): 16 bytes/param
|
||
- KV cache: 2 × layers × kv_heads × head_dim × seq_len × bytes
|
||
- Ridge point: peak_FLOPS / peak_bandwidth
|
||
- AllReduce ring: 2(N-1)/N × data_size / bandwidth
|
||
- Conv2D FLOPs: 2 × K² × Cin × Cout × Hout × Wout
|
||
|
||
Edge/Mobile: Jetson Orin 275 TOPS INT8, 204.8 GB/s | Apple A17 Pro ~35 TOPS | Snapdragon 8 Gen 3 ~45 TOPS
|
||
TinyML: Cortex-M4 ~240 MHz | ESP32-S3 240 MHz, 512 KB SRAM | STM32H7 480 MHz, 1 MB SRAM
|
||
""".strip()
|
||
|
||
FIX_PROMPT = """You are an expert ML Systems engineer fixing errors in interview questions.
|
||
|
||
{hardware_reference}
|
||
|
||
## Instructions
|
||
|
||
Below are {num_questions} interview questions that were flagged with specific errors during review.
|
||
For each question:
|
||
|
||
1. Read the error description carefully
|
||
2. Determine if the error is REAL or a FALSE POSITIVE
|
||
3. If REAL: fix the question by correcting the math, specs, or logic in ALL affected fields (scenario, napkin_math, realistic_solution, common_mistake, options if MCQ)
|
||
4. If FALSE POSITIVE: leave the question unchanged
|
||
|
||
## CRITICAL RULES
|
||
- When you fix math, update ALL downstream values that depend on the corrected number
|
||
- If an MCQ correct_index needs to change, update it
|
||
- Preserve the question's pedagogical intent — fix the numbers, not the teaching goal
|
||
- Use hardware specs from the reference sheet above as ground truth
|
||
|
||
## Output Format
|
||
|
||
Return a JSON array. For each question, output:
|
||
```json
|
||
{{
|
||
"corpus_index": <original index>,
|
||
"id": "<question id>",
|
||
"action": "FIXED" or "FALSE_POSITIVE",
|
||
"fix_summary": "<what was fixed>" or "<why it's a false positive>",
|
||
"corrected_fields": {{
|
||
"scenario": "<new scenario if changed>",
|
||
"details": {{
|
||
"napkin_math": "<new napkin_math if changed>",
|
||
"realistic_solution": "<new solution if changed>",
|
||
"common_mistake": "<new common_mistake if changed>",
|
||
"options": ["<new options if MCQ changed>"],
|
||
"correct_index": <new index if changed>
|
||
}}
|
||
}}
|
||
}}
|
||
```
|
||
|
||
Only include fields in `corrected_fields` that actually changed. If FALSE_POSITIVE, omit `corrected_fields`.
|
||
|
||
## Questions to Fix
|
||
|
||
{questions_json}
|
||
"""
|
||
|
||
|
||
def fix_batch(batch_idx: int, batch_path: str, output_dir: Path, model: str) -> dict:
|
||
"""Send a batch to Gemini for fixing."""
|
||
from google import genai
|
||
|
||
with open(batch_path) as f:
|
||
batch = json.load(f)
|
||
|
||
# Build slim version for the prompt
|
||
slim = []
|
||
for item in batch:
|
||
q = item["question"]
|
||
entry = {
|
||
"corpus_index": item["corpus_index"],
|
||
"id": q.get("id", ""),
|
||
"track": q.get("track", ""),
|
||
"level": q.get("level", ""),
|
||
"scenario": q.get("scenario", ""),
|
||
"details": {
|
||
"napkin_math": q.get("details", {}).get("napkin_math", ""),
|
||
"realistic_solution": q.get("details", {}).get("realistic_solution", ""),
|
||
"common_mistake": q.get("details", {}).get("common_mistake", ""),
|
||
},
|
||
"errors_found": item["gemini_errors"],
|
||
}
|
||
opts = q.get("details", {}).get("options")
|
||
if opts:
|
||
entry["details"]["options"] = opts
|
||
entry["details"]["correct_index"] = q.get("details", {}).get("correct_index")
|
||
slim.append(entry)
|
||
|
||
prompt = FIX_PROMPT.format(
|
||
hardware_reference=HARDWARE_REFERENCE,
|
||
num_questions=len(slim),
|
||
questions_json=json.dumps(slim, indent=2, ensure_ascii=False),
|
||
)
|
||
|
||
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
|
||
client = genai.Client(api_key=api_key)
|
||
|
||
try:
|
||
response = client.models.generate_content(
|
||
model=model,
|
||
contents=prompt,
|
||
config={"response_mime_type": "application/json"},
|
||
)
|
||
raw = response.text
|
||
|
||
# Save raw response
|
||
(output_dir / f"raw_{batch_idx:03d}.txt").write_text(raw, encoding="utf-8")
|
||
|
||
# Parse JSON
|
||
fixes = json.loads(raw)
|
||
(output_dir / f"fixes_{batch_idx:03d}.json").write_text(
|
||
json.dumps(fixes, indent=2, ensure_ascii=False), encoding="utf-8"
|
||
)
|
||
|
||
fixed = sum(1 for f in fixes if f.get("action") == "FIXED")
|
||
fp = sum(1 for f in fixes if f.get("action") == "FALSE_POSITIVE")
|
||
|
||
return {
|
||
"batch": batch_idx,
|
||
"total": len(batch),
|
||
"fixed": fixed,
|
||
"false_positive": fp,
|
||
"parse_ok": True,
|
||
}
|
||
except json.JSONDecodeError as e:
|
||
# Save raw even on parse failure
|
||
(output_dir / f"raw_{batch_idx:03d}.txt").write_text(raw, encoding="utf-8")
|
||
return {"batch": batch_idx, "total": len(batch), "error": f"JSON parse: {e}", "parse_ok": False}
|
||
except Exception as e:
|
||
return {"batch": batch_idx, "total": len(batch), "error": str(e), "parse_ok": False}
|
||
|
||
|
||
def main():
|
||
import argparse
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--workers", type=int, default=8)
|
||
parser.add_argument("--model", default="gemini-3.1-pro-preview")
|
||
parser.add_argument("--batch-dir", default="/tmp/vault_gemini_fix")
|
||
args = parser.parse_args()
|
||
|
||
batch_files = sorted(Path(args.batch_dir).glob("batch_*.json"))
|
||
print(f"Found {len(batch_files)} batches to fix")
|
||
|
||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||
output_dir = Path(f"_validation_results/gemini_fixes_{timestamp}")
|
||
output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"Output: {output_dir}")
|
||
print(f"Model: {args.model}")
|
||
print(f"Workers: {args.workers}")
|
||
print()
|
||
|
||
results = []
|
||
with ThreadPoolExecutor(max_workers=args.workers) as executor:
|
||
futures = {
|
||
executor.submit(fix_batch, i, str(bf), output_dir, args.model): i
|
||
for i, bf in enumerate(batch_files)
|
||
}
|
||
for future in as_completed(futures):
|
||
r = future.result()
|
||
results.append(r)
|
||
if r.get("parse_ok"):
|
||
print(f" Batch {r['batch']:3d}: {r['fixed']} fixed, {r['false_positive']} false positives")
|
||
else:
|
||
print(f" Batch {r['batch']:3d}: ERROR — {r.get('error', 'unknown')}")
|
||
|
||
# Summary
|
||
total_fixed = sum(r.get("fixed", 0) for r in results)
|
||
total_fp = sum(r.get("false_positive", 0) for r in results)
|
||
total_err = sum(1 for r in results if not r.get("parse_ok"))
|
||
|
||
print(f"\n{'='*60}")
|
||
print(f"FIX COMPLETE")
|
||
print(f"{'='*60}")
|
||
print(f" Batches: {len(results)}")
|
||
print(f" Fixed: {total_fixed}")
|
||
print(f" False positives: {total_fp}")
|
||
print(f" Parse errors: {total_err}")
|
||
print(f" Output: {output_dir}")
|
||
|
||
# Save summary
|
||
(output_dir / "summary.json").write_text(json.dumps(results, indent=2), encoding="utf-8")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|