Files
cs249r_book/interviews/vault/scripts/verify_math.py
Vijay Janapa Reddi 482fe71375 feat(staffml): Gemini 3.1 Pro verification complete — 8,419 Qs verified
Full independent cross-model verification by gemini-3.1-pro-preview:
- 8,419/9,226 questions verified (91.2%)
- 7,376 CORRECT (87.6%), 697 ERROR (8.3%), 346 WARN (4.1%)
- 7 chunks had JSON parse failures (9,226 - 8,419 = 807 unverified)

Systematic fixes applied:
- MI300X 1300→1307 TFLOPS (127 occurrences)

All questions stamped with math_verified, math_status, math_issues,
math_model fields. Error list at scripts/_verification_results/.
19/19 invariants pass. Paper figures rebuilt.
2026-04-03 10:55:41 -04:00

231 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""Gemini-powered math verification pass for StaffML corpus.
Sends chunks of questions to gemini-3.1-pro-preview for independent
math verification. Each call checks ~25 questions. With 250 calls/day
quota, this covers ~6,250 questions per day.
Usage:
python3 scripts/verify_math.py # Verify all unverified
python3 scripts/verify_math.py --chunk-size 20 # Smaller chunks
python3 scripts/verify_math.py --limit 500 # Only first 500
python3 scripts/verify_math.py --dry-run # Show plan without calling
"""
import argparse
import json
import re
import subprocess
import sys
import time
from collections import Counter
from datetime import datetime
from pathlib import Path
BASE = Path(__file__).parent.parent
CORPUS_PATH = BASE / "corpus.json"
RESULTS_DIR = BASE / "scripts" / "_verification_results"
MODEL = "gemini-3.1-pro-preview"
def build_verification_prompt(questions):
"""Build a prompt asking Gemini to verify math in a batch of questions."""
q_texts = []
for i, q in enumerate(questions):
details = q.get("details", {})
q_texts.append(
f"Q{i + 1} [id={q['id']}]: {q.get('title', 'untitled')}\n"
f" Scenario: {q.get('scenario', 'N/A')}\n"
f" Napkin Math: {details.get('napkin_math', 'N/A')}\n"
f" Solution: {details.get('realistic_solution', 'N/A')}\n"
f" Hardware: topic={q.get('topic')}, track={q.get('track')}"
)
batch_text = "\n\n".join(q_texts)
return f"""You are a meticulous ML systems math verifier. For each question below,
check the napkin math and hardware specs for correctness. Output ONLY a JSON array
where each element is:
{{"id": "question-id", "status": "CORRECT|ERROR|WARN", "issues": ["issue1", ...], "corrections": ["fix1", ...]}}
Rules:
- CORRECT: math is right, specs are accurate
- ERROR: math produces wrong answer, or hardware spec is factually wrong
- WARN: minor issue (rounding, approximation is reasonable but imprecise)
Check specifically:
1. Are hardware specs accurate? (H100: 80GB HBM3, 3.35TB/s, 989 TFLOPS FP16; A100: 80GB HBM2e, 2TB/s, 312 TFLOPS; MI300X: 192GB HBM3, 5.3TB/s; Jetson Orin: 32GB LPDDR5, 275 TOPS)
2. Is the arithmetic correct? (multiplication, division, unit conversions)
3. Are the formulas correct? (roofline, KV-cache sizing, model memory, AllReduce time)
4. Are the conclusions consistent with the math?
Output ONLY the JSON array, no markdown, no explanation.
--- QUESTIONS TO VERIFY ---
{batch_text}"""
def call_gemini(prompt, timeout=300):
"""Call gemini CLI and parse JSON response."""
try:
r = subprocess.run(
["gemini", "-m", MODEL, "-o", "text"],
input=prompt,
capture_output=True,
text=True,
timeout=timeout,
)
if r.returncode != 0:
return None, r.stderr[:200]
text = r.stdout.strip()
# Strip markdown code fences if present
if text.startswith("```"):
text = re.sub(r"^```\w*\n?", "", text)
text = re.sub(r"\n?```$", "", text)
return json.loads(text.strip()), None
except json.JSONDecodeError as e:
return None, f"JSON parse error: {e}"
except subprocess.TimeoutExpired:
return None, "Timeout"
except Exception as e:
return None, str(e)
def main():
parser = argparse.ArgumentParser(description="Verify math in StaffML corpus")
parser.add_argument(
"--chunk-size", type=int, default=25, help="Questions per API call"
)
parser.add_argument(
"--limit", type=int, default=0, help="Max questions to verify (0=all)"
)
parser.add_argument(
"--dry-run", action="store_true", help="Show plan without calling API"
)
parser.add_argument(
"--unverified-only",
action="store_true",
help="Only verify questions not yet math-verified",
)
args = parser.parse_args()
corpus = json.load(open(CORPUS_PATH))
published = [q for q in corpus if q.get("status") == "published"]
if args.unverified_only:
to_verify = [q for q in published if not q.get("math_verified")]
else:
to_verify = published
if args.limit:
to_verify = to_verify[: args.limit]
n_chunks = (len(to_verify) + args.chunk_size - 1) // args.chunk_size
print(f"Math Verification Pass")
print(f" Model: {MODEL}")
print(f" Questions to verify: {len(to_verify)}")
print(f" Chunk size: {args.chunk_size}")
print(f" API calls needed: {n_chunks}")
print(f" Estimated time: {n_chunks * 30 // 60} min")
print()
if n_chunks > 250:
print(f" WARNING: {n_chunks} calls exceeds daily quota of ~250")
print(f" Will need {(n_chunks + 249) // 250} days to complete")
print()
if args.dry_run:
print("DRY RUN — no API calls made")
return
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
results_file = RESULTS_DIR / f"math-verify-{timestamp}.json"
all_results = []
errors = 0
warnings = 0
correct = 0
api_errors = 0
start = time.time()
for chunk_idx in range(n_chunks):
chunk_start = chunk_idx * args.chunk_size
chunk_end = min(chunk_start + args.chunk_size, len(to_verify))
chunk = to_verify[chunk_start:chunk_end]
prompt = build_verification_prompt(chunk)
results, error = call_gemini(prompt)
if error:
api_errors += 1
print(f" [{chunk_idx + 1}/{n_chunks}] API ERROR: {error}")
if "QuotaError" in str(error) or "exhausted" in str(error):
print(" QUOTA EXHAUSTED — stopping. Resume tomorrow.")
break
continue
if results:
for r in results:
all_results.append(r)
status = r.get("status", "UNKNOWN")
if status == "CORRECT":
correct += 1
elif status == "ERROR":
errors += 1
qid = r.get("id", "?")
issues = r.get("issues", [])
print(f" ERROR {qid}: {'; '.join(issues[:2])}")
elif status == "WARN":
warnings += 1
# Stamp results back into corpus
results_by_id = {r["id"]: r for r in results if "id" in r}
for q in corpus:
if q.get("id") in results_by_id:
r = results_by_id[q["id"]]
q["math_verified"] = True
q["math_status"] = r.get("status", "UNKNOWN")
q["math_issues"] = r.get("issues", [])
q["math_corrections"] = r.get("corrections", [])
q["math_model"] = MODEL
q["math_date"] = datetime.now().strftime("%Y-%m-%d")
done = chunk_end
elapsed = time.time() - start
rate = done / elapsed if elapsed > 0 else 0
print(
f" [{chunk_idx + 1}/{n_chunks}] "
f"verified={done} correct={correct} "
f"errors={errors} warnings={warnings} "
f"({rate:.1f} q/s)"
)
# Rate limiting — don't hammer the API
time.sleep(2)
# Save results
json.dump(all_results, open(results_file, "w"), indent=2)
json.dump(corpus, open(CORPUS_PATH, "w"), indent=2, ensure_ascii=False)
elapsed = time.time() - start
print(f"\nDone: {correct} correct, {errors} errors, {warnings} warnings")
print(f"API errors: {api_errors}")
print(f"Time: {elapsed:.0f}s")
print(f"Results: {results_file}")
print(f"Corpus updated with math_verified stamps")
if errors > 0:
print(f"\n{errors} questions need math corrections.")
print(f"Run: python3 scripts/verify_math.py --unverified-only")
if __name__ == "__main__":
main()