mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 10:08:50 -05:00
Full independent cross-model verification by gemini-3.1-pro-preview: - 8,419/9,226 questions verified (91.2%) - 7,376 CORRECT (87.6%), 697 ERROR (8.3%), 346 WARN (4.1%) - 7 chunks had JSON parse failures (9,226 - 8,419 = 807 unverified) Systematic fixes applied: - MI300X 1300→1307 TFLOPS (127 occurrences) All questions stamped with math_verified, math_status, math_issues, math_model fields. Error list at scripts/_verification_results/. 19/19 invariants pass. Paper figures rebuilt.
231 lines
7.8 KiB
Python
231 lines
7.8 KiB
Python
#!/usr/bin/env python3
|
|
"""Gemini-powered math verification pass for StaffML corpus.
|
|
|
|
Sends chunks of questions to gemini-3.1-pro-preview for independent
|
|
math verification. Each call checks ~25 questions. With 250 calls/day
|
|
quota, this covers ~6,250 questions per day.
|
|
|
|
Usage:
|
|
python3 scripts/verify_math.py # Verify all unverified
|
|
python3 scripts/verify_math.py --chunk-size 20 # Smaller chunks
|
|
python3 scripts/verify_math.py --limit 500 # Only first 500
|
|
python3 scripts/verify_math.py --dry-run # Show plan without calling
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import time
|
|
from collections import Counter
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
BASE = Path(__file__).parent.parent
|
|
CORPUS_PATH = BASE / "corpus.json"
|
|
RESULTS_DIR = BASE / "scripts" / "_verification_results"
|
|
|
|
MODEL = "gemini-3.1-pro-preview"
|
|
|
|
|
|
def build_verification_prompt(questions):
|
|
"""Build a prompt asking Gemini to verify math in a batch of questions."""
|
|
q_texts = []
|
|
for i, q in enumerate(questions):
|
|
details = q.get("details", {})
|
|
q_texts.append(
|
|
f"Q{i + 1} [id={q['id']}]: {q.get('title', 'untitled')}\n"
|
|
f" Scenario: {q.get('scenario', 'N/A')}\n"
|
|
f" Napkin Math: {details.get('napkin_math', 'N/A')}\n"
|
|
f" Solution: {details.get('realistic_solution', 'N/A')}\n"
|
|
f" Hardware: topic={q.get('topic')}, track={q.get('track')}"
|
|
)
|
|
|
|
batch_text = "\n\n".join(q_texts)
|
|
|
|
return f"""You are a meticulous ML systems math verifier. For each question below,
|
|
check the napkin math and hardware specs for correctness. Output ONLY a JSON array
|
|
where each element is:
|
|
{{"id": "question-id", "status": "CORRECT|ERROR|WARN", "issues": ["issue1", ...], "corrections": ["fix1", ...]}}
|
|
|
|
Rules:
|
|
- CORRECT: math is right, specs are accurate
|
|
- ERROR: math produces wrong answer, or hardware spec is factually wrong
|
|
- WARN: minor issue (rounding, approximation is reasonable but imprecise)
|
|
|
|
Check specifically:
|
|
1. Are hardware specs accurate? (H100: 80GB HBM3, 3.35TB/s, 989 TFLOPS FP16; A100: 80GB HBM2e, 2TB/s, 312 TFLOPS; MI300X: 192GB HBM3, 5.3TB/s; Jetson Orin: 32GB LPDDR5, 275 TOPS)
|
|
2. Is the arithmetic correct? (multiplication, division, unit conversions)
|
|
3. Are the formulas correct? (roofline, KV-cache sizing, model memory, AllReduce time)
|
|
4. Are the conclusions consistent with the math?
|
|
|
|
Output ONLY the JSON array, no markdown, no explanation.
|
|
|
|
--- QUESTIONS TO VERIFY ---
|
|
|
|
{batch_text}"""
|
|
|
|
|
|
def call_gemini(prompt, timeout=300):
|
|
"""Call gemini CLI and parse JSON response."""
|
|
try:
|
|
r = subprocess.run(
|
|
["gemini", "-m", MODEL, "-o", "text"],
|
|
input=prompt,
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
)
|
|
if r.returncode != 0:
|
|
return None, r.stderr[:200]
|
|
|
|
text = r.stdout.strip()
|
|
# Strip markdown code fences if present
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```\w*\n?", "", text)
|
|
text = re.sub(r"\n?```$", "", text)
|
|
|
|
return json.loads(text.strip()), None
|
|
except json.JSONDecodeError as e:
|
|
return None, f"JSON parse error: {e}"
|
|
except subprocess.TimeoutExpired:
|
|
return None, "Timeout"
|
|
except Exception as e:
|
|
return None, str(e)
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Verify math in StaffML corpus")
|
|
parser.add_argument(
|
|
"--chunk-size", type=int, default=25, help="Questions per API call"
|
|
)
|
|
parser.add_argument(
|
|
"--limit", type=int, default=0, help="Max questions to verify (0=all)"
|
|
)
|
|
parser.add_argument(
|
|
"--dry-run", action="store_true", help="Show plan without calling API"
|
|
)
|
|
parser.add_argument(
|
|
"--unverified-only",
|
|
action="store_true",
|
|
help="Only verify questions not yet math-verified",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
corpus = json.load(open(CORPUS_PATH))
|
|
published = [q for q in corpus if q.get("status") == "published"]
|
|
|
|
if args.unverified_only:
|
|
to_verify = [q for q in published if not q.get("math_verified")]
|
|
else:
|
|
to_verify = published
|
|
|
|
if args.limit:
|
|
to_verify = to_verify[: args.limit]
|
|
|
|
n_chunks = (len(to_verify) + args.chunk_size - 1) // args.chunk_size
|
|
|
|
print(f"Math Verification Pass")
|
|
print(f" Model: {MODEL}")
|
|
print(f" Questions to verify: {len(to_verify)}")
|
|
print(f" Chunk size: {args.chunk_size}")
|
|
print(f" API calls needed: {n_chunks}")
|
|
print(f" Estimated time: {n_chunks * 30 // 60} min")
|
|
print()
|
|
|
|
if n_chunks > 250:
|
|
print(f" WARNING: {n_chunks} calls exceeds daily quota of ~250")
|
|
print(f" Will need {(n_chunks + 249) // 250} days to complete")
|
|
print()
|
|
|
|
if args.dry_run:
|
|
print("DRY RUN — no API calls made")
|
|
return
|
|
|
|
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
results_file = RESULTS_DIR / f"math-verify-{timestamp}.json"
|
|
|
|
all_results = []
|
|
errors = 0
|
|
warnings = 0
|
|
correct = 0
|
|
api_errors = 0
|
|
|
|
start = time.time()
|
|
|
|
for chunk_idx in range(n_chunks):
|
|
chunk_start = chunk_idx * args.chunk_size
|
|
chunk_end = min(chunk_start + args.chunk_size, len(to_verify))
|
|
chunk = to_verify[chunk_start:chunk_end]
|
|
|
|
prompt = build_verification_prompt(chunk)
|
|
results, error = call_gemini(prompt)
|
|
|
|
if error:
|
|
api_errors += 1
|
|
print(f" [{chunk_idx + 1}/{n_chunks}] API ERROR: {error}")
|
|
if "QuotaError" in str(error) or "exhausted" in str(error):
|
|
print(" QUOTA EXHAUSTED — stopping. Resume tomorrow.")
|
|
break
|
|
continue
|
|
|
|
if results:
|
|
for r in results:
|
|
all_results.append(r)
|
|
status = r.get("status", "UNKNOWN")
|
|
if status == "CORRECT":
|
|
correct += 1
|
|
elif status == "ERROR":
|
|
errors += 1
|
|
qid = r.get("id", "?")
|
|
issues = r.get("issues", [])
|
|
print(f" ERROR {qid}: {'; '.join(issues[:2])}")
|
|
elif status == "WARN":
|
|
warnings += 1
|
|
|
|
# Stamp results back into corpus
|
|
results_by_id = {r["id"]: r for r in results if "id" in r}
|
|
for q in corpus:
|
|
if q.get("id") in results_by_id:
|
|
r = results_by_id[q["id"]]
|
|
q["math_verified"] = True
|
|
q["math_status"] = r.get("status", "UNKNOWN")
|
|
q["math_issues"] = r.get("issues", [])
|
|
q["math_corrections"] = r.get("corrections", [])
|
|
q["math_model"] = MODEL
|
|
q["math_date"] = datetime.now().strftime("%Y-%m-%d")
|
|
|
|
done = chunk_end
|
|
elapsed = time.time() - start
|
|
rate = done / elapsed if elapsed > 0 else 0
|
|
print(
|
|
f" [{chunk_idx + 1}/{n_chunks}] "
|
|
f"verified={done} correct={correct} "
|
|
f"errors={errors} warnings={warnings} "
|
|
f"({rate:.1f} q/s)"
|
|
)
|
|
|
|
# Rate limiting — don't hammer the API
|
|
time.sleep(2)
|
|
|
|
# Save results
|
|
json.dump(all_results, open(results_file, "w"), indent=2)
|
|
json.dump(corpus, open(CORPUS_PATH, "w"), indent=2, ensure_ascii=False)
|
|
|
|
elapsed = time.time() - start
|
|
print(f"\nDone: {correct} correct, {errors} errors, {warnings} warnings")
|
|
print(f"API errors: {api_errors}")
|
|
print(f"Time: {elapsed:.0f}s")
|
|
print(f"Results: {results_file}")
|
|
print(f"Corpus updated with math_verified stamps")
|
|
|
|
if errors > 0:
|
|
print(f"\n{errors} questions need math corrections.")
|
|
print(f"Run: python3 scripts/verify_math.py --unverified-only")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|