cs249r_book/interviews/vault/scripts/verify_math.py

#!/usr/bin/env python3
"""Gemini-powered math verification pass for StaffML corpus.

Sends chunks of questions to gemini-3.1-pro-preview for independent
math verification. Each call checks ~25 questions. With 250 calls/day
quota, this covers ~6,250 questions per day.

Usage:
    python3 scripts/verify_math.py                    # Verify all unverified
    python3 scripts/verify_math.py --chunk-size 20    # Smaller chunks
    python3 scripts/verify_math.py --limit 500        # Only first 500
    python3 scripts/verify_math.py --dry-run           # Show plan without calling
"""

import argparse
import json
import re
import subprocess
import sys
import time
from collections import Counter
from datetime import datetime
from pathlib import Path

BASE = Path(__file__).parent.parent
CORPUS_PATH = BASE / "corpus.json"
RESULTS_DIR = BASE / "scripts" / "_verification_results"

MODEL = "gemini-3.1-pro-preview"


def build_verification_prompt(questions):
    """Build a prompt asking Gemini to verify math in a batch of questions."""
    q_texts = []
    for i, q in enumerate(questions):
        details = q.get("details", {})
        q_texts.append(
            f"Q{i + 1} [id={q['id']}]: {q.get('title', 'untitled')}\n"
            f"  Scenario: {q.get('scenario', 'N/A')}\n"
            f"  Napkin Math: {details.get('napkin_math', 'N/A')}\n"
            f"  Solution: {details.get('realistic_solution', 'N/A')}\n"
            f"  Hardware: topic={q.get('topic')}, track={q.get('track')}"
        )

    batch_text = "\n\n".join(q_texts)

    return f"""You are a meticulous ML systems math verifier. For each question below,
check the napkin math and hardware specs for correctness. Output ONLY a JSON array
where each element is:
{{"id": "question-id", "status": "CORRECT|ERROR|WARN", "issues": ["issue1", ...], "corrections": ["fix1", ...]}}

Rules:
- CORRECT: math is right, specs are accurate
- ERROR: math produces wrong answer, or hardware spec is factually wrong
- WARN: minor issue (rounding, approximation is reasonable but imprecise)

Check specifically:
1. Are hardware specs accurate? (H100: 80GB HBM3, 3.35TB/s, 989 TFLOPS FP16; A100: 80GB HBM2e, 2TB/s, 312 TFLOPS; MI300X: 192GB HBM3, 5.3TB/s; Jetson Orin: 32GB LPDDR5, 275 TOPS)
2. Is the arithmetic correct? (multiplication, division, unit conversions)
3. Are the formulas correct? (roofline, KV-cache sizing, model memory, AllReduce time)
4. Are the conclusions consistent with the math?

Output ONLY the JSON array, no markdown, no explanation.

--- QUESTIONS TO VERIFY ---

{batch_text}"""


def call_gemini(prompt, timeout=300):
    """Call gemini CLI and parse JSON response."""
    try:
        r = subprocess.run(
            ["gemini", "-m", MODEL, "-o", "text"],
            input=prompt,
            capture_output=True,
            text=True,
            timeout=timeout,
        )
        if r.returncode != 0:
            return None, r.stderr[:200]

        text = r.stdout.strip()
        # Strip markdown code fences if present
        if text.startswith("```"):
            text = re.sub(r"^```\w*\n?", "", text)
            text = re.sub(r"\n?```$", "", text)

        return json.loads(text.strip()), None
    except json.JSONDecodeError as e:
        return None, f"JSON parse error: {e}"
    except subprocess.TimeoutExpired:
        return None, "Timeout"
    except Exception as e:
        return None, str(e)


def main():
    parser = argparse.ArgumentParser(description="Verify math in StaffML corpus")
    parser.add_argument(
        "--chunk-size", type=int, default=25, help="Questions per API call"
    )
    parser.add_argument(
        "--limit", type=int, default=0, help="Max questions to verify (0=all)"
    )
    parser.add_argument(
        "--dry-run", action="store_true", help="Show plan without calling API"
    )
    parser.add_argument(
        "--unverified-only",
        action="store_true",
        help="Only verify questions not yet math-verified",
    )
    args = parser.parse_args()

    corpus = json.load(open(CORPUS_PATH))
    published = [q for q in corpus if q.get("status") == "published"]

    if args.unverified_only:
        to_verify = [q for q in published if not q.get("math_verified")]
    else:
        to_verify = published

    if args.limit:
        to_verify = to_verify[: args.limit]

    n_chunks = (len(to_verify) + args.chunk_size - 1) // args.chunk_size

    print(f"Math Verification Pass")
    print(f"  Model: {MODEL}")
    print(f"  Questions to verify: {len(to_verify)}")
    print(f"  Chunk size: {args.chunk_size}")
    print(f"  API calls needed: {n_chunks}")
    print(f"  Estimated time: {n_chunks * 30 // 60} min")
    print()

    if n_chunks > 250:
        print(f"  WARNING: {n_chunks} calls exceeds daily quota of ~250")
        print(f"  Will need {(n_chunks + 249) // 250} days to complete")
        print()

    if args.dry_run:
        print("DRY RUN — no API calls made")
        return

    RESULTS_DIR.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    results_file = RESULTS_DIR / f"math-verify-{timestamp}.json"

    all_results = []
    errors = 0
    warnings = 0
    correct = 0
    api_errors = 0

    start = time.time()

    for chunk_idx in range(n_chunks):
        chunk_start = chunk_idx * args.chunk_size
        chunk_end = min(chunk_start + args.chunk_size, len(to_verify))
        chunk = to_verify[chunk_start:chunk_end]

        prompt = build_verification_prompt(chunk)
        results, error = call_gemini(prompt)

        if error:
            api_errors += 1
            print(f"  [{chunk_idx + 1}/{n_chunks}] API ERROR: {error}")
            if "QuotaError" in str(error) or "exhausted" in str(error):
                print("  QUOTA EXHAUSTED — stopping. Resume tomorrow.")
                break
            continue

        if results:
            for r in results:
                all_results.append(r)
                status = r.get("status", "UNKNOWN")
                if status == "CORRECT":
                    correct += 1
                elif status == "ERROR":
                    errors += 1
                    qid = r.get("id", "?")
                    issues = r.get("issues", [])
                    print(f"    ERROR {qid}: {'; '.join(issues[:2])}")
                elif status == "WARN":
                    warnings += 1

            # Stamp results back into corpus
            results_by_id = {r["id"]: r for r in results if "id" in r}
            for q in corpus:
                if q.get("id") in results_by_id:
                    r = results_by_id[q["id"]]
                    q["math_verified"] = True
                    q["math_status"] = r.get("status", "UNKNOWN")
                    q["math_issues"] = r.get("issues", [])
                    q["math_corrections"] = r.get("corrections", [])
                    q["math_model"] = MODEL
                    q["math_date"] = datetime.now().strftime("%Y-%m-%d")

        done = chunk_end
        elapsed = time.time() - start
        rate = done / elapsed if elapsed > 0 else 0
        print(
            f"  [{chunk_idx + 1}/{n_chunks}] "
            f"verified={done} correct={correct} "
            f"errors={errors} warnings={warnings} "
            f"({rate:.1f} q/s)"
        )

        # Rate limiting — don't hammer the API
        time.sleep(2)

    # Save results
    json.dump(all_results, open(results_file, "w"), indent=2)
    json.dump(corpus, open(CORPUS_PATH, "w"), indent=2, ensure_ascii=False)

    elapsed = time.time() - start
    print(f"\nDone: {correct} correct, {errors} errors, {warnings} warnings")
    print(f"API errors: {api_errors}")
    print(f"Time: {elapsed:.0f}s")
    print(f"Results: {results_file}")
    print(f"Corpus updated with math_verified stamps")

    if errors > 0:
        print(f"\n{errors} questions need math corrections.")
        print(f"Run: python3 scripts/verify_math.py --unverified-only")


if __name__ == "__main__":
    main()