mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 18:18:42 -05:00
The Phase 0 cleanup removed 18 scripts as deprecated, but 6 of them have
unique-capability patterns not yet covered by the modern tooling. Restoring
them as reference patterns, not active scripts.
What's restored and why:
gemini_backfill_question.py
Idempotent corpus-walk + Gemini batch + thread-pool + JSON YAML
round-trip. The "fix one field across thousands of YAMLs" pattern.
To be mined in CORPUS_HARDENING_PLAN.md Phase 5.
gpt_backfill_question.py
OpenAI variant of the above. Cross-provider template.
gemini_cli_generate_questions.py (35K)
BATCHED generation: 12 cells per call with balanced track × area ×
zone × level round-robin. `vault generate` does NOT batch — it calls
once per question. This script's batching pattern is what we want
when generating > 100 questions in bulk.
generate.py (30K)
Coverage-survey-driven generation engine: surveys the corpus, finds
empty cells, generates to fill the emptiest first, stops when
saturated. `vault generate` lacks this auto-balance loop.
gemini_fix_errors.py
Batch error-fixer with hardware-reference grounding (V100 / A100 /
H100 / B200 / T4 specs as ground-truth context). To be mined for
audit_corpus_batched.py --propose-fixes in Phase 5.
deep_verify.py
Claude Opus + extended thinking; SHOWS ITS WORK on every napkin-math
claim. Useful as a tiebreaker on borderline math findings from the
lightweight audit.
Each restored file has a 5-line STATUS comment block at the top
documenting what to adapt before running. DEPRECATED.md is restructured
to make the three categories explicit (removed / preserved-for-adaptation
/ active-migration), and adds an adaptation checklist that applies to
all preserved scripts (replace corpus.json loading, verify SDK pins,
update output paths, re-validate prompts, sample first).
Validation:
vault check --strict — 10,711 loaded, 0 invariant failures
pytest — 74/74
ruff — clean
306 lines
12 KiB
Python
306 lines
12 KiB
Python
#!/usr/bin/env python3
|
||
# STATUS (2026-05-03): preserved as a reference pattern — see vault/scripts/DEPRECATED.md
|
||
# §"Preserved for adaptation". Unique among the audit scripts: Claude Opus +
|
||
# extended thinking, asks the model to SHOW ITS WORK on every napkin-math claim.
|
||
# Useful as a tiebreaker when audit_corpus_batched.py flags borderline math.
|
||
# Reads the legacy corpus.json — adapt to walk vault/questions/**/*.yaml.
|
||
"""Deep math verification using Claude Opus with extended thinking.
|
||
|
||
Unlike the lightweight verify_math.py (which checks ~25 Qs per call
|
||
with brief responses), this script sends smaller batches and asks the
|
||
model to SHOW ITS WORK — step-by-step recalculation of every napkin
|
||
math claim, with explicit arithmetic checks.
|
||
|
||
Usage:
|
||
python3 scripts/deep_verify.py # Verify all unverified
|
||
python3 scripts/deep_verify.py --sample 200 # Random sample
|
||
python3 scripts/deep_verify.py --topic flash-attention # Single topic
|
||
python3 scripts/deep_verify.py --errors-only # Re-check previous errors
|
||
python3 scripts/deep_verify.py --dry-run # Plan only
|
||
"""
|
||
|
||
import argparse
|
||
import json
|
||
import os
|
||
import random
|
||
import subprocess
|
||
import sys
|
||
import time
|
||
from collections import Counter, defaultdict
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
|
||
BASE = Path(__file__).parent.parent
|
||
CORPUS_PATH = BASE / "corpus.json"
|
||
RESULTS_DIR = BASE / "scripts" / "_verification_results"
|
||
|
||
# Hardware reference specs for verification prompts
|
||
HARDWARE_SPECS = """
|
||
AUTHORITATIVE HARDWARE SPECS (use these to check questions):
|
||
- H100 SXM: 80 GB HBM3, 3.35 TB/s bandwidth, 989 TFLOPS FP16 dense, 1,979 with sparsity, 700W TDP
|
||
- A100 SXM: 80 GB HBM2e, 2.0 TB/s bandwidth, 312 TFLOPS FP16 dense, 624 with sparsity, 400W TDP
|
||
- MI300X: 192 GB HBM3, 5.3 TB/s bandwidth, 1,307 TFLOPS FP16, 750W TDP, 8 XCDs
|
||
- TPU v5e: 16 GB HBM, 1.6 TB/s bandwidth, 197 TFLOPS BF16
|
||
- Jetson Orin: 32 GB LPDDR5, 275 TOPS INT8, 60W TDP
|
||
- Hailo-8: 26 TOPS INT8, 2.5W
|
||
- Coral Edge TPU: 4 TOPS INT8, 2W
|
||
- A17 Pro: 35 TOPS Neural Engine, 8 GB unified
|
||
- Snapdragon 8 Gen 3: 45 TOPS INT8 Hexagon NPU, 16 GB LPDDR5X
|
||
- Cortex-M4: 168 MHz (NOT 240 MHz), 256 KB SRAM, no FPU
|
||
- Cortex-M7: 480 MHz, 512 KB SRAM
|
||
- ESP32-S3: 240 MHz dual-core, 512 KB SRAM, 8 MB PSRAM
|
||
|
||
KEY FORMULAS:
|
||
- Model memory (FP16): params × 2 bytes
|
||
- Adam optimizer state: params × (2 + 4 + 4 + 4) = params × 14 bytes (FP16 weights + FP32 master + momentum + variance)
|
||
- KV cache per token: 2 × n_layers × n_kv_heads × head_dim × bytes_per_element
|
||
- Ridge point: peak_FLOPS / memory_bandwidth (FLOPs/Byte)
|
||
- Arithmetic intensity: FLOPs / bytes_accessed
|
||
- Ring AllReduce time: 2 × (N-1)/N × message_size / bandwidth
|
||
"""
|
||
|
||
|
||
def build_deep_verify_prompt(questions):
|
||
"""Build a prompt that asks for step-by-step verification."""
|
||
q_texts = []
|
||
for i, q in enumerate(questions):
|
||
details = q.get("details", {})
|
||
q_texts.append(f"""
|
||
--- QUESTION {i+1} [id={q['id']}] ---
|
||
Title: {q.get('title', 'N/A')}
|
||
Topic: {q.get('topic', 'N/A')} | Track: {q.get('track', 'N/A')} | Zone: {q.get('zone', 'N/A')}
|
||
Scenario: {q.get('scenario', 'N/A')}
|
||
Napkin Math: {details.get('napkin_math', 'N/A')}
|
||
Solution: {details.get('realistic_solution', 'N/A')}
|
||
Common Mistake: {details.get('common_mistake', 'N/A')}
|
||
""")
|
||
|
||
batch_text = "\n".join(q_texts)
|
||
|
||
return f"""You are a meticulous ML systems verification engine. For each question below,
|
||
you MUST independently recalculate every numerical claim from first principles.
|
||
|
||
{HARDWARE_SPECS}
|
||
|
||
VERIFICATION PROTOCOL:
|
||
For each question:
|
||
1. IDENTIFY all numerical claims (hardware specs, computed values, conclusions)
|
||
2. CHECK each hardware spec against the authoritative list above
|
||
3. RECALCULATE every derived number step by step (show your arithmetic)
|
||
4. VERIFY the conclusion follows from the math
|
||
5. CHECK units (TFLOPS vs GFLOPS, GB vs MB, seconds vs milliseconds, TB/s vs GB/s)
|
||
|
||
Output a JSON array. For each question:
|
||
{{
|
||
"id": "question-id",
|
||
"status": "CORRECT|ERROR|WARN",
|
||
"hardware_specs_correct": true|false,
|
||
"arithmetic_verified": true|false,
|
||
"units_consistent": true|false,
|
||
"conclusion_follows": true|false,
|
||
"recalculation": "Brief step-by-step of the key calculation",
|
||
"issues": ["specific issue descriptions"],
|
||
"corrections": ["exact fix needed"],
|
||
"severity": "critical|moderate|minor|none"
|
||
}}
|
||
|
||
Rules:
|
||
- CORRECT: everything checks out
|
||
- ERROR: wrong answer, wrong spec, wrong formula, or wrong conclusion
|
||
- WARN: approximation is loose but defensible, or minor inconsistency
|
||
- Be STRICT on hardware specs (use the authoritative list)
|
||
- Be STRICT on arithmetic (recalculate, don't trust)
|
||
- Be LENIENT on reasonable approximations (±10% is fine for napkin math)
|
||
|
||
Output ONLY the JSON array.
|
||
|
||
--- QUESTIONS TO VERIFY ---
|
||
{batch_text}"""
|
||
|
||
|
||
def run_claude_verify(prompt, timeout=300):
|
||
"""Run verification through Claude CLI."""
|
||
try:
|
||
r = subprocess.run(
|
||
["claude", "-p", prompt, "--model", "opus", "--output-format", "json"],
|
||
capture_output=True, text=True, timeout=timeout,
|
||
)
|
||
if r.returncode != 0:
|
||
return None, f"CLI error: {r.stderr[:200]}"
|
||
|
||
# Parse response — claude CLI with --output-format json wraps in a structure
|
||
try:
|
||
response = json.loads(r.stdout)
|
||
# Extract the text content
|
||
if isinstance(response, dict) and "result" in response:
|
||
text = response["result"]
|
||
elif isinstance(response, dict) and "content" in response:
|
||
text = response["content"]
|
||
elif isinstance(response, str):
|
||
text = response
|
||
else:
|
||
text = r.stdout
|
||
|
||
# Strip markdown fences
|
||
import re
|
||
if isinstance(text, str) and text.strip().startswith("```"):
|
||
text = re.sub(r"^```\w*\n?", "", text.strip())
|
||
text = re.sub(r"\n?```$", "", text)
|
||
|
||
return json.loads(text.strip()), None
|
||
except json.JSONDecodeError:
|
||
# Try to find JSON array in the output
|
||
import re
|
||
match = re.search(r'\[[\s\S]*\]', r.stdout)
|
||
if match:
|
||
return json.loads(match.group()), None
|
||
return None, f"JSON parse error in response"
|
||
|
||
except subprocess.TimeoutExpired:
|
||
return None, "Timeout"
|
||
except Exception as e:
|
||
return None, str(e)
|
||
|
||
|
||
def main():
|
||
parser = argparse.ArgumentParser(description="Deep math verification with Opus")
|
||
parser.add_argument("--chunk-size", type=int, default=10,
|
||
help="Questions per verification call (smaller = more thorough)")
|
||
parser.add_argument("--sample", type=int, default=0,
|
||
help="Random sample size (0=all published)")
|
||
parser.add_argument("--topic", type=str, default=None,
|
||
help="Verify only this topic")
|
||
parser.add_argument("--errors-only", action="store_true",
|
||
help="Re-verify only previously flagged errors")
|
||
parser.add_argument("--dry-run", action="store_true",
|
||
help="Show plan without calling API")
|
||
parser.add_argument("--workers", type=int, default=1,
|
||
help="Parallel verification workers")
|
||
parser.add_argument("--stratified", action="store_true",
|
||
help="Stratified sample (N per topic)")
|
||
parser.add_argument("--per-topic", type=int, default=3,
|
||
help="Questions per topic for stratified sampling")
|
||
args = parser.parse_args()
|
||
|
||
corpus = json.load(open(CORPUS_PATH))
|
||
published = [q for q in corpus if q.get("status") == "published"]
|
||
|
||
# Select questions to verify
|
||
if args.errors_only:
|
||
to_verify = [q for q in published if q.get("math_status") == "ERROR"]
|
||
print(f"Re-verifying {len(to_verify)} previously flagged errors")
|
||
elif args.topic:
|
||
to_verify = [q for q in published if q.get("topic") == args.topic]
|
||
print(f"Verifying topic '{args.topic}': {len(to_verify)} questions")
|
||
elif args.stratified:
|
||
by_topic = defaultdict(list)
|
||
for q in published:
|
||
by_topic[q.get("topic", "")].append(q)
|
||
to_verify = []
|
||
for topic, qs in sorted(by_topic.items()):
|
||
n = min(args.per_topic, len(qs))
|
||
to_verify.extend(random.sample(qs, n))
|
||
print(f"Stratified sample: {args.per_topic}/topic = {len(to_verify)} questions")
|
||
elif args.sample:
|
||
to_verify = random.sample(published, min(args.sample, len(published)))
|
||
print(f"Random sample: {len(to_verify)} questions")
|
||
else:
|
||
to_verify = published
|
||
|
||
n_chunks = (len(to_verify) + args.chunk_size - 1) // args.chunk_size
|
||
|
||
print(f"\nDeep Verification Plan:")
|
||
print(f" Questions: {len(to_verify)}")
|
||
print(f" Chunk size: {args.chunk_size} (smaller = more thorough)")
|
||
print(f" API calls: {n_chunks}")
|
||
print(f" Model: Claude Opus (extended thinking)")
|
||
|
||
if args.dry_run:
|
||
print("\nDRY RUN — no API calls")
|
||
return
|
||
|
||
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
|
||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||
results_file = RESULTS_DIR / f"deep-verify-{timestamp}.json"
|
||
|
||
all_results = []
|
||
stats = Counter()
|
||
start = time.time()
|
||
|
||
for chunk_idx in range(n_chunks):
|
||
chunk_start = chunk_idx * args.chunk_size
|
||
chunk_end = min(chunk_start + args.chunk_size, len(to_verify))
|
||
chunk = to_verify[chunk_start:chunk_end]
|
||
|
||
prompt = build_deep_verify_prompt(chunk)
|
||
results, error = run_claude_verify(prompt)
|
||
|
||
if error:
|
||
print(f" [{chunk_idx+1}/{n_chunks}] ERROR: {error}")
|
||
stats["api_error"] += 1
|
||
continue
|
||
|
||
if results:
|
||
for r in results:
|
||
all_results.append(r)
|
||
status = r.get("status", "UNKNOWN")
|
||
stats[status] += 1
|
||
|
||
if status == "ERROR":
|
||
qid = r.get("id", "?")
|
||
severity = r.get("severity", "?")
|
||
issues = r.get("issues", [])
|
||
recalc = r.get("recalculation", "")
|
||
print(f" ❌ {qid} [{severity}]: {'; '.join(issues[:2])}")
|
||
if recalc:
|
||
print(f" Recalc: {recalc[:100]}")
|
||
|
||
# Stamp results into corpus
|
||
results_by_id = {r["id"]: r for r in results if "id" in r}
|
||
for q in corpus:
|
||
if q.get("id") in results_by_id:
|
||
r = results_by_id[q["id"]]
|
||
q["math_verified"] = True
|
||
q["math_status"] = r.get("status", "UNKNOWN")
|
||
q["math_issues"] = r.get("issues", [])
|
||
q["math_corrections"] = r.get("corrections", [])
|
||
q["math_severity"] = r.get("severity", "none")
|
||
q["math_model"] = "claude-opus-4-6"
|
||
q["math_date"] = datetime.now().strftime("%Y-%m-%d")
|
||
|
||
done = chunk_end
|
||
elapsed = time.time() - start
|
||
rate = done / elapsed if elapsed > 0 else 0
|
||
print(f" [{chunk_idx+1}/{n_chunks}] "
|
||
f"done={done} correct={stats.get('CORRECT',0)} "
|
||
f"error={stats.get('ERROR',0)} warn={stats.get('WARN',0)} "
|
||
f"({rate:.1f} q/s)")
|
||
|
||
time.sleep(1)
|
||
|
||
# Save
|
||
json.dump(all_results, open(results_file, "w"), indent=2)
|
||
json.dump(corpus, open(CORPUS_PATH, "w"), indent=2, ensure_ascii=False)
|
||
|
||
elapsed = time.time() - start
|
||
total_verified = sum(stats.values()) - stats.get("api_error", 0)
|
||
print(f"\n{'='*60}")
|
||
print(f"DEEP VERIFICATION COMPLETE")
|
||
print(f"{'='*60}")
|
||
print(f"Verified: {total_verified}")
|
||
print(f" CORRECT: {stats.get('CORRECT', 0)} ({100*stats.get('CORRECT',0)/max(total_verified,1):.1f}%)")
|
||
print(f" ERROR: {stats.get('ERROR', 0)} ({100*stats.get('ERROR',0)/max(total_verified,1):.1f}%)")
|
||
print(f" WARN: {stats.get('WARN', 0)} ({100*stats.get('WARN',0)/max(total_verified,1):.1f}%)")
|
||
print(f"Time: {elapsed:.0f}s")
|
||
print(f"Results: {results_file}")
|
||
|
||
if stats.get("ERROR", 0) > 0:
|
||
error_rate = stats["ERROR"] / total_verified
|
||
projected = int(error_rate * len(published))
|
||
print(f"\nProjected errors corpus-wide: ~{projected}")
|
||
print(f"Run with --errors-only to re-verify after fixing")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|