Files
cs249r_book/interviews/vault-cli/scripts/audit_chains_with_gemini.py
Vijay Janapa Reddi a74c98576e Merge origin/dev into yaml-audit
Sync the yaml-audit branch with the latest dev work since the previous
sync (5c5af75ed). Brings in 73 commits including:

  - CI security fixes: postcss XSS bump, uuid bounds bump, codeql
    paths-ignore for vendored bundles, read-only token on
    staffml-validate-vault workflow
  - kits/ dark mode polish: code-block readability, dropdown contrast
  - vault-cli/: pre-commit ruff hook + 20 ruff fixes, all-contributors
    auto-credit workflow change to pull_request_target
  - dev's earlier merge of yaml-audit (836d481b5) carrying the
    pre-trailer-strip Phase 1/2/3 history; this merge harmonises that
    with the current trailer-clean yaml-audit tip
  - misc bug fixes (tinytorch perceptron seed, infra workflows,
    socratiq vite dev injector)

Conflicts resolved (if any) preserve the yaml-audit-side authoritative
state for vault/* files (we own those) and the dev-side authoritative
state for .github/workflows/* and other shared infrastructure.

# Conflicts:
#	.github/workflows/all-contributors-auto-credit.yml
#	.github/workflows/staffml-preview-dev.yml
#	interviews/staffml/src/data/corpus-summary.json
#	interviews/staffml/src/data/vault-manifest.json
#	interviews/staffml/tests/chain-and-vault-smoke.mjs
#	interviews/vault-cli/README.md
#	interviews/vault-cli/docs/CHAIN_ROADMAP.md
#	interviews/vault-cli/scripts/build_chains_with_gemini.py
#	interviews/vault-cli/scripts/generate_question_for_gap.py
#	interviews/vault-cli/scripts/merge_chain_passes.py
#	interviews/vault-cli/scripts/validate_drafts.py
#	interviews/vault-cli/src/vault_cli/legacy_export.py
#	interviews/vault-cli/tests/test_chain_validation.py
#	interviews/vault/.gitignore
#	interviews/vault/ARCHITECTURE.md
#	interviews/vault/chains.json
#	interviews/vault/id-registry.yaml
#	interviews/vault/questions/edge/optimization/edge-2536.yaml
#	interviews/vault/questions/mobile/deployment/mobile-2147.yaml
#	tinytorch/src/03_layers/03_layers.py
2026-05-02 11:06:43 -04:00

723 lines
28 KiB
Python
Executable File

#!/usr/bin/env python3
"""Independent audit of the Phase 1-3 chain work via gemini-3.1-pro-preview.
Designed to be a complementary check on the output of the chain-build,
tier-classification, and gap-detection pipeline — running an
independent Gemini pass over the artifacts that human review would
otherwise have to spot-check by eye.
Total call budget: ~50-60 calls (well under the 250/day Pro cap).
Per-call target: ~80K input tokens (roughly 320K chars), the sweet
spot where Gemini's attention stays sharp without burning context on
ground that won't be used.
Categories audited:
1. drafts — All 4 Phase 3 promoted drafts (independent quality
gate vs the validate_drafts.py judges; ~2 calls).
2. secondary — 100-chain sample of tier=secondary chains
(pedagogical coherence; ~10 calls).
3. delta_zero — All Δ=0 chains (highest-risk lenient additions:
verifies "shared scenario" claim; ~6 calls).
4. primary — 100-chain sample of tier=primary chains
(regression check on strict-pass quality; ~10 calls).
5. gaps — 50-gap sample with the two between-questions in full
(real bridge vs hallucination; ~10 calls).
6. synthesis — 1 wrap-up call that reads category outputs and
emits AUDIT_REPORT.md.
(NOTE: an originally-planned tier_compare category was dropped —
the lenient sweep was scoped to uncovered buckets, so 0 buckets
carry both primary and secondary chains. Per-tier quality is
inferred from categories 2 and 4 by the synthesis call.)
Outputs:
interviews/vault/_pipeline/runs/<UTC-timestamp>/
config.json — what was run, with what samples
01_drafts.json — per-call traces (prompt, response, parsed verdict)
02_secondary.json
03_delta_zero.json
04_primary.json
05_gaps.json
06_tier_compare.json
07_synthesis.json
AUDIT_REPORT.md — human-readable rollup
Modes:
--dry-run # plan + show batching, don't call Gemini
--only <category> # run a single category (debugging)
--skip <category,...> # skip listed categories (debugging)
Findings only — this script never edits chains.json or any question
YAML. Issues are surfaced for human review.
"""
from __future__ import annotations
import argparse
import json
import random
import subprocess
import sys
import time
from collections import defaultdict
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
import yaml
REPO_ROOT = Path(__file__).resolve().parents[3]
VAULT_DIR = REPO_ROOT / "interviews" / "vault"
QUESTIONS_DIR = VAULT_DIR / "questions"
CHAINS_PATH = VAULT_DIR / "chains.json"
# AI-pipeline staging artifacts live under _pipeline/ (gitignored).
# See interviews/CLAUDE.md.
PIPELINE_DIR = VAULT_DIR / "_pipeline"
GAPS_PRIMARY_PATH = PIPELINE_DIR / "gaps.proposed.json"
GAPS_LENIENT_PATH = PIPELINE_DIR / "gaps.proposed.lenient.json"
AUDIT_RUNS = PIPELINE_DIR / "runs"
SCORECARD = PIPELINE_DIR / "draft-validation-scorecard.json"
GEMINI_MODEL = "gemini-3.1-pro-preview"
INTER_CALL_DELAY_S = 4
MAX_PROMPT_CHARS = 320_000 # ~80K input tokens, attention-sweet spot
SCENARIO_CHAR_BUDGET = 350 # truncate per-question for prompt budget
LEVEL_RANK = {"L1": 1, "L2": 2, "L3": 3, "L4": 4, "L5": 5, "L6+": 6}
CATEGORIES = ["drafts", "secondary", "delta_zero", "primary", "gaps",
"synthesis"]
# RNG seed for reproducible sampling — flip via --seed for a different draw.
RNG_SEED = 42
# ─── corpus + chain helpers ───────────────────────────────────────────────
def load_corpus() -> dict[str, dict]:
out: dict[str, dict] = {}
for path in QUESTIONS_DIR.rglob("*.yaml"):
try:
with path.open(encoding="utf-8") as f:
d = yaml.safe_load(f)
except Exception:
continue
if isinstance(d, dict) and d.get("id"):
out[d["id"]] = d
return out
def load_chains() -> list[dict]:
return json.loads(CHAINS_PATH.read_text(encoding="utf-8"))
def question_payload(q: dict, *, terse: bool = False) -> dict[str, Any]:
"""Compact view of a question for prompt context. terse=True for
cases where we have a lot of questions to fit in one call."""
out = {
"id": q.get("id"),
"level": q.get("level"),
"title": q.get("title"),
"scenario": (q.get("scenario") or "")[:SCENARIO_CHAR_BUDGET],
"question": q.get("question"),
}
if not terse:
details = q.get("details") or {}
out["realistic_solution"] = details.get("realistic_solution")
return out
def chain_payload(c: dict, corpus: dict[str, dict], *, terse: bool = False) -> dict[str, Any]:
qids = [m["id"] for m in c.get("questions", []) if m.get("id")]
return {
"chain_id": c["chain_id"],
"track": c["track"],
"topic": c["topic"],
"tier": c.get("tier", "primary"),
"rationale": c.get("rationale"),
"members": [question_payload(corpus[q], terse=terse)
for q in qids if q in corpus],
}
# ─── Gemini call ──────────────────────────────────────────────────────────
def call_gemini(prompt: str, *, timeout: int = 600) -> dict | None:
try:
result = subprocess.run(
["gemini", "-m", GEMINI_MODEL, "-p", prompt, "--yolo"],
capture_output=True, text=True, timeout=timeout,
)
except subprocess.TimeoutExpired:
return None
out = (result.stdout or "").strip()
if out.startswith("```"):
out = out.strip("`")
if out.startswith("json"):
out = out[4:].lstrip()
i = out.find("{")
j = out.rfind("}")
if i == -1 or j == -1:
if result.returncode != 0:
print(f" gemini exit {result.returncode}: {(result.stderr or '')[:200]}",
file=sys.stderr)
return None
try:
return json.loads(out[i:j+1])
except json.JSONDecodeError as e:
print(f" JSON parse failed: {e}", file=sys.stderr)
return None
# ─── batching ─────────────────────────────────────────────────────────────
def batch_chains(items: list[dict], corpus: dict[str, dict],
max_chars: int = MAX_PROMPT_CHARS,
wrapper_chars: int = 4_000) -> list[list[dict]]:
"""Pack ~80K-token batches of full chain payloads."""
batches: list[list[dict]] = []
cur: list[dict] = []
cur_chars = wrapper_chars
for c in items:
payload_chars = len(json.dumps(chain_payload(c, corpus)))
if cur and cur_chars + payload_chars > max_chars:
batches.append(cur)
cur = []
cur_chars = wrapper_chars
cur.append(c)
cur_chars += payload_chars
if cur:
batches.append(cur)
return batches
# ─── category 1: drafts audit ─────────────────────────────────────────────
def audit_drafts(corpus: dict[str, dict], outdir: Path) -> dict:
if not SCORECARD.exists():
return {"skipped": True, "reason": "no scorecard"}
sc = json.loads(SCORECARD.read_text(encoding="utf-8"))
drafts_qids = [r["draft_id"] for r in sc.get("rows", [])]
drafts = [corpus[q] for q in drafts_qids if q in corpus]
# Pack 2 drafts per call so each gets a substantial context window
# for the same-bucket exemplars and the full draft body.
batches = [drafts[i:i+2] for i in range(0, len(drafts), 2)]
rows: list[dict] = []
for i, batch in enumerate(batches, start=1):
payload: list[dict] = []
for d in batch:
same_bucket = [
question_payload(q) for q in corpus.values()
if q.get("track") == d.get("track")
and q.get("topic") == d.get("topic")
and q.get("level") == d.get("level")
and q.get("id") != d.get("id")
and q.get("status") == "published"
][:5]
payload.append({
"draft": question_payload(d),
"draft_id": d.get("id"),
"same_level_neighbours": same_bucket,
"tags": d.get("tags") or [],
"authors": d.get("authors"),
"human_reviewed": d.get("human_reviewed"),
})
prompt = f"""You are an ML systems interview-question reviewer running an
INDEPENDENT QUALITY CHECK on LLM-authored draft questions. They have already
passed: Pydantic schema, BAAI/bge-small-en-v1.5 cosine vs in-bucket
neighbours (<0.92), and three Gemini-judge gates (level_fit, coherence,
bridge). Your job is to surface failure modes those gates routinely miss:
- vendor-name fabrication (made-up hardware / benchmarks / model names)
- subtle cognitive-load drift (the level field claims L4 but the question
is actually L2-shaped)
- factually wrong but internally-consistent answers
- low-quality scenarios that read as ML cosplay rather than a real situation
For each candidate, return STRICT JSON (no prose, no fences) of the shape:
{{
"drafts": [
{{
"draft_id": "<id>",
"verdict": "accept" | "edit" | "reject",
"fabrication_check": "yes" | "no" | "unclear", // any made-up vendor/benchmark/model?
"level_match": "yes" | "no" | "unclear", // does cognitive load match the level field?
"answer_correctness": "yes" | "no" | "unclear", // is realistic_solution correct?
"scenario_realism": "yes" | "no" | "unclear",
"rationale": "<one or two sentences with the SPECIFIC issue if not accept>"
}}
]
}}
INPUT:
{json.dumps(payload, indent=2)}
"""
print(f" [drafts] call {i}/{len(batches)}{len(prompt)//1000}K char prompt")
resp = call_gemini(prompt)
rows.append({"call_idx": i, "draft_ids": [d["id"] for d in batch],
"prompt_chars": len(prompt), "response": resp})
with (outdir / "01_drafts.json").open("w") as f:
json.dump(rows, f, indent=2)
if i < len(batches):
time.sleep(INTER_CALL_DELAY_S)
return {"calls": len(rows), "rows": rows}
# ─── category 2/3/4: chain sample audits (shared shape) ───────────────────
def audit_chain_sample(
chains: list[dict],
corpus: dict[str, dict],
*,
label: str,
outname: str,
outdir: Path,
instructions: str,
extra_fields: str = "",
) -> dict:
if not chains:
return {"skipped": True, "reason": f"{label}: no chains"}
batches = batch_chains(chains, corpus)
rows: list[dict] = []
for i, batch in enumerate(batches, start=1):
payload = [chain_payload(c, corpus) for c in batch]
prompt = f"""You are an ML systems interview question reviewer auditing
chains (pedagogical sequences of 2-6 questions through Bloom levels) for
quality. {instructions}
For each chain, return STRICT JSON of the shape:
{{
"chains": [
{{
"chain_id": "<id>",
"verdict": "good" | "weak" | "bad",
"progression": "yes" | "no" | "unclear", // is each step a real progression?
"topic_unity": "yes" | "no" | "unclear", // does the chain stay on one topic?
"duplicate_pair": "yes" | "no" | "unclear", // any pair too similar to be a real chain step?
{extra_fields}
"rationale": "<one sentence pointing to the SPECIFIC issue if not good>"
}}
]
}}
Return ONLY the JSON, no prose, no fences.
INPUT:
{json.dumps(payload, indent=2)}
"""
print(f" [{label}] call {i}/{len(batches)}{len(payload)} chains, "
f"{len(prompt)//1000}K char prompt")
resp = call_gemini(prompt)
rows.append({"call_idx": i,
"chain_ids": [c["chain_id"] for c in batch],
"prompt_chars": len(prompt), "response": resp})
with (outdir / outname).open("w") as f:
json.dump(rows, f, indent=2)
if i < len(batches):
time.sleep(INTER_CALL_DELAY_S)
return {"calls": len(rows), "rows": rows}
# ─── category 5: gaps audit ───────────────────────────────────────────────
def audit_gaps(corpus: dict[str, dict], outdir: Path,
limit: int, rng: random.Random) -> dict:
gaps_all = []
for p in [GAPS_PRIMARY_PATH, GAPS_LENIENT_PATH]:
if p.exists():
gaps_all.extend(json.loads(p.read_text(encoding="utf-8")))
sampled = rng.sample(gaps_all, min(limit, len(gaps_all)))
# Pack ~5 gaps per call (each gap brings 2 full anchor questions).
batch_size = 5
batches = [sampled[i:i+batch_size] for i in range(0, len(sampled), batch_size)]
rows: list[dict] = []
for i, batch in enumerate(batches, start=1):
payload = []
for g in batch:
anchors = [corpus.get(q) for q in (g.get("between") or [])]
if any(a is None for a in anchors):
continue
payload.append({
"track": g.get("track"),
"topic": g.get("topic"),
"missing_level": g.get("missing_level"),
"between_anchors": [question_payload(a) for a in anchors],
"rationale": g.get("rationale"),
})
if not payload:
continue
prompt = f"""You are reviewing GAP DETECTION output: each entry claims
that a chain bucket is missing a question at a specific Bloom level
between two existing anchor questions. Judge whether the gap is REAL
(the two anchors share a scenario thread and a true bridge would chain
them) or HALLUCINATED (the anchors are too unrelated for a bridge to
make sense, or the missing-level is wrong).
For each gap, return STRICT JSON:
{{
"gaps": [
{{
"track": "<>",
"topic": "<>",
"missing_level": "<>",
"verdict": "real" | "hallucinated" | "unclear",
"anchors_share_scenario": "yes" | "no" | "unclear",
"level_makes_sense": "yes" | "no",
"rationale": "<one sentence>"
}}
]
}}
Return ONLY JSON, no prose.
INPUT:
{json.dumps(payload, indent=2)}
"""
print(f" [gaps] call {i}/{len(batches)}{len(payload)} gaps, "
f"{len(prompt)//1000}K char prompt")
resp = call_gemini(prompt)
rows.append({"call_idx": i, "gap_count": len(payload),
"prompt_chars": len(prompt), "response": resp})
with (outdir / "05_gaps.json").open("w") as f:
json.dump(rows, f, indent=2)
if i < len(batches):
time.sleep(INTER_CALL_DELAY_S)
return {"calls": len(rows), "rows": rows}
# ─── category 6: tier comparison ──────────────────────────────────────────
def audit_tier_compare(chains: list[dict], corpus: dict[str, dict],
outdir: Path, limit: int, rng: random.Random) -> dict:
"""Find buckets that have BOTH primary and secondary chains, send one
pair per call to Gemini for side-by-side judgement."""
by_bucket: dict[tuple[str, str], dict[str, list[dict]]] = defaultdict(
lambda: {"primary": [], "secondary": []})
for c in chains:
tier = c.get("tier", "primary")
if tier not in ("primary", "secondary"):
continue
by_bucket[(c["track"], c["topic"])][tier].append(c)
candidates = [
(bucket, lists) for bucket, lists in by_bucket.items()
if lists["primary"] and lists["secondary"]
]
if not candidates:
return {"skipped": True, "reason": "no buckets with both tiers"}
sampled = rng.sample(candidates, min(limit, len(candidates)))
rows: list[dict] = []
for i, (bucket, lists) in enumerate(sampled, start=1):
# one primary + one secondary
p = rng.choice(lists["primary"])
s = rng.choice(lists["secondary"])
payload = {
"bucket": {"track": bucket[0], "topic": bucket[1]},
"primary_chain": chain_payload(p, corpus),
"secondary_chain": chain_payload(s, corpus),
}
prompt = f"""You are judging the tier-classification of two chains
in the same (track, topic) bucket. The PRIMARY chain came from a strict
Bloom-progression sweep (Δ ∈ {{1,2}}); the SECONDARY chain came from a
lenient second-pass that allowed Δ ∈ {{0,1,2,3}}. Judge whether the
classification is plausible: the primary should look like a cleaner,
more canonical pedagogical sequence than the secondary.
Return STRICT JSON:
{{
"primary_genuinely_stronger": "yes" | "no" | "unclear",
"primary_quality": "good" | "weak" | "bad",
"secondary_quality": "good" | "weak" | "bad",
"tier_inversion": "yes" | "no", // is secondary actually better than primary?
"rationale": "<one or two sentences>"
}}
Return ONLY JSON.
INPUT:
{json.dumps(payload, indent=2)}
"""
print(f" [tier_compare] call {i}/{len(sampled)} — bucket={bucket[0]}/{bucket[1]}")
resp = call_gemini(prompt)
rows.append({"call_idx": i, "bucket": list(bucket),
"primary_chain_id": p["chain_id"],
"secondary_chain_id": s["chain_id"],
"prompt_chars": len(prompt), "response": resp})
with (outdir / "06_tier_compare.json").open("w") as f:
json.dump(rows, f, indent=2)
if i < len(sampled):
time.sleep(INTER_CALL_DELAY_S)
return {"calls": len(rows), "rows": rows}
# ─── category 7: synthesis ────────────────────────────────────────────────
def synthesise(outdir: Path) -> dict:
"""Single call that reads category outputs and emits AUDIT_REPORT.md."""
summary = {}
for fname in sorted(outdir.glob("0?_*.json")):
if fname.name.startswith("07_"):
continue
try:
data = json.loads(fname.read_text())
except Exception:
continue
# Extract per-call response verdicts compactly so the synthesis
# call doesn't have to re-read full chain payloads.
flat = []
for row in data if isinstance(data, list) else []:
r = (row.get("response") or {})
for key in ("drafts", "chains", "gaps"):
for entry in (r.get(key) or []):
flat.append({"category": fname.stem, **entry})
for key in ("primary_genuinely_stronger", "tier_inversion"):
if key in r:
flat.append({"category": fname.stem, **r})
break
summary[fname.stem] = flat
summary_chars = sum(len(json.dumps(v)) for v in summary.values())
prompt = f"""You are writing an AUDIT REPORT for an ML systems interview
question pipeline. The pipeline built 879 chains, 4 LLM-authored drafts,
and detected 407 chain gaps. You have the per-category Gemini judge
results below. Produce STRICT JSON of the shape:
{{
"summary": "<2-3 sentence overall verdict>",
"headline_findings": ["<finding 1>", "<finding 2>", ...], // top 3-5 issues worth a human's attention
"per_category": {{
"drafts": {{"pass_rate": <0..1>, "key_issue": "<...>" }},
"secondary": {{"pass_rate": <0..1>, "key_issue": "<...>" }},
"delta_zero": {{"pass_rate": <0..1>, "key_issue": "<...>" }},
"primary": {{"pass_rate": <0..1>, "key_issue": "<...>" }},
"gaps": {{"pass_rate": <0..1>, "key_issue": "<...>" }}
}},
"tier_quality_delta": "<does secondary look systematically weaker than primary? one sentence>",
"recommendations": ["<actionable recommendation 1>", ...]
}}
Return ONLY JSON, no prose, no fences.
CATEGORY RESULTS (already-distilled per-call verdicts; total {summary_chars} chars):
{json.dumps(summary, indent=2)}
"""
print(f" [synthesis] 1 call — {len(prompt)//1000}K char prompt")
resp = call_gemini(prompt)
out = {"call_idx": 1, "prompt_chars": len(prompt), "response": resp,
"summary_input": summary}
with (outdir / "07_synthesis.json").open("w") as f:
json.dump(out, f, indent=2)
return out
def write_report(outdir: Path) -> Path:
"""Generate AUDIT_REPORT.md from per-category outputs + synthesis."""
syn_path = outdir / "07_synthesis.json"
syn = json.loads(syn_path.read_text()) if syn_path.exists() else {}
s = (syn.get("response") or {})
lines: list[str] = [
"# Vault chain pipeline — independent audit report",
"",
f"**Generated:** {datetime.now(UTC).isoformat(timespec='seconds')}",
f"**Auditor:** {GEMINI_MODEL} (independent of the pipeline's own judges)",
f"**Audit run dir:** `{outdir.relative_to(REPO_ROOT)}`",
"",
"---",
"",
"## Summary",
"",
s.get("summary", "*(synthesis call failed; see per-category JSON)*"),
"",
"## Headline findings",
"",
]
for f in s.get("headline_findings", []) or []:
lines.append(f"- {f}")
if not s.get("headline_findings"):
lines.append("*(no synthesis findings; see per-category JSON)*")
lines.extend(["", "## Per-category", ""])
for cat in ("drafts", "secondary", "delta_zero", "primary", "gaps"):
cat_data = (s.get("per_category") or {}).get(cat) or {}
rate = cat_data.get("pass_rate")
rate_str = f"{rate*100:.0f}%" if isinstance(rate, (int, float)) else "n/a"
lines.append(f"### {cat}")
lines.append("")
lines.append(f"- pass rate: **{rate_str}**")
lines.append(f"- key issue: {cat_data.get('key_issue', '*(none reported)*')}")
lines.append("")
if s.get("tier_quality_delta"):
lines.append("### Tier quality delta (primary vs secondary)\n")
lines.append(s["tier_quality_delta"])
lines.append("")
lines.append("## Recommendations\n")
for r in s.get("recommendations", []) or []:
lines.append(f"- {r}")
lines.extend([
"",
"---",
"",
f"Per-call traces are in `{outdir.relative_to(REPO_ROOT)}/`. "
"Each `0N_*.json` file contains the prompt-char count, the IDs in "
"scope, and the raw Gemini response. Use these for ground-truth "
"follow-up — the synthesis above is one model's compression of "
"the underlying judgements.",
])
report = outdir.parent / "AUDIT_REPORT.md"
report.write_text("\n".join(lines) + "\n", encoding="utf-8")
return report
# ─── main ─────────────────────────────────────────────────────────────────
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--dry-run", action="store_true",
help="show plan and batching, don't call Gemini")
ap.add_argument("--only", choices=CATEGORIES, default=None,
help="run a single category (debugging)")
ap.add_argument("--skip", default="",
help="comma-separated categories to skip")
ap.add_argument("--secondary-sample", type=int, default=100)
ap.add_argument("--primary-sample", type=int, default=100)
ap.add_argument("--gap-sample", type=int, default=50)
ap.add_argument("--bucket-pairs", type=int, default=15)
ap.add_argument("--seed", type=int, default=RNG_SEED)
args = ap.parse_args()
skipped = set(c.strip() for c in args.skip.split(",") if c.strip())
rng = random.Random(args.seed)
print("loading corpus + chains…")
corpus = load_corpus()
chains = load_chains()
print(f" corpus={len(corpus)}, chains={len(chains)} "
f"({sum(1 for c in chains if c.get('tier') == 'primary')} primary / "
f"{sum(1 for c in chains if c.get('tier') == 'secondary')} secondary)")
# Δ=0 chains: any consecutive pair with same level
delta_zero_chains = [
c for c in chains
if any(LEVEL_RANK.get(c["questions"][i+1]["level"], 0)
- LEVEL_RANK.get(c["questions"][i]["level"], 0) == 0
for i in range(len(c["questions"])-1))
]
primary_chains = [c for c in chains if c.get("tier", "primary") == "primary"]
secondary_chains = [c for c in chains if c.get("tier") == "secondary"]
primary_sample = rng.sample(primary_chains,
min(args.primary_sample, len(primary_chains)))
secondary_sample = rng.sample(secondary_chains,
min(args.secondary_sample, len(secondary_chains)))
timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ")
outdir = AUDIT_RUNS / timestamp
outdir.mkdir(parents=True, exist_ok=True)
plan = {
"drafts": 2,
"secondary": len(batch_chains(secondary_sample, corpus)),
"delta_zero": len(batch_chains(delta_zero_chains, corpus)),
"primary": len(batch_chains(primary_sample, corpus)),
"gaps": (args.gap_sample + 4) // 5, # 5/call
"synthesis": 1,
}
print(f"\nbatching plan ({sum(plan.values())} total calls):")
for k, v in plan.items():
marker = "*" if (args.only and args.only != k) or k in skipped else " "
print(f" {marker} {k:14s} {v} call(s)")
config = {
"timestamp": timestamp,
"seed": args.seed,
"samples": {
"secondary": len(secondary_sample),
"primary": len(primary_sample),
"delta_zero": len(delta_zero_chains),
"gap_sample": args.gap_sample,
"bucket_pairs": args.bucket_pairs,
},
"plan": plan,
}
(outdir / "config.json").write_text(json.dumps(config, indent=2))
if args.dry_run:
print(f"\n--dry-run set; wrote {outdir / 'config.json'}")
return 0
def should(cat: str) -> bool:
if args.only and args.only != cat:
return False
return cat not in skipped
if should("drafts"):
print("\n[1] drafts audit")
audit_drafts(corpus, outdir)
time.sleep(INTER_CALL_DELAY_S)
if should("secondary"):
print("\n[2] secondary chain sample audit")
audit_chain_sample(
secondary_sample, corpus,
label="secondary", outname="02_secondary.json", outdir=outdir,
instructions="These chains came from a LENIENT second-pass coverage "
"build (Δ ∈ {0,1,2,3}). Be especially attentive to "
"consecutive-pair quality, since the lenient sweep is "
"where weak chains are likeliest to slip through.",
)
time.sleep(INTER_CALL_DELAY_S)
if should("delta_zero"):
print("\n[3] Δ=0 chain audit")
audit_chain_sample(
delta_zero_chains, corpus,
label="delta_zero", outname="03_delta_zero.json", outdir=outdir,
instructions="These chains contain at least one same-level (Δ=0) "
"consecutive pair. The lenient prompt allowed Δ=0 ONLY "
"when both questions share a scenario thread. Verify "
"that claim per-pair.",
extra_fields='"shared_scenario_for_d0_pair": "yes" | "no" | "unclear",',
)
time.sleep(INTER_CALL_DELAY_S)
if should("primary"):
print("\n[4] primary chain sample audit")
audit_chain_sample(
primary_sample, corpus,
label="primary", outname="04_primary.json", outdir=outdir,
instructions="These chains came from the STRICT first-pass build "
"(Δ ∈ {1,2}). This is a regression check on strict-pass "
"quality; failures here suggest the original chain "
"rebuild itself has issues, not just the lenient sweep.",
)
time.sleep(INTER_CALL_DELAY_S)
if should("gaps"):
print("\n[5] gap detection audit")
audit_gaps(corpus, outdir, args.gap_sample, rng)
time.sleep(INTER_CALL_DELAY_S)
if should("synthesis"):
print("\n[6] synthesis")
synthesise(outdir)
report = write_report(outdir)
print(f"\nwrote {report.relative_to(REPO_ROOT)}")
print(f"\nDONE. Audit run dir: {outdir.relative_to(REPO_ROOT)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())