mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
Action on the strongest finding from the 2026-05-01 independent audit:
54 of 55 Δ=0 chains had no shared scenario (the "two questions
sharing a scenario thread" constraint the lenient prompt was supposed
to enforce). Two independent audit fields agreed (verdict=bad and
shared_scenario=no), so this isn't a tuning question — the design
choice was wrong.
Why remove Δ=0 entirely rather than tighten the prompt:
- The chain definition is "pedagogical progression through Bloom
levels"; same-level edges contradict the definition.
- The "shared scenario / different angle" carve-out is unenforceable
by an LLM at corpus scale (audit confirmed).
- Same-scenario same-level pairs are more honestly modeled as
siblings of a chain anchor, not as chain members.
Changes:
- chains.json: 879 → 824. Dropped: 55 chains (all tier=secondary,
since Δ=0 was only ever produced by the lenient sweep).
Per-track: edge -19, tinyml -12, mobile -10, cloud -7, global -7.
- build_chains_with_gemini.py:
MODE_CONFIG["lenient"]["allowed_deltas"]: {0,1,2,3} → {1,2,3}
LENIENT_PROMPT_TEMPLATE: Δ=0 paragraph rewritten to explicitly
REJECT same-level pairs (with rationale citing the audit).
docstring + --mode help text updated.
- tests/test_chain_validation.py:
test_lenient_accepts_same_level_pair → test_lenient_rejects_same_level_pair
header docstring updated to reflect the new rule.
- vault-manifest.json: chainCount 879 → 824, releaseHash rolls to
479811040b7a… (real content delta, not a timestamp churn).
Validation:
- vault check --strict: 10,705 loaded, 0 failures
- vault build --local-json: chainCount=824, releaseHash=479811040b…
- pytest: 74/74
- playwright chain-and-vault-smoke: 19/19 (fixtures cloud-0001 +
cloud-0231 are still in their chains post-drop)
Audit findings #2 (gap detection ~50% noise) and #3 (4 pilot drafts
disposition) remain open — see CHAIN_ROADMAP.md Progress Log.
198 lines
6.0 KiB
Python
198 lines
6.0 KiB
Python
"""Tests for ``validate_chain`` in ``scripts/build_chains_with_gemini.py``.
|
|
|
|
Phase 1.3 of CHAIN_ROADMAP.md added a ``mode`` parameter that toggles the
|
|
allowed Bloom-level deltas:
|
|
|
|
strict → Δ ∈ {1, 2}
|
|
lenient → Δ ∈ {1, 2, 3}
|
|
|
|
These tests pin both directions: that lenient mode accepts a Δ=3
|
|
missing-rung jump strict mode rejects, and that both modes still reject
|
|
Δ=0 same-level edges, backward deltas, multi-topic chains, and
|
|
out-of-range chain sizes.
|
|
|
|
(Δ=0 was originally allowed under lenient for "shared scenario,
|
|
different angle" pairs. The 2026-05-01 audit found 54/55 such chains
|
|
had no shared scenario in practice, so Δ=0 was removed from lenient
|
|
on 2026-05-02.)
|
|
|
|
The script lives outside the importable ``vault_cli`` package, so we load
|
|
it via ``importlib.util`` rather than a normal import.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
SCRIPT = (
|
|
Path(__file__).resolve().parents[1]
|
|
/ "scripts"
|
|
/ "build_chains_with_gemini.py"
|
|
)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def chain_module():
|
|
spec = importlib.util.spec_from_file_location("_build_chains", SCRIPT)
|
|
assert spec and spec.loader
|
|
module = importlib.util.module_from_spec(spec)
|
|
spec.loader.exec_module(module)
|
|
return module
|
|
|
|
|
|
@pytest.fixture
|
|
def corpus():
|
|
"""Mini corpus: same topic at L1..L6+, plus one Δ=0 pair and one off-topic."""
|
|
return {
|
|
"x-1": {"level": "L1", "topic": "t", "track": "edge"},
|
|
"x-2": {"level": "L2", "topic": "t", "track": "edge"},
|
|
"x-3a": {"level": "L3", "topic": "t", "track": "edge"},
|
|
"x-3b": {"level": "L3", "topic": "t", "track": "edge"},
|
|
"x-4": {"level": "L4", "topic": "t", "track": "edge"},
|
|
"x-5": {"level": "L5", "topic": "t", "track": "edge"},
|
|
"x-6": {"level": "L6+", "topic": "t", "track": "edge"},
|
|
"y-2": {"level": "L2", "topic": "u", "track": "edge"},
|
|
}
|
|
|
|
|
|
@pytest.fixture
|
|
def bucket(corpus):
|
|
return set(corpus.keys())
|
|
|
|
|
|
def _chain(*qids):
|
|
return {"questions": list(qids)}
|
|
|
|
|
|
# --- strict mode --------------------------------------------------------
|
|
|
|
def test_strict_accepts_plus_one_progression(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-2", "x-3a"), bucket, corpus, mode="strict"
|
|
)
|
|
assert ok, why
|
|
|
|
|
|
def test_strict_accepts_plus_two_jump(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-3a"), bucket, corpus, mode="strict"
|
|
)
|
|
assert ok, why
|
|
|
|
|
|
def test_strict_rejects_same_level_pair(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-3a", "x-3b"), bucket, corpus, mode="strict"
|
|
)
|
|
assert not ok
|
|
assert "Δ=" in why
|
|
|
|
|
|
def test_strict_rejects_three_step_jump(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-3a", "x-6"), bucket, corpus, mode="strict"
|
|
)
|
|
assert not ok
|
|
assert "Δ=" in why
|
|
|
|
|
|
def test_strict_rejects_backward_step(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-2", "x-1"), bucket, corpus, mode="strict"
|
|
)
|
|
assert not ok
|
|
|
|
|
|
# --- lenient mode -------------------------------------------------------
|
|
|
|
def test_lenient_rejects_same_level_pair(chain_module, bucket, corpus):
|
|
"""Δ=0 is rejected under lenient as of 2026-05-02. The previous "shared
|
|
scenario / different angle" carve-out was removed after the audit
|
|
found 54/55 Δ=0 chains in chains.json had no actual shared scenario."""
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-3a", "x-3b"), bucket, corpus, mode="lenient"
|
|
)
|
|
assert not ok
|
|
assert "Δ=" in why
|
|
|
|
|
|
def test_lenient_accepts_three_step_jump(chain_module, bucket, corpus):
|
|
"""Δ=3 is allowed under lenient when no smaller intermediate exists."""
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-3a", "x-6"), bucket, corpus, mode="lenient"
|
|
)
|
|
assert ok, why
|
|
|
|
|
|
def test_lenient_accepts_mixed_long_chain(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-2", "x-3a", "x-5", "x-6"),
|
|
bucket, corpus, mode="lenient",
|
|
)
|
|
assert ok, why
|
|
|
|
|
|
def test_lenient_still_rejects_backward(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-2", "x-1"), bucket, corpus, mode="lenient"
|
|
)
|
|
assert not ok
|
|
|
|
|
|
def test_lenient_rejects_four_step_jump(chain_module, bucket, corpus):
|
|
"""Δ=4 (e.g., L1→L5) stays out of bounds even under lenient."""
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-5"), bucket, corpus, mode="lenient"
|
|
)
|
|
assert not ok
|
|
|
|
|
|
# --- both modes ---------------------------------------------------------
|
|
|
|
@pytest.mark.parametrize("mode", ["strict", "lenient"])
|
|
def test_size_below_two_rejected(chain_module, bucket, corpus, mode):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1"), bucket, corpus, mode=mode
|
|
)
|
|
assert not ok
|
|
assert "size" in why
|
|
|
|
|
|
@pytest.mark.parametrize("mode", ["strict", "lenient"])
|
|
def test_size_above_six_rejected(chain_module, bucket, corpus, mode):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-2", "x-3a", "x-3b", "x-4", "x-5", "x-6"),
|
|
bucket, corpus, mode=mode,
|
|
)
|
|
assert not ok
|
|
assert "size" in why
|
|
|
|
|
|
@pytest.mark.parametrize("mode", ["strict", "lenient"])
|
|
def test_multi_topic_rejected(chain_module, bucket, corpus, mode):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "y-2"), bucket, corpus, mode=mode
|
|
)
|
|
assert not ok
|
|
assert "multi-topic" in why
|
|
|
|
|
|
@pytest.mark.parametrize("mode", ["strict", "lenient"])
|
|
def test_unknown_qid_rejected(chain_module, bucket, corpus, mode):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-99"), bucket, corpus, mode=mode
|
|
)
|
|
assert not ok
|
|
assert "not in bucket" in why
|
|
|
|
|
|
def test_unknown_mode_rejected(chain_module, bucket, corpus):
|
|
ok, why = chain_module.validate_chain(
|
|
_chain("x-1", "x-2"), bucket, corpus, mode="moderate"
|
|
)
|
|
assert not ok
|
|
assert "unknown mode" in why
|