"""Tests for the canonical hashing layer. Key invariants: - Same semantic content hashes identically regardless of YAML key order. - Whitelist fields drive the hash; metadata doesn't. - Merkle construction stable across re-ordering of leaves. """ from __future__ import annotations from vault_cli.hashing import CANON_VERSION, content_hash, release_hash def _base_question() -> dict: return { "id": "global-0000", "title": "Example", "topic": "kv-cache-management", "chain": {"id": "global-chain-000", "position": 1}, "status": "published", "scenario": "Explain KV-cache.", "details": {"realistic_solution": "Paged attention."}, "tags": ["a", "b"], "provenance": "human", } def test_content_hash_stable_across_key_reorder() -> None: """Soumith M-NEW-4 / R3-F-4: top-level AND nested-dict hashing must be key-order-invariant. The prior version of this test only reordered top-level keys and collapsed ``details`` to a single-key dict, so it didn't actually exercise the nested-dict claim. Extended to also reorder ``details`` and ``chain``. """ q1 = _base_question() # Reorder top-level keys AND nested dicts. q2 = {k: q1[k] for k in reversed(list(q1.keys()))} q2["details"] = { "napkin_math": "X", # add a second nested key so the order matters "common_mistake": "Y", "realistic_solution": q1["details"]["realistic_solution"], } q2["chain"] = {"position": q1["chain"]["position"], "id": q1["chain"]["id"]} # Mirror on q1 to keep the semantic payload identical. q1 = dict(q1) q1["details"] = dict(q2["details"]) # Now q1 and q2 share the same semantic fields; only key insertion order differs. assert content_hash(q1) == content_hash(q2) def test_content_hash_nested_dict_order_invariance() -> None: """Explicit nested-order test with a 3-key ``details`` — must hash identical regardless of ``details`` key insertion order. """ base = _base_question() base["details"] = { "realistic_solution": "ANS", "common_mistake": "WRONG", "napkin_math": "2 + 2 = 4", } reordered = dict(base) reordered["details"] = { "napkin_math": "2 + 2 = 4", "realistic_solution": "ANS", "common_mistake": "WRONG", } assert content_hash(base) == content_hash(reordered) def test_content_hash_excludes_metadata() -> None: """Hash must NOT change when last_modified or file_path changes.""" q1 = _base_question() q2 = dict(q1) q2["last_modified"] = "2050-01-01T00:00:00Z" q2["file_path"] = "/tmp/foo.yaml" q2["authors"] = ["someone"] assert content_hash(q1) == content_hash(q2) def test_content_hash_changes_with_semantic_edit() -> None: """Hash MUST change when scenario changes.""" q1 = _base_question() q2 = dict(q1) q2["scenario"] = "An edited scenario." assert content_hash(q1) != content_hash(q2) def test_release_hash_includes_canon_and_policy_leaves() -> None: """Chip N-H5: release_hash must bind canon version and policy.""" leaves = [("a", "1" * 64), ("b", "2" * 64)] base = release_hash( per_question=leaves, taxonomy_hash="t" * 64, chains_hash="c" * 64, zones_hash="z" * 64, policy_hash="p" * 64, ) # Different policy_hash → different release_hash different_policy = release_hash( per_question=leaves, taxonomy_hash="t" * 64, chains_hash="c" * 64, zones_hash="z" * 64, policy_hash="P" * 64, ) assert base != different_policy # Different canon_version → different release_hash different_canon = release_hash( per_question=leaves, taxonomy_hash="t" * 64, chains_hash="c" * 64, zones_hash="z" * 64, policy_hash="p" * 64, canon_version=999, ) assert base != different_canon def test_canon_version_is_pinned() -> None: assert isinstance(CANON_VERSION, int) assert CANON_VERSION >= 1