cs249r_book/interviews/vault/schema.py

"""Pydantic schema for the StaffML question corpus (schema v1.0).

Enum values are imported from :mod:`vault.schema.enums` — the single source
of truth. Do not redefine them here. See ``schema/question_schema.yaml`` for
the canonical LinkML schema.

This module validates dict records (e.g. loaded from corpus.json or from
YAMLs via ``vault-cli``'s loader). The authoritative on-disk format is the
per-question YAML described in ``schema/question_schema.yaml``.
"""

from __future__ import annotations

import sys
from pathlib import Path
from typing import Optional

from pydantic import BaseModel, field_validator, model_validator

# Import enums from the single source of truth. Because `vault/` is not a
# conventional Python package, we add the schema directory to sys.path.
_THIS_DIR = Path(__file__).resolve().parent
_SCHEMA_DIR = _THIS_DIR / "schema"
if str(_SCHEMA_DIR) not in sys.path:
    sys.path.insert(0, str(_SCHEMA_DIR))

from enums import (  # noqa: E402  type: ignore[import-not-found]
    VALID_BLOOM_LEVELS,
    VALID_COMPETENCY_AREAS,
    VALID_HUMAN_REVIEW_STATUSES,
    VALID_LEVELS,
    VALID_PHASES,
    VALID_PROVENANCES,
    VALID_STATUSES,
    VALID_TOPICS,
    VALID_TRACKS,
    VALID_ZONES,
)


class Resource(BaseModel):
    """Author-curated external reference attached to a question."""

    name: str
    url: str

    @field_validator("name")
    @classmethod
    def name_non_empty(cls, v: str) -> str:
        if not v.strip():
            raise ValueError("Resource.name must be non-empty")
        if len(v) > 200:
            raise ValueError(f"Resource.name too long ({len(v)} chars, max 200)")
        return v

    @field_validator("url")
    @classmethod
    def url_is_https(cls, v: str) -> str:
        if not v.startswith("https://"):
            raise ValueError(f"Resource.url must start with https:// (got: {v[:40]!r})")
        return v


class Visual(BaseModel):
    """Optional diagram/figure attached to a question.

    Visuals live as separate asset files under
    ``interviews/vault/visuals/<track>/<path>`` so the SVG text does
    not contaminate YAML diffs and existing SVG tooling (Inkscape,
    formatters, linters) works unchanged. The bundle-build step copies
    these into the Next.js ``public/question-visuals/`` tree. The
    practice page renders them between the scenario and the
    ``question`` callout — context → diagram → ask, mirroring how an
    interviewer would flow the question in person.
    """

    kind: str = "svg"
    """Renderer kind. MVP supports `svg` only. Future: `mermaid`
    (inline text), `roofline` (parameterized React component), etc.
    The renderer dispatches on this field."""

    path: str
    """Asset filename relative to ``interviews/vault/visuals/<track>/``.
    Must end in ``.svg`` for ``kind=svg``. No path traversal."""

    alt: str
    """Accessibility description for screen readers and fallback when
    the SVG fails to load. Required — a visual with no alt is an
    accessibility regression, not an optional add-on."""

    caption: Optional[str] = None
    """Author-facing caption rendered below the figure. Short — max
    120 chars. Optional; the alt text handles the semantic payload."""

    @field_validator("kind")
    @classmethod
    def valid_kind(cls, v: str) -> str:
        if v not in {"svg", "mermaid"}:
            raise ValueError(f"Visual.kind must be 'svg' or 'mermaid' (got {v!r})")
        return v

    @field_validator("path")
    @classmethod
    def safe_path(cls, v: str) -> str:
        if not v.strip():
            raise ValueError("Visual.path must be non-empty")
        if "/" in v or "\\" in v or ".." in v:
            raise ValueError(
                f"Visual.path must be a bare filename, no traversal (got {v!r})"
            )
        if len(v) > 120:
            raise ValueError(f"Visual.path too long ({len(v)} chars, max 120)")
        return v

    @field_validator("alt")
    @classmethod
    def alt_non_empty(cls, v: str) -> str:
        if not v.strip():
            raise ValueError("Visual.alt must be non-empty (accessibility requirement)")
        if len(v) > 400:
            raise ValueError(f"Visual.alt too long ({len(v)} chars, max 400)")
        return v

    @field_validator("caption")
    @classmethod
    def caption_length(cls, v: Optional[str]) -> Optional[str]:
        if v is not None and len(v) > 120:
            raise ValueError(f"Visual.caption too long ({len(v)} chars, max 120)")
        return v


class ChainRef(BaseModel):
    """Structured chain reference with position (plural chains list item)."""

    id: str
    position: int


class HumanReview(BaseModel):
    """Human verification lineage. Distinct from LLM validation stamps."""

    status: str = "not-reviewed"
    by: Optional[str] = None
    date: Optional[str] = None
    notes: Optional[str] = None

    @field_validator("status")
    @classmethod
    def valid_status(cls, v: str) -> str:
        if v not in VALID_HUMAN_REVIEW_STATUSES:
            raise ValueError(
                f"invalid human_reviewed.status {v!r}, must be one of "
                f"{sorted(VALID_HUMAN_REVIEW_STATUSES)}"
            )
        return v


class QuestionDetails(BaseModel):
    realistic_solution: str
    common_mistake: str = ""
    napkin_math: str = ""
    resources: list[Resource] = []
    options: Optional[list[str]] = None
    correct_index: Optional[int] = None

    @field_validator("realistic_solution")
    @classmethod
    def realistic_solution_min_length(cls, v: str) -> str:
        if len(v.strip()) < 5:
            raise ValueError(f"realistic_solution too short ({len(v)} chars, min 5)")
        return v

    @model_validator(mode="after")
    def mcq_consistency(self) -> "QuestionDetails":
        if self.options is not None:
            if len(self.options) != 4:
                raise ValueError(f"MCQ must have exactly 4 options, got {len(self.options)}")
            if self.correct_index is None:
                raise ValueError("MCQ has options but missing correct_index")
            if not (0 <= self.correct_index <= 3):
                raise ValueError(f"correct_index must be 0-3, got {self.correct_index}")
        return self


class Question(BaseModel):
    """A StaffML question (schema v1.0). Every classification axis is a field."""

    # Identity
    schema_version: str = "1.0"
    id: str

    # 4-axis classification
    track: str
    level: str
    zone: str
    topic: str
    competency_area: str
    bloom_level: str = ""
    phase: Optional[str] = None

    # Content
    title: str
    scenario: str
    # Explicit interrogative — the one-sentence ask derived from scenario
    # + details.realistic_solution. The practice page renders this field
    # as the "Your task" callout when it is not already duplicated by the
    # scenario text.
    question: Optional[str] = None
    visual: Optional[Visual] = None
    details: QuestionDetails

    # Workflow
    status: str = "draft"
    provenance: str = "imported"
    requires_explanation: Optional[bool] = None
    expected_time_minutes: Optional[int] = None
    deletion_reason: Optional[str] = None

    # Chain membership (plural)
    chains: list[ChainRef] = []

    # LLM validation
    validated: Optional[bool] = None
    validation_status: Optional[str] = None
    validation_date: Optional[str] = None
    validation_model: Optional[str] = None
    validation_issues: Optional[list[str]] = None
    validation_status_pro: Optional[str] = None
    validation_issues_pro: Optional[list[str]] = None

    # Math validation
    math_verified: Optional[bool] = None
    math_status: Optional[str] = None
    math_date: Optional[str] = None
    math_model: Optional[str] = None
    math_issues: Optional[list[str]] = None

    # Human review (new in v1.0)
    human_reviewed: Optional[HumanReview] = None

    # Pro-model classification review notes
    classification_review: Optional[str] = None

    # Tags + temporal
    tags: list[str] = []
    created_at: Optional[str] = None
    updated_at: Optional[str] = None
    last_modified: Optional[str] = None

    @field_validator("track")
    @classmethod
    def valid_track(cls, v: str) -> str:
        if v not in VALID_TRACKS:
            raise ValueError(f"Invalid track {v!r}, must be one of {sorted(VALID_TRACKS)}")
        return v

    @field_validator("level")
    @classmethod
    def valid_level(cls, v: str) -> str:
        if v not in VALID_LEVELS:
            raise ValueError(f"Invalid level {v!r}, must be one of {sorted(VALID_LEVELS)}")
        return v

    @field_validator("zone")
    @classmethod
    def valid_zone(cls, v: str) -> str:
        if v not in VALID_ZONES:
            raise ValueError(f"Invalid zone {v!r}, must be one of {sorted(VALID_ZONES)}")
        return v

    @field_validator("topic")
    @classmethod
    def valid_topic(cls, v: str) -> str:
        if v not in VALID_TOPICS:
            raise ValueError(f"Invalid topic {v!r} (not in {len(VALID_TOPICS)}-topic curated list)")
        return v

    @field_validator("competency_area")
    @classmethod
    def valid_area(cls, v: str) -> str:
        if v not in VALID_COMPETENCY_AREAS:
            raise ValueError(
                f"Invalid competency_area {v!r}, must be one of {sorted(VALID_COMPETENCY_AREAS)}"
            )
        return v

    @field_validator("bloom_level")
    @classmethod
    def valid_bloom(cls, v: str) -> str:
        if v and v not in VALID_BLOOM_LEVELS:
            raise ValueError(
                f"Invalid bloom_level {v!r}, must be one of {sorted(VALID_BLOOM_LEVELS)}"
            )
        return v

    @field_validator("phase")
    @classmethod
    def valid_phase(cls, v: Optional[str]) -> Optional[str]:
        if v is not None and v not in VALID_PHASES:
            raise ValueError(f"Invalid phase {v!r}, must be one of {sorted(VALID_PHASES)}")
        return v

    @field_validator("status")
    @classmethod
    def valid_status(cls, v: str) -> str:
        if v not in VALID_STATUSES:
            raise ValueError(f"Invalid status {v!r}, must be one of {sorted(VALID_STATUSES)}")
        return v

    @field_validator("provenance")
    @classmethod
    def valid_provenance(cls, v: str) -> str:
        if v not in VALID_PROVENANCES:
            raise ValueError(
                f"Invalid provenance {v!r}, must be one of {sorted(VALID_PROVENANCES)}"
            )
        return v

    @field_validator("title")
    @classmethod
    def title_min_length(cls, v: str) -> str:
        if len(v.strip()) < 3:
            raise ValueError(f"title too short ({len(v)} chars, min 3)")
        return v

    @field_validator("scenario")
    @classmethod
    def scenario_quality(cls, v: str) -> str:
        if len(v.strip()) < 30:
            raise ValueError(f"scenario too short ({len(v)} chars, min 30)")
        return v


def validate_corpus(questions: list[dict]) -> tuple[list["Question"], list[str], list[str]]:
    """Validate a list of question dicts against the schema.

    Returns (valid_questions, errors, warnings).
    """
    valid: list[Question] = []
    errors: list[str] = []

    for i, q_dict in enumerate(questions):
        try:
            q = Question(**q_dict)
            valid.append(q)
        except Exception as e:
            qid = q_dict.get("id", f"index-{i}")
            errors.append(f"[{qid}] {e}")

    id_counts: dict[str, int] = {}
    for q in valid:
        id_counts[q.id] = id_counts.get(q.id, 0) + 1
    for qid, count in id_counts.items():
        if count > 1:
            errors.append(f"Duplicate ID: {qid!r} appears {count} times")

    seen_titles: dict[tuple[str, str, str], str] = {}
    warnings: list[str] = []
    for q in valid:
        key = (q.track, q.level, q.title)
        if key in seen_titles:
            warnings.append(
                f"Duplicate title: {q.title!r} in {q.track}/{q.level} "
                f"(IDs: {seen_titles[key]}, {q.id})"
            )
        else:
            seen_titles[key] = q.id

    return valid, errors, warnings