Files
cs249r_book/interviews/vault/schema.py
2026-04-24 17:46:17 -04:00

370 lines
12 KiB
Python

"""Pydantic schema for the StaffML question corpus (schema v1.0).
Enum values are imported from :mod:`vault.schema.enums` — the single source
of truth. Do not redefine them here. See ``schema/question_schema.yaml`` for
the canonical LinkML schema.
This module validates dict records (e.g. loaded from corpus.json or from
YAMLs via ``vault-cli``'s loader). The authoritative on-disk format is the
per-question YAML described in ``schema/question_schema.yaml``.
"""
from __future__ import annotations
import sys
from pathlib import Path
from typing import Optional
from pydantic import BaseModel, field_validator, model_validator
# Import enums from the single source of truth. Because `vault/` is not a
# conventional Python package, we add the schema directory to sys.path.
_THIS_DIR = Path(__file__).resolve().parent
_SCHEMA_DIR = _THIS_DIR / "schema"
if str(_SCHEMA_DIR) not in sys.path:
sys.path.insert(0, str(_SCHEMA_DIR))
from enums import ( # noqa: E402 type: ignore[import-not-found]
VALID_BLOOM_LEVELS,
VALID_COMPETENCY_AREAS,
VALID_HUMAN_REVIEW_STATUSES,
VALID_LEVELS,
VALID_PHASES,
VALID_PROVENANCES,
VALID_STATUSES,
VALID_TOPICS,
VALID_TRACKS,
VALID_ZONES,
)
class Resource(BaseModel):
"""Author-curated external reference attached to a question."""
name: str
url: str
@field_validator("name")
@classmethod
def name_non_empty(cls, v: str) -> str:
if not v.strip():
raise ValueError("Resource.name must be non-empty")
if len(v) > 200:
raise ValueError(f"Resource.name too long ({len(v)} chars, max 200)")
return v
@field_validator("url")
@classmethod
def url_is_https(cls, v: str) -> str:
if not v.startswith("https://"):
raise ValueError(f"Resource.url must start with https:// (got: {v[:40]!r})")
return v
class Visual(BaseModel):
"""Optional diagram/figure attached to a question.
Visuals live as separate asset files under
``interviews/vault/visuals/<track>/<path>`` so the SVG text does
not contaminate YAML diffs and existing SVG tooling (Inkscape,
formatters, linters) works unchanged. The bundle-build step copies
these into the Next.js ``public/question-visuals/`` tree. The
practice page renders them between the scenario and the
``question`` callout — context → diagram → ask, mirroring how an
interviewer would flow the question in person.
"""
kind: str = "svg"
"""Renderer kind. MVP supports `svg` only. Future: `mermaid`
(inline text), `roofline` (parameterized React component), etc.
The renderer dispatches on this field."""
path: str
"""Asset filename relative to ``interviews/vault/visuals/<track>/``.
Must end in ``.svg`` for ``kind=svg``. No path traversal."""
alt: str
"""Accessibility description for screen readers and fallback when
the SVG fails to load. Required — a visual with no alt is an
accessibility regression, not an optional add-on."""
caption: Optional[str] = None
"""Author-facing caption rendered below the figure. Short — max
120 chars. Optional; the alt text handles the semantic payload."""
@field_validator("kind")
@classmethod
def valid_kind(cls, v: str) -> str:
if v not in {"svg", "mermaid"}:
raise ValueError(f"Visual.kind must be 'svg' or 'mermaid' (got {v!r})")
return v
@field_validator("path")
@classmethod
def safe_path(cls, v: str) -> str:
if not v.strip():
raise ValueError("Visual.path must be non-empty")
if "/" in v or "\\" in v or ".." in v:
raise ValueError(
f"Visual.path must be a bare filename, no traversal (got {v!r})"
)
if len(v) > 120:
raise ValueError(f"Visual.path too long ({len(v)} chars, max 120)")
return v
@field_validator("alt")
@classmethod
def alt_non_empty(cls, v: str) -> str:
if not v.strip():
raise ValueError("Visual.alt must be non-empty (accessibility requirement)")
if len(v) > 400:
raise ValueError(f"Visual.alt too long ({len(v)} chars, max 400)")
return v
@field_validator("caption")
@classmethod
def caption_length(cls, v: Optional[str]) -> Optional[str]:
if v is not None and len(v) > 120:
raise ValueError(f"Visual.caption too long ({len(v)} chars, max 120)")
return v
class ChainRef(BaseModel):
"""Structured chain reference with position (plural chains list item)."""
id: str
position: int
class HumanReview(BaseModel):
"""Human verification lineage. Distinct from LLM validation stamps."""
status: str = "not-reviewed"
by: Optional[str] = None
date: Optional[str] = None
notes: Optional[str] = None
@field_validator("status")
@classmethod
def valid_status(cls, v: str) -> str:
if v not in VALID_HUMAN_REVIEW_STATUSES:
raise ValueError(
f"invalid human_reviewed.status {v!r}, must be one of "
f"{sorted(VALID_HUMAN_REVIEW_STATUSES)}"
)
return v
class QuestionDetails(BaseModel):
realistic_solution: str
common_mistake: str = ""
napkin_math: str = ""
resources: list[Resource] = []
options: Optional[list[str]] = None
correct_index: Optional[int] = None
@field_validator("realistic_solution")
@classmethod
def realistic_solution_min_length(cls, v: str) -> str:
if len(v.strip()) < 5:
raise ValueError(f"realistic_solution too short ({len(v)} chars, min 5)")
return v
@model_validator(mode="after")
def mcq_consistency(self) -> "QuestionDetails":
if self.options is not None:
if len(self.options) != 4:
raise ValueError(f"MCQ must have exactly 4 options, got {len(self.options)}")
if self.correct_index is None:
raise ValueError("MCQ has options but missing correct_index")
if not (0 <= self.correct_index <= 3):
raise ValueError(f"correct_index must be 0-3, got {self.correct_index}")
return self
class Question(BaseModel):
"""A StaffML question (schema v1.0). Every classification axis is a field."""
# Identity
schema_version: str = "1.0"
id: str
# 4-axis classification
track: str
level: str
zone: str
topic: str
competency_area: str
bloom_level: str = ""
phase: Optional[str] = None
# Content
title: str
scenario: str
# Explicit interrogative — the one-sentence ask derived from scenario
# + details.realistic_solution. The practice page renders this field
# as the "Your task" callout when it is not already duplicated by the
# scenario text.
question: Optional[str] = None
visual: Optional[Visual] = None
details: QuestionDetails
# Workflow
status: str = "draft"
provenance: str = "imported"
requires_explanation: Optional[bool] = None
expected_time_minutes: Optional[int] = None
deletion_reason: Optional[str] = None
# Chain membership (plural)
chains: list[ChainRef] = []
# LLM validation
validated: Optional[bool] = None
validation_status: Optional[str] = None
validation_date: Optional[str] = None
validation_model: Optional[str] = None
validation_issues: Optional[list[str]] = None
validation_status_pro: Optional[str] = None
validation_issues_pro: Optional[list[str]] = None
# Math validation
math_verified: Optional[bool] = None
math_status: Optional[str] = None
math_date: Optional[str] = None
math_model: Optional[str] = None
math_issues: Optional[list[str]] = None
# Human review (new in v1.0)
human_reviewed: Optional[HumanReview] = None
# Pro-model classification review notes
classification_review: Optional[str] = None
# Tags + temporal
tags: list[str] = []
created_at: Optional[str] = None
updated_at: Optional[str] = None
last_modified: Optional[str] = None
@field_validator("track")
@classmethod
def valid_track(cls, v: str) -> str:
if v not in VALID_TRACKS:
raise ValueError(f"Invalid track {v!r}, must be one of {sorted(VALID_TRACKS)}")
return v
@field_validator("level")
@classmethod
def valid_level(cls, v: str) -> str:
if v not in VALID_LEVELS:
raise ValueError(f"Invalid level {v!r}, must be one of {sorted(VALID_LEVELS)}")
return v
@field_validator("zone")
@classmethod
def valid_zone(cls, v: str) -> str:
if v not in VALID_ZONES:
raise ValueError(f"Invalid zone {v!r}, must be one of {sorted(VALID_ZONES)}")
return v
@field_validator("topic")
@classmethod
def valid_topic(cls, v: str) -> str:
if v not in VALID_TOPICS:
raise ValueError(f"Invalid topic {v!r} (not in {len(VALID_TOPICS)}-topic curated list)")
return v
@field_validator("competency_area")
@classmethod
def valid_area(cls, v: str) -> str:
if v not in VALID_COMPETENCY_AREAS:
raise ValueError(
f"Invalid competency_area {v!r}, must be one of {sorted(VALID_COMPETENCY_AREAS)}"
)
return v
@field_validator("bloom_level")
@classmethod
def valid_bloom(cls, v: str) -> str:
if v and v not in VALID_BLOOM_LEVELS:
raise ValueError(
f"Invalid bloom_level {v!r}, must be one of {sorted(VALID_BLOOM_LEVELS)}"
)
return v
@field_validator("phase")
@classmethod
def valid_phase(cls, v: Optional[str]) -> Optional[str]:
if v is not None and v not in VALID_PHASES:
raise ValueError(f"Invalid phase {v!r}, must be one of {sorted(VALID_PHASES)}")
return v
@field_validator("status")
@classmethod
def valid_status(cls, v: str) -> str:
if v not in VALID_STATUSES:
raise ValueError(f"Invalid status {v!r}, must be one of {sorted(VALID_STATUSES)}")
return v
@field_validator("provenance")
@classmethod
def valid_provenance(cls, v: str) -> str:
if v not in VALID_PROVENANCES:
raise ValueError(
f"Invalid provenance {v!r}, must be one of {sorted(VALID_PROVENANCES)}"
)
return v
@field_validator("title")
@classmethod
def title_min_length(cls, v: str) -> str:
if len(v.strip()) < 3:
raise ValueError(f"title too short ({len(v)} chars, min 3)")
return v
@field_validator("scenario")
@classmethod
def scenario_quality(cls, v: str) -> str:
if len(v.strip()) < 30:
raise ValueError(f"scenario too short ({len(v)} chars, min 30)")
return v
def validate_corpus(questions: list[dict]) -> tuple[list["Question"], list[str], list[str]]:
"""Validate a list of question dicts against the schema.
Returns (valid_questions, errors, warnings).
"""
valid: list[Question] = []
errors: list[str] = []
for i, q_dict in enumerate(questions):
try:
q = Question(**q_dict)
valid.append(q)
except Exception as e:
qid = q_dict.get("id", f"index-{i}")
errors.append(f"[{qid}] {e}")
id_counts: dict[str, int] = {}
for q in valid:
id_counts[q.id] = id_counts.get(q.id, 0) + 1
for qid, count in id_counts.items():
if count > 1:
errors.append(f"Duplicate ID: {qid!r} appears {count} times")
seen_titles: dict[tuple[str, str, str], str] = {}
warnings: list[str] = []
for q in valid:
key = (q.track, q.level, q.title)
if key in seen_titles:
warnings.append(
f"Duplicate title: {q.title!r} in {q.track}/{q.level} "
f"(IDs: {seen_titles[key]}, {q.id})"
)
else:
seen_titles[key] = q.id
return valid, errors, warnings