mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
Three gap-fixes a corpus audit on 2026-05-04 surfaced: 1. 55 cloud YAMLs were missing the status field entirely; Pydantic silently defaulted them to 'draft', so audit_corpus_batched skipped them. fix_missing_metadata.py adds explicit status: draft + provenance: imported. 2. 59 deleted YAMLs lacked the deletion_reason that the soft-delete pairing rule requires. Added placeholder text noting the original reason was not preserved on import. 3. The 55 newly-explicit drafts went through a focused vault audit (gates: format/level_fit/coherence/math/title). 41 passed all five gates and were promoted to status: published. The remaining 14 had real issues (13 level_fit / 2 coherence / 1 math) and stay drafts for authoring follow-up. audit_corpus_batched.py now accepts non-published YAMLs when --qids is explicit (the operator opted in). Default behavior (full-corpus audit) is unchanged: published-only. On-disk corpus now: 9,487 published (was 9,446, +41) · 423 drafts · 386 flagged · 390 deleted · 25 archived · 0 missing-status. vault check --strict and pytest both clean.
102 lines
3.1 KiB
Python
102 lines
3.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Add explicit status / provenance / deletion_reason where the YAML is
|
|
silently relying on Pydantic defaults or violating soft-delete pairing.
|
|
|
|
Three classes of fix:
|
|
|
|
A. status field missing entirely
|
|
Pydantic defaults to 'draft', but the YAML on disk lacks the field.
|
|
Add explicit `status: draft` + `provenance: imported` so the file
|
|
no longer relies on a silent default.
|
|
|
|
B. deleted but no deletion_reason
|
|
Soft-delete pairing rule (Question.status='deleted' must carry
|
|
deletion_reason). Add placeholder text.
|
|
|
|
C. flagged with no human_reviewed
|
|
Reported but not auto-fixed; needs human disposition.
|
|
|
|
Usage:
|
|
|
|
python3 interviews/vault-cli/scripts/fix_missing_metadata.py [--dry-run]
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
sys.path.insert(0, str(REPO_ROOT / "interviews" / "vault-cli" / "src"))
|
|
|
|
from vault_cli.models import Question # noqa: E402
|
|
from vault_cli.yaml_io import dump_str, load_file # noqa: E402
|
|
|
|
QUESTIONS_DIR = REPO_ROOT / "interviews" / "vault" / "questions"
|
|
PLACEHOLDER_REASON = (
|
|
"imported as deleted; original deletion reason was not preserved on import"
|
|
)
|
|
|
|
|
|
def write_yaml(path: Path, body: dict) -> None:
|
|
text = dump_str(body)
|
|
tmp = path.with_suffix(path.suffix + ".tmp")
|
|
tmp.write_text(text, encoding="utf-8")
|
|
os.replace(tmp, path)
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--dry-run", action="store_true")
|
|
args = ap.parse_args()
|
|
|
|
counters: Counter[str] = Counter()
|
|
|
|
for yp in QUESTIONS_DIR.rglob("*.yaml"):
|
|
body = load_file(yp)
|
|
if not isinstance(body, dict):
|
|
continue
|
|
changed = False
|
|
|
|
# Class A: status field missing → set draft + imported
|
|
if "status" not in body:
|
|
body["status"] = "draft"
|
|
if "provenance" not in body:
|
|
body["provenance"] = "imported"
|
|
counters["status-added"] += 1
|
|
changed = True
|
|
|
|
# Class B: deleted without deletion_reason
|
|
if body.get("status") == "deleted" and not body.get("deletion_reason"):
|
|
body["deletion_reason"] = PLACEHOLDER_REASON
|
|
counters["deletion-reason-added"] += 1
|
|
changed = True
|
|
|
|
if not changed:
|
|
continue
|
|
|
|
try:
|
|
Question.model_validate(body)
|
|
except Exception as e:
|
|
counters["pydantic-fail"] += 1
|
|
print(f" pydantic-fail {body.get('id')}: {str(e)[:200]}", file=sys.stderr)
|
|
continue
|
|
|
|
if args.dry_run:
|
|
print(f" [dry] {body.get('id')}: status={body.get('status')!r}, "
|
|
f"provenance={body.get('provenance')!r}, "
|
|
f"deletion_reason={'yes' if body.get('deletion_reason') else 'no'}")
|
|
else:
|
|
write_yaml(yp, body)
|
|
|
|
print(f"\ncounters: {dict(counters)}")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|