mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-06 17:49:07 -05:00
Walks vault/questions/**/*.yaml, finds published YAMLs with no top-level provenance line, and inserts `provenance: imported` on the line immediately after `status: published`. Idempotent — re-running is a no-op once the field is present. Limits scope to status: published; the mechanical pass should not overwrite the semantics of draft / flagged / deleted / archived questions. CLI: --dry-run report what would change --limit N cap modifications (smoke test) CORPUS_HARDENING_PLAN.md Phase 1.
149 lines
5.1 KiB
Python
149 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""Backfill the explicit ``provenance: imported`` line on YAMLs that lack it.
|
|
|
|
Pydantic was already filling ``provenance="imported"`` as a default at
|
|
load time, so this is a clarity-only fix: 407 published YAMLs in the
|
|
corpus have no explicit ``provenance:`` line, and we want every YAML to
|
|
carry the field on disk so it shows up in diffs, in the corpus
|
|
manifest, and in `vault edit` output.
|
|
|
|
CORPUS_HARDENING_PLAN.md Phase 1.
|
|
|
|
Idempotent — re-running is a no-op once the field is present.
|
|
|
|
Algorithm:
|
|
|
|
1. Walk ``interviews/vault/questions/**/*.yaml``.
|
|
2. Skip files where any top-level line starts with ``provenance:``.
|
|
3. Locate the top-level ``status:`` line. Skip if not exactly
|
|
``status: published`` — drafts default to ``llm-draft`` and other
|
|
statuses (flagged / deleted / archived) carry their own semantics
|
|
that this mechanical pass should not overwrite.
|
|
4. Insert ``provenance: imported`` on the line immediately below the
|
|
``status: published`` line, preserving the file's existing
|
|
indentation discipline (top-level keys are at column 0).
|
|
|
|
Usage:
|
|
|
|
# Dry run — print what would change, don't write:
|
|
python3 interviews/vault-cli/scripts/backfill_provenance.py --dry-run
|
|
|
|
# Apply:
|
|
python3 interviews/vault-cli/scripts/backfill_provenance.py
|
|
|
|
# Apply but cap to N files (smoke testing):
|
|
python3 interviews/vault-cli/scripts/backfill_provenance.py --limit 5
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
QUESTIONS_DIR = REPO_ROOT / "interviews" / "vault" / "questions"
|
|
|
|
|
|
def has_top_level_provenance(lines: list[str]) -> bool:
|
|
"""True iff any line starts with ``provenance:`` at column 0."""
|
|
return any(line.startswith("provenance:") for line in lines)
|
|
|
|
|
|
def find_status_published_line(lines: list[str]) -> int | None:
|
|
"""Return the index of the top-level ``status: published`` line, or None.
|
|
|
|
Returns None for any other status (draft / flagged / deleted /
|
|
archived) or if no top-level status line exists. The mechanical
|
|
pass only touches published questions; the others have their own
|
|
semantics that should not be overwritten.
|
|
"""
|
|
for i, line in enumerate(lines):
|
|
if line.startswith("status:"):
|
|
# Only published — match exact value modulo whitespace.
|
|
stripped = line[len("status:"):].strip()
|
|
if stripped == "published":
|
|
return i
|
|
return None # found status but not published; bail
|
|
return None
|
|
|
|
|
|
def backfill_one(path: Path, *, dry_run: bool) -> str:
|
|
"""Return one of: 'inserted', 'already-present', 'skipped-not-published'."""
|
|
text = path.read_text(encoding="utf-8")
|
|
# Preserve trailing newline behavior across writes.
|
|
had_trailing_newline = text.endswith("\n")
|
|
lines = text.split("\n")
|
|
if had_trailing_newline:
|
|
# split("\n") produces a trailing empty string we'll restore later.
|
|
assert lines[-1] == ""
|
|
lines = lines[:-1]
|
|
|
|
if has_top_level_provenance(lines):
|
|
return "already-present"
|
|
|
|
status_idx = find_status_published_line(lines)
|
|
if status_idx is None:
|
|
return "skipped-not-published"
|
|
|
|
new_lines = (
|
|
lines[: status_idx + 1]
|
|
+ ["provenance: imported"]
|
|
+ lines[status_idx + 1 :]
|
|
)
|
|
if dry_run:
|
|
return "inserted"
|
|
|
|
out = "\n".join(new_lines)
|
|
if had_trailing_newline:
|
|
out += "\n"
|
|
path.write_text(out, encoding="utf-8")
|
|
return "inserted"
|
|
|
|
|
|
def main() -> int:
|
|
ap = argparse.ArgumentParser(description=__doc__,
|
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
|
ap.add_argument("--dry-run", action="store_true",
|
|
help="report what would change without writing")
|
|
ap.add_argument("--limit", type=int, default=None,
|
|
help="cap the number of files modified this run")
|
|
args = ap.parse_args()
|
|
|
|
targets = sorted(QUESTIONS_DIR.rglob("*.yaml"))
|
|
inserted = already = skipped = 0
|
|
inserted_paths: list[Path] = []
|
|
|
|
for path in targets:
|
|
verdict = backfill_one(path, dry_run=args.dry_run)
|
|
if verdict == "inserted":
|
|
inserted += 1
|
|
inserted_paths.append(path)
|
|
if args.limit is not None and inserted >= args.limit:
|
|
# Stop AFTER counting the limit-th insertion.
|
|
break
|
|
elif verdict == "already-present":
|
|
already += 1
|
|
else: # skipped-not-published
|
|
skipped += 1
|
|
|
|
label = "would insert" if args.dry_run else "inserted"
|
|
print(f"\n{label}: {inserted}")
|
|
print(f"already had provenance: {already}")
|
|
print(f"skipped (not status: published): {skipped}")
|
|
print(f"total scanned: {inserted + already + skipped}")
|
|
|
|
if args.dry_run and inserted_paths[:5]:
|
|
print("\nfirst 5 candidates:")
|
|
for p in inserted_paths[:5]:
|
|
try:
|
|
rel = p.relative_to(REPO_ROOT)
|
|
except ValueError:
|
|
rel = p
|
|
print(f" {rel}")
|
|
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
raise SystemExit(main())
|