Files
cs249r_book/interviews/vault-cli/scripts/backfill_provenance.py
Vijay Janapa Reddi 39d567f267 feat(vault-cli): backfill_provenance.py — Phase 1 helper
Walks vault/questions/**/*.yaml, finds published YAMLs with no top-level
provenance line, and inserts `provenance: imported` on the line
immediately after `status: published`. Idempotent — re-running is a
no-op once the field is present. Limits scope to status: published; the
mechanical pass should not overwrite the semantics of draft / flagged /
deleted / archived questions.

CLI:
  --dry-run     report what would change
  --limit N     cap modifications (smoke test)

CORPUS_HARDENING_PLAN.md Phase 1.
2026-05-03 08:06:12 -04:00

149 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""Backfill the explicit ``provenance: imported`` line on YAMLs that lack it.
Pydantic was already filling ``provenance="imported"`` as a default at
load time, so this is a clarity-only fix: 407 published YAMLs in the
corpus have no explicit ``provenance:`` line, and we want every YAML to
carry the field on disk so it shows up in diffs, in the corpus
manifest, and in `vault edit` output.
CORPUS_HARDENING_PLAN.md Phase 1.
Idempotent — re-running is a no-op once the field is present.
Algorithm:
1. Walk ``interviews/vault/questions/**/*.yaml``.
2. Skip files where any top-level line starts with ``provenance:``.
3. Locate the top-level ``status:`` line. Skip if not exactly
``status: published`` — drafts default to ``llm-draft`` and other
statuses (flagged / deleted / archived) carry their own semantics
that this mechanical pass should not overwrite.
4. Insert ``provenance: imported`` on the line immediately below the
``status: published`` line, preserving the file's existing
indentation discipline (top-level keys are at column 0).
Usage:
# Dry run — print what would change, don't write:
python3 interviews/vault-cli/scripts/backfill_provenance.py --dry-run
# Apply:
python3 interviews/vault-cli/scripts/backfill_provenance.py
# Apply but cap to N files (smoke testing):
python3 interviews/vault-cli/scripts/backfill_provenance.py --limit 5
"""
from __future__ import annotations
import argparse
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[3]
QUESTIONS_DIR = REPO_ROOT / "interviews" / "vault" / "questions"
def has_top_level_provenance(lines: list[str]) -> bool:
"""True iff any line starts with ``provenance:`` at column 0."""
return any(line.startswith("provenance:") for line in lines)
def find_status_published_line(lines: list[str]) -> int | None:
"""Return the index of the top-level ``status: published`` line, or None.
Returns None for any other status (draft / flagged / deleted /
archived) or if no top-level status line exists. The mechanical
pass only touches published questions; the others have their own
semantics that should not be overwritten.
"""
for i, line in enumerate(lines):
if line.startswith("status:"):
# Only published — match exact value modulo whitespace.
stripped = line[len("status:"):].strip()
if stripped == "published":
return i
return None # found status but not published; bail
return None
def backfill_one(path: Path, *, dry_run: bool) -> str:
"""Return one of: 'inserted', 'already-present', 'skipped-not-published'."""
text = path.read_text(encoding="utf-8")
# Preserve trailing newline behavior across writes.
had_trailing_newline = text.endswith("\n")
lines = text.split("\n")
if had_trailing_newline:
# split("\n") produces a trailing empty string we'll restore later.
assert lines[-1] == ""
lines = lines[:-1]
if has_top_level_provenance(lines):
return "already-present"
status_idx = find_status_published_line(lines)
if status_idx is None:
return "skipped-not-published"
new_lines = (
lines[: status_idx + 1]
+ ["provenance: imported"]
+ lines[status_idx + 1 :]
)
if dry_run:
return "inserted"
out = "\n".join(new_lines)
if had_trailing_newline:
out += "\n"
path.write_text(out, encoding="utf-8")
return "inserted"
def main() -> int:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--dry-run", action="store_true",
help="report what would change without writing")
ap.add_argument("--limit", type=int, default=None,
help="cap the number of files modified this run")
args = ap.parse_args()
targets = sorted(QUESTIONS_DIR.rglob("*.yaml"))
inserted = already = skipped = 0
inserted_paths: list[Path] = []
for path in targets:
verdict = backfill_one(path, dry_run=args.dry_run)
if verdict == "inserted":
inserted += 1
inserted_paths.append(path)
if args.limit is not None and inserted >= args.limit:
# Stop AFTER counting the limit-th insertion.
break
elif verdict == "already-present":
already += 1
else: # skipped-not-published
skipped += 1
label = "would insert" if args.dry_run else "inserted"
print(f"\n{label}: {inserted}")
print(f"already had provenance: {already}")
print(f"skipped (not status: published): {skipped}")
print(f"total scanned: {inserted + already + skipped}")
if args.dry_run and inserted_paths[:5]:
print("\nfirst 5 candidates:")
for p in inserted_paths[:5]:
try:
rel = p.relative_to(REPO_ROOT)
except ValueError:
rel = p
print(f" {rel}")
return 0
if __name__ == "__main__":
raise SystemExit(main())