Files
cs249r_book/book/quarto/mlsys/validate_inline_refs.py

291 lines
11 KiB
Python

#!/usr/bin/env python3
"""
validate_inline_refs.py
Pre-render guardrail for inline Python in QMD files.
Checks:
1. Every `{python} var_name` resolves to a defined variable
2. Every `{python} var_name` appears AFTER its definition (Locality)
3. No inline Python inside LaTeX math mode (causes decimal stripping)
4. No inline Python adjacent to LaTeX symbols like $\\times$
5. No grid tables with inline Python (use pipe tables instead)
Usage:
python3 book/quarto/mlsys/validate_inline_refs.py [--verbose] [--check-patterns]
Exit codes:
0 = all checks pass
1 = issues found
"""
import re
import sys
from pathlib import Path
DEPRECATION_MSG = (
"DEPRECATION: use Binder instead of direct script invocation:\n"
" ./book/binder validate inline-refs [--path <file-or-dir>] [--check-patterns]"
)
BOOK_ROOT = Path(__file__).resolve().parent.parent # book/quarto/
CONTENTS = BOOK_ROOT / "contents"
# Pattern for inline Python references: `{python} var_name`
INLINE_REF = re.compile(r'`\{python\}\s+(\w+)`')
# Pattern for Python compute cell blocks
CELL_START = re.compile(r'^```\{python\}')
CELL_END = re.compile(r'^```\s*$')
# Pattern for variable assignments in compute cells (handles tuple unpacking)
ASSIGNMENT = re.compile(r'^([a-zA-Z_]\w*(?:\s*,\s*[a-zA-Z_]\w*)*)\s*=')
# Pattern for Exports: in header block
EXPORTS_SECTION = re.compile(r'#\s*.\s*[Ee]xports?:\s*(.*)')
# Problematic patterns that cause rendering issues
# Pattern 1: Inline Python directly inside LaTeX math: $`{python}`$ or $..`{python}`$
# Only matches when $ is immediately followed by backtick-python (within short distance)
# This avoids false positives from {python} appearing between two separate $...$ pairs
# EXCLUDES _str variables which are pre-formatted strings (no decimals to strip)
LATEX_INLINE_PYTHON = re.compile(r'(?<!\\)\$\s*`\{python\}\s+(?!\w+_str)[^`]+`|`\{python\}\s+(?!\w+_str)[^`]+`\s*(?<!\\)\$')
# Pattern 2: Inline Python adjacent to LaTeX symbols (decimal stripping risk)
# Only flags NON-_str variables. Using _str variables adjacent to $\times$ etc. is the
# PREFERRED convention — see book-prose.md "Multiplication and Times Notation".
LATEX_ADJACENT = re.compile(r'`\{python\}\s+(?!\w+_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu)\$')
# Pattern 3: Grid table row separator (indicates grid table format)
GRID_TABLE_SEP = re.compile(r'^\+[-:=+]+\+$')
# Pattern 4: Inline f-string formatting (should be pre-computed as _str)
INLINE_FSTRING = re.compile(r'`\{python\}\s*f"[^`]+`')
# Pattern 5: Inline function calls (should be pre-computed as _str)
INLINE_FUNC_CALL = re.compile(r'`\{python\}\s*\w+\([^`]+\)`')
# Pattern 6: Inline Python in YAML chunk options (fig-cap, tbl-cap, fig-alt, lst-cap)
# These NEVER render — Quarto uses the option value as a literal string (verified by
# rendering _test_inline_captions.qmd: body and ": Caption {#tbl-...}" run inline Python;
# #| fig-alt and #| fig-cap do not).
YAML_OPTION_INLINE = re.compile(r'^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}')
def check_rendering_patterns(qmd_path, verbose=False):
"""Check for patterns that cause rendering issues. Returns list of warnings."""
text = qmd_path.read_text(encoding="utf-8")
lines = text.splitlines()
warnings = []
filepath = str(qmd_path.relative_to(BOOK_ROOT))
# Track if we're in a grid table
in_grid_table = False
grid_table_start = 0
for i, line in enumerate(lines, 1):
# Check for inline Python inside LaTeX math
if LATEX_INLINE_PYTHON.search(line):
warnings.append((filepath, i, "LATEX_MATH",
"Inline Python inside $...$ - will strip decimal points"))
if verbose:
print(f"{qmd_path.name}:{i} — Python inside LaTeX math")
# Check for non-_str inline Python adjacent to LaTeX symbols (decimal stripping risk)
# NOTE: _str variables adjacent to $\times$ is the PREFERRED convention.
if LATEX_ADJACENT.search(line):
warnings.append((filepath, i, "LATEX_ADJACENT",
"Non-_str inline Python adjacent to $\\\\times$ (decimal stripping risk)"))
if verbose:
print(f"{qmd_path.name}:{i} — Non-_str Python adjacent to LaTeX symbol")
# Track grid tables
if GRID_TABLE_SEP.match(line.strip()):
if not in_grid_table:
in_grid_table = True
grid_table_start = i
elif in_grid_table and not line.strip().startswith('|') and line.strip():
in_grid_table = False
# Check for inline Python in grid tables
if in_grid_table and '`{python}' in line:
warnings.append((filepath, grid_table_start, "GRID_TABLE",
"Grid table with inline Python - convert to pipe table"))
if verbose:
print(f"{qmd_path.name}:{i} — Python in grid table")
# Check for inline f-string formatting (should be pre-computed)
if INLINE_FSTRING.search(line):
warnings.append((filepath, i, "INLINE_FSTRING",
"Inline f-string - pre-compute as _str variable in Python block"))
if verbose:
print(f"{qmd_path.name}:{i} — Inline f-string")
# Check for inline function calls (should be pre-computed)
if INLINE_FUNC_CALL.search(line):
warnings.append((filepath, i, "INLINE_FUNC",
"Inline function call - pre-compute as _str variable in Python block"))
if verbose:
print(f"{qmd_path.name}:{i} — Inline function call")
# Check for inline Python in YAML chunk options (fig-cap, fig-alt, tbl-cap, lst-cap) — NEVER renders
if YAML_OPTION_INLINE.search(line):
warnings.append((filepath, i, "YAML_OPTION",
"Inline Python in #| fig-alt/fig-cap/tbl-cap/lst-cap - NEVER renders! Use hardcoded value or set caption in code."))
if verbose:
print(f"{qmd_path.name}:{i} — Python in YAML option (will appear literally)")
return warnings
def validate_file(qmd_path, verbose=False, check_patterns=False):
"""Validate one QMD file. Returns (errors, warnings)."""
text = qmd_path.read_text(encoding="utf-8")
lines = text.splitlines()
errors = []
defined_vars = set()
in_cell = False
in_exports = False
for i, line in enumerate(lines, 1):
# 1. Track variable definitions in cells
if CELL_START.match(line):
in_cell = True
continue
if in_cell and CELL_END.match(line):
in_cell = False
in_exports = False
continue
if in_cell:
# Check for assignments: var = ... or var1, var2 = ...
m = ASSIGNMENT.match(line.strip())
if m:
vars_part = m.group(1)
for v in re.split(r'[,\s]+', vars_part):
if v.strip():
defined_vars.add(v.strip())
# Check for Exports: in header
m = EXPORTS_SECTION.match(line.strip())
if m:
in_exports = True
vars_raw = m.group(1)
# Remove unit parentheticals like (MB, GB)
vars_raw = re.sub(r'\(.*?\)', '', vars_raw)
for v in re.split(r'[,\s]+', vars_raw):
v = v.strip().rstrip(',')
if v:
defined_vars.add(v)
elif in_exports:
# Continuation of exports
m = re.match(r'#\s*.\s*(.*)', line.strip())
if m:
content = m.group(1).strip()
# If content starts with a section like 'Goal:', stop
if re.match(r'^[A-Z][a-z]+:', content):
in_exports = False
elif content == "" or "──" in content:
in_exports = False
else:
vars_raw = re.sub(r'\(.*?\)', '', content)
for v in re.split(r'[,\s]+', vars_raw):
v = v.strip().rstrip(',')
if v:
defined_vars.add(v)
else:
in_exports = False
continue # Don't check for refs inside compute cells
# 2. Check inline references for Locality
for m in INLINE_REF.finditer(line):
var = m.group(1)
if var not in defined_vars:
errors.append((str(qmd_path.relative_to(BOOK_ROOT)), i, var))
if verbose:
print(f"{qmd_path.name}:{i} — `{{python}} {var}` used before definition (Locality Violation)")
warnings = []
if check_patterns:
warnings = check_rendering_patterns(qmd_path, verbose)
return errors, warnings
def main():
print(DEPRECATION_MSG, file=sys.stderr)
verbose = "--verbose" in sys.argv or "-v" in sys.argv
check_patterns = "--check-patterns" in sys.argv or "-p" in sys.argv
# Check for path argument
args = [a for a in sys.argv[1:] if not a.startswith("-")]
if args:
target_path = Path(args[0]).resolve()
if target_path.is_file():
qmd_files = [target_path]
else:
qmd_files = sorted(target_path.rglob("*.qmd"))
else:
qmd_files = sorted(CONTENTS.rglob("*.qmd"))
total_files = 0
total_refs = 0
all_errors = []
all_warnings = []
for qmd in qmd_files:
errors, warnings = validate_file(qmd, verbose=verbose, check_patterns=check_patterns)
text = qmd.read_text(encoding="utf-8")
refs = INLINE_REF.findall(text)
if refs:
total_files += 1
total_refs += len(refs)
all_errors.extend(errors)
all_warnings.extend(warnings)
print(f"\n{'='*60}")
print(f"Inline Python Validation Report")
print(f"{'='*60}")
print(f"Files with inline refs: {total_files}")
print(f"Total inline references: {total_refs}")
print(f"Unresolved references: {len(all_errors)}")
if check_patterns:
print(f"Rendering warnings: {len(all_warnings)}")
exit_code = 0
if all_errors:
print(f"\n{''*60}")
print("ERRORS (will break render):")
for filepath, lineno, var in all_errors:
print(f" {filepath}:{lineno} — `{{python}} {var}` undefined/locality violation")
exit_code = 1
if all_warnings:
print(f"\n{''*60}")
print("WARNINGS (may cause incorrect rendering):")
# Group by type
by_type = {}
for filepath, lineno, wtype, msg in all_warnings:
by_type.setdefault(wtype, []).append((filepath, lineno, msg))
for wtype, items in sorted(by_type.items()):
print(f"\n [{wtype}] ({len(items)} issues)")
for filepath, lineno, msg in items[:5]: # Show first 5
print(f" {filepath}:{lineno}{msg}")
if len(items) > 5:
print(f" ... and {len(items) - 5} more")
exit_code = 1
if exit_code == 0:
print("\n✓ All checks passed!")
else:
print(f"\n{''*60}")
print("Fix issues before rendering to ensure correct output.")
return exit_code
if __name__ == "__main__":
sys.exit(main())