mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 17:20:21 -05:00
291 lines
11 KiB
Python
291 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
validate_inline_refs.py
|
|
Pre-render guardrail for inline Python in QMD files.
|
|
|
|
Checks:
|
|
1. Every `{python} var_name` resolves to a defined variable
|
|
2. Every `{python} var_name` appears AFTER its definition (Locality)
|
|
3. No inline Python inside LaTeX math mode (causes decimal stripping)
|
|
4. No inline Python adjacent to LaTeX symbols like $\\times$
|
|
5. No grid tables with inline Python (use pipe tables instead)
|
|
|
|
Usage:
|
|
python3 book/quarto/mlsys/validate_inline_refs.py [--verbose] [--check-patterns]
|
|
|
|
Exit codes:
|
|
0 = all checks pass
|
|
1 = issues found
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
DEPRECATION_MSG = (
|
|
"DEPRECATION: use Binder instead of direct script invocation:\n"
|
|
" ./book/binder validate inline-refs [--path <file-or-dir>] [--check-patterns]"
|
|
)
|
|
|
|
BOOK_ROOT = Path(__file__).resolve().parent.parent # book/quarto/
|
|
CONTENTS = BOOK_ROOT / "contents"
|
|
|
|
# Pattern for inline Python references: `{python} var_name`
|
|
INLINE_REF = re.compile(r'`\{python\}\s+(\w+)`')
|
|
|
|
# Pattern for Python compute cell blocks
|
|
CELL_START = re.compile(r'^```\{python\}')
|
|
CELL_END = re.compile(r'^```\s*$')
|
|
|
|
# Pattern for variable assignments in compute cells (handles tuple unpacking)
|
|
ASSIGNMENT = re.compile(r'^([a-zA-Z_]\w*(?:\s*,\s*[a-zA-Z_]\w*)*)\s*=')
|
|
|
|
# Pattern for Exports: in header block
|
|
EXPORTS_SECTION = re.compile(r'#\s*.\s*[Ee]xports?:\s*(.*)')
|
|
|
|
# Problematic patterns that cause rendering issues
|
|
# Pattern 1: Inline Python directly inside LaTeX math: $`{python}`$ or $..`{python}`$
|
|
# Only matches when $ is immediately followed by backtick-python (within short distance)
|
|
# This avoids false positives from {python} appearing between two separate $...$ pairs
|
|
# EXCLUDES _str variables which are pre-formatted strings (no decimals to strip)
|
|
LATEX_INLINE_PYTHON = re.compile(r'(?<!\\)\$\s*`\{python\}\s+(?!\w+_str)[^`]+`|`\{python\}\s+(?!\w+_str)[^`]+`\s*(?<!\\)\$')
|
|
|
|
# Pattern 2: Inline Python adjacent to LaTeX symbols (decimal stripping risk)
|
|
# Only flags NON-_str variables. Using _str variables adjacent to $\times$ etc. is the
|
|
# PREFERRED convention — see book-prose.md "Multiplication and Times Notation".
|
|
LATEX_ADJACENT = re.compile(r'`\{python\}\s+(?!\w+_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu)\$')
|
|
|
|
# Pattern 3: Grid table row separator (indicates grid table format)
|
|
GRID_TABLE_SEP = re.compile(r'^\+[-:=+]+\+$')
|
|
|
|
# Pattern 4: Inline f-string formatting (should be pre-computed as _str)
|
|
INLINE_FSTRING = re.compile(r'`\{python\}\s*f"[^`]+`')
|
|
|
|
# Pattern 5: Inline function calls (should be pre-computed as _str)
|
|
INLINE_FUNC_CALL = re.compile(r'`\{python\}\s*\w+\([^`]+\)`')
|
|
|
|
# Pattern 6: Inline Python in YAML chunk options (fig-cap, tbl-cap, fig-alt, lst-cap)
|
|
# These NEVER render — Quarto uses the option value as a literal string (verified by
|
|
# rendering _test_inline_captions.qmd: body and ": Caption {#tbl-...}" run inline Python;
|
|
# #| fig-alt and #| fig-cap do not).
|
|
YAML_OPTION_INLINE = re.compile(r'^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}')
|
|
|
|
|
|
def check_rendering_patterns(qmd_path, verbose=False):
|
|
"""Check for patterns that cause rendering issues. Returns list of warnings."""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
warnings = []
|
|
filepath = str(qmd_path.relative_to(BOOK_ROOT))
|
|
|
|
# Track if we're in a grid table
|
|
in_grid_table = False
|
|
grid_table_start = 0
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
# Check for inline Python inside LaTeX math
|
|
if LATEX_INLINE_PYTHON.search(line):
|
|
warnings.append((filepath, i, "LATEX_MATH",
|
|
"Inline Python inside $...$ - will strip decimal points"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Python inside LaTeX math")
|
|
|
|
# Check for non-_str inline Python adjacent to LaTeX symbols (decimal stripping risk)
|
|
# NOTE: _str variables adjacent to $\times$ is the PREFERRED convention.
|
|
if LATEX_ADJACENT.search(line):
|
|
warnings.append((filepath, i, "LATEX_ADJACENT",
|
|
"Non-_str inline Python adjacent to $\\\\times$ (decimal stripping risk)"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Non-_str Python adjacent to LaTeX symbol")
|
|
|
|
# Track grid tables
|
|
if GRID_TABLE_SEP.match(line.strip()):
|
|
if not in_grid_table:
|
|
in_grid_table = True
|
|
grid_table_start = i
|
|
elif in_grid_table and not line.strip().startswith('|') and line.strip():
|
|
in_grid_table = False
|
|
|
|
# Check for inline Python in grid tables
|
|
if in_grid_table and '`{python}' in line:
|
|
warnings.append((filepath, grid_table_start, "GRID_TABLE",
|
|
"Grid table with inline Python - convert to pipe table"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Python in grid table")
|
|
|
|
# Check for inline f-string formatting (should be pre-computed)
|
|
if INLINE_FSTRING.search(line):
|
|
warnings.append((filepath, i, "INLINE_FSTRING",
|
|
"Inline f-string - pre-compute as _str variable in Python block"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Inline f-string")
|
|
|
|
# Check for inline function calls (should be pre-computed)
|
|
if INLINE_FUNC_CALL.search(line):
|
|
warnings.append((filepath, i, "INLINE_FUNC",
|
|
"Inline function call - pre-compute as _str variable in Python block"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Inline function call")
|
|
|
|
# Check for inline Python in YAML chunk options (fig-cap, fig-alt, tbl-cap, lst-cap) — NEVER renders
|
|
if YAML_OPTION_INLINE.search(line):
|
|
warnings.append((filepath, i, "YAML_OPTION",
|
|
"Inline Python in #| fig-alt/fig-cap/tbl-cap/lst-cap - NEVER renders! Use hardcoded value or set caption in code."))
|
|
if verbose:
|
|
print(f" ✗ {qmd_path.name}:{i} — Python in YAML option (will appear literally)")
|
|
|
|
return warnings
|
|
|
|
|
|
def validate_file(qmd_path, verbose=False, check_patterns=False):
|
|
"""Validate one QMD file. Returns (errors, warnings)."""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
|
|
errors = []
|
|
defined_vars = set()
|
|
in_cell = False
|
|
in_exports = False
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
# 1. Track variable definitions in cells
|
|
if CELL_START.match(line):
|
|
in_cell = True
|
|
continue
|
|
if in_cell and CELL_END.match(line):
|
|
in_cell = False
|
|
in_exports = False
|
|
continue
|
|
|
|
if in_cell:
|
|
# Check for assignments: var = ... or var1, var2 = ...
|
|
m = ASSIGNMENT.match(line.strip())
|
|
if m:
|
|
vars_part = m.group(1)
|
|
for v in re.split(r'[,\s]+', vars_part):
|
|
if v.strip():
|
|
defined_vars.add(v.strip())
|
|
|
|
# Check for Exports: in header
|
|
m = EXPORTS_SECTION.match(line.strip())
|
|
if m:
|
|
in_exports = True
|
|
vars_raw = m.group(1)
|
|
# Remove unit parentheticals like (MB, GB)
|
|
vars_raw = re.sub(r'\(.*?\)', '', vars_raw)
|
|
for v in re.split(r'[,\s]+', vars_raw):
|
|
v = v.strip().rstrip(',')
|
|
if v:
|
|
defined_vars.add(v)
|
|
elif in_exports:
|
|
# Continuation of exports
|
|
m = re.match(r'#\s*.\s*(.*)', line.strip())
|
|
if m:
|
|
content = m.group(1).strip()
|
|
# If content starts with a section like 'Goal:', stop
|
|
if re.match(r'^[A-Z][a-z]+:', content):
|
|
in_exports = False
|
|
elif content == "" or "──" in content:
|
|
in_exports = False
|
|
else:
|
|
vars_raw = re.sub(r'\(.*?\)', '', content)
|
|
for v in re.split(r'[,\s]+', vars_raw):
|
|
v = v.strip().rstrip(',')
|
|
if v:
|
|
defined_vars.add(v)
|
|
else:
|
|
in_exports = False
|
|
continue # Don't check for refs inside compute cells
|
|
|
|
# 2. Check inline references for Locality
|
|
for m in INLINE_REF.finditer(line):
|
|
var = m.group(1)
|
|
if var not in defined_vars:
|
|
errors.append((str(qmd_path.relative_to(BOOK_ROOT)), i, var))
|
|
if verbose:
|
|
print(f" ✗ {qmd_path.name}:{i} — `{{python}} {var}` used before definition (Locality Violation)")
|
|
|
|
warnings = []
|
|
if check_patterns:
|
|
warnings = check_rendering_patterns(qmd_path, verbose)
|
|
|
|
return errors, warnings
|
|
|
|
|
|
def main():
|
|
print(DEPRECATION_MSG, file=sys.stderr)
|
|
|
|
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
|
check_patterns = "--check-patterns" in sys.argv or "-p" in sys.argv
|
|
|
|
# Check for path argument
|
|
args = [a for a in sys.argv[1:] if not a.startswith("-")]
|
|
if args:
|
|
target_path = Path(args[0]).resolve()
|
|
if target_path.is_file():
|
|
qmd_files = [target_path]
|
|
else:
|
|
qmd_files = sorted(target_path.rglob("*.qmd"))
|
|
else:
|
|
qmd_files = sorted(CONTENTS.rglob("*.qmd"))
|
|
total_files = 0
|
|
total_refs = 0
|
|
all_errors = []
|
|
all_warnings = []
|
|
|
|
for qmd in qmd_files:
|
|
errors, warnings = validate_file(qmd, verbose=verbose, check_patterns=check_patterns)
|
|
text = qmd.read_text(encoding="utf-8")
|
|
refs = INLINE_REF.findall(text)
|
|
if refs:
|
|
total_files += 1
|
|
total_refs += len(refs)
|
|
all_errors.extend(errors)
|
|
all_warnings.extend(warnings)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Inline Python Validation Report")
|
|
print(f"{'='*60}")
|
|
print(f"Files with inline refs: {total_files}")
|
|
print(f"Total inline references: {total_refs}")
|
|
print(f"Unresolved references: {len(all_errors)}")
|
|
if check_patterns:
|
|
print(f"Rendering warnings: {len(all_warnings)}")
|
|
|
|
exit_code = 0
|
|
|
|
if all_errors:
|
|
print(f"\n{'─'*60}")
|
|
print("ERRORS (will break render):")
|
|
for filepath, lineno, var in all_errors:
|
|
print(f" {filepath}:{lineno} — `{{python}} {var}` undefined/locality violation")
|
|
exit_code = 1
|
|
|
|
if all_warnings:
|
|
print(f"\n{'─'*60}")
|
|
print("WARNINGS (may cause incorrect rendering):")
|
|
# Group by type
|
|
by_type = {}
|
|
for filepath, lineno, wtype, msg in all_warnings:
|
|
by_type.setdefault(wtype, []).append((filepath, lineno, msg))
|
|
|
|
for wtype, items in sorted(by_type.items()):
|
|
print(f"\n [{wtype}] ({len(items)} issues)")
|
|
for filepath, lineno, msg in items[:5]: # Show first 5
|
|
print(f" {filepath}:{lineno} — {msg}")
|
|
if len(items) > 5:
|
|
print(f" ... and {len(items) - 5} more")
|
|
exit_code = 1
|
|
|
|
if exit_code == 0:
|
|
print("\n✓ All checks passed!")
|
|
else:
|
|
print(f"\n{'─'*60}")
|
|
print("Fix issues before rendering to ensure correct output.")
|
|
|
|
return exit_code
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|