mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-11 17:49:25 -05:00
Moves the mlsysim package from book/quarto/mlsysim/ to the repo root so it is importable as a proper top-level package across the codebase. Key changes: - mlsysim/fmt.py: new top-level module for all formatting helpers (fmt, sci, check, md_math, fmt_full, fmt_split, etc.), moved out of viz/ - mlsysim/viz/__init__.py: now exports only plot utilities; dashboard.py (marimo-only) is no longer wildcard-exported and must be imported explicitly by marimo labs - mlsysim/__init__.py: added `from . import fmt` and `from .core import constants`; removed broken `from .viz import plots as viz` alias - execute-env.yml: fixed PYTHONPATH from "../../.." to "../.." so chapters resolve to repo root, not parent of repo - 51 QMD files: updated `from mlsysim.viz import <fmt-fns>` to `from mlsysim.fmt import <fmt-fns>` - book/quarto/mlsys/: legacy shadow package contents cleaned up; stub __init__.py remains for backward compat - All Vol1 and Vol2 chapters verified to build with `binder build pdf`
882 lines
36 KiB
Python
882 lines
36 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
validate_inline_refs.py
|
|
Pre-render guardrail for inline Python in QMD files.
|
|
|
|
Checks:
|
|
1. Every `{python} ref` resolves to a defined variable or ClassName.attr
|
|
2. Every `{python} ref` appears AFTER its definition (Locality)
|
|
3. No inline Python inside LaTeX math mode (causes decimal stripping)
|
|
4. No inline Python adjacent to LaTeX symbols like $\\times$
|
|
5. No grid tables with inline Python (use pipe tables instead)
|
|
6. (--check-lego) callout-notebook blocks without a preceding LEGO cell
|
|
7. (--check-lego) Hardcoded derived results in display math inside callout-notebooks
|
|
8. (--check-scope) Bare variable references in class bodies that need ClassName.attr
|
|
9. (--check-scope) Python 3 class-scope comprehension issues
|
|
|
|
Inline refs may be simple identifiers (`{python} var_name`) or dotted class
|
|
attribute access (`{python} ClassName.attr`).
|
|
|
|
Usage:
|
|
python3 book/quarto/mlsysim/validate_inline_refs.py [--verbose] [--check-patterns] [--check-lego] [--check-scope]
|
|
|
|
Exit codes:
|
|
0 = all checks pass
|
|
1 = issues found
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
DEPRECATION_MSG = (
|
|
"DEPRECATION: use Binder instead of direct script invocation:\n"
|
|
" ./book/binder validate inline-refs [--path <file-or-dir>] [--check-patterns] [--check-lego]"
|
|
)
|
|
|
|
BOOK_ROOT = Path(__file__).resolve().parent.parent # book/quarto/
|
|
CONTENTS = BOOK_ROOT / "contents"
|
|
|
|
# Pattern for inline Python references: `{python} var_name` or `{python} Name.attr`
|
|
INLINE_REF = re.compile(r'`\{python\}\s+(\w+(?:\.\w+)?)`')
|
|
|
|
# Pattern for Python compute cell blocks
|
|
CELL_START = re.compile(r'^```\{python\}')
|
|
CELL_END = re.compile(r'^```\s*$')
|
|
|
|
# Pattern for variable assignments in compute cells (handles tuple unpacking)
|
|
ASSIGNMENT = re.compile(r'^([a-zA-Z_]\w*(?:\s*,\s*[a-zA-Z_]\w*)*)\s*=')
|
|
|
|
# Pattern for class definitions in compute cells
|
|
CLASS_DEF = re.compile(r'^class\s+(\w+)\s*[:(]')
|
|
|
|
# Pattern for Exports: in header block
|
|
EXPORTS_SECTION = re.compile(r'#\s*.\s*[Ee]xports?:\s*(.*)')
|
|
|
|
# Problematic patterns that cause rendering issues
|
|
# Pattern 1: Inline Python directly inside LaTeX math: $`{python}`$ or $..`{python}`$
|
|
# Only matches when $ is immediately followed by backtick-python (within short distance)
|
|
# This avoids false positives from {python} appearing between two separate $...$ pairs
|
|
# EXCLUDES _str variables which are pre-formatted strings (no decimals to strip)
|
|
# Handles both simple `var_str` and dotted `Name.attr_str`
|
|
LATEX_INLINE_PYTHON = re.compile(r'(?<!\\)\$\s*`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`|`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*(?<!\\)\$')
|
|
|
|
# Pattern 2: Inline Python adjacent to LaTeX symbols (decimal stripping risk)
|
|
# Only flags NON-_str variables. Using _str variables adjacent to $\times$ etc. is the
|
|
# PREFERRED convention — see book-prose.md "Multiplication and Times Notation".
|
|
# Handles both simple `var_str` and dotted `Name.attr_str`
|
|
LATEX_ADJACENT = re.compile(r'`\{python\}\s+(?!\w+(?:\.\w+)?_str)[^`]+`\s*\$\\(times|approx|ll|gg|mu)\$')
|
|
|
|
# Pattern 3: Grid table row separator (indicates grid table format)
|
|
GRID_TABLE_SEP = re.compile(r'^\+[-:=+]+\+$')
|
|
|
|
# Pattern 4: Inline f-string formatting (should be pre-computed as _str)
|
|
INLINE_FSTRING = re.compile(r'`\{python\}\s*f"[^`]+`')
|
|
|
|
# Pattern 5: Inline function calls (should be pre-computed as _str)
|
|
INLINE_FUNC_CALL = re.compile(r'`\{python\}\s*\w+\([^`]+\)`')
|
|
|
|
# Pattern 6: Inline Python in YAML chunk options (fig-cap, tbl-cap, fig-alt, lst-cap)
|
|
# These NEVER render — Quarto uses the option value as a literal string (verified by
|
|
# rendering _test_inline_captions.qmd: body and ": Caption {#tbl-...}" run inline Python;
|
|
# #| fig-alt and #| fig-cap do not).
|
|
YAML_OPTION_INLINE = re.compile(r'^#\|\s*(fig-cap|tbl-cap|lst-cap|fig-alt):\s*.*`\{python\}')
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LEGO compliance patterns (--check-lego)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
CALLOUT_NOTEBOOK_START = re.compile(r'^:{2,}\s*\{\.callout-notebook')
|
|
CALLOUT_DIV_END = re.compile(r'^:{3,}\s*$')
|
|
|
|
# Hardcoded numeric result after = or \approx in LaTeX math.
|
|
# Matches patterns like: = 0.046, = 31.25, \approx 12.4, = 14,400
|
|
# Also handles: = \mathbf{6.9}, = \$398,131, = **0.30**
|
|
HARDCODED_RESULT_RE = re.compile(
|
|
r'(?:=\s*|\\approx\s*)'
|
|
r'(?:\\mathbf\{|\\textbf\{|\*\*)?'
|
|
r'-?\d[\d,]*\.?\d*'
|
|
)
|
|
|
|
# A line with `{python}` reference — used to check if a callout uses computed values
|
|
# Matches both `{python} var` and `{python} Name.attr`
|
|
HAS_PYTHON_REF = re.compile(r'`\{python\}\s+\w+(?:\.\w+)?`')
|
|
|
|
# Suppression comment
|
|
LEGO_OK = re.compile(r'<!--\s*lego-ok\s*-->')
|
|
|
|
# Display math delimiters
|
|
DISPLAY_MATH_LINE = re.compile(r'^\$\$')
|
|
|
|
# Lines that contain numeric content worth flagging (digits with decimals or arithmetic)
|
|
HAS_NUMERIC_CONTENT = re.compile(r'\d+\.\d+|\\times|\\frac|\\approx')
|
|
|
|
|
|
def check_lego_compliance(qmd_path, verbose=False):
|
|
"""Check LEGO principle compliance in callout-notebook blocks.
|
|
|
|
Returns list of (filepath, lineno, check_type, message) tuples.
|
|
"""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
warnings = []
|
|
filepath = str(qmd_path.relative_to(BOOK_ROOT))
|
|
|
|
# Pre-scan: record lines that are Python cell starts or ends
|
|
python_cell_end_lines = set()
|
|
in_cell = False
|
|
cell_start_line = 0
|
|
for i, line in enumerate(lines):
|
|
if CELL_START.match(line):
|
|
in_cell = True
|
|
cell_start_line = i
|
|
elif in_cell and CELL_END.match(line):
|
|
in_cell = False
|
|
python_cell_end_lines.add(i) # 0-based
|
|
|
|
# Main scan: find callout-notebook blocks
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
|
|
# Check for lego-ok suppression on this line or the next
|
|
if LEGO_OK.search(line):
|
|
i += 1
|
|
continue
|
|
|
|
if CALLOUT_NOTEBOOK_START.match(line):
|
|
callout_start = i # 0-based
|
|
callout_title = line.strip()
|
|
|
|
# Check suppression on the callout start line itself
|
|
if LEGO_OK.search(line):
|
|
# Skip to end of callout
|
|
i += 1
|
|
depth = 1
|
|
while i < len(lines) and depth > 0:
|
|
if re.match(r'^:{3,}\s*\{', lines[i]):
|
|
depth += 1
|
|
elif CALLOUT_DIV_END.match(lines[i]):
|
|
depth -= 1
|
|
i += 1
|
|
continue
|
|
|
|
# --- Check 1: MISSING_LEGO_CELL ---
|
|
# Look backwards up to 15 lines for a Python cell end (```)
|
|
has_preceding_cell = False
|
|
lookback = min(callout_start, 15)
|
|
for j in range(callout_start - 1, callout_start - lookback - 1, -1):
|
|
if j < 0:
|
|
break
|
|
if j in python_cell_end_lines:
|
|
has_preceding_cell = True
|
|
break
|
|
# Stop looking if we hit another callout or heading
|
|
if re.match(r'^#{1,4}\s', lines[j]) or CALLOUT_NOTEBOOK_START.match(lines[j]):
|
|
break
|
|
|
|
# --- Collect callout body ---
|
|
callout_body_lines = []
|
|
i += 1
|
|
depth = 1
|
|
while i < len(lines) and depth > 0:
|
|
if LEGO_OK.search(lines[i]):
|
|
# Suppression inside callout: skip rest
|
|
while i < len(lines) and depth > 0:
|
|
if re.match(r'^:{3,}\s*\{', lines[i]):
|
|
depth += 1
|
|
elif CALLOUT_DIV_END.match(lines[i]):
|
|
depth -= 1
|
|
i += 1
|
|
break
|
|
if re.match(r'^:{3,}\s*\{', lines[i]):
|
|
depth += 1
|
|
elif CALLOUT_DIV_END.match(lines[i]):
|
|
depth -= 1
|
|
if depth == 0:
|
|
i += 1
|
|
break
|
|
callout_body_lines.append((i, lines[i]))
|
|
i += 1
|
|
|
|
# Check if callout has numeric content (display math, decimals, etc.)
|
|
has_numerics = False
|
|
has_python_refs = False
|
|
for _, bline in callout_body_lines:
|
|
if HAS_NUMERIC_CONTENT.search(bline):
|
|
has_numerics = True
|
|
if HAS_PYTHON_REF.search(bline):
|
|
has_python_refs = True
|
|
|
|
if not has_preceding_cell and has_numerics and not has_python_refs:
|
|
warnings.append((filepath, callout_start + 1, "MISSING_LEGO_CELL",
|
|
f"callout-notebook has no preceding Python LEGO cell"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{callout_start + 1} — "
|
|
f"callout-notebook missing LEGO cell")
|
|
|
|
# --- Check 2: HARDCODED_RESULT ---
|
|
in_display_math = False
|
|
display_math_start = 0
|
|
display_math_buf = []
|
|
|
|
for line_idx, bline in callout_body_lines:
|
|
# Per-line suppression
|
|
if LEGO_OK.search(bline):
|
|
continue
|
|
|
|
# Track display math blocks ($$...$$)
|
|
if DISPLAY_MATH_LINE.match(bline.strip()):
|
|
if not in_display_math:
|
|
in_display_math = True
|
|
display_math_start = line_idx
|
|
display_math_buf = [bline]
|
|
else:
|
|
# End of display math
|
|
display_math_buf.append(bline)
|
|
math_text = ' '.join(display_math_buf)
|
|
if (HARDCODED_RESULT_RE.search(math_text)
|
|
and not HAS_PYTHON_REF.search(math_text)):
|
|
warnings.append((filepath, display_math_start + 1,
|
|
"HARDCODED_RESULT",
|
|
"display math has hardcoded numeric result (no {python} ref)"))
|
|
if verbose:
|
|
snippet = math_text[:80].replace('\n', ' ')
|
|
print(f" ⚠ {qmd_path.name}:{display_math_start + 1} — "
|
|
f"hardcoded result: {snippet}…")
|
|
in_display_math = False
|
|
display_math_buf = []
|
|
continue
|
|
|
|
if in_display_math:
|
|
display_math_buf.append(bline)
|
|
continue
|
|
|
|
# Single-line display math: $$ ... $$
|
|
stripped = bline.strip()
|
|
if stripped.startswith('$$') and stripped.endswith('$$') and len(stripped) > 4:
|
|
if (HARDCODED_RESULT_RE.search(stripped)
|
|
and not HAS_PYTHON_REF.search(stripped)):
|
|
warnings.append((filepath, line_idx + 1, "HARDCODED_RESULT",
|
|
"display math has hardcoded numeric result (no {python} ref)"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{line_idx + 1} — hardcoded result")
|
|
continue
|
|
|
|
# Inline math with results: $...= NUMBER...$
|
|
# Only flag if line has = or \approx followed by a number, no {python}
|
|
if ('$' in bline and not HAS_PYTHON_REF.search(bline)
|
|
and HARDCODED_RESULT_RE.search(bline)):
|
|
# Confirm it's inside $...$ math, not prose
|
|
dollar_count = bline.count('$') - bline.count('\\$')
|
|
if dollar_count >= 2:
|
|
warnings.append((filepath, line_idx + 1, "HARDCODED_RESULT",
|
|
"inline math has hardcoded numeric result (no {python} ref)"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{line_idx + 1} — "
|
|
f"hardcoded inline result")
|
|
|
|
continue # already advanced i in the callout body loop
|
|
|
|
i += 1
|
|
|
|
return warnings
|
|
|
|
|
|
def check_rendering_patterns(qmd_path, verbose=False):
|
|
"""Check for patterns that cause rendering issues. Returns list of warnings."""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
warnings = []
|
|
filepath = str(qmd_path.relative_to(BOOK_ROOT))
|
|
|
|
# Track if we're in a grid table
|
|
in_grid_table = False
|
|
grid_table_start = 0
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
# Check for inline Python inside LaTeX math
|
|
if LATEX_INLINE_PYTHON.search(line):
|
|
warnings.append((filepath, i, "LATEX_MATH",
|
|
"Inline Python inside $...$ - will strip decimal points"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Python inside LaTeX math")
|
|
|
|
# Check for non-_str inline Python adjacent to LaTeX symbols (decimal stripping risk)
|
|
# NOTE: _str variables adjacent to $\times$ is the PREFERRED convention.
|
|
if LATEX_ADJACENT.search(line):
|
|
warnings.append((filepath, i, "LATEX_ADJACENT",
|
|
"Non-_str inline Python adjacent to $\\\\times$ (decimal stripping risk)"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Non-_str Python adjacent to LaTeX symbol")
|
|
|
|
# Track grid tables
|
|
if GRID_TABLE_SEP.match(line.strip()):
|
|
if not in_grid_table:
|
|
in_grid_table = True
|
|
grid_table_start = i
|
|
elif in_grid_table and not line.strip().startswith('|') and line.strip():
|
|
in_grid_table = False
|
|
|
|
# Check for inline Python in grid tables
|
|
if in_grid_table and '`{python}' in line:
|
|
warnings.append((filepath, grid_table_start, "GRID_TABLE",
|
|
"Grid table with inline Python - convert to pipe table"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Python in grid table")
|
|
|
|
# Check for inline f-string formatting (should be pre-computed)
|
|
if INLINE_FSTRING.search(line):
|
|
warnings.append((filepath, i, "INLINE_FSTRING",
|
|
"Inline f-string - pre-compute as _str variable in Python block"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Inline f-string")
|
|
|
|
# Check for inline function calls (should be pre-computed)
|
|
if INLINE_FUNC_CALL.search(line):
|
|
warnings.append((filepath, i, "INLINE_FUNC",
|
|
"Inline function call - pre-compute as _str variable in Python block"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{i} — Inline function call")
|
|
|
|
# Check for inline Python in YAML chunk options (fig-cap, fig-alt, tbl-cap, lst-cap) — NEVER renders
|
|
if YAML_OPTION_INLINE.search(line):
|
|
warnings.append((filepath, i, "YAML_OPTION",
|
|
"Inline Python in #| fig-alt/fig-cap/tbl-cap/lst-cap - NEVER renders! Use hardcoded value or set caption in code."))
|
|
if verbose:
|
|
print(f" ✗ {qmd_path.name}:{i} — Python in YAML option (will appear literally)")
|
|
|
|
return warnings
|
|
|
|
|
|
import ast
|
|
import importlib
|
|
|
|
PYTHON_BUILTINS = set(dir(__builtins__)) if isinstance(__builtins__, dict) else set(dir(__builtins__))
|
|
PYTHON_BUILTINS |= {
|
|
'int', 'float', 'str', 'list', 'dict', 'set', 'tuple', 'bool', 'bytes',
|
|
'range', 'len', 'print', 'type', 'isinstance', 'enumerate', 'zip', 'map',
|
|
'filter', 'sorted', 'reversed', 'min', 'max', 'sum', 'abs', 'round',
|
|
'True', 'False', 'None', 'super', 'property', 'staticmethod', 'classmethod',
|
|
'Exception', 'ValueError', 'TypeError', 'KeyError', 'IndexError',
|
|
'RuntimeError', 'AttributeError', 'NameError', 'ImportError',
|
|
'NotImplementedError', 'StopIteration', 'AssertionError',
|
|
'open', 'getattr', 'setattr', 'hasattr', 'delattr', 'callable',
|
|
'any', 'all', 'id', 'hash', 'hex', 'oct', 'bin', 'chr', 'ord',
|
|
'format', 'repr', 'input', 'breakpoint', 'object', 'complex',
|
|
'frozenset', 'bytearray', 'memoryview', 'vars', 'dir', 'globals', 'locals',
|
|
'exec', 'eval', 'compile', 'iter', 'next',
|
|
'ZeroDivisionError', 'FileNotFoundError', 'OSError', 'IOError',
|
|
'OverflowError', 'ArithmeticError', 'SystemError', 'Warning',
|
|
'DeprecationWarning', 'UserWarning', 'FutureWarning',
|
|
}
|
|
|
|
_star_import_cache: dict = {}
|
|
_sys_path_patched = False
|
|
|
|
|
|
def _ensure_book_on_path():
|
|
"""Add book/quarto to sys.path so mlsysim.* imports resolve for star-import analysis."""
|
|
global _sys_path_patched
|
|
if _sys_path_patched:
|
|
return
|
|
book_quarto = str(BOOK_ROOT)
|
|
if book_quarto not in sys.path:
|
|
sys.path.insert(0, book_quarto)
|
|
_sys_path_patched = True
|
|
|
|
|
|
def _resolve_star_import(module_name):
|
|
"""Try to resolve names exported by a `from X import *` statement.
|
|
Returns a set of names, or empty set on failure."""
|
|
if module_name in _star_import_cache:
|
|
return _star_import_cache[module_name]
|
|
_ensure_book_on_path()
|
|
try:
|
|
mod = importlib.import_module(module_name)
|
|
if hasattr(mod, '__all__'):
|
|
names = set(mod.__all__)
|
|
else:
|
|
names = {n for n in dir(mod) if not n.startswith('_')}
|
|
_star_import_cache[module_name] = names
|
|
return names
|
|
except Exception:
|
|
_star_import_cache[module_name] = set()
|
|
return set()
|
|
|
|
|
|
def _extract_cell_blocks(qmd_path):
|
|
"""Extract Python code cells from a QMD file.
|
|
|
|
Returns list of (start_line_1based, source_code) tuples.
|
|
"""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
cells = []
|
|
in_cell = False
|
|
cell_start = 0
|
|
cell_lines = []
|
|
|
|
for i, line in enumerate(lines):
|
|
if CELL_START.match(line):
|
|
in_cell = True
|
|
cell_start = i + 1
|
|
cell_lines = []
|
|
elif in_cell and CELL_END.match(line):
|
|
in_cell = False
|
|
cells.append((cell_start, '\n'.join(cell_lines)))
|
|
elif in_cell:
|
|
cell_lines.append(line)
|
|
|
|
return cells
|
|
|
|
|
|
def _collect_all_assignments(node):
|
|
"""Recursively collect all names assigned anywhere under an AST node."""
|
|
assigned = set()
|
|
for child in ast.walk(node):
|
|
if isinstance(child, ast.Assign):
|
|
for target in child.targets:
|
|
for t in ast.walk(target):
|
|
if isinstance(t, ast.Name) and isinstance(t.ctx, ast.Store):
|
|
assigned.add(t.id)
|
|
elif isinstance(child, ast.AugAssign) and isinstance(child.target, ast.Name):
|
|
assigned.add(child.target.id)
|
|
elif isinstance(child, ast.AnnAssign) and isinstance(child.target, ast.Name):
|
|
assigned.add(child.target.id)
|
|
elif isinstance(child, ast.NamedExpr) and isinstance(child.target, ast.Name):
|
|
assigned.add(child.target.id)
|
|
return assigned
|
|
|
|
|
|
def _names_loaded_in_node(node):
|
|
"""Collect all Name nodes in Load context from an AST node,
|
|
excluding names locally bound by function params/locals, lambda params,
|
|
comprehension vars, for-loop vars, and exception handlers."""
|
|
names = set()
|
|
locally_bound = set()
|
|
|
|
for child in ast.walk(node):
|
|
if isinstance(child, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
for arg in child.args.args + child.args.posonlyargs + child.args.kwonlyargs:
|
|
locally_bound.add(arg.arg)
|
|
if child.args.vararg:
|
|
locally_bound.add(child.args.vararg.arg)
|
|
if child.args.kwarg:
|
|
locally_bound.add(child.args.kwarg.arg)
|
|
locally_bound |= _collect_all_assignments(child)
|
|
elif isinstance(child, ast.Lambda):
|
|
for arg in child.args.args:
|
|
locally_bound.add(arg.arg)
|
|
elif isinstance(child, ast.comprehension):
|
|
if isinstance(child.target, ast.Name):
|
|
locally_bound.add(child.target.id)
|
|
elif isinstance(child.target, ast.Tuple):
|
|
for elt in child.target.elts:
|
|
if isinstance(elt, ast.Name):
|
|
locally_bound.add(elt.id)
|
|
elif isinstance(child, ast.For):
|
|
if isinstance(child.target, ast.Name):
|
|
locally_bound.add(child.target.id)
|
|
elif isinstance(child.target, ast.Tuple):
|
|
for elt in child.target.elts:
|
|
if isinstance(elt, ast.Name):
|
|
locally_bound.add(elt.id)
|
|
elif isinstance(child, ast.ExceptHandler) and child.name:
|
|
locally_bound.add(child.name)
|
|
|
|
for child in ast.walk(node):
|
|
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
if child.id not in locally_bound:
|
|
names.add(child.id)
|
|
elif isinstance(child, ast.Attribute) and isinstance(child.ctx, ast.Load):
|
|
if isinstance(child.value, ast.Name) and child.value.id not in locally_bound:
|
|
names.add(child.value.id)
|
|
return names
|
|
|
|
|
|
def _names_defined_in_class(class_node):
|
|
"""Collect names defined (assigned, imported, function-defined) in a class body.
|
|
|
|
In Python, assignments inside for-loops, if-blocks, with-blocks, and try-blocks
|
|
at the class level all define class-level attributes (they share the class scope).
|
|
"""
|
|
defined = set()
|
|
|
|
def _collect(stmts):
|
|
for stmt in stmts:
|
|
if isinstance(stmt, ast.Assign):
|
|
for target in stmt.targets:
|
|
for t in ast.walk(target):
|
|
if isinstance(t, ast.Name) and isinstance(t.ctx, ast.Store):
|
|
defined.add(t.id)
|
|
elif isinstance(stmt, ast.AnnAssign) and isinstance(stmt.target, ast.Name):
|
|
defined.add(stmt.target.id)
|
|
elif isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
defined.add(stmt.name)
|
|
elif isinstance(stmt, (ast.Import, ast.ImportFrom)):
|
|
for alias in stmt.names:
|
|
defined.add(alias.asname if alias.asname else alias.name.split('.')[0])
|
|
elif isinstance(stmt, ast.AugAssign) and isinstance(stmt.target, ast.Name):
|
|
defined.add(stmt.target.id)
|
|
elif isinstance(stmt, ast.For):
|
|
if isinstance(stmt.target, ast.Name):
|
|
defined.add(stmt.target.id)
|
|
elif isinstance(stmt.target, ast.Tuple):
|
|
for elt in stmt.target.elts:
|
|
if isinstance(elt, ast.Name):
|
|
defined.add(elt.id)
|
|
_collect(stmt.body)
|
|
_collect(stmt.orelse)
|
|
elif isinstance(stmt, ast.If):
|
|
_collect(stmt.body)
|
|
_collect(stmt.orelse)
|
|
elif isinstance(stmt, ast.With):
|
|
for item in stmt.items:
|
|
if item.optional_vars and isinstance(item.optional_vars, ast.Name):
|
|
defined.add(item.optional_vars.id)
|
|
_collect(stmt.body)
|
|
elif isinstance(stmt, ast.Try):
|
|
_collect(stmt.body)
|
|
for handler in stmt.handlers:
|
|
_collect(handler.body)
|
|
_collect(stmt.orelse)
|
|
_collect(stmt.finalbody)
|
|
|
|
_collect(class_node.body)
|
|
return defined
|
|
|
|
|
|
def _find_comprehension_scope_issues(class_node):
|
|
"""Detect class-scope variables referenced inside list/dict/set/generator comprehensions.
|
|
|
|
In Python 3, comprehensions inside class bodies cannot access class-level names
|
|
(except the iterable expression). This is a well-known scoping gotcha.
|
|
Returns list of (lineno, name) tuples for problematic references.
|
|
"""
|
|
issues = []
|
|
class_local_names = _names_defined_in_class(class_node)
|
|
|
|
for stmt in ast.walk(class_node):
|
|
if isinstance(stmt, (ast.ListComp, ast.SetComp, ast.GeneratorExp, ast.DictComp)):
|
|
comp_iter_names = set()
|
|
for gen in stmt.generators:
|
|
if isinstance(gen.target, ast.Name):
|
|
comp_iter_names.add(gen.target.id)
|
|
elif isinstance(gen.target, ast.Tuple):
|
|
for elt in gen.target.elts:
|
|
if isinstance(elt, ast.Name):
|
|
comp_iter_names.add(elt.id)
|
|
|
|
if isinstance(stmt, ast.ListComp):
|
|
inner_node = stmt.elt
|
|
elif isinstance(stmt, ast.SetComp):
|
|
inner_node = stmt.elt
|
|
elif isinstance(stmt, ast.GeneratorExp):
|
|
inner_node = stmt.elt
|
|
elif isinstance(stmt, ast.DictComp):
|
|
inner_node = stmt # check key and value
|
|
|
|
for child in ast.walk(inner_node):
|
|
if isinstance(child, ast.Name) and isinstance(child.ctx, ast.Load):
|
|
name = child.id
|
|
if (name in class_local_names
|
|
and name not in comp_iter_names
|
|
and name not in PYTHON_BUILTINS):
|
|
if isinstance(child, ast.Name) and hasattr(child, 'lineno'):
|
|
issues.append((child.lineno, name))
|
|
return issues
|
|
|
|
|
|
def check_scope(qmd_path, verbose=False):
|
|
"""Check for bare variable references inside class bodies that need ClassName. prefix.
|
|
|
|
Detects two classes of bugs:
|
|
1. BARE_CLASS_REF: A class body references a name that isn't locally defined,
|
|
imported, or a builtin — likely needs a ClassName.attr prefix from a prior class.
|
|
2. COMPREHENSION_SCOPE: A list comprehension inside a class body references a
|
|
class-local function/variable, which fails in Python 3 due to implicit scope.
|
|
|
|
Returns list of (filepath, lineno, check_type, message) tuples.
|
|
"""
|
|
cells = _extract_cell_blocks(qmd_path)
|
|
try:
|
|
filepath = str(qmd_path.relative_to(BOOK_ROOT))
|
|
except ValueError:
|
|
filepath = str(qmd_path)
|
|
warnings = []
|
|
|
|
module_scope_names = set()
|
|
known_classes = {} # class_name -> set of attribute names
|
|
|
|
for cell_start, source in cells:
|
|
try:
|
|
tree = ast.parse(source, filename=str(qmd_path))
|
|
except SyntaxError:
|
|
continue
|
|
|
|
cell_imports = set()
|
|
cell_top_level_names = set()
|
|
for stmt in tree.body:
|
|
if isinstance(stmt, (ast.Import, ast.ImportFrom)):
|
|
if isinstance(stmt, ast.ImportFrom) and any(
|
|
a.name == '*' for a in stmt.names
|
|
):
|
|
star_names = _resolve_star_import(stmt.module or '')
|
|
cell_imports |= star_names
|
|
cell_top_level_names |= star_names
|
|
else:
|
|
for alias in stmt.names:
|
|
name = alias.asname if alias.asname else alias.name.split('.')[0]
|
|
cell_imports.add(name)
|
|
cell_top_level_names.add(name)
|
|
elif isinstance(stmt, ast.Assign):
|
|
for target in stmt.targets:
|
|
if isinstance(target, ast.Name):
|
|
cell_top_level_names.add(target.id)
|
|
elif isinstance(stmt, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
cell_top_level_names.add(stmt.name)
|
|
|
|
for stmt in tree.body:
|
|
if not isinstance(stmt, ast.ClassDef):
|
|
continue
|
|
|
|
class_name = stmt.name
|
|
local_defs = _names_defined_in_class(stmt)
|
|
known_classes[class_name] = local_defs
|
|
|
|
all_loaded = set()
|
|
for child_stmt in stmt.body:
|
|
if isinstance(child_stmt, (ast.Expr,)) and isinstance(child_stmt.value, ast.Constant):
|
|
continue
|
|
all_loaded |= _names_loaded_in_node(child_stmt)
|
|
|
|
unresolved = (
|
|
all_loaded
|
|
- local_defs
|
|
- cell_imports
|
|
- cell_top_level_names
|
|
- module_scope_names
|
|
- PYTHON_BUILTINS
|
|
- {class_name}
|
|
)
|
|
|
|
for name in sorted(unresolved):
|
|
candidates = [
|
|
cn for cn, attrs in known_classes.items()
|
|
if name in attrs and cn != class_name
|
|
]
|
|
qmd_line = cell_start + _find_name_line(source, name)
|
|
if candidates:
|
|
suggestion = f"{candidates[0]}.{name}"
|
|
warnings.append((filepath, qmd_line, "BARE_CLASS_REF",
|
|
f"bare `{name}` in class `{class_name}` — did you mean `{suggestion}`?"))
|
|
else:
|
|
warnings.append((filepath, qmd_line, "BARE_CLASS_REF",
|
|
f"bare `{name}` in class `{class_name}` — not defined locally or in prior classes"))
|
|
if verbose:
|
|
if candidates:
|
|
print(f" ⚠ {qmd_path.name}:{qmd_line} — bare `{name}` in `{class_name}` "
|
|
f"(try `{candidates[0]}.{name}`)")
|
|
else:
|
|
print(f" ⚠ {qmd_path.name}:{qmd_line} — bare `{name}` in `{class_name}` (undefined)")
|
|
|
|
comp_issues = _find_comprehension_scope_issues(stmt)
|
|
for lineno, name in comp_issues:
|
|
qmd_line = cell_start + lineno - 1
|
|
warnings.append((filepath, qmd_line, "COMPREHENSION_SCOPE",
|
|
f"`{name}` in comprehension inside class `{class_name}` — "
|
|
f"Python 3 class-scope comprehensions cannot access class-level names"))
|
|
if verbose:
|
|
print(f" ⚠ {qmd_path.name}:{qmd_line} — `{name}` in comprehension "
|
|
f"(Python 3 class scope issue)")
|
|
|
|
module_scope_names |= cell_top_level_names
|
|
for cn in known_classes:
|
|
module_scope_names.add(cn)
|
|
|
|
return warnings
|
|
|
|
|
|
def _find_name_line(source, name):
|
|
"""Find approximate line number of a name reference in source code."""
|
|
for i, line in enumerate(source.splitlines()):
|
|
if re.search(rf'\b{re.escape(name)}\b', line):
|
|
if not line.strip().startswith('#'):
|
|
return i
|
|
return 0
|
|
|
|
|
|
def validate_file(qmd_path, verbose=False, check_patterns=False, check_lego=False,
|
|
check_scope_flag=False):
|
|
"""Validate one QMD file. Returns (errors, warnings)."""
|
|
text = qmd_path.read_text(encoding="utf-8")
|
|
lines = text.splitlines()
|
|
|
|
errors = []
|
|
defined_vars = set()
|
|
defined_classes = set()
|
|
in_cell = False
|
|
in_exports = False
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
# 1. Track variable and class definitions in cells
|
|
if CELL_START.match(line):
|
|
in_cell = True
|
|
continue
|
|
if in_cell and CELL_END.match(line):
|
|
in_cell = False
|
|
in_exports = False
|
|
continue
|
|
|
|
if in_cell:
|
|
# Check for class definitions: class ClassName:
|
|
cm = CLASS_DEF.match(line.strip())
|
|
if cm:
|
|
defined_classes.add(cm.group(1))
|
|
|
|
# Check for assignments: var = ... or var1, var2 = ...
|
|
m = ASSIGNMENT.match(line.strip())
|
|
if m:
|
|
vars_part = m.group(1)
|
|
for v in re.split(r'[,\s]+', vars_part):
|
|
if v.strip():
|
|
defined_vars.add(v.strip())
|
|
|
|
# Check for Exports: in header
|
|
m = EXPORTS_SECTION.match(line.strip())
|
|
if m:
|
|
in_exports = True
|
|
vars_raw = m.group(1)
|
|
# Remove unit parentheticals like (MB, GB)
|
|
vars_raw = re.sub(r'\(.*?\)', '', vars_raw)
|
|
for v in re.split(r'[,\s]+', vars_raw):
|
|
v = v.strip().rstrip(',')
|
|
if v:
|
|
defined_vars.add(v)
|
|
elif in_exports:
|
|
# Continuation of exports
|
|
m = re.match(r'#\s*.\s*(.*)', line.strip())
|
|
if m:
|
|
content = m.group(1).strip()
|
|
# If content starts with a section like 'Goal:', stop
|
|
if re.match(r'^[A-Z][a-z]+:', content):
|
|
in_exports = False
|
|
elif content == "" or "──" in content:
|
|
in_exports = False
|
|
else:
|
|
vars_raw = re.sub(r'\(.*?\)', '', content)
|
|
for v in re.split(r'[,\s]+', vars_raw):
|
|
v = v.strip().rstrip(',')
|
|
if v:
|
|
defined_vars.add(v)
|
|
else:
|
|
in_exports = False
|
|
continue # Don't check for refs inside compute cells
|
|
|
|
# 2. Check inline references for Locality
|
|
for m in INLINE_REF.finditer(line):
|
|
ref = m.group(1)
|
|
if '.' in ref:
|
|
cls_name = ref.split('.', 1)[0]
|
|
resolved = cls_name in defined_classes or cls_name in defined_vars
|
|
else:
|
|
resolved = ref in defined_vars
|
|
if not resolved:
|
|
errors.append((str(qmd_path.relative_to(BOOK_ROOT)), i, ref))
|
|
if verbose:
|
|
print(f" ✗ {qmd_path.name}:{i} — `{{python}} {ref}` used before definition (Locality Violation)")
|
|
|
|
warnings = []
|
|
if check_patterns:
|
|
warnings = check_rendering_patterns(qmd_path, verbose)
|
|
if check_lego:
|
|
warnings.extend(check_lego_compliance(qmd_path, verbose))
|
|
if check_scope_flag:
|
|
warnings.extend(check_scope(qmd_path, verbose))
|
|
|
|
return errors, warnings
|
|
|
|
|
|
def main():
|
|
print(DEPRECATION_MSG, file=sys.stderr)
|
|
|
|
verbose = "--verbose" in sys.argv or "-v" in sys.argv
|
|
check_patterns = "--check-patterns" in sys.argv or "-p" in sys.argv
|
|
check_lego = "--check-lego" in sys.argv or "-l" in sys.argv
|
|
check_scope_flag = "--check-scope" in sys.argv or "-s" in sys.argv
|
|
|
|
# Check for path argument
|
|
args = [a for a in sys.argv[1:] if not a.startswith("-")]
|
|
if args:
|
|
target_path = Path(args[0]).resolve()
|
|
if target_path.is_file():
|
|
qmd_files = [target_path]
|
|
else:
|
|
qmd_files = sorted(target_path.rglob("*.qmd"))
|
|
else:
|
|
qmd_files = sorted(CONTENTS.rglob("*.qmd"))
|
|
total_files = 0
|
|
total_refs = 0
|
|
all_errors = []
|
|
all_warnings = []
|
|
|
|
for qmd in qmd_files:
|
|
errors, warnings = validate_file(
|
|
qmd, verbose=verbose,
|
|
check_patterns=check_patterns, check_lego=check_lego,
|
|
check_scope_flag=check_scope_flag,
|
|
)
|
|
text = qmd.read_text(encoding="utf-8")
|
|
refs = INLINE_REF.findall(text)
|
|
if refs:
|
|
total_files += 1
|
|
total_refs += len(refs)
|
|
all_errors.extend(errors)
|
|
all_warnings.extend(warnings)
|
|
|
|
print(f"\n{'='*60}")
|
|
print(f"Inline Python Validation Report")
|
|
print(f"{'='*60}")
|
|
print(f"Files with inline refs: {total_files}")
|
|
print(f"Total inline references: {total_refs}")
|
|
print(f"Unresolved references: {len(all_errors)}")
|
|
if check_patterns or check_lego:
|
|
print(f"Rendering warnings: {len(all_warnings)}")
|
|
|
|
exit_code = 0
|
|
|
|
if all_errors:
|
|
print(f"\n{'─'*60}")
|
|
print("ERRORS (will break render):")
|
|
for filepath, lineno, var in all_errors:
|
|
print(f" {filepath}:{lineno} — `{{python}} {var}` undefined/locality violation")
|
|
exit_code = 1
|
|
|
|
if all_warnings:
|
|
print(f"\n{'─'*60}")
|
|
print("WARNINGS (may cause incorrect rendering):")
|
|
# Group by type
|
|
by_type = {}
|
|
for filepath, lineno, wtype, msg in all_warnings:
|
|
by_type.setdefault(wtype, []).append((filepath, lineno, msg))
|
|
|
|
for wtype, items in sorted(by_type.items()):
|
|
print(f"\n [{wtype}] ({len(items)} issues)")
|
|
for filepath, lineno, msg in items:
|
|
print(f" {filepath}:{lineno} — {msg}")
|
|
exit_code = 1
|
|
|
|
if exit_code == 0:
|
|
print("\n✓ All checks passed!")
|
|
else:
|
|
print(f"\n{'─'*60}")
|
|
print("Fix issues before rendering to ensure correct output.")
|
|
|
|
return exit_code
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|