chore(test): add TikZ style linter for undefined style detection\n\nScans all Quarto .tikz blocks, collects defined styles, flags uses of undefined custom styles like Line/Box/etc. Heuristics focus on tokens starting uppercase to minimize false positives from colors and built-in keys. Provides file and line context; exits nonzero on findings.

2026-05-05 17:18:48 -05:00 · 2025-11-01 13:26:05 -04:00
parent 2c8baff6b7
commit 53b31cb8b6
1 changed files with 232 additions and 0 deletions
--- a/tools/scripts/testing/tikz_style_linter.py
+++ b/tools/scripts/testing/tikz_style_linter.py
@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+from pathlib import Path
+from typing import Dict, Iterable, List, Optional, Set, Tuple
+
+
+TIKZ_BLOCK_PATTERN = re.compile(r"```\{\.tikz\}[\s\S]*?```", re.MULTILINE)
+TIKZSET_BLOCK_PATTERN = re.compile(r"\\tikzset\s*\{([\s\S]*?)\}")
+STYLE_DEF_PATTERN = re.compile(r"(^|[,\s])([A-Za-z][\w-]*)\s*/\\.style\b")
+
+# Commands that take option lists in square brackets
+OPTIONED_COMMANDS = (
+    "draw",
+    "node",
+    "path",
+    "filldraw",
+    "shade",
+    "clip",
+    "coordinate",
+    "begin\\{scope\}",  # \begin{scope}[...]
+)
+
+OPTION_CAPTURE_PATTERNS = [
+    re.compile(rf"\\{cmd}\s*\[([^\]]+)\]") for cmd in OPTIONED_COMMANDS
+]
+
+
+# Heuristic tokens we will ignore when seen as standalone options, to reduce false positives
+KNOWN_TOKENS: Set[str] = {
+    # thickness
+    "ultra thin",
+    "very thin",
+    "thin",
+    "semithick",
+    "thick",
+    "very thick",
+    "ultra thick",
+    # dashing
+    "solid",
+    "dashed",
+    "densely dashed",
+    "loosely dashed",
+    "dotted",
+    "densely dotted",
+    "loosely dotted",
+    "dashdotted",
+    # common shape keywords
+    "circle",
+    "rectangle",
+    # positioning
+    "left",
+    "right",
+    "above",
+    "below",
+    # path arrows sometimes appear as tokens, but usually expressed via -{...}
+}
+
+
+def read_text(path: Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8")
+    except Exception:
+        return path.read_text(errors="ignore")
+
+
+def find_tikz_blocks(text: str) -> List[Tuple[int, int, str]]:
+    blocks: List[Tuple[int, int, str]] = []
+    for m in TIKZ_BLOCK_PATTERN.finditer(text):
+        start, end = m.span()
+        blocks.append((start, end, m.group(0)))
+    return blocks
+
+
+def collect_defined_styles(text: str) -> Set[str]:
+    styles: Set[str] = set()
+    for m in TIKZSET_BLOCK_PATTERN.finditer(text):
+        body = m.group(1)
+        for s in STYLE_DEF_PATTERN.finditer(body):
+            styles.add(s.group(2))
+    return styles
+
+
+def extract_option_tokens(option_text: str) -> List[str]:
+    # naive split by comma, ignore content inside braces or brackets
+    tokens: List[str] = []
+    buf: List[str] = []
+    depth_curly = 0
+    depth_brack = 0
+    for ch in option_text:
+        if ch == '{':
+            depth_curly += 1
+        elif ch == '}':
+            depth_curly = max(0, depth_curly - 1)
+        elif ch == '[':
+            depth_brack += 1
+        elif ch == ']':
+            depth_brack = max(0, depth_brack - 1)
+        if ch == ',' and depth_curly == 0 and depth_brack == 0:
+            token = ''.join(buf).strip()
+            if token:
+                tokens.append(token)
+            buf = []
+        else:
+            buf.append(ch)
+    last = ''.join(buf).strip()
+    if last:
+        tokens.append(last)
+    return tokens
+
+
+def looks_like_style_token(token: str) -> bool:
+    # Exclude key=value and tokens with spaces that are clearly compound phrases unless uppercase start
+    if '=' in token:
+        return False
+    # Ignore shorten <=, shorten >=, etc.
+    if 'shorten <=' in token or 'shorten >=' in token:
+        return False
+    # Arrow tip specifications or path operators are not styles
+    if token.startswith('-') or token.endswith('-') or '->' in token:
+        return False
+    # Common known tokens
+    if token in KNOWN_TOKENS:
+        return False
+    # If token contains spaces and starts lowercase, likely a built-in keyword like very thick, dashed, etc.
+    if ' ' in token and not token[0].isupper():
+        return False
+    # Only consider tokens that start with an uppercase letter as potential custom styles
+    if not token or not token[0].isupper():
+        return False
+    return bool(re.match(r"^[A-Za-z][\w-]*$", token))
+
+
+def find_undefined_styles_in_block(block_text: str, defined_styles: Set[str]) -> Set[str]:
+    used_unknown: Set[str] = set()
+    for pat in OPTION_CAPTURE_PATTERNS:
+        for m in pat.finditer(block_text):
+            options_text = m.group(1)
+            for token in extract_option_tokens(options_text):
+                token = token.strip()
+                if not looks_like_style_token(token):
+                    continue
+                # Accept styles explicitly defined
+                if token in defined_styles:
+                    continue
+                used_unknown.add(token)
+    return used_unknown
+
+
+def build_line_index(text: str) -> List[int]:
+    # returns start index of each line
+    idxs = [0]
+    for m in re.finditer(r"\n", text):
+        idxs.append(m.end())
+    return idxs
+
+
+def offset_to_line(line_starts: List[int], offset: int) -> int:
+    # binary search for line number from offset
+    lo, hi = 0, len(line_starts) - 1
+    ans = 0
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        if line_starts[mid] <= offset:
+            ans = mid
+            lo = mid + 1
+        else:
+            hi = mid - 1
+    return ans + 1  # 1-based
+
+
+def scan_quarto_root(root: Path) -> int:
+    # Collect global styles from header-includes.tex if present
+    global_styles: Set[str] = set()
+    header_includes = root / "quarto" / "tex" / "header-includes.tex"
+    if header_includes.exists():
+        global_styles |= collect_defined_styles(read_text(header_includes))
+
+    qmd_files = list((root / "quarto").rglob("*.qmd"))
+    total_issues = 0
+    for qmd in qmd_files:
+        text = read_text(qmd)
+        line_index = build_line_index(text)
+        blocks = find_tikz_blocks(text)
+        if not blocks:
+            continue
+        for bstart, bend, btext in blocks:
+            local_defs = collect_defined_styles(btext)
+            defined = set(global_styles) | set(local_defs)
+            unknown = find_undefined_styles_in_block(btext, defined)
+            if not unknown:
+                continue
+            total_issues += len(unknown)
+            # Try to locate first reference lines for each unknown token
+            print(f"File: {qmd}")
+            for token in sorted(unknown):
+                # find usage position within block (simplified matcher for robustness)
+                usage_pattern = (
+                    r"\\(draw|node|path|filldraw|shade|clip|coordinate|begin\{scope\})\s*\[" 
+                    + r"[^\]]*" + re.escape(token) + r"[^\]]*\]"
+                )
+                m = re.search(usage_pattern, btext)
+                if m:
+                    pos_in_block = m.start()
+                    line_no = offset_to_line(line_index, bstart + pos_in_block)
+                    context_line = text.splitlines()[line_no - 1].rstrip()
+                    print(f"  line {line_no}: uses undefined style '{token}'")
+                    print(f"    {context_line}")
+                else:
+                    print(f"  uses undefined style '{token}' (exact line not found)")
+            print()
+    return total_issues
+
+
+def main(argv: List[str]) -> int:
+    root = Path(argv[1]).resolve() if len(argv) > 1 else Path(__file__).resolve().parents[3]
+    if not root.exists():
+        print(f"Root path does not exist: {root}", file=sys.stderr)
+        return 2
+    issues = scan_quarto_root(root)
+    if issues:
+        print(f"Found {issues} undefined style references.")
+        return 1
+    print("No undefined TikZ style references found.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv))
+
+