cs249r_book/book/tools/scripts/testing/tikz_style_linter.py

#!/usr/bin/env python3

import re
import sys
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Set, Tuple


TIKZ_BLOCK_PATTERN = re.compile(r"```\{\.tikz\}[\s\S]*?```", re.MULTILINE)
TIKZSET_BLOCK_PATTERN = re.compile(r"\\tikzset\s*\{([\s\S]*?)\}")
STYLE_DEF_PATTERN = re.compile(r"(^|[,\s])([A-Za-z][\w-]*)\s*/\\.style\b")

# Commands that take option lists in square brackets
OPTIONED_COMMANDS = (
    "draw",
    "node",
    "path",
    "filldraw",
    "shade",
    "clip",
    "coordinate",
    "begin\\{scope\}",  # \begin{scope}[...]
)

OPTION_CAPTURE_PATTERNS = [
    re.compile(rf"\\{cmd}\s*\[([^\]]+)\]") for cmd in OPTIONED_COMMANDS
]


# Heuristic tokens we will ignore when seen as standalone options, to reduce false positives
KNOWN_TOKENS: Set[str] = {
    # thickness
    "ultra thin",
    "very thin",
    "thin",
    "semithick",
    "thick",
    "very thick",
    "ultra thick",
    # dashing
    "solid",
    "dashed",
    "densely dashed",
    "loosely dashed",
    "dotted",
    "densely dotted",
    "loosely dotted",
    "dashdotted",
    # common shape keywords
    "circle",
    "rectangle",
    # positioning
    "left",
    "right",
    "above",
    "below",
    # path arrows sometimes appear as tokens, but usually expressed via -{...}
}


def read_text(path: Path) -> str:
    try:
        return path.read_text(encoding="utf-8")
    except Exception:
        return path.read_text(errors="ignore")


def find_tikz_blocks(text: str) -> List[Tuple[int, int, str]]:
    blocks: List[Tuple[int, int, str]] = []
    for m in TIKZ_BLOCK_PATTERN.finditer(text):
        start, end = m.span()
        blocks.append((start, end, m.group(0)))
    return blocks


def collect_defined_styles(text: str) -> Set[str]:
    styles: Set[str] = set()
    for m in TIKZSET_BLOCK_PATTERN.finditer(text):
        body = m.group(1)
        for s in STYLE_DEF_PATTERN.finditer(body):
            styles.add(s.group(2))
    return styles


def extract_option_tokens(option_text: str) -> List[str]:
    # naive split by comma, ignore content inside braces or brackets
    tokens: List[str] = []
    buf: List[str] = []
    depth_curly = 0
    depth_brack = 0
    for ch in option_text:
        if ch == '{':
            depth_curly += 1
        elif ch == '}':
            depth_curly = max(0, depth_curly - 1)
        elif ch == '[':
            depth_brack += 1
        elif ch == ']':
            depth_brack = max(0, depth_brack - 1)
        if ch == ',' and depth_curly == 0 and depth_brack == 0:
            token = ''.join(buf).strip()
            if token:
                tokens.append(token)
            buf = []
        else:
            buf.append(ch)
    last = ''.join(buf).strip()
    if last:
        tokens.append(last)
    return tokens


def looks_like_style_token(token: str) -> bool:
    # Exclude key=value and tokens with spaces that are clearly compound phrases unless uppercase start
    if '=' in token:
        return False
    # Ignore shorten <=, shorten >=, etc.
    if 'shorten <=' in token or 'shorten >=' in token:
        return False
    # Arrow tip specifications or path operators are not styles
    if token.startswith('-') or token.endswith('-') or '->' in token:
        return False
    # Common known tokens
    if token in KNOWN_TOKENS:
        return False
    # If token contains spaces and starts lowercase, likely a built-in keyword like very thick, dashed, etc.
    if ' ' in token and not token[0].isupper():
        return False
    # Only consider tokens that start with an uppercase letter as potential custom styles
    if not token or not token[0].isupper():
        return False
    return bool(re.match(r"^[A-Za-z][\w-]*$", token))


def find_undefined_styles_in_block(block_text: str, defined_styles: Set[str]) -> Set[str]:
    used_unknown: Set[str] = set()
    for pat in OPTION_CAPTURE_PATTERNS:
        for m in pat.finditer(block_text):
            options_text = m.group(1)
            for token in extract_option_tokens(options_text):
                token = token.strip()
                if not looks_like_style_token(token):
                    continue
                # Accept styles explicitly defined
                if token in defined_styles:
                    continue
                used_unknown.add(token)
    return used_unknown


def build_line_index(text: str) -> List[int]:
    # returns start index of each line
    idxs = [0]
    for m in re.finditer(r"\n", text):
        idxs.append(m.end())
    return idxs


def offset_to_line(line_starts: List[int], offset: int) -> int:
    # binary search for line number from offset
    lo, hi = 0, len(line_starts) - 1
    ans = 0
    while lo <= hi:
        mid = (lo + hi) // 2
        if line_starts[mid] <= offset:
            ans = mid
            lo = mid + 1
        else:
            hi = mid - 1
    return ans + 1  # 1-based


def scan_quarto_root(root: Path) -> int:
    # Collect global styles from header-includes.tex if present
    global_styles: Set[str] = set()
    header_includes = root / "quarto" / "tex" / "header-includes.tex"
    if header_includes.exists():
        global_styles |= collect_defined_styles(read_text(header_includes))

    qmd_files = list((root / "quarto").rglob("*.qmd"))
    total_issues = 0
    for qmd in qmd_files:
        text = read_text(qmd)
        line_index = build_line_index(text)
        blocks = find_tikz_blocks(text)
        if not blocks:
            continue
        for bstart, bend, btext in blocks:
            local_defs = collect_defined_styles(btext)
            defined = set(global_styles) | set(local_defs)
            unknown = find_undefined_styles_in_block(btext, defined)
            if not unknown:
                continue
            total_issues += len(unknown)
            # Try to locate first reference lines for each unknown token
            print(f"File: {qmd}")
            for token in sorted(unknown):
                # find usage position within block (simplified matcher for robustness)
                usage_pattern = (
                    r"\\(draw|node|path|filldraw|shade|clip|coordinate|begin\{scope\})\s*\["
                    + r"[^\]]*" + re.escape(token) + r"[^\]]*\]"
                )
                m = re.search(usage_pattern, btext)
                if m:
                    pos_in_block = m.start()
                    line_no = offset_to_line(line_index, bstart + pos_in_block)
                    context_line = text.splitlines()[line_no - 1].rstrip()
                    print(f"  line {line_no}: uses undefined style '{token}'")
                    print(f"    {context_line}")
                else:
                    print(f"  uses undefined style '{token}' (exact line not found)")
            print()
    return total_issues


def main(argv: List[str]) -> int:
    root = Path(argv[1]).resolve() if len(argv) > 1 else Path(__file__).resolve().parents[3]
    if not root.exists():
        print(f"Root path does not exist: {root}", file=sys.stderr)
        return 2
    issues = scan_quarto_root(root)
    if issues:
        print(f"Found {issues} undefined style references.")
        return 1
    print("No undefined TikZ style references found.")
    return 0


if __name__ == "__main__":
    sys.exit(main(sys.argv))