mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 01:29:07 -05:00
231 lines
7.1 KiB
Python
231 lines
7.1 KiB
Python
#!/usr/bin/env python3
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Dict, Iterable, List, Optional, Set, Tuple
|
|
|
|
|
|
TIKZ_BLOCK_PATTERN = re.compile(r"```\{\.tikz\}[\s\S]*?```", re.MULTILINE)
|
|
TIKZSET_BLOCK_PATTERN = re.compile(r"\\tikzset\s*\{([\s\S]*?)\}")
|
|
STYLE_DEF_PATTERN = re.compile(r"(^|[,\s])([A-Za-z][\w-]*)\s*/\\.style\b")
|
|
|
|
# Commands that take option lists in square brackets
|
|
OPTIONED_COMMANDS = (
|
|
"draw",
|
|
"node",
|
|
"path",
|
|
"filldraw",
|
|
"shade",
|
|
"clip",
|
|
"coordinate",
|
|
"begin\\{scope\}", # \begin{scope}[...]
|
|
)
|
|
|
|
OPTION_CAPTURE_PATTERNS = [
|
|
re.compile(rf"\\{cmd}\s*\[([^\]]+)\]") for cmd in OPTIONED_COMMANDS
|
|
]
|
|
|
|
|
|
# Heuristic tokens we will ignore when seen as standalone options, to reduce false positives
|
|
KNOWN_TOKENS: Set[str] = {
|
|
# thickness
|
|
"ultra thin",
|
|
"very thin",
|
|
"thin",
|
|
"semithick",
|
|
"thick",
|
|
"very thick",
|
|
"ultra thick",
|
|
# dashing
|
|
"solid",
|
|
"dashed",
|
|
"densely dashed",
|
|
"loosely dashed",
|
|
"dotted",
|
|
"densely dotted",
|
|
"loosely dotted",
|
|
"dashdotted",
|
|
# common shape keywords
|
|
"circle",
|
|
"rectangle",
|
|
# positioning
|
|
"left",
|
|
"right",
|
|
"above",
|
|
"below",
|
|
# path arrows sometimes appear as tokens, but usually expressed via -{...}
|
|
}
|
|
|
|
|
|
def read_text(path: Path) -> str:
|
|
try:
|
|
return path.read_text(encoding="utf-8")
|
|
except Exception:
|
|
return path.read_text(errors="ignore")
|
|
|
|
|
|
def find_tikz_blocks(text: str) -> List[Tuple[int, int, str]]:
|
|
blocks: List[Tuple[int, int, str]] = []
|
|
for m in TIKZ_BLOCK_PATTERN.finditer(text):
|
|
start, end = m.span()
|
|
blocks.append((start, end, m.group(0)))
|
|
return blocks
|
|
|
|
|
|
def collect_defined_styles(text: str) -> Set[str]:
|
|
styles: Set[str] = set()
|
|
for m in TIKZSET_BLOCK_PATTERN.finditer(text):
|
|
body = m.group(1)
|
|
for s in STYLE_DEF_PATTERN.finditer(body):
|
|
styles.add(s.group(2))
|
|
return styles
|
|
|
|
|
|
def extract_option_tokens(option_text: str) -> List[str]:
|
|
# naive split by comma, ignore content inside braces or brackets
|
|
tokens: List[str] = []
|
|
buf: List[str] = []
|
|
depth_curly = 0
|
|
depth_brack = 0
|
|
for ch in option_text:
|
|
if ch == '{':
|
|
depth_curly += 1
|
|
elif ch == '}':
|
|
depth_curly = max(0, depth_curly - 1)
|
|
elif ch == '[':
|
|
depth_brack += 1
|
|
elif ch == ']':
|
|
depth_brack = max(0, depth_brack - 1)
|
|
if ch == ',' and depth_curly == 0 and depth_brack == 0:
|
|
token = ''.join(buf).strip()
|
|
if token:
|
|
tokens.append(token)
|
|
buf = []
|
|
else:
|
|
buf.append(ch)
|
|
last = ''.join(buf).strip()
|
|
if last:
|
|
tokens.append(last)
|
|
return tokens
|
|
|
|
|
|
def looks_like_style_token(token: str) -> bool:
|
|
# Exclude key=value and tokens with spaces that are clearly compound phrases unless uppercase start
|
|
if '=' in token:
|
|
return False
|
|
# Ignore shorten <=, shorten >=, etc.
|
|
if 'shorten <=' in token or 'shorten >=' in token:
|
|
return False
|
|
# Arrow tip specifications or path operators are not styles
|
|
if token.startswith('-') or token.endswith('-') or '->' in token:
|
|
return False
|
|
# Common known tokens
|
|
if token in KNOWN_TOKENS:
|
|
return False
|
|
# If token contains spaces and starts lowercase, likely a built-in keyword like very thick, dashed, etc.
|
|
if ' ' in token and not token[0].isupper():
|
|
return False
|
|
# Only consider tokens that start with an uppercase letter as potential custom styles
|
|
if not token or not token[0].isupper():
|
|
return False
|
|
return bool(re.match(r"^[A-Za-z][\w-]*$", token))
|
|
|
|
|
|
def find_undefined_styles_in_block(block_text: str, defined_styles: Set[str]) -> Set[str]:
|
|
used_unknown: Set[str] = set()
|
|
for pat in OPTION_CAPTURE_PATTERNS:
|
|
for m in pat.finditer(block_text):
|
|
options_text = m.group(1)
|
|
for token in extract_option_tokens(options_text):
|
|
token = token.strip()
|
|
if not looks_like_style_token(token):
|
|
continue
|
|
# Accept styles explicitly defined
|
|
if token in defined_styles:
|
|
continue
|
|
used_unknown.add(token)
|
|
return used_unknown
|
|
|
|
|
|
def build_line_index(text: str) -> List[int]:
|
|
# returns start index of each line
|
|
idxs = [0]
|
|
for m in re.finditer(r"\n", text):
|
|
idxs.append(m.end())
|
|
return idxs
|
|
|
|
|
|
def offset_to_line(line_starts: List[int], offset: int) -> int:
|
|
# binary search for line number from offset
|
|
lo, hi = 0, len(line_starts) - 1
|
|
ans = 0
|
|
while lo <= hi:
|
|
mid = (lo + hi) // 2
|
|
if line_starts[mid] <= offset:
|
|
ans = mid
|
|
lo = mid + 1
|
|
else:
|
|
hi = mid - 1
|
|
return ans + 1 # 1-based
|
|
|
|
|
|
def scan_quarto_root(root: Path) -> int:
|
|
# Collect global styles from header-includes.tex if present
|
|
global_styles: Set[str] = set()
|
|
header_includes = root / "quarto" / "tex" / "header-includes.tex"
|
|
if header_includes.exists():
|
|
global_styles |= collect_defined_styles(read_text(header_includes))
|
|
|
|
qmd_files = list((root / "quarto").rglob("*.qmd"))
|
|
total_issues = 0
|
|
for qmd in qmd_files:
|
|
text = read_text(qmd)
|
|
line_index = build_line_index(text)
|
|
blocks = find_tikz_blocks(text)
|
|
if not blocks:
|
|
continue
|
|
for bstart, bend, btext in blocks:
|
|
local_defs = collect_defined_styles(btext)
|
|
defined = set(global_styles) | set(local_defs)
|
|
unknown = find_undefined_styles_in_block(btext, defined)
|
|
if not unknown:
|
|
continue
|
|
total_issues += len(unknown)
|
|
# Try to locate first reference lines for each unknown token
|
|
print(f"File: {qmd}")
|
|
for token in sorted(unknown):
|
|
# find usage position within block (simplified matcher for robustness)
|
|
usage_pattern = (
|
|
r"\\(draw|node|path|filldraw|shade|clip|coordinate|begin\{scope\})\s*\["
|
|
+ r"[^\]]*" + re.escape(token) + r"[^\]]*\]"
|
|
)
|
|
m = re.search(usage_pattern, btext)
|
|
if m:
|
|
pos_in_block = m.start()
|
|
line_no = offset_to_line(line_index, bstart + pos_in_block)
|
|
context_line = text.splitlines()[line_no - 1].rstrip()
|
|
print(f" line {line_no}: uses undefined style '{token}'")
|
|
print(f" {context_line}")
|
|
else:
|
|
print(f" uses undefined style '{token}' (exact line not found)")
|
|
print()
|
|
return total_issues
|
|
|
|
|
|
def main(argv: List[str]) -> int:
|
|
root = Path(argv[1]).resolve() if len(argv) > 1 else Path(__file__).resolve().parents[3]
|
|
if not root.exists():
|
|
print(f"Root path does not exist: {root}", file=sys.stderr)
|
|
return 2
|
|
issues = scan_quarto_root(root)
|
|
if issues:
|
|
print(f"Found {issues} undefined style references.")
|
|
return 1
|
|
print("No undefined TikZ style references found.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main(sys.argv))
|