feat(tools): add comprehensive spell checking for TikZ diagrams and prose

Add two complementary spell checking tools for content validation:

- check_tikz_spelling.py: Extracts and validates all visible text from
  TikZ diagrams including node labels, inline annotations, custom pics,
  foreach loops, legends, and comments. Uses pattern-based matching for
  common typos with optional aspell integration.

- check_prose_spelling.py: Intelligently parses QMD structure to check
  only actual prose content while excluding YAML frontmatter, code blocks,
  TikZ diagrams, inline code, math expressions, and URLs. Uses aspell with
  comprehensive ignore list of 500+ technical terms and acronyms.

Both tools provide detailed output with file paths, line numbers, and
context for identified spelling errors. The TikZ checker found and enabled
fixing of typos like 'gatewey', 'poihnts', and 'Intellignet' across the
codebase.
This commit is contained in:
Vijay Janapa Reddi
2025-11-03 11:01:04 -05:00
parent 36634cf1d3
commit 2c730dda36
2 changed files with 991 additions and 0 deletions

View File

@@ -0,0 +1,497 @@
#!/usr/bin/env python3
r"""
Spell check prose content in QMD files using aspell.
Intelligently parses QMD file structure to only check actual prose text,
excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, URLs, etc.
Usage:
python3 tools/scripts/content/check_prose_spelling.py [directory]
Requirements:
- aspell must be installed (brew install aspell)
- No Python dependencies beyond standard library
Checks:
- Paragraph text
- Headings
- List items
- Callout content
Ignores:
- YAML frontmatter
- Code blocks (```...```)
- Inline code (`...`)
- TikZ diagrams
- URLs and links
- LaTeX math ($...$, $$...$$)
- Special Quarto syntax
"""
import re
import sys
import subprocess
from pathlib import Path
from typing import List, Tuple, Set
def extract_yaml_frontmatter(content: str) -> Tuple[int, int]:
"""
Find the start and end positions of YAML frontmatter.
Returns:
Tuple of (start_pos, end_pos) or (0, 0) if no frontmatter
"""
if not content.startswith('---'):
return (0, 0)
# Find the closing ---
lines = content.split('\n')
for i, line in enumerate(lines[1:], 1):
if line.strip() == '---':
# Return character positions
start = 0
end = sum(len(lines[j]) + 1 for j in range(i + 1))
return (start, end)
return (0, 0)
def extract_code_blocks(content: str) -> List[Tuple[int, int]]:
"""
Find all code blocks (```...``` and TikZ blocks).
Returns:
List of (start_pos, end_pos) tuples
"""
blocks = []
# Find ``` code blocks
pattern = r'```.*?```'
for match in re.finditer(pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
# Find TikZ blocks specifically (in case they're not in ```)
tikz_pattern = r'\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}'
for match in re.finditer(tikz_pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
return blocks
def extract_inline_code(content: str) -> List[Tuple[int, int]]:
"""
Find all inline code spans (`...`).
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
pattern = r'`[^`]+?`'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
return spans
def extract_math_blocks(content: str) -> List[Tuple[int, int]]:
"""
Find all LaTeX math blocks ($...$, $$...$$).
Returns:
List of (start_pos, end_pos) tuples
"""
blocks = []
# Display math $$...$$
pattern = r'\$\$.*?\$\$'
for match in re.finditer(pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
# Inline math $...$
pattern = r'(?<!\$)\$(?!\$)[^\$]+?\$(?!\$)'
for match in re.finditer(pattern, content):
blocks.append((match.start(), match.end()))
return blocks
def extract_links_and_urls(content: str) -> List[Tuple[int, int]]:
"""
Find all markdown links and URLs.
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
# Markdown links [text](url)
pattern = r'\[([^\]]+)\]\([^\)]+\)'
for match in re.finditer(pattern, content):
# Only exclude the URL part, keep the link text
url_start = match.group(0).find('](') + match.start() + 1
url_end = match.end() - 1
spans.append((url_start, url_end))
# Reference-style links [@ref], {#id}, @sec-name
pattern = r'(\[@[^\]]+\]|\{#[^\}]+\}|@[a-z]+-[a-z0-9-]+)'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
# Plain URLs
pattern = r'https?://[^\s\)>]+'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
return spans
def extract_quarto_syntax(content: str) -> List[Tuple[int, int]]:
"""
Find Quarto-specific syntax to exclude.
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
# Quarto divs ::: {.classname}
pattern = r':::\s*\{[^\}]+\}'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
# Quarto shortcodes {{< ... >}}
pattern = r'\{\{<.*?>\}\}'
for match in re.finditer(pattern, content, re.DOTALL):
spans.append((match.start(), match.end()))
return spans
def should_exclude_position(pos: int, exclude_ranges: List[Tuple[int, int]]) -> bool:
"""Check if a position falls within any exclude range."""
for start, end in exclude_ranges:
if start <= pos < end:
return True
return False
def extract_prose_text(content: str) -> List[Tuple[str, int]]:
"""
Extract only prose text from QMD content.
Returns:
List of (text, line_number) tuples
"""
# Build exclude ranges
exclude_ranges = []
yaml_start, yaml_end = extract_yaml_frontmatter(content)
if yaml_end > 0:
exclude_ranges.append((yaml_start, yaml_end))
exclude_ranges.extend(extract_code_blocks(content))
exclude_ranges.extend(extract_inline_code(content))
exclude_ranges.extend(extract_math_blocks(content))
exclude_ranges.extend(extract_links_and_urls(content))
exclude_ranges.extend(extract_quarto_syntax(content))
# Sort and merge overlapping ranges
exclude_ranges.sort()
merged = []
for start, end in exclude_ranges:
if merged and start <= merged[-1][1]:
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
else:
merged.append((start, end))
# Extract prose text
prose_segments = []
lines = content.split('\n')
pos = 0
for line_num, line in enumerate(lines, 1):
line_start = pos
line_end = pos + len(line)
# Check if any part of this line is prose
if not should_exclude_position(line_start, merged):
# Extract prose parts from this line
prose_text = ""
for i, char in enumerate(line):
char_pos = line_start + i
if not should_exclude_position(char_pos, merged):
prose_text += char
else:
if prose_text.strip():
prose_segments.append((prose_text.strip(), line_num))
prose_text = ""
if prose_text.strip():
prose_segments.append((prose_text.strip(), line_num))
pos = line_end + 1 # +1 for newline
return prose_segments
def clean_prose_text(text: str) -> str:
"""
Clean prose text of markdown formatting while keeping words.
Args:
text: Raw prose text with markdown
Returns:
Cleaned text for spell checking
"""
# Remove markdown formatting
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) # Bold
text = re.sub(r'\*([^\*]+)\*', r'\1', text) # Italic
text = re.sub(r'_([^_]+)_', r'\1', text) # Italic
text = re.sub(r'~~([^~]+)~~', r'\1', text) # Strikethrough
# Remove remaining markdown symbols
text = re.sub(r'[#\*_~]', '', text)
# Remove special characters but keep apostrophes in words
text = re.sub(r'[^\w\s\'-]', ' ', text)
return text.strip()
def check_with_aspell(text: str, ignore_terms: Set[str]) -> List[str]:
"""
Check text with aspell.
Returns:
List of misspelled words
"""
try:
result = subprocess.run(
['aspell', 'list', '--lang=en'],
input=text,
capture_output=True,
text=True,
check=False
)
if result.returncode == 0:
words = [w for w in result.stdout.strip().split('\n') if w]
# Filter ignore terms
filtered = [w for w in words if w.lower() not in ignore_terms]
return filtered
return []
except Exception as e:
print(f"Error running aspell: {e}", file=sys.stderr)
return []
def check_file(filepath: Path) -> List[dict]:
"""
Check a single QMD file for spelling errors.
Returns:
List of error dictionaries
"""
# Common technical terms to ignore
ignore_terms = {
# File formats and common abbreviations
'qmd', 'yml', 'json', 'png', 'jpg', 'svg', 'pdf',
'tikz', 'quarto', 'pandoc', 'latex', 'tensorflow', 'pytorch',
'gpu', 'cpu', 'tpu', 'ram', 'api', 'ui', 'ux', 'cli', 'sdk',
'yaml', 'toml', 'html', 'css', 'javascript', 'typescript',
'numpy', 'pandas', 'matplotlib', 'jupyter', 'colab',
'github', 'gitlab', 'bitbucket',
'ai', 'ml', 'dl', 'cv', 'nlp', 'iot', 'rl', 'gan',
'lstm', 'gru', 'rnn', 'cnn', 'vgg', 'resnet', 'bert',
# ML systems and techniques
'tinyml', 'microcontroller', 'microcontrollers', 'preprocessing',
'convolutional', 'latencies', 'dns', 'dennard', 'triadic',
'benchmarking', 'gdpr', 'hipaa', 'backpropagation', 'quantized',
'autoregressive', 'overfitting', 'checkpointing', 'hyperparameters',
'embeddings', 'spectrograms', 'mfcc', 'kws', 'activations',
'mnist', 'feedforward', 'softmax', 'relu', 'sigmoid', 'thresholding',
'postprocessing', 'suboptimal', 'multilayer', 'perceptrons',
'cnns', 'rnns', 'mlps', 'dnn', 'translational', 'invariance',
'parallelizable', 'uat', 'discriminative', 'fpgas', 'asics',
'topologies', 'reconceptualization', 'orchestrators', 'bfloat',
# Product and project names
'plantvillage', 'nuru', 'farmbeats', 'respira', 'colabs', 'edgeml',
'mlperf', 'linpack', 'specpowerssj', 'datahub', 'kubeflow',
'mobilenets', 'efficientnets', 'gpt', 'palm',
# Company and organization names
'mckinsey', 'espressif', 'hortonworks', 'linkedin', 'uber', 'cloudtrail',
# Acronyms and abbreviations
'cmd', 'cbsd', 'mw', 'sram', 'sox', 'sdg', 'sdgs', 'agi', 'tco',
'gpus', 'mlops', 'gigaflops', 'eniac', 'cpus', 'tpus', 'fp', 'nist',
# Legitimate English words often flagged
'underserved', 'sociotechnical', 'ebola', 'forecasted', 'unmonitored',
'transformative', 'microclimates', 'microclimate', 'responders',
'scalable', 'aspirational', 'lifecycle', 'lifecycles',
'representativeness', 'reproducibility', 'milliwatt', 'milliwatts',
'decomposable', 'interpretability', 'modularity', 'architecting',
'instantiations', 'crowdsourcing', 'crowdsourced', 'interdependencies',
'degradations', 'natively', 'detections', 'observability', 'exfiltration',
'auditable', 'cryptographic', 'curation', 'engineerable', 'subfield',
'misrouted', 'tradeoff', 'tradeoffs', 'pre',
# People names (for attributions)
'vijay', 'janapa', 'reddi', 'yann', 'lecun', 'corinna', 'burges',
'cybenko', 'hornik', 'augereau',
# Image filename patterns (without extensions)
'covermlsystems', 'coveraigood', 'coveraibenchmarking',
'coverconclusion', 'coverdataengineering', 'covernnprimer',
'coverdlarch',
# LaTeX commands
'noindent',
# AI tools
'dall', 'dalle',
# Short codes/patterns
'fn',
# Additional comprehensive technical terms (auto-generated from book content)
'accelerometers', 'acm', 'adamw', 'additionality', 'adreno', 'aes', 'agentic', 'aiops',
'airbnb', 'aitraining', 'akida', 'al', 'alexa', 'alexnet', 'algorithmically', 'alphafold',
'ambri', 'amodei', 'anonymization', 'anonymized', 'anthropic', 'asilomar', 'auditability',
'autocorrect', 'autocorrection', 'autocorrections', 'automatable', 'automl', 'avr', 'axonal',
'backdoored', 'backdoors', 'backend', 'backends', 'balancers', 'batchsize', 'bibliometric',
'binarization', 'biometric', 'bist', 'blas', 'bostrom', 'bottlenecked', 'brominated', 'carlini',
'cfe', 'channelwise', 'chatbot', 'chatbots', 'chatgpt', 'checkmark', 'chiplet', 'chiplets',
'clinaiops', 'cloudlets', 'cmsis', 'codecarbon', 'compas', 'conda', 'contestability', 'coprocessor',
'coprocessors', 'coveraihardware', 'coveraiworkflow', 'coverefficientai', 'coverfrontiers',
'coverintroduction', 'covermlframeworks', 'covermlops', 'covermodeloptimizations',
'coverondevicelearning', 'coverresponsibleai', 'coverrobustai', 'coversecurityprivacy',
'coversustainableai', 'cublas', 'cuda', 'customizations', 'cybersecurity', 'cyberweapon',
'de', 'debois', 'debuggable', 'deepsparse', 'deepspeed', 'devops', 'distilbert', 'dma', 'dp',
'dsp', 'dsps', 'dvfs', 'dwork', 'dx', 'eacs', 'electrodermal', 'electromechanical',
'epistemologically', 'esg', 'esrs', 'et', 'ethnicities', 'ets', 'ewc', 'exaflops',
'explainability', 'explanations', 'expressivity', 'externality', 'facto', 'failover', 'fairlearn',
'fairscale', 'fe', 'fedavgm', 'fedprox', 'fi', 'flops', 'forrester', 'fpu', 'frac', 'freertos',
'fx', 'gapped', 'gboard', 'gemm', 'gflops', 'giga', 'goertzel', 'gradcam', 'greenwashing',
'groupwise', 'handlin', 'hbm', 'hd', 'hdfs', 'hitl', 'homomorphic', 'hsms', 'huggingface',
'hwacc', 'hyperscale', 'iid', 'imagenet', 'imbalancing', 'incentivized', 'incentivizing',
'instantiation', 'intentioned', 'interdependency', 'intra', 'jax', 'jenkins', 'jpeg', 'kaggle',
'kanies', 'kawaguchi', 'kdd', 'keras', 'kinetis', 'kleinberg', 'kohsuke', 'kolmogorov', 'krum',
'kryo', 'kubernetes', 'lapack', 'lca', 'leaderboards', 'lidar', 'llms', 'ln', 'loihi', 'lora',
'lpddr', 'mah', 'maml', 'mance', 'mapa', 'mbed', 'mbps', 'mcus', 'medskip', 'metux', 'metuxs',
'micronpu', 'microservices', 'microsystems', 'millijoules', 'misalignments', 'misclassification',
'misclassifies', 'misclassify', 'misconfigured', 'mitigations', 'mj', 'mlcommons', 'mlflow',
'mlir', 'mlp', 'mobilenetv', 'modelscaling', 'moores', 'msqe', 'multimodal', 'multiphase',
'mwh', 'nas', 'natanz', 'nbsp', 'netron', 'neurosymbolic', 'ngo', 'nm', 'nn', 'npu', 'npus',
'npv', 'nsight', 'numenta', 'numerics', 'nvlink', 'nwp', 'nxp', 'oecd', 'onnx', 'ons', 'openai',
'opencl', 'openvino', 'openwebtext', 'operationalization', 'operationalize', 'operationalizing',
'optum', 'ota', 'overcorrecting', 'overfit', 'overreliance', 'parallelizes', 'pcie', 'perceptron',
'performant', 'personalization', 'pes', 'picojoules', 'pipelining', 'pj', 'plcs', 'ppv',
'prefetched', 'prefetching', 'pretrained', 'programmability', 'proliferative', 'proprioception',
'propublica', 'ptq', 'pufs', 'pypi', 'qat', 'qos', 'quadratically', 'quant', 'rbac', 'recalibrate',
'recalibrating', 'recommender', 'reconceptualizes', 'recyclability', 'reframing', 'reimagined',
'reimagining', 'reimplement', 'reimplementing', 'renewables', 'repairability', 'rescoring',
'reskilling', 'retinopathy', 'reusability', 'ridesharing', 'rlhf', 'roadmap', 'rollout', 'rollouts',
'rss', 'runtimes', 'sagemaker', 'sanitization', 'scipy', 'scopus', 'sdt', 'sgd', 'shader', 'shaders',
'shap', 'shapley', 'simd', 'siri', 'situationally', 'slas', 'smi', 'smirnov', 'snns', 'snpe',
'soc', 'socs', 'sparseml', 'sparsification', 'spinoff', 'sprase', 'spss', 'stationarity', 'stm',
'stuxnet', 'swappable', 'synergistically', 'tcp', 'tdsp', 'tees', 'tensorboard', 'tensorrt',
'tera', 'ternarization', 'tflite', 'tfx', 'thresholded', 'timm', 'titration', 'tls', 'tokenization',
'toolchains', 'torchscript', 'torchserve', 'tpr', 'tpuv', 'tradeable', 'trojan', 'truenorth',
'tvm', 'ultrapure', 'unbundled', 'underutilization', 'unexplainable', 'unimodal', 'unoptimized',
'untrusted', 'upgradable', 'upgradeable', 'upskilling', 'uptime', 'usb', 'utensor', 'utopian',
'vectornet', 'virusbokbok', 'vitis', 'von', 'vr', 'vtune', 'vulkan', 'waymo', 'wearables',
'wellbeing', 'wi', 'xla', 'zero',
}
try:
content = filepath.read_text(encoding='utf-8')
except Exception as e:
print(f"Error reading {filepath}: {e}", file=sys.stderr)
return []
prose_segments = extract_prose_text(content)
errors = []
for text, line_num in prose_segments:
cleaned = clean_prose_text(text)
if not cleaned:
continue
misspelled = check_with_aspell(cleaned, ignore_terms)
if misspelled:
errors.append({
'file': filepath.resolve(), # Store absolute path
'line': line_num,
'text': text[:100] + ('...' if len(text) > 100 else ''),
'misspelled': misspelled
})
return errors
def main():
"""Main function."""
# Check if aspell is available
try:
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
except (FileNotFoundError, subprocess.CalledProcessError):
print("Error: aspell not found. Install it with: brew install aspell", file=sys.stderr)
return 1
# Get directory to check
repo_root = Path(__file__).resolve().parents[3]
if len(sys.argv) > 1:
target_dir = Path(sys.argv[1])
else:
target_dir = repo_root / 'quarto' / 'contents' / 'core'
if not target_dir.exists():
print(f"Error: Directory not found: {target_dir}", file=sys.stderr)
return 1
# Find all QMD files
qmd_files = list(target_dir.rglob('*.qmd'))
print(f"Checking {len(qmd_files)} .qmd files for prose spelling errors...\n")
all_errors = []
files_with_errors = 0
for qmd_file in sorted(qmd_files):
errors = check_file(qmd_file)
if errors:
files_with_errors += 1
all_errors.extend(errors)
# Print results
if all_errors:
print(f"Found {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
current_file = None
for error in sorted(all_errors, key=lambda e: (str(e['file']), e['line'])):
if error['file'] != current_file:
current_file = error['file']
try:
rel_path = error['file'].relative_to(repo_root)
except ValueError:
rel_path = error['file']
print(f"\n{rel_path}")
print("=" * len(str(rel_path)))
print(f" Line {error['line']}: {error['text']}")
print(f" → Misspelled: {', '.join(error['misspelled'])}")
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
return 1
else:
print("✓ No spelling errors found in prose text!")
return 0
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,494 @@
#!/usr/bin/env python3
r"""
Spell check text content within TikZ diagrams.
Extracts and validates all visible text from TikZ diagrams in .qmd files,
including node labels, inline annotations, formatted text, and comments.
Usage:
python3 tools/scripts/content/check_tikz_spelling.py
Checks text in:
- Node commands: \node{text}, node{text} in \draw/\path/\fill
- Formatted text: \textbf{}, \textit{}, \emph{}, etc.
- Drawing annotations: \draw--node{label}--
- Custom pics: pics/name/, \pic{name}
- Foreach loops: /{Text}/ patterns
- Labels: label={text}, pin={text}
- Legends: \legend{Item 1, Item 2}
- Comments: % text
Optional: Install aspell for comprehensive dictionary checking
macOS: brew install aspell
Ubuntu: sudo apt-get install aspell
"""
import re
import sys
from pathlib import Path
from typing import List, Tuple, Set
import subprocess
def extract_tikz_blocks(content: str, filepath: str) -> List[Tuple[str, int]]:
"""
Extract TikZ code blocks with their starting line numbers.
Returns:
List of (tikz_content, start_line_number) tuples
"""
blocks = []
lines = content.split('\n')
in_tikz = False
current_block = []
start_line = 0
for i, line in enumerate(lines, 1):
if r'\begin{tikzpicture}' in line:
in_tikz = True
start_line = i
current_block = [line]
elif r'\end{tikzpicture}' in line and in_tikz:
current_block.append(line)
blocks.append(('\n'.join(current_block), start_line))
in_tikz = False
current_block = []
elif in_tikz:
current_block.append(line)
return blocks
def clean_latex_text(text: str) -> str:
"""
Clean LaTeX formatting from text to get readable content.
Args:
text: Raw text from LaTeX/TikZ
Returns:
Cleaned text with LaTeX commands removed
"""
# Replace \\ (line breaks) with spaces first
text = text.replace('\\\\', ' ')
# Remove size commands that appear before text (like {\huge ?})
text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\s+', ' ', text)
# Remove font commands
text = re.sub(r'\\usefont\{[^}]*\}\{[^}]*\}\{[^}]*\}\{[^}]*\}', ' ', text)
text = re.sub(r'\\fontsize\{[^}]*\}\{[^}]*\}\\selectfont', ' ', text)
text = re.sub(r'\\bfseries\s*', ' ', text)
# Handle nested formatting commands (multiple passes)
for _ in range(3): # Up to 3 levels of nesting
# Remove common LaTeX formatting commands but keep the text
text = re.sub(r'\\textbf\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\textit\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\emph\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\text\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\mathbf\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\mathrm\{([^}]+)\}', r'\1', text)
text = re.sub(r'\\textsubscript\{([^}]+)\}', r'_\1', text)
text = re.sub(r'\\textsuperscript\{([^}]+)\}', r'^\1', text)
text = re.sub(r'\\textcolor\{[^}]*\}\{([^}]+)\}', r'\1', text)
# Remove $ signs (math mode)
text = text.replace('$', '')
# Remove other common LaTeX commands (but preserve the text after them)
text = re.sub(r'\\[a-zA-Z]+\s*', ' ', text)
# Clean up whitespace
text = ' '.join(text.split())
return text.strip()
def extract_all_curly_brace_text(tikz_content: str) -> List[Tuple[str, str, int]]:
"""
Extract all text content from curly braces that could be visible text.
Returns:
List of (text, context, char_position) tuples
"""
texts = []
# Find all text in curly braces that follows common TikZ commands or appears in node definitions
# This catches: \node{text}, node{text}, \textbf{text}, etc.
# Pattern 1: \node[options]{text} or \node(name){text}
node_standalone = r'\\node\s*(?:\[[^\]]*\])?\s*(?:\([^)]*\))?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
for match in re.finditer(node_standalone, tikz_content):
text = match.group(1)
texts.append((text, '\\node{...}', match.start()))
# Pattern 2: node[options]{text} (inside \draw, \fill, or \path)
node_inline = r'(?<!\\)node\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
for match in re.finditer(node_inline, tikz_content):
text = match.group(1)
texts.append((text, 'node{...} in draw/path/fill', match.start()))
# Pattern 3: Text formatting commands
text_commands = [
(r'\\textbf\{([^}]+)\}', '\\textbf{...}'),
(r'\\textit\{([^}]+)\}', '\\textit{...}'),
(r'\\emph\{([^}]+)\}', '\\emph{...}'),
(r'\\text\{([^}]+)\}', '\\text{...}'),
]
for pattern, context in text_commands:
for match in re.finditer(pattern, tikz_content):
text = match.group(1)
texts.append((text, context, match.start()))
# Pattern 4: label={text} and similar options
label_pattern = r'(?:label|pin|xlabel|ylabel)\s*=\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
for match in re.finditer(label_pattern, tikz_content):
text = match.group(1)
texts.append((text, 'label={...}', match.start()))
# Pattern 5: legend command
legend_pattern = r'\\legend\s*\{([^}]+)\}'
for match in re.finditer(legend_pattern, tikz_content):
text = match.group(1)
texts.append((text, '\\legend{...}', match.start()))
return texts
def extract_text_from_foreach(tikz_content: str) -> List[Tuple[str, str]]:
r"""
Extract text from \foreach loops which often contain labels.
Pattern: \\foreach \\i/\\j/... in {val1/{Text 1}/val2, val2/{Text 2}/val3, ...}
Returns:
List of (text, context) tuples
"""
texts = []
# Find \foreach statements
foreach_pattern = r'\\foreach[^{]+in\s*\{([^}]+)\}'
for match in re.finditer(foreach_pattern, tikz_content, re.DOTALL):
content = match.group(1)
# Extract text from {...} within the foreach content
# Pattern: /{text}/
text_in_braces = re.findall(r'/\{([^}]+)\}/', content)
for text in text_in_braces:
cleaned = clean_latex_text(text)
if cleaned and len(cleaned) > 2:
# Skip if it's just a number or coordinate
if not re.match(r'^[\d\s\.,\-\+]+$', cleaned):
texts.append((cleaned, f'\\foreach loop: /{{{text}}}/'))
return texts
def extract_text_from_tikz(tikz_content: str) -> List[Tuple[str, str]]:
"""
Extract ALL human-readable text from TikZ code.
Returns:
List of (text, context) tuples where context shows where the text was found
"""
texts = []
seen_texts = set() # Avoid duplicates
# Extract all text from curly braces
for raw_text, context, pos in extract_all_curly_brace_text(tikz_content):
# Clean the text
cleaned = clean_latex_text(raw_text)
# Skip if it's just numbers, coordinates, colors, or TikZ commands
if not cleaned:
continue
if re.match(r'^[\d\s\.,\-\+\*\/\(\)_\^]+$', cleaned): # Just numbers/math/subscripts
continue
if re.match(r'^[a-z]+!?\d*$', cleaned): # Colors like "red", "blue!50"
continue
if len(cleaned) < 2: # Too short to be meaningful text
continue
# Avoid duplicates
key = (cleaned.lower(), context)
if key not in seen_texts:
seen_texts.add(key)
texts.append((cleaned, f'{context}: "{raw_text}"'))
# Extract text from \foreach loops
for text, context in extract_text_from_foreach(tikz_content):
key = (text.lower(), 'foreach')
if key not in seen_texts:
seen_texts.add(key)
texts.append((text, context))
# Extract text from pic names (custom TikZ pictures)
pic_name_pattern = r'pics/([a-zA-Z_]+)/'
for match in re.finditer(pic_name_pattern, tikz_content):
name = match.group(1)
if len(name) > 2:
key = (name.lower(), 'pics')
if key not in seen_texts:
seen_texts.add(key)
texts.append((name, f'pics/{name}/'))
# Extract text from pic usage
pic_usage_pattern = r'\\pic\s*(?:\[[^\]]*\])?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
for match in re.finditer(pic_usage_pattern, tikz_content):
name = match.group(1)
if len(name) > 2 and not re.match(r'^[\d\s]+$', name):
key = (name.lower(), 'pic_usage')
if key not in seen_texts:
seen_texts.add(key)
texts.append((name, f'\\pic{{...}}{{{name}}}'))
# Extract comments (often contain descriptive text)
comment_pattern = r'%\s*(.+?)(?:\n|$)'
for match in re.finditer(comment_pattern, tikz_content):
comment = match.group(1).strip()
# Skip comments that are just separators or structure
if comment and not re.match(r'^[\-\=\*\s]+$', comment):
key = (comment.lower(), 'comment')
if key not in seen_texts:
seen_texts.add(key)
texts.append((comment, f'% {comment}'))
# Extract variable names from \def that might be words
def_pattern = r'\\def\\([a-zA-Z]+)\{'
for match in re.finditer(def_pattern, tikz_content):
name = match.group(1)
# Only check if it looks like a word (not all caps, reasonable length)
if len(name) > 3 and not name.isupper() and not name.startswith('r'):
key = (name.lower(), 'def')
if key not in seen_texts:
seen_texts.add(key)
texts.append((name, f'\\def\\{name}'))
return texts
def check_spelling_with_aspell(text: str) -> List[str]:
"""
Check spelling using aspell if available, filtering out TikZ/LaTeX technical terms.
Returns:
List of misspelled words (excluding known technical terms)
"""
# Terms to ignore (TikZ syntax, LaTeX commands, common technical terms, etc.)
ignore_terms = {
# TikZ pic parameters
'scalefac', 'picname', 'filllcolor', 'drawcolor', 'linewidth',
'filllcirclecolor', 'drawcircle', 'bodycolor', 'tiecolor', 'stetcolor',
'drawchannelcolor', 'channelcolor',
# Color names
'brownline', 'redline', 'blueline', 'violetline', 'greenline', 'orangeline',
'violetl', 'greenl', 'bluel', 'redl', 'orangel',
'greend',
# TikZ/LaTeX commands
'tikzset', 'foreach', 'tikz', 'usefont', 'phv', 'bfseries', 'textbf',
'pgfmathparse', 'addplot', 'sqrt',
# Common variable names
'cellsize', 'cellheight', 'xmax', 'ymin', 'newx', 'pos', 'sep',
# Technical diagram terms
'mycylinder', 'mycycle', 'myline', 'rgpoly', 'zerofill',
# Display/UI elements
'displaye', 'autotext',
# Abbreviations used in diagrams
'zgl', 'zgd', 'da', 'dcd', 'dcl', 'dsc', 'ggb', 'lca', 'sre',
# Common acronyms and abbreviations
'ui', 'kpis', 'oss', 'rtx', 'tpus', 'bg', 'eniac', 'fp',
# Technical terms (keep legitimate ones but add clearly technical)
'preprocessing', 'backprop', 'weightgradient', 'davit', 'tokenize',
'multimodality', 'microarchitecture', 'hypercomputing', 'curation',
'transformative',
# Misc
'helvetica', 'geeksforgeeks', 'lightgray', 'gaussian', 'yshift',
'ack', 'zz', 'yy',
}
try:
# Check if aspell is available
result = subprocess.run(
['aspell', '--version'],
capture_output=True,
text=True
)
if result.returncode != 0:
return []
except FileNotFoundError:
return []
# Use aspell to check spelling
try:
result = subprocess.run(
['aspell', 'list', '--lang=en'],
input=text,
capture_output=True,
text=True
)
if result.returncode == 0:
words = [word for word in result.stdout.strip().split('\n') if word]
# Filter out ignored terms
filtered = [w for w in words if w.lower() not in ignore_terms]
return filtered
except Exception:
pass
return []
def simple_spell_check(text: str) -> List[str]:
"""
Simple pattern-based spell checking for common mistakes.
Returns:
List of potential typos
"""
common_typos = {
'teh': 'the',
'htat': 'that',
'taht': 'that',
'adn': 'and',
'nad': 'and',
'gatewey': 'gateway',
'poihnts': 'points',
'poitns': 'points',
'recieve': 'receive',
'seperate': 'separate',
'occured': 'occurred',
'occurance': 'occurrence',
'begining': 'beginning',
'lenght': 'length',
'widht': 'width',
'heigth': 'height',
'coordiante': 'coordinate',
'cooridate': 'coordinate',
'paramter': 'parameter',
'paramters': 'parameters',
'intellignet': 'intelligent',
}
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
typos = []
for word in words:
if word in common_typos:
typos.append(f'{word} (suggest: {common_typos[word]})')
return typos
def check_file(filepath: Path, use_aspell: bool = True) -> List[dict]:
"""
Check a single file for spelling errors in TikZ diagrams.
Returns:
List of error dictionaries with file, line, text, and suggestions
"""
try:
content = filepath.read_text(encoding='utf-8')
except Exception as e:
print(f"Error reading {filepath}: {e}", file=sys.stderr)
return []
tikz_blocks = extract_tikz_blocks(content, str(filepath))
errors = []
for tikz_content, start_line in tikz_blocks:
texts = extract_text_from_tikz(tikz_content)
for text, context in texts:
# Simple pattern check (always run)
simple_errors = simple_spell_check(text)
if simple_errors:
errors.append({
'file': str(filepath),
'line': start_line,
'text': text,
'context': context,
'suggestions': simple_errors
})
# Aspell check (if available and requested)
if use_aspell:
aspell_errors = check_spelling_with_aspell(text)
if aspell_errors:
errors.append({
'file': str(filepath),
'line': start_line,
'text': text,
'context': context,
'suggestions': aspell_errors
})
return errors
def main():
"""Main function to check all .qmd files for TikZ spelling errors."""
# Find all .qmd files in the quarto/contents directory
repo_root = Path(__file__).resolve().parents[3]
contents_dir = repo_root / 'quarto' / 'contents'
if not contents_dir.exists():
print(f"Error: Contents directory not found at {contents_dir}", file=sys.stderr)
return 1
qmd_files = list(contents_dir.rglob('*.qmd'))
print(f"Checking {len(qmd_files)} .qmd files for TikZ spelling errors...\n")
# Check if aspell is available
use_aspell = True
try:
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
print("Using aspell for comprehensive spell checking.")
except (FileNotFoundError, subprocess.CalledProcessError):
print("aspell not found. Using pattern-based checking only.")
print("Install aspell for more comprehensive checking: brew install aspell\n")
use_aspell = False
all_errors = []
files_with_errors = 0
for qmd_file in sorted(qmd_files):
errors = check_file(qmd_file, use_aspell)
if errors:
files_with_errors += 1
all_errors.extend(errors)
# Print results
if all_errors:
print(f"\nFound {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
current_file = None
for error in sorted(all_errors, key=lambda e: (e['file'], e['line'])):
if error['file'] != current_file:
current_file = error['file']
rel_path = Path(error['file']).relative_to(repo_root)
print(f"\n{rel_path}")
print("=" * len(str(rel_path)))
print(f" Line {error['line']}: {error['context']}")
print(f" → Issues: {', '.join(error['suggestions'])}")
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
return 1
else:
print("\n✓ No spelling errors found in TikZ diagrams!")
return 0
if __name__ == '__main__':
sys.exit(main())