mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 01:29:07 -05:00
feat(tools): add comprehensive spell checking for TikZ diagrams and prose
Add two complementary spell checking tools for content validation: - check_tikz_spelling.py: Extracts and validates all visible text from TikZ diagrams including node labels, inline annotations, custom pics, foreach loops, legends, and comments. Uses pattern-based matching for common typos with optional aspell integration. - check_prose_spelling.py: Intelligently parses QMD structure to check only actual prose content while excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, math expressions, and URLs. Uses aspell with comprehensive ignore list of 500+ technical terms and acronyms. Both tools provide detailed output with file paths, line numbers, and context for identified spelling errors. The TikZ checker found and enabled fixing of typos like 'gatewey', 'poihnts', and 'Intellignet' across the codebase.
This commit is contained in:
497
tools/scripts/content/check_prose_spelling.py
Executable file
497
tools/scripts/content/check_prose_spelling.py
Executable file
@@ -0,0 +1,497 @@
|
||||
#!/usr/bin/env python3
|
||||
r"""
|
||||
Spell check prose content in QMD files using aspell.
|
||||
|
||||
Intelligently parses QMD file structure to only check actual prose text,
|
||||
excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, URLs, etc.
|
||||
|
||||
Usage:
|
||||
python3 tools/scripts/content/check_prose_spelling.py [directory]
|
||||
|
||||
Requirements:
|
||||
- aspell must be installed (brew install aspell)
|
||||
- No Python dependencies beyond standard library
|
||||
|
||||
Checks:
|
||||
- Paragraph text
|
||||
- Headings
|
||||
- List items
|
||||
- Callout content
|
||||
|
||||
Ignores:
|
||||
- YAML frontmatter
|
||||
- Code blocks (```...```)
|
||||
- Inline code (`...`)
|
||||
- TikZ diagrams
|
||||
- URLs and links
|
||||
- LaTeX math ($...$, $$...$$)
|
||||
- Special Quarto syntax
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Set
|
||||
|
||||
|
||||
def extract_yaml_frontmatter(content: str) -> Tuple[int, int]:
|
||||
"""
|
||||
Find the start and end positions of YAML frontmatter.
|
||||
|
||||
Returns:
|
||||
Tuple of (start_pos, end_pos) or (0, 0) if no frontmatter
|
||||
"""
|
||||
if not content.startswith('---'):
|
||||
return (0, 0)
|
||||
|
||||
# Find the closing ---
|
||||
lines = content.split('\n')
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
if line.strip() == '---':
|
||||
# Return character positions
|
||||
start = 0
|
||||
end = sum(len(lines[j]) + 1 for j in range(i + 1))
|
||||
return (start, end)
|
||||
|
||||
return (0, 0)
|
||||
|
||||
|
||||
def extract_code_blocks(content: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Find all code blocks (```...``` and TikZ blocks).
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos) tuples
|
||||
"""
|
||||
blocks = []
|
||||
|
||||
# Find ``` code blocks
|
||||
pattern = r'```.*?```'
|
||||
for match in re.finditer(pattern, content, re.DOTALL):
|
||||
blocks.append((match.start(), match.end()))
|
||||
|
||||
# Find TikZ blocks specifically (in case they're not in ```)
|
||||
tikz_pattern = r'\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}'
|
||||
for match in re.finditer(tikz_pattern, content, re.DOTALL):
|
||||
blocks.append((match.start(), match.end()))
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def extract_inline_code(content: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Find all inline code spans (`...`).
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos) tuples
|
||||
"""
|
||||
spans = []
|
||||
pattern = r'`[^`]+?`'
|
||||
for match in re.finditer(pattern, content):
|
||||
spans.append((match.start(), match.end()))
|
||||
return spans
|
||||
|
||||
|
||||
def extract_math_blocks(content: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Find all LaTeX math blocks ($...$, $$...$$).
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos) tuples
|
||||
"""
|
||||
blocks = []
|
||||
|
||||
# Display math $$...$$
|
||||
pattern = r'\$\$.*?\$\$'
|
||||
for match in re.finditer(pattern, content, re.DOTALL):
|
||||
blocks.append((match.start(), match.end()))
|
||||
|
||||
# Inline math $...$
|
||||
pattern = r'(?<!\$)\$(?!\$)[^\$]+?\$(?!\$)'
|
||||
for match in re.finditer(pattern, content):
|
||||
blocks.append((match.start(), match.end()))
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def extract_links_and_urls(content: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Find all markdown links and URLs.
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos) tuples
|
||||
"""
|
||||
spans = []
|
||||
|
||||
# Markdown links [text](url)
|
||||
pattern = r'\[([^\]]+)\]\([^\)]+\)'
|
||||
for match in re.finditer(pattern, content):
|
||||
# Only exclude the URL part, keep the link text
|
||||
url_start = match.group(0).find('](') + match.start() + 1
|
||||
url_end = match.end() - 1
|
||||
spans.append((url_start, url_end))
|
||||
|
||||
# Reference-style links [@ref], {#id}, @sec-name
|
||||
pattern = r'(\[@[^\]]+\]|\{#[^\}]+\}|@[a-z]+-[a-z0-9-]+)'
|
||||
for match in re.finditer(pattern, content):
|
||||
spans.append((match.start(), match.end()))
|
||||
|
||||
# Plain URLs
|
||||
pattern = r'https?://[^\s\)>]+'
|
||||
for match in re.finditer(pattern, content):
|
||||
spans.append((match.start(), match.end()))
|
||||
|
||||
return spans
|
||||
|
||||
|
||||
def extract_quarto_syntax(content: str) -> List[Tuple[int, int]]:
|
||||
"""
|
||||
Find Quarto-specific syntax to exclude.
|
||||
|
||||
Returns:
|
||||
List of (start_pos, end_pos) tuples
|
||||
"""
|
||||
spans = []
|
||||
|
||||
# Quarto divs ::: {.classname}
|
||||
pattern = r':::\s*\{[^\}]+\}'
|
||||
for match in re.finditer(pattern, content):
|
||||
spans.append((match.start(), match.end()))
|
||||
|
||||
# Quarto shortcodes {{< ... >}}
|
||||
pattern = r'\{\{<.*?>\}\}'
|
||||
for match in re.finditer(pattern, content, re.DOTALL):
|
||||
spans.append((match.start(), match.end()))
|
||||
|
||||
return spans
|
||||
|
||||
|
||||
def should_exclude_position(pos: int, exclude_ranges: List[Tuple[int, int]]) -> bool:
|
||||
"""Check if a position falls within any exclude range."""
|
||||
for start, end in exclude_ranges:
|
||||
if start <= pos < end:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def extract_prose_text(content: str) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
Extract only prose text from QMD content.
|
||||
|
||||
Returns:
|
||||
List of (text, line_number) tuples
|
||||
"""
|
||||
# Build exclude ranges
|
||||
exclude_ranges = []
|
||||
|
||||
yaml_start, yaml_end = extract_yaml_frontmatter(content)
|
||||
if yaml_end > 0:
|
||||
exclude_ranges.append((yaml_start, yaml_end))
|
||||
|
||||
exclude_ranges.extend(extract_code_blocks(content))
|
||||
exclude_ranges.extend(extract_inline_code(content))
|
||||
exclude_ranges.extend(extract_math_blocks(content))
|
||||
exclude_ranges.extend(extract_links_and_urls(content))
|
||||
exclude_ranges.extend(extract_quarto_syntax(content))
|
||||
|
||||
# Sort and merge overlapping ranges
|
||||
exclude_ranges.sort()
|
||||
merged = []
|
||||
for start, end in exclude_ranges:
|
||||
if merged and start <= merged[-1][1]:
|
||||
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
||||
else:
|
||||
merged.append((start, end))
|
||||
|
||||
# Extract prose text
|
||||
prose_segments = []
|
||||
lines = content.split('\n')
|
||||
pos = 0
|
||||
|
||||
for line_num, line in enumerate(lines, 1):
|
||||
line_start = pos
|
||||
line_end = pos + len(line)
|
||||
|
||||
# Check if any part of this line is prose
|
||||
if not should_exclude_position(line_start, merged):
|
||||
# Extract prose parts from this line
|
||||
prose_text = ""
|
||||
for i, char in enumerate(line):
|
||||
char_pos = line_start + i
|
||||
if not should_exclude_position(char_pos, merged):
|
||||
prose_text += char
|
||||
else:
|
||||
if prose_text.strip():
|
||||
prose_segments.append((prose_text.strip(), line_num))
|
||||
prose_text = ""
|
||||
|
||||
if prose_text.strip():
|
||||
prose_segments.append((prose_text.strip(), line_num))
|
||||
|
||||
pos = line_end + 1 # +1 for newline
|
||||
|
||||
return prose_segments
|
||||
|
||||
|
||||
def clean_prose_text(text: str) -> str:
|
||||
"""
|
||||
Clean prose text of markdown formatting while keeping words.
|
||||
|
||||
Args:
|
||||
text: Raw prose text with markdown
|
||||
|
||||
Returns:
|
||||
Cleaned text for spell checking
|
||||
"""
|
||||
# Remove markdown formatting
|
||||
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) # Bold
|
||||
text = re.sub(r'\*([^\*]+)\*', r'\1', text) # Italic
|
||||
text = re.sub(r'_([^_]+)_', r'\1', text) # Italic
|
||||
text = re.sub(r'~~([^~]+)~~', r'\1', text) # Strikethrough
|
||||
|
||||
# Remove remaining markdown symbols
|
||||
text = re.sub(r'[#\*_~]', '', text)
|
||||
|
||||
# Remove special characters but keep apostrophes in words
|
||||
text = re.sub(r'[^\w\s\'-]', ' ', text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def check_with_aspell(text: str, ignore_terms: Set[str]) -> List[str]:
|
||||
"""
|
||||
Check text with aspell.
|
||||
|
||||
Returns:
|
||||
List of misspelled words
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['aspell', 'list', '--lang=en'],
|
||||
input=text,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False
|
||||
)
|
||||
if result.returncode == 0:
|
||||
words = [w for w in result.stdout.strip().split('\n') if w]
|
||||
# Filter ignore terms
|
||||
filtered = [w for w in words if w.lower() not in ignore_terms]
|
||||
return filtered
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error running aspell: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
def check_file(filepath: Path) -> List[dict]:
|
||||
"""
|
||||
Check a single QMD file for spelling errors.
|
||||
|
||||
Returns:
|
||||
List of error dictionaries
|
||||
"""
|
||||
# Common technical terms to ignore
|
||||
ignore_terms = {
|
||||
# File formats and common abbreviations
|
||||
'qmd', 'yml', 'json', 'png', 'jpg', 'svg', 'pdf',
|
||||
'tikz', 'quarto', 'pandoc', 'latex', 'tensorflow', 'pytorch',
|
||||
'gpu', 'cpu', 'tpu', 'ram', 'api', 'ui', 'ux', 'cli', 'sdk',
|
||||
'yaml', 'toml', 'html', 'css', 'javascript', 'typescript',
|
||||
'numpy', 'pandas', 'matplotlib', 'jupyter', 'colab',
|
||||
'github', 'gitlab', 'bitbucket',
|
||||
'ai', 'ml', 'dl', 'cv', 'nlp', 'iot', 'rl', 'gan',
|
||||
'lstm', 'gru', 'rnn', 'cnn', 'vgg', 'resnet', 'bert',
|
||||
|
||||
# ML systems and techniques
|
||||
'tinyml', 'microcontroller', 'microcontrollers', 'preprocessing',
|
||||
'convolutional', 'latencies', 'dns', 'dennard', 'triadic',
|
||||
'benchmarking', 'gdpr', 'hipaa', 'backpropagation', 'quantized',
|
||||
'autoregressive', 'overfitting', 'checkpointing', 'hyperparameters',
|
||||
'embeddings', 'spectrograms', 'mfcc', 'kws', 'activations',
|
||||
'mnist', 'feedforward', 'softmax', 'relu', 'sigmoid', 'thresholding',
|
||||
'postprocessing', 'suboptimal', 'multilayer', 'perceptrons',
|
||||
'cnns', 'rnns', 'mlps', 'dnn', 'translational', 'invariance',
|
||||
'parallelizable', 'uat', 'discriminative', 'fpgas', 'asics',
|
||||
'topologies', 'reconceptualization', 'orchestrators', 'bfloat',
|
||||
|
||||
# Product and project names
|
||||
'plantvillage', 'nuru', 'farmbeats', 'respira', 'colabs', 'edgeml',
|
||||
'mlperf', 'linpack', 'specpowerssj', 'datahub', 'kubeflow',
|
||||
'mobilenets', 'efficientnets', 'gpt', 'palm',
|
||||
|
||||
# Company and organization names
|
||||
'mckinsey', 'espressif', 'hortonworks', 'linkedin', 'uber', 'cloudtrail',
|
||||
|
||||
# Acronyms and abbreviations
|
||||
'cmd', 'cbsd', 'mw', 'sram', 'sox', 'sdg', 'sdgs', 'agi', 'tco',
|
||||
'gpus', 'mlops', 'gigaflops', 'eniac', 'cpus', 'tpus', 'fp', 'nist',
|
||||
|
||||
# Legitimate English words often flagged
|
||||
'underserved', 'sociotechnical', 'ebola', 'forecasted', 'unmonitored',
|
||||
'transformative', 'microclimates', 'microclimate', 'responders',
|
||||
'scalable', 'aspirational', 'lifecycle', 'lifecycles',
|
||||
'representativeness', 'reproducibility', 'milliwatt', 'milliwatts',
|
||||
'decomposable', 'interpretability', 'modularity', 'architecting',
|
||||
'instantiations', 'crowdsourcing', 'crowdsourced', 'interdependencies',
|
||||
'degradations', 'natively', 'detections', 'observability', 'exfiltration',
|
||||
'auditable', 'cryptographic', 'curation', 'engineerable', 'subfield',
|
||||
'misrouted', 'tradeoff', 'tradeoffs', 'pre',
|
||||
|
||||
# People names (for attributions)
|
||||
'vijay', 'janapa', 'reddi', 'yann', 'lecun', 'corinna', 'burges',
|
||||
'cybenko', 'hornik', 'augereau',
|
||||
|
||||
# Image filename patterns (without extensions)
|
||||
'covermlsystems', 'coveraigood', 'coveraibenchmarking',
|
||||
'coverconclusion', 'coverdataengineering', 'covernnprimer',
|
||||
'coverdlarch',
|
||||
|
||||
# LaTeX commands
|
||||
'noindent',
|
||||
|
||||
# AI tools
|
||||
'dall', 'dalle',
|
||||
|
||||
# Short codes/patterns
|
||||
'fn',
|
||||
|
||||
# Additional comprehensive technical terms (auto-generated from book content)
|
||||
'accelerometers', 'acm', 'adamw', 'additionality', 'adreno', 'aes', 'agentic', 'aiops',
|
||||
'airbnb', 'aitraining', 'akida', 'al', 'alexa', 'alexnet', 'algorithmically', 'alphafold',
|
||||
'ambri', 'amodei', 'anonymization', 'anonymized', 'anthropic', 'asilomar', 'auditability',
|
||||
'autocorrect', 'autocorrection', 'autocorrections', 'automatable', 'automl', 'avr', 'axonal',
|
||||
'backdoored', 'backdoors', 'backend', 'backends', 'balancers', 'batchsize', 'bibliometric',
|
||||
'binarization', 'biometric', 'bist', 'blas', 'bostrom', 'bottlenecked', 'brominated', 'carlini',
|
||||
'cfe', 'channelwise', 'chatbot', 'chatbots', 'chatgpt', 'checkmark', 'chiplet', 'chiplets',
|
||||
'clinaiops', 'cloudlets', 'cmsis', 'codecarbon', 'compas', 'conda', 'contestability', 'coprocessor',
|
||||
'coprocessors', 'coveraihardware', 'coveraiworkflow', 'coverefficientai', 'coverfrontiers',
|
||||
'coverintroduction', 'covermlframeworks', 'covermlops', 'covermodeloptimizations',
|
||||
'coverondevicelearning', 'coverresponsibleai', 'coverrobustai', 'coversecurityprivacy',
|
||||
'coversustainableai', 'cublas', 'cuda', 'customizations', 'cybersecurity', 'cyberweapon',
|
||||
'de', 'debois', 'debuggable', 'deepsparse', 'deepspeed', 'devops', 'distilbert', 'dma', 'dp',
|
||||
'dsp', 'dsps', 'dvfs', 'dwork', 'dx', 'eacs', 'electrodermal', 'electromechanical',
|
||||
'epistemologically', 'esg', 'esrs', 'et', 'ethnicities', 'ets', 'ewc', 'exaflops',
|
||||
'explainability', 'explanations', 'expressivity', 'externality', 'facto', 'failover', 'fairlearn',
|
||||
'fairscale', 'fe', 'fedavgm', 'fedprox', 'fi', 'flops', 'forrester', 'fpu', 'frac', 'freertos',
|
||||
'fx', 'gapped', 'gboard', 'gemm', 'gflops', 'giga', 'goertzel', 'gradcam', 'greenwashing',
|
||||
'groupwise', 'handlin', 'hbm', 'hd', 'hdfs', 'hitl', 'homomorphic', 'hsms', 'huggingface',
|
||||
'hwacc', 'hyperscale', 'iid', 'imagenet', 'imbalancing', 'incentivized', 'incentivizing',
|
||||
'instantiation', 'intentioned', 'interdependency', 'intra', 'jax', 'jenkins', 'jpeg', 'kaggle',
|
||||
'kanies', 'kawaguchi', 'kdd', 'keras', 'kinetis', 'kleinberg', 'kohsuke', 'kolmogorov', 'krum',
|
||||
'kryo', 'kubernetes', 'lapack', 'lca', 'leaderboards', 'lidar', 'llms', 'ln', 'loihi', 'lora',
|
||||
'lpddr', 'mah', 'maml', 'mance', 'mapa', 'mbed', 'mbps', 'mcus', 'medskip', 'metux', 'metuxs',
|
||||
'micronpu', 'microservices', 'microsystems', 'millijoules', 'misalignments', 'misclassification',
|
||||
'misclassifies', 'misclassify', 'misconfigured', 'mitigations', 'mj', 'mlcommons', 'mlflow',
|
||||
'mlir', 'mlp', 'mobilenetv', 'modelscaling', 'moores', 'msqe', 'multimodal', 'multiphase',
|
||||
'mwh', 'nas', 'natanz', 'nbsp', 'netron', 'neurosymbolic', 'ngo', 'nm', 'nn', 'npu', 'npus',
|
||||
'npv', 'nsight', 'numenta', 'numerics', 'nvlink', 'nwp', 'nxp', 'oecd', 'onnx', 'ons', 'openai',
|
||||
'opencl', 'openvino', 'openwebtext', 'operationalization', 'operationalize', 'operationalizing',
|
||||
'optum', 'ota', 'overcorrecting', 'overfit', 'overreliance', 'parallelizes', 'pcie', 'perceptron',
|
||||
'performant', 'personalization', 'pes', 'picojoules', 'pipelining', 'pj', 'plcs', 'ppv',
|
||||
'prefetched', 'prefetching', 'pretrained', 'programmability', 'proliferative', 'proprioception',
|
||||
'propublica', 'ptq', 'pufs', 'pypi', 'qat', 'qos', 'quadratically', 'quant', 'rbac', 'recalibrate',
|
||||
'recalibrating', 'recommender', 'reconceptualizes', 'recyclability', 'reframing', 'reimagined',
|
||||
'reimagining', 'reimplement', 'reimplementing', 'renewables', 'repairability', 'rescoring',
|
||||
'reskilling', 'retinopathy', 'reusability', 'ridesharing', 'rlhf', 'roadmap', 'rollout', 'rollouts',
|
||||
'rss', 'runtimes', 'sagemaker', 'sanitization', 'scipy', 'scopus', 'sdt', 'sgd', 'shader', 'shaders',
|
||||
'shap', 'shapley', 'simd', 'siri', 'situationally', 'slas', 'smi', 'smirnov', 'snns', 'snpe',
|
||||
'soc', 'socs', 'sparseml', 'sparsification', 'spinoff', 'sprase', 'spss', 'stationarity', 'stm',
|
||||
'stuxnet', 'swappable', 'synergistically', 'tcp', 'tdsp', 'tees', 'tensorboard', 'tensorrt',
|
||||
'tera', 'ternarization', 'tflite', 'tfx', 'thresholded', 'timm', 'titration', 'tls', 'tokenization',
|
||||
'toolchains', 'torchscript', 'torchserve', 'tpr', 'tpuv', 'tradeable', 'trojan', 'truenorth',
|
||||
'tvm', 'ultrapure', 'unbundled', 'underutilization', 'unexplainable', 'unimodal', 'unoptimized',
|
||||
'untrusted', 'upgradable', 'upgradeable', 'upskilling', 'uptime', 'usb', 'utensor', 'utopian',
|
||||
'vectornet', 'virusbokbok', 'vitis', 'von', 'vr', 'vtune', 'vulkan', 'waymo', 'wearables',
|
||||
'wellbeing', 'wi', 'xla', 'zero',
|
||||
}
|
||||
|
||||
try:
|
||||
content = filepath.read_text(encoding='utf-8')
|
||||
except Exception as e:
|
||||
print(f"Error reading {filepath}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
prose_segments = extract_prose_text(content)
|
||||
errors = []
|
||||
|
||||
for text, line_num in prose_segments:
|
||||
cleaned = clean_prose_text(text)
|
||||
if not cleaned:
|
||||
continue
|
||||
|
||||
misspelled = check_with_aspell(cleaned, ignore_terms)
|
||||
if misspelled:
|
||||
errors.append({
|
||||
'file': filepath.resolve(), # Store absolute path
|
||||
'line': line_num,
|
||||
'text': text[:100] + ('...' if len(text) > 100 else ''),
|
||||
'misspelled': misspelled
|
||||
})
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function."""
|
||||
# Check if aspell is available
|
||||
try:
|
||||
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
print("Error: aspell not found. Install it with: brew install aspell", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Get directory to check
|
||||
repo_root = Path(__file__).resolve().parents[3]
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
target_dir = Path(sys.argv[1])
|
||||
else:
|
||||
target_dir = repo_root / 'quarto' / 'contents' / 'core'
|
||||
|
||||
if not target_dir.exists():
|
||||
print(f"Error: Directory not found: {target_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
# Find all QMD files
|
||||
qmd_files = list(target_dir.rglob('*.qmd'))
|
||||
print(f"Checking {len(qmd_files)} .qmd files for prose spelling errors...\n")
|
||||
|
||||
all_errors = []
|
||||
files_with_errors = 0
|
||||
|
||||
for qmd_file in sorted(qmd_files):
|
||||
errors = check_file(qmd_file)
|
||||
if errors:
|
||||
files_with_errors += 1
|
||||
all_errors.extend(errors)
|
||||
|
||||
# Print results
|
||||
if all_errors:
|
||||
print(f"Found {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
|
||||
|
||||
current_file = None
|
||||
for error in sorted(all_errors, key=lambda e: (str(e['file']), e['line'])):
|
||||
if error['file'] != current_file:
|
||||
current_file = error['file']
|
||||
try:
|
||||
rel_path = error['file'].relative_to(repo_root)
|
||||
except ValueError:
|
||||
rel_path = error['file']
|
||||
print(f"\n{rel_path}")
|
||||
print("=" * len(str(rel_path)))
|
||||
|
||||
print(f" Line {error['line']}: {error['text']}")
|
||||
print(f" → Misspelled: {', '.join(error['misspelled'])}")
|
||||
|
||||
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
|
||||
return 1
|
||||
else:
|
||||
print("✓ No spelling errors found in prose text!")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
||||
494
tools/scripts/content/check_tikz_spelling.py
Executable file
494
tools/scripts/content/check_tikz_spelling.py
Executable file
@@ -0,0 +1,494 @@
|
||||
#!/usr/bin/env python3
|
||||
r"""
|
||||
Spell check text content within TikZ diagrams.
|
||||
|
||||
Extracts and validates all visible text from TikZ diagrams in .qmd files,
|
||||
including node labels, inline annotations, formatted text, and comments.
|
||||
|
||||
Usage:
|
||||
python3 tools/scripts/content/check_tikz_spelling.py
|
||||
|
||||
Checks text in:
|
||||
- Node commands: \node{text}, node{text} in \draw/\path/\fill
|
||||
- Formatted text: \textbf{}, \textit{}, \emph{}, etc.
|
||||
- Drawing annotations: \draw--node{label}--
|
||||
- Custom pics: pics/name/, \pic{name}
|
||||
- Foreach loops: /{Text}/ patterns
|
||||
- Labels: label={text}, pin={text}
|
||||
- Legends: \legend{Item 1, Item 2}
|
||||
- Comments: % text
|
||||
|
||||
Optional: Install aspell for comprehensive dictionary checking
|
||||
macOS: brew install aspell
|
||||
Ubuntu: sudo apt-get install aspell
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Set
|
||||
import subprocess
|
||||
|
||||
|
||||
def extract_tikz_blocks(content: str, filepath: str) -> List[Tuple[str, int]]:
|
||||
"""
|
||||
Extract TikZ code blocks with their starting line numbers.
|
||||
|
||||
Returns:
|
||||
List of (tikz_content, start_line_number) tuples
|
||||
"""
|
||||
blocks = []
|
||||
lines = content.split('\n')
|
||||
in_tikz = False
|
||||
current_block = []
|
||||
start_line = 0
|
||||
|
||||
for i, line in enumerate(lines, 1):
|
||||
if r'\begin{tikzpicture}' in line:
|
||||
in_tikz = True
|
||||
start_line = i
|
||||
current_block = [line]
|
||||
elif r'\end{tikzpicture}' in line and in_tikz:
|
||||
current_block.append(line)
|
||||
blocks.append(('\n'.join(current_block), start_line))
|
||||
in_tikz = False
|
||||
current_block = []
|
||||
elif in_tikz:
|
||||
current_block.append(line)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def clean_latex_text(text: str) -> str:
|
||||
"""
|
||||
Clean LaTeX formatting from text to get readable content.
|
||||
|
||||
Args:
|
||||
text: Raw text from LaTeX/TikZ
|
||||
|
||||
Returns:
|
||||
Cleaned text with LaTeX commands removed
|
||||
"""
|
||||
# Replace \\ (line breaks) with spaces first
|
||||
text = text.replace('\\\\', ' ')
|
||||
|
||||
# Remove size commands that appear before text (like {\huge ?})
|
||||
text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\s+', ' ', text)
|
||||
|
||||
# Remove font commands
|
||||
text = re.sub(r'\\usefont\{[^}]*\}\{[^}]*\}\{[^}]*\}\{[^}]*\}', ' ', text)
|
||||
text = re.sub(r'\\fontsize\{[^}]*\}\{[^}]*\}\\selectfont', ' ', text)
|
||||
text = re.sub(r'\\bfseries\s*', ' ', text)
|
||||
|
||||
# Handle nested formatting commands (multiple passes)
|
||||
for _ in range(3): # Up to 3 levels of nesting
|
||||
# Remove common LaTeX formatting commands but keep the text
|
||||
text = re.sub(r'\\textbf\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\textit\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\emph\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\text\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\mathbf\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\mathrm\{([^}]+)\}', r'\1', text)
|
||||
text = re.sub(r'\\textsubscript\{([^}]+)\}', r'_\1', text)
|
||||
text = re.sub(r'\\textsuperscript\{([^}]+)\}', r'^\1', text)
|
||||
text = re.sub(r'\\textcolor\{[^}]*\}\{([^}]+)\}', r'\1', text)
|
||||
|
||||
# Remove $ signs (math mode)
|
||||
text = text.replace('$', '')
|
||||
|
||||
# Remove other common LaTeX commands (but preserve the text after them)
|
||||
text = re.sub(r'\\[a-zA-Z]+\s*', ' ', text)
|
||||
|
||||
# Clean up whitespace
|
||||
text = ' '.join(text.split())
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
def extract_all_curly_brace_text(tikz_content: str) -> List[Tuple[str, str, int]]:
|
||||
"""
|
||||
Extract all text content from curly braces that could be visible text.
|
||||
|
||||
Returns:
|
||||
List of (text, context, char_position) tuples
|
||||
"""
|
||||
texts = []
|
||||
|
||||
# Find all text in curly braces that follows common TikZ commands or appears in node definitions
|
||||
# This catches: \node{text}, node{text}, \textbf{text}, etc.
|
||||
|
||||
# Pattern 1: \node[options]{text} or \node(name){text}
|
||||
node_standalone = r'\\node\s*(?:\[[^\]]*\])?\s*(?:\([^)]*\))?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
|
||||
for match in re.finditer(node_standalone, tikz_content):
|
||||
text = match.group(1)
|
||||
texts.append((text, '\\node{...}', match.start()))
|
||||
|
||||
# Pattern 2: node[options]{text} (inside \draw, \fill, or \path)
|
||||
node_inline = r'(?<!\\)node\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
|
||||
for match in re.finditer(node_inline, tikz_content):
|
||||
text = match.group(1)
|
||||
texts.append((text, 'node{...} in draw/path/fill', match.start()))
|
||||
|
||||
# Pattern 3: Text formatting commands
|
||||
text_commands = [
|
||||
(r'\\textbf\{([^}]+)\}', '\\textbf{...}'),
|
||||
(r'\\textit\{([^}]+)\}', '\\textit{...}'),
|
||||
(r'\\emph\{([^}]+)\}', '\\emph{...}'),
|
||||
(r'\\text\{([^}]+)\}', '\\text{...}'),
|
||||
]
|
||||
for pattern, context in text_commands:
|
||||
for match in re.finditer(pattern, tikz_content):
|
||||
text = match.group(1)
|
||||
texts.append((text, context, match.start()))
|
||||
|
||||
# Pattern 4: label={text} and similar options
|
||||
label_pattern = r'(?:label|pin|xlabel|ylabel)\s*=\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
|
||||
for match in re.finditer(label_pattern, tikz_content):
|
||||
text = match.group(1)
|
||||
texts.append((text, 'label={...}', match.start()))
|
||||
|
||||
# Pattern 5: legend command
|
||||
legend_pattern = r'\\legend\s*\{([^}]+)\}'
|
||||
for match in re.finditer(legend_pattern, tikz_content):
|
||||
text = match.group(1)
|
||||
texts.append((text, '\\legend{...}', match.start()))
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def extract_text_from_foreach(tikz_content: str) -> List[Tuple[str, str]]:
|
||||
r"""
|
||||
Extract text from \foreach loops which often contain labels.
|
||||
|
||||
Pattern: \\foreach \\i/\\j/... in {val1/{Text 1}/val2, val2/{Text 2}/val3, ...}
|
||||
|
||||
Returns:
|
||||
List of (text, context) tuples
|
||||
"""
|
||||
texts = []
|
||||
|
||||
# Find \foreach statements
|
||||
foreach_pattern = r'\\foreach[^{]+in\s*\{([^}]+)\}'
|
||||
|
||||
for match in re.finditer(foreach_pattern, tikz_content, re.DOTALL):
|
||||
content = match.group(1)
|
||||
|
||||
# Extract text from {...} within the foreach content
|
||||
# Pattern: /{text}/
|
||||
text_in_braces = re.findall(r'/\{([^}]+)\}/', content)
|
||||
for text in text_in_braces:
|
||||
cleaned = clean_latex_text(text)
|
||||
if cleaned and len(cleaned) > 2:
|
||||
# Skip if it's just a number or coordinate
|
||||
if not re.match(r'^[\d\s\.,\-\+]+$', cleaned):
|
||||
texts.append((cleaned, f'\\foreach loop: /{{{text}}}/'))
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def extract_text_from_tikz(tikz_content: str) -> List[Tuple[str, str]]:
|
||||
"""
|
||||
Extract ALL human-readable text from TikZ code.
|
||||
|
||||
Returns:
|
||||
List of (text, context) tuples where context shows where the text was found
|
||||
"""
|
||||
texts = []
|
||||
seen_texts = set() # Avoid duplicates
|
||||
|
||||
# Extract all text from curly braces
|
||||
for raw_text, context, pos in extract_all_curly_brace_text(tikz_content):
|
||||
# Clean the text
|
||||
cleaned = clean_latex_text(raw_text)
|
||||
|
||||
# Skip if it's just numbers, coordinates, colors, or TikZ commands
|
||||
if not cleaned:
|
||||
continue
|
||||
if re.match(r'^[\d\s\.,\-\+\*\/\(\)_\^]+$', cleaned): # Just numbers/math/subscripts
|
||||
continue
|
||||
if re.match(r'^[a-z]+!?\d*$', cleaned): # Colors like "red", "blue!50"
|
||||
continue
|
||||
if len(cleaned) < 2: # Too short to be meaningful text
|
||||
continue
|
||||
|
||||
# Avoid duplicates
|
||||
key = (cleaned.lower(), context)
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((cleaned, f'{context}: "{raw_text}"'))
|
||||
|
||||
# Extract text from \foreach loops
|
||||
for text, context in extract_text_from_foreach(tikz_content):
|
||||
key = (text.lower(), 'foreach')
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((text, context))
|
||||
|
||||
# Extract text from pic names (custom TikZ pictures)
|
||||
pic_name_pattern = r'pics/([a-zA-Z_]+)/'
|
||||
for match in re.finditer(pic_name_pattern, tikz_content):
|
||||
name = match.group(1)
|
||||
if len(name) > 2:
|
||||
key = (name.lower(), 'pics')
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((name, f'pics/{name}/'))
|
||||
|
||||
# Extract text from pic usage
|
||||
pic_usage_pattern = r'\\pic\s*(?:\[[^\]]*\])?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
|
||||
for match in re.finditer(pic_usage_pattern, tikz_content):
|
||||
name = match.group(1)
|
||||
if len(name) > 2 and not re.match(r'^[\d\s]+$', name):
|
||||
key = (name.lower(), 'pic_usage')
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((name, f'\\pic{{...}}{{{name}}}'))
|
||||
|
||||
# Extract comments (often contain descriptive text)
|
||||
comment_pattern = r'%\s*(.+?)(?:\n|$)'
|
||||
for match in re.finditer(comment_pattern, tikz_content):
|
||||
comment = match.group(1).strip()
|
||||
# Skip comments that are just separators or structure
|
||||
if comment and not re.match(r'^[\-\=\*\s]+$', comment):
|
||||
key = (comment.lower(), 'comment')
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((comment, f'% {comment}'))
|
||||
|
||||
# Extract variable names from \def that might be words
|
||||
def_pattern = r'\\def\\([a-zA-Z]+)\{'
|
||||
for match in re.finditer(def_pattern, tikz_content):
|
||||
name = match.group(1)
|
||||
# Only check if it looks like a word (not all caps, reasonable length)
|
||||
if len(name) > 3 and not name.isupper() and not name.startswith('r'):
|
||||
key = (name.lower(), 'def')
|
||||
if key not in seen_texts:
|
||||
seen_texts.add(key)
|
||||
texts.append((name, f'\\def\\{name}'))
|
||||
|
||||
return texts
|
||||
|
||||
|
||||
def check_spelling_with_aspell(text: str) -> List[str]:
|
||||
"""
|
||||
Check spelling using aspell if available, filtering out TikZ/LaTeX technical terms.
|
||||
|
||||
Returns:
|
||||
List of misspelled words (excluding known technical terms)
|
||||
"""
|
||||
# Terms to ignore (TikZ syntax, LaTeX commands, common technical terms, etc.)
|
||||
ignore_terms = {
|
||||
# TikZ pic parameters
|
||||
'scalefac', 'picname', 'filllcolor', 'drawcolor', 'linewidth',
|
||||
'filllcirclecolor', 'drawcircle', 'bodycolor', 'tiecolor', 'stetcolor',
|
||||
'drawchannelcolor', 'channelcolor',
|
||||
|
||||
# Color names
|
||||
'brownline', 'redline', 'blueline', 'violetline', 'greenline', 'orangeline',
|
||||
'violetl', 'greenl', 'bluel', 'redl', 'orangel',
|
||||
'greend',
|
||||
|
||||
# TikZ/LaTeX commands
|
||||
'tikzset', 'foreach', 'tikz', 'usefont', 'phv', 'bfseries', 'textbf',
|
||||
'pgfmathparse', 'addplot', 'sqrt',
|
||||
|
||||
# Common variable names
|
||||
'cellsize', 'cellheight', 'xmax', 'ymin', 'newx', 'pos', 'sep',
|
||||
|
||||
# Technical diagram terms
|
||||
'mycylinder', 'mycycle', 'myline', 'rgpoly', 'zerofill',
|
||||
|
||||
# Display/UI elements
|
||||
'displaye', 'autotext',
|
||||
|
||||
# Abbreviations used in diagrams
|
||||
'zgl', 'zgd', 'da', 'dcd', 'dcl', 'dsc', 'ggb', 'lca', 'sre',
|
||||
|
||||
# Common acronyms and abbreviations
|
||||
'ui', 'kpis', 'oss', 'rtx', 'tpus', 'bg', 'eniac', 'fp',
|
||||
|
||||
# Technical terms (keep legitimate ones but add clearly technical)
|
||||
'preprocessing', 'backprop', 'weightgradient', 'davit', 'tokenize',
|
||||
'multimodality', 'microarchitecture', 'hypercomputing', 'curation',
|
||||
'transformative',
|
||||
|
||||
# Misc
|
||||
'helvetica', 'geeksforgeeks', 'lightgray', 'gaussian', 'yshift',
|
||||
'ack', 'zz', 'yy',
|
||||
}
|
||||
|
||||
try:
|
||||
# Check if aspell is available
|
||||
result = subprocess.run(
|
||||
['aspell', '--version'],
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return []
|
||||
except FileNotFoundError:
|
||||
return []
|
||||
|
||||
# Use aspell to check spelling
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['aspell', 'list', '--lang=en'],
|
||||
input=text,
|
||||
capture_output=True,
|
||||
text=True
|
||||
)
|
||||
if result.returncode == 0:
|
||||
words = [word for word in result.stdout.strip().split('\n') if word]
|
||||
# Filter out ignored terms
|
||||
filtered = [w for w in words if w.lower() not in ignore_terms]
|
||||
return filtered
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def simple_spell_check(text: str) -> List[str]:
|
||||
"""
|
||||
Simple pattern-based spell checking for common mistakes.
|
||||
|
||||
Returns:
|
||||
List of potential typos
|
||||
"""
|
||||
common_typos = {
|
||||
'teh': 'the',
|
||||
'htat': 'that',
|
||||
'taht': 'that',
|
||||
'adn': 'and',
|
||||
'nad': 'and',
|
||||
'gatewey': 'gateway',
|
||||
'poihnts': 'points',
|
||||
'poitns': 'points',
|
||||
'recieve': 'receive',
|
||||
'seperate': 'separate',
|
||||
'occured': 'occurred',
|
||||
'occurance': 'occurrence',
|
||||
'begining': 'beginning',
|
||||
'lenght': 'length',
|
||||
'widht': 'width',
|
||||
'heigth': 'height',
|
||||
'coordiante': 'coordinate',
|
||||
'cooridate': 'coordinate',
|
||||
'paramter': 'parameter',
|
||||
'paramters': 'parameters',
|
||||
'intellignet': 'intelligent',
|
||||
}
|
||||
|
||||
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
||||
typos = []
|
||||
|
||||
for word in words:
|
||||
if word in common_typos:
|
||||
typos.append(f'{word} (suggest: {common_typos[word]})')
|
||||
|
||||
return typos
|
||||
|
||||
|
||||
def check_file(filepath: Path, use_aspell: bool = True) -> List[dict]:
|
||||
"""
|
||||
Check a single file for spelling errors in TikZ diagrams.
|
||||
|
||||
Returns:
|
||||
List of error dictionaries with file, line, text, and suggestions
|
||||
"""
|
||||
try:
|
||||
content = filepath.read_text(encoding='utf-8')
|
||||
except Exception as e:
|
||||
print(f"Error reading {filepath}: {e}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
tikz_blocks = extract_tikz_blocks(content, str(filepath))
|
||||
errors = []
|
||||
|
||||
for tikz_content, start_line in tikz_blocks:
|
||||
texts = extract_text_from_tikz(tikz_content)
|
||||
|
||||
for text, context in texts:
|
||||
# Simple pattern check (always run)
|
||||
simple_errors = simple_spell_check(text)
|
||||
if simple_errors:
|
||||
errors.append({
|
||||
'file': str(filepath),
|
||||
'line': start_line,
|
||||
'text': text,
|
||||
'context': context,
|
||||
'suggestions': simple_errors
|
||||
})
|
||||
|
||||
# Aspell check (if available and requested)
|
||||
if use_aspell:
|
||||
aspell_errors = check_spelling_with_aspell(text)
|
||||
if aspell_errors:
|
||||
errors.append({
|
||||
'file': str(filepath),
|
||||
'line': start_line,
|
||||
'text': text,
|
||||
'context': context,
|
||||
'suggestions': aspell_errors
|
||||
})
|
||||
|
||||
return errors
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to check all .qmd files for TikZ spelling errors."""
|
||||
# Find all .qmd files in the quarto/contents directory
|
||||
repo_root = Path(__file__).resolve().parents[3]
|
||||
contents_dir = repo_root / 'quarto' / 'contents'
|
||||
|
||||
if not contents_dir.exists():
|
||||
print(f"Error: Contents directory not found at {contents_dir}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
qmd_files = list(contents_dir.rglob('*.qmd'))
|
||||
print(f"Checking {len(qmd_files)} .qmd files for TikZ spelling errors...\n")
|
||||
|
||||
# Check if aspell is available
|
||||
use_aspell = True
|
||||
try:
|
||||
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
|
||||
print("Using aspell for comprehensive spell checking.")
|
||||
except (FileNotFoundError, subprocess.CalledProcessError):
|
||||
print("aspell not found. Using pattern-based checking only.")
|
||||
print("Install aspell for more comprehensive checking: brew install aspell\n")
|
||||
use_aspell = False
|
||||
|
||||
all_errors = []
|
||||
files_with_errors = 0
|
||||
|
||||
for qmd_file in sorted(qmd_files):
|
||||
errors = check_file(qmd_file, use_aspell)
|
||||
if errors:
|
||||
files_with_errors += 1
|
||||
all_errors.extend(errors)
|
||||
|
||||
# Print results
|
||||
if all_errors:
|
||||
print(f"\nFound {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
|
||||
|
||||
current_file = None
|
||||
for error in sorted(all_errors, key=lambda e: (e['file'], e['line'])):
|
||||
if error['file'] != current_file:
|
||||
current_file = error['file']
|
||||
rel_path = Path(error['file']).relative_to(repo_root)
|
||||
print(f"\n{rel_path}")
|
||||
print("=" * len(str(rel_path)))
|
||||
|
||||
print(f" Line {error['line']}: {error['context']}")
|
||||
print(f" → Issues: {', '.join(error['suggestions'])}")
|
||||
|
||||
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
|
||||
return 1
|
||||
else:
|
||||
print("\n✓ No spelling errors found in TikZ diagrams!")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
|
||||
Reference in New Issue
Block a user