#!/usr/bin/env python3 r""" Spell check prose content in QMD files using aspell. Intelligently parses QMD file structure to only check actual prose text, excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, URLs, etc. Usage: python3 tools/scripts/content/check_prose_spelling.py [directory] Requirements: - aspell must be installed (brew install aspell) - No Python dependencies beyond standard library Checks: - Paragraph text - Headings - List items - Callout content Ignores: - YAML frontmatter - Code blocks (```...```) - Inline code (`...`) - TikZ diagrams - URLs and links - LaTeX math ($...$, $$...$$) - Special Quarto syntax """ import re import sys import subprocess from pathlib import Path from typing import List, Tuple, Set def extract_yaml_frontmatter(content: str) -> Tuple[int, int]: """ Find the start and end positions of YAML frontmatter. Returns: Tuple of (start_pos, end_pos) or (0, 0) if no frontmatter """ if not content.startswith('---'): return (0, 0) # Find the closing --- lines = content.split('\n') for i, line in enumerate(lines[1:], 1): if line.strip() == '---': # Return character positions start = 0 end = sum(len(lines[j]) + 1 for j in range(i + 1)) return (start, end) return (0, 0) def extract_code_blocks(content: str) -> List[Tuple[int, int]]: """ Find all code blocks (```...``` and TikZ blocks). Returns: List of (start_pos, end_pos) tuples """ blocks = [] # Find ``` code blocks pattern = r'```.*?```' for match in re.finditer(pattern, content, re.DOTALL): blocks.append((match.start(), match.end())) # Find TikZ blocks specifically (in case they're not in ```) tikz_pattern = r'\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}' for match in re.finditer(tikz_pattern, content, re.DOTALL): blocks.append((match.start(), match.end())) return blocks def extract_inline_code(content: str) -> List[Tuple[int, int]]: """ Find all inline code spans (`...`). Returns: List of (start_pos, end_pos) tuples """ spans = [] pattern = r'`[^`]+?`' for match in re.finditer(pattern, content): spans.append((match.start(), match.end())) return spans def extract_math_blocks(content: str) -> List[Tuple[int, int]]: """ Find all LaTeX math blocks ($...$, $$...$$). Returns: List of (start_pos, end_pos) tuples """ blocks = [] # Display math $$...$$ pattern = r'\$\$.*?\$\$' for match in re.finditer(pattern, content, re.DOTALL): blocks.append((match.start(), match.end())) # Inline math $...$ pattern = r'(? List[Tuple[int, int]]: """ Find all markdown links and URLs. Returns: List of (start_pos, end_pos) tuples """ spans = [] # Markdown links [text](url) pattern = r'\[([^\]]+)\]\([^\)]+\)' for match in re.finditer(pattern, content): # Only exclude the URL part, keep the link text url_start = match.group(0).find('](') + match.start() + 1 url_end = match.end() - 1 spans.append((url_start, url_end)) # Reference-style links [@ref], {#id}, @sec-name pattern = r'(\[@[^\]]+\]|\{#[^\}]+\}|@[a-z]+-[a-z0-9-]+)' for match in re.finditer(pattern, content): spans.append((match.start(), match.end())) # Plain URLs pattern = r'https?://[^\s\)>]+' for match in re.finditer(pattern, content): spans.append((match.start(), match.end())) return spans def extract_quarto_syntax(content: str) -> List[Tuple[int, int]]: """ Find Quarto-specific syntax to exclude. Returns: List of (start_pos, end_pos) tuples """ spans = [] # Quarto divs ::: {.classname} pattern = r':::\s*\{[^\}]+\}' for match in re.finditer(pattern, content): spans.append((match.start(), match.end())) # Quarto shortcodes {{< ... >}} pattern = r'\{\{<.*?>\}\}' for match in re.finditer(pattern, content, re.DOTALL): spans.append((match.start(), match.end())) return spans def should_exclude_position(pos: int, exclude_ranges: List[Tuple[int, int]]) -> bool: """Check if a position falls within any exclude range.""" for start, end in exclude_ranges: if start <= pos < end: return True return False def extract_prose_text(content: str) -> List[Tuple[str, int]]: """ Extract only prose text from QMD content. Returns: List of (text, line_number) tuples """ # Build exclude ranges exclude_ranges = [] yaml_start, yaml_end = extract_yaml_frontmatter(content) if yaml_end > 0: exclude_ranges.append((yaml_start, yaml_end)) exclude_ranges.extend(extract_code_blocks(content)) exclude_ranges.extend(extract_inline_code(content)) exclude_ranges.extend(extract_math_blocks(content)) exclude_ranges.extend(extract_links_and_urls(content)) exclude_ranges.extend(extract_quarto_syntax(content)) # Sort and merge overlapping ranges exclude_ranges.sort() merged = [] for start, end in exclude_ranges: if merged and start <= merged[-1][1]: merged[-1] = (merged[-1][0], max(merged[-1][1], end)) else: merged.append((start, end)) # Extract prose text prose_segments = [] lines = content.split('\n') pos = 0 for line_num, line in enumerate(lines, 1): line_start = pos line_end = pos + len(line) # Check if any part of this line is prose if not should_exclude_position(line_start, merged): # Extract prose parts from this line prose_text = "" for i, char in enumerate(line): char_pos = line_start + i if not should_exclude_position(char_pos, merged): prose_text += char else: if prose_text.strip(): prose_segments.append((prose_text.strip(), line_num)) prose_text = "" if prose_text.strip(): prose_segments.append((prose_text.strip(), line_num)) pos = line_end + 1 # +1 for newline return prose_segments def clean_prose_text(text: str) -> str: """ Clean prose text of markdown formatting while keeping words. Args: text: Raw prose text with markdown Returns: Cleaned text for spell checking """ # Remove markdown formatting text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) # Bold text = re.sub(r'\*([^\*]+)\*', r'\1', text) # Italic text = re.sub(r'_([^_]+)_', r'\1', text) # Italic text = re.sub(r'~~([^~]+)~~', r'\1', text) # Strikethrough # Remove remaining markdown symbols text = re.sub(r'[#\*_~]', '', text) # Remove special characters but keep apostrophes in words text = re.sub(r'[^\w\s\'-]', ' ', text) return text.strip() def check_with_aspell(text: str, ignore_terms: Set[str]) -> List[str]: """ Check text with aspell. Returns: List of misspelled words """ try: result = subprocess.run( ['aspell', 'list', '--lang=en'], input=text, capture_output=True, text=True, check=False ) if result.returncode == 0: words = [w for w in result.stdout.strip().split('\n') if w] # Filter ignore terms filtered = [w for w in words if w.lower() not in ignore_terms] return filtered return [] except Exception as e: print(f"Error running aspell: {e}", file=sys.stderr) return [] def check_file(filepath: Path) -> List[dict]: """ Check a single QMD file for spelling errors. Returns: List of error dictionaries """ # Common technical terms to ignore ignore_terms = { # File formats and common abbreviations 'qmd', 'yml', 'json', 'png', 'jpg', 'svg', 'pdf', 'tikz', 'quarto', 'pandoc', 'latex', 'tensorflow', 'pytorch', 'gpu', 'cpu', 'tpu', 'ram', 'api', 'ui', 'ux', 'cli', 'sdk', 'yaml', 'toml', 'html', 'css', 'javascript', 'typescript', 'numpy', 'pandas', 'matplotlib', 'jupyter', 'colab', 'github', 'gitlab', 'bitbucket', 'ai', 'ml', 'dl', 'cv', 'nlp', 'iot', 'rl', 'gan', 'lstm', 'gru', 'rnn', 'cnn', 'vgg', 'resnet', 'bert', # ML systems and techniques 'tinyml', 'microcontroller', 'microcontrollers', 'preprocessing', 'convolutional', 'latencies', 'dns', 'dennard', 'triadic', 'benchmarking', 'gdpr', 'hipaa', 'backpropagation', 'quantized', 'autoregressive', 'overfitting', 'checkpointing', 'hyperparameters', 'embeddings', 'spectrograms', 'mfcc', 'kws', 'activations', 'mnist', 'feedforward', 'softmax', 'relu', 'sigmoid', 'thresholding', 'postprocessing', 'suboptimal', 'multilayer', 'perceptrons', 'cnns', 'rnns', 'mlps', 'dnn', 'translational', 'invariance', 'parallelizable', 'uat', 'discriminative', 'fpgas', 'asics', 'topologies', 'reconceptualization', 'orchestrators', 'bfloat', # Product and project names 'plantvillage', 'nuru', 'farmbeats', 'respira', 'colabs', 'edgeml', 'mlperf', 'linpack', 'specpowerssj', 'datahub', 'kubeflow', 'mobilenets', 'efficientnets', 'gpt', 'palm', # Company and organization names 'mckinsey', 'espressif', 'hortonworks', 'linkedin', 'uber', 'cloudtrail', # Acronyms and abbreviations 'cmd', 'cbsd', 'mw', 'sram', 'sox', 'sdg', 'sdgs', 'agi', 'tco', 'gpus', 'mlops', 'gigaflops', 'eniac', 'cpus', 'tpus', 'fp', 'nist', # Legitimate English words often flagged 'underserved', 'sociotechnical', 'ebola', 'forecasted', 'unmonitored', 'transformative', 'microclimates', 'microclimate', 'responders', 'scalable', 'aspirational', 'lifecycle', 'lifecycles', 'representativeness', 'reproducibility', 'milliwatt', 'milliwatts', 'decomposable', 'interpretability', 'modularity', 'architecting', 'instantiations', 'crowdsourcing', 'crowdsourced', 'interdependencies', 'degradations', 'natively', 'detections', 'observability', 'exfiltration', 'auditable', 'cryptographic', 'curation', 'engineerable', 'subfield', 'misrouted', 'tradeoff', 'tradeoffs', 'pre', # People names (for attributions) 'vijay', 'janapa', 'reddi', 'yann', 'lecun', 'corinna', 'burges', 'cybenko', 'hornik', 'augereau', # Image filename patterns (without extensions) 'covermlsystems', 'coveraigood', 'coveraibenchmarking', 'coverconclusion', 'coverdataengineering', 'covernnprimer', 'coverdlarch', # LaTeX commands 'noindent', # AI tools 'dall', 'dalle', # Short codes/patterns 'fn', # Additional comprehensive technical terms (auto-generated from book content) 'accelerometers', 'acm', 'adamw', 'additionality', 'adreno', 'aes', 'agentic', 'aiops', 'airbnb', 'aitraining', 'akida', 'al', 'alexa', 'alexnet', 'algorithmically', 'alphafold', 'ambri', 'amodei', 'anonymization', 'anonymized', 'anthropic', 'asilomar', 'auditability', 'autocorrect', 'autocorrection', 'autocorrections', 'automatable', 'automl', 'avr', 'axonal', 'backdoored', 'backdoors', 'backend', 'backends', 'balancers', 'batchsize', 'bibliometric', 'binarization', 'biometric', 'bist', 'blas', 'bostrom', 'bottlenecked', 'brominated', 'carlini', 'cfe', 'channelwise', 'chatbot', 'chatbots', 'chatgpt', 'checkmark', 'chiplet', 'chiplets', 'clinaiops', 'cloudlets', 'cmsis', 'codecarbon', 'compas', 'conda', 'contestability', 'coprocessor', 'coprocessors', 'coveraihardware', 'coveraiworkflow', 'coverefficientai', 'coverfrontiers', 'coverintroduction', 'covermlframeworks', 'covermlops', 'covermodeloptimizations', 'coverondevicelearning', 'coverresponsibleai', 'coverrobustai', 'coversecurityprivacy', 'coversustainableai', 'cublas', 'cuda', 'customizations', 'cybersecurity', 'cyberweapon', 'de', 'debois', 'debuggable', 'deepsparse', 'deepspeed', 'devops', 'distilbert', 'dma', 'dp', 'dsp', 'dsps', 'dvfs', 'dwork', 'dx', 'eacs', 'electrodermal', 'electromechanical', 'epistemologically', 'esg', 'esrs', 'et', 'ethnicities', 'ets', 'ewc', 'exaflops', 'explainability', 'explanations', 'expressivity', 'externality', 'facto', 'failover', 'fairlearn', 'fairscale', 'fe', 'fedavgm', 'fedprox', 'fi', 'flops', 'forrester', 'fpu', 'frac', 'freertos', 'fx', 'gapped', 'gboard', 'gemm', 'gflops', 'giga', 'goertzel', 'gradcam', 'greenwashing', 'groupwise', 'handlin', 'hbm', 'hd', 'hdfs', 'hitl', 'homomorphic', 'hsms', 'huggingface', 'hwacc', 'hyperscale', 'iid', 'imagenet', 'imbalancing', 'incentivized', 'incentivizing', 'instantiation', 'intentioned', 'interdependency', 'intra', 'jax', 'jenkins', 'jpeg', 'kaggle', 'kanies', 'kawaguchi', 'kdd', 'keras', 'kinetis', 'kleinberg', 'kohsuke', 'kolmogorov', 'krum', 'kryo', 'kubernetes', 'lapack', 'lca', 'leaderboards', 'lidar', 'llms', 'ln', 'loihi', 'lora', 'lpddr', 'mah', 'maml', 'mance', 'mapa', 'mbed', 'mbps', 'mcus', 'medskip', 'metux', 'metuxs', 'micronpu', 'microservices', 'microsystems', 'millijoules', 'misalignments', 'misclassification', 'misclassifies', 'misclassify', 'misconfigured', 'mitigations', 'mj', 'mlcommons', 'mlflow', 'mlir', 'mlp', 'mobilenetv', 'modelscaling', 'moores', 'msqe', 'multimodal', 'multiphase', 'mwh', 'nas', 'natanz', 'nbsp', 'netron', 'neurosymbolic', 'ngo', 'nm', 'nn', 'npu', 'npus', 'npv', 'nsight', 'numenta', 'numerics', 'nvlink', 'nwp', 'nxp', 'oecd', 'onnx', 'ons', 'openai', 'opencl', 'openvino', 'openwebtext', 'operationalization', 'operationalize', 'operationalizing', 'optum', 'ota', 'overcorrecting', 'overfit', 'overreliance', 'parallelizes', 'pcie', 'perceptron', 'performant', 'personalization', 'pes', 'picojoules', 'pipelining', 'pj', 'plcs', 'ppv', 'prefetched', 'prefetching', 'pretrained', 'programmability', 'proliferative', 'proprioception', 'propublica', 'ptq', 'pufs', 'pypi', 'qat', 'qos', 'quadratically', 'quant', 'rbac', 'recalibrate', 'recalibrating', 'recommender', 'reconceptualizes', 'recyclability', 'reframing', 'reimagined', 'reimagining', 'reimplement', 'reimplementing', 'renewables', 'repairability', 'rescoring', 'reskilling', 'retinopathy', 'reusability', 'ridesharing', 'rlhf', 'roadmap', 'rollout', 'rollouts', 'rss', 'runtimes', 'sagemaker', 'sanitization', 'scipy', 'scopus', 'sdt', 'sgd', 'shader', 'shaders', 'shap', 'shapley', 'simd', 'siri', 'situationally', 'slas', 'smi', 'smirnov', 'snns', 'snpe', 'soc', 'socs', 'sparseml', 'sparsification', 'spinoff', 'sprase', 'spss', 'stationarity', 'stm', 'stuxnet', 'swappable', 'synergistically', 'tcp', 'tdsp', 'tees', 'tensorboard', 'tensorrt', 'tera', 'ternarization', 'tflite', 'tfx', 'thresholded', 'timm', 'titration', 'tls', 'tokenization', 'toolchains', 'torchscript', 'torchserve', 'tpr', 'tpuv', 'tradeable', 'trojan', 'truenorth', 'tvm', 'ultrapure', 'unbundled', 'underutilization', 'unexplainable', 'unimodal', 'unoptimized', 'untrusted', 'upgradable', 'upgradeable', 'upskilling', 'uptime', 'usb', 'utensor', 'utopian', 'vectornet', 'virusbokbok', 'vitis', 'von', 'vr', 'vtune', 'vulkan', 'waymo', 'wearables', 'wellbeing', 'wi', 'xla', 'zero', } try: content = filepath.read_text(encoding='utf-8') except Exception as e: print(f"Error reading {filepath}: {e}", file=sys.stderr) return [] prose_segments = extract_prose_text(content) errors = [] for text, line_num in prose_segments: cleaned = clean_prose_text(text) if not cleaned: continue misspelled = check_with_aspell(cleaned, ignore_terms) if misspelled: errors.append({ 'file': filepath.resolve(), # Store absolute path 'line': line_num, 'text': text[:100] + ('...' if len(text) > 100 else ''), 'misspelled': misspelled }) return errors def main(): """Main function.""" # Check if aspell is available try: subprocess.run(['aspell', '--version'], capture_output=True, check=True) except (FileNotFoundError, subprocess.CalledProcessError): print("Error: aspell not found. Install it with: brew install aspell", file=sys.stderr) return 1 # Get directory to check repo_root = Path(__file__).resolve().parents[3] if len(sys.argv) > 1: target_dir = Path(sys.argv[1]) else: target_dir = repo_root / 'quarto' / 'contents' / 'core' if not target_dir.exists(): print(f"Error: Directory not found: {target_dir}", file=sys.stderr) return 1 # Find all QMD files qmd_files = list(target_dir.rglob('*.qmd')) print(f"Checking {len(qmd_files)} .qmd files for prose spelling errors...\n") all_errors = [] files_with_errors = 0 for qmd_file in sorted(qmd_files): errors = check_file(qmd_file) if errors: files_with_errors += 1 all_errors.extend(errors) # Print results if all_errors: print(f"Found {len(all_errors)} potential spelling errors in {files_with_errors} files:\n") current_file = None for error in sorted(all_errors, key=lambda e: (str(e['file']), e['line'])): if error['file'] != current_file: current_file = error['file'] try: rel_path = error['file'].relative_to(repo_root) except ValueError: rel_path = error['file'] print(f"\n{rel_path}") print("=" * len(str(rel_path))) print(f" Line {error['line']}: {error['text']}") print(f" → Misspelled: {', '.join(error['misspelled'])}") print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files") return 1 else: print("✓ No spelling errors found in prose text!") return 0 if __name__ == '__main__': sys.exit(main())