Files
cs249r_book/book/tools/scripts/content/check_prose_spelling.py

497 lines
18 KiB
Python
Executable File

#!/usr/bin/env python3
r"""
Spell check prose content in QMD files using aspell.
Intelligently parses QMD file structure to only check actual prose text,
excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, URLs, etc.
Usage:
python3 tools/scripts/content/check_prose_spelling.py [directory]
Requirements:
- aspell must be installed (brew install aspell)
- No Python dependencies beyond standard library
Checks:
- Paragraph text
- Headings
- List items
- Callout content
Ignores:
- YAML frontmatter
- Code blocks (```...```)
- Inline code (`...`)
- TikZ diagrams
- URLs and links
- LaTeX math ($...$, $$...$$)
- Special Quarto syntax
"""
import re
import sys
import subprocess
from pathlib import Path
from typing import List, Tuple, Set
def extract_yaml_frontmatter(content: str) -> Tuple[int, int]:
"""
Find the start and end positions of YAML frontmatter.
Returns:
Tuple of (start_pos, end_pos) or (0, 0) if no frontmatter
"""
if not content.startswith('---'):
return (0, 0)
# Find the closing ---
lines = content.split('\n')
for i, line in enumerate(lines[1:], 1):
if line.strip() == '---':
# Return character positions
start = 0
end = sum(len(lines[j]) + 1 for j in range(i + 1))
return (start, end)
return (0, 0)
def extract_code_blocks(content: str) -> List[Tuple[int, int]]:
"""
Find all code blocks (```...``` and TikZ blocks).
Returns:
List of (start_pos, end_pos) tuples
"""
blocks = []
# Find ``` code blocks
pattern = r'```.*?```'
for match in re.finditer(pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
# Find TikZ blocks specifically (in case they're not in ```)
tikz_pattern = r'\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}'
for match in re.finditer(tikz_pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
return blocks
def extract_inline_code(content: str) -> List[Tuple[int, int]]:
"""
Find all inline code spans (`...`).
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
pattern = r'`[^`]+?`'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
return spans
def extract_math_blocks(content: str) -> List[Tuple[int, int]]:
"""
Find all LaTeX math blocks ($...$, $$...$$).
Returns:
List of (start_pos, end_pos) tuples
"""
blocks = []
# Display math $$...$$
pattern = r'\$\$.*?\$\$'
for match in re.finditer(pattern, content, re.DOTALL):
blocks.append((match.start(), match.end()))
# Inline math $...$
pattern = r'(?<!\$)\$(?!\$)[^\$]+?\$(?!\$)'
for match in re.finditer(pattern, content):
blocks.append((match.start(), match.end()))
return blocks
def extract_links_and_urls(content: str) -> List[Tuple[int, int]]:
"""
Find all markdown links and URLs.
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
# Markdown links [text](url)
pattern = r'\[([^\]]+)\]\([^\)]+\)'
for match in re.finditer(pattern, content):
# Only exclude the URL part, keep the link text
url_start = match.group(0).find('](') + match.start() + 1
url_end = match.end() - 1
spans.append((url_start, url_end))
# Reference-style links [@ref], {#id}, @sec-name
pattern = r'(\[@[^\]]+\]|\{#[^\}]+\}|@[a-z]+-[a-z0-9-]+)'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
# Plain URLs
pattern = r'https?://[^\s\)>]+'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
return spans
def extract_quarto_syntax(content: str) -> List[Tuple[int, int]]:
"""
Find Quarto-specific syntax to exclude.
Returns:
List of (start_pos, end_pos) tuples
"""
spans = []
# Quarto divs ::: {.classname}
pattern = r':::\s*\{[^\}]+\}'
for match in re.finditer(pattern, content):
spans.append((match.start(), match.end()))
# Quarto shortcodes {{< ... >}}
pattern = r'\{\{<.*?>\}\}'
for match in re.finditer(pattern, content, re.DOTALL):
spans.append((match.start(), match.end()))
return spans
def should_exclude_position(pos: int, exclude_ranges: List[Tuple[int, int]]) -> bool:
"""Check if a position falls within any exclude range."""
for start, end in exclude_ranges:
if start <= pos < end:
return True
return False
def extract_prose_text(content: str) -> List[Tuple[str, int]]:
"""
Extract only prose text from QMD content.
Returns:
List of (text, line_number) tuples
"""
# Build exclude ranges
exclude_ranges = []
yaml_start, yaml_end = extract_yaml_frontmatter(content)
if yaml_end > 0:
exclude_ranges.append((yaml_start, yaml_end))
exclude_ranges.extend(extract_code_blocks(content))
exclude_ranges.extend(extract_inline_code(content))
exclude_ranges.extend(extract_math_blocks(content))
exclude_ranges.extend(extract_links_and_urls(content))
exclude_ranges.extend(extract_quarto_syntax(content))
# Sort and merge overlapping ranges
exclude_ranges.sort()
merged = []
for start, end in exclude_ranges:
if merged and start <= merged[-1][1]:
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
else:
merged.append((start, end))
# Extract prose text
prose_segments = []
lines = content.split('\n')
pos = 0
for line_num, line in enumerate(lines, 1):
line_start = pos
line_end = pos + len(line)
# Check if any part of this line is prose
if not should_exclude_position(line_start, merged):
# Extract prose parts from this line
prose_text = ""
for i, char in enumerate(line):
char_pos = line_start + i
if not should_exclude_position(char_pos, merged):
prose_text += char
else:
if prose_text.strip():
prose_segments.append((prose_text.strip(), line_num))
prose_text = ""
if prose_text.strip():
prose_segments.append((prose_text.strip(), line_num))
pos = line_end + 1 # +1 for newline
return prose_segments
def clean_prose_text(text: str) -> str:
"""
Clean prose text of markdown formatting while keeping words.
Args:
text: Raw prose text with markdown
Returns:
Cleaned text for spell checking
"""
# Remove markdown formatting
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) # Bold
text = re.sub(r'\*([^\*]+)\*', r'\1', text) # Italic
text = re.sub(r'_([^_]+)_', r'\1', text) # Italic
text = re.sub(r'~~([^~]+)~~', r'\1', text) # Strikethrough
# Remove remaining markdown symbols
text = re.sub(r'[#\*_~]', '', text)
# Remove special characters but keep apostrophes in words
text = re.sub(r'[^\w\s\'-]', ' ', text)
return text.strip()
def check_with_aspell(text: str, ignore_terms: Set[str]) -> List[str]:
"""
Check text with aspell.
Returns:
List of misspelled words
"""
try:
result = subprocess.run(
['aspell', 'list', '--lang=en'],
input=text,
capture_output=True,
text=True,
check=False
)
if result.returncode == 0:
words = [w for w in result.stdout.strip().split('\n') if w]
# Filter ignore terms
filtered = [w for w in words if w.lower() not in ignore_terms]
return filtered
return []
except Exception as e:
print(f"Error running aspell: {e}", file=sys.stderr)
return []
def check_file(filepath: Path) -> List[dict]:
"""
Check a single QMD file for spelling errors.
Returns:
List of error dictionaries
"""
# Common technical terms to ignore
ignore_terms = {
# File formats and common abbreviations
'qmd', 'yml', 'json', 'png', 'jpg', 'svg', 'pdf',
'tikz', 'quarto', 'pandoc', 'latex', 'tensorflow', 'pytorch',
'gpu', 'cpu', 'tpu', 'ram', 'api', 'ui', 'ux', 'cli', 'sdk',
'yaml', 'toml', 'html', 'css', 'javascript', 'typescript',
'numpy', 'pandas', 'matplotlib', 'jupyter', 'colab',
'github', 'gitlab', 'bitbucket',
'ai', 'ml', 'dl', 'cv', 'nlp', 'iot', 'rl', 'gan',
'lstm', 'gru', 'rnn', 'cnn', 'vgg', 'resnet', 'bert',
# ML systems and techniques
'tinyml', 'microcontroller', 'microcontrollers', 'preprocessing',
'convolutional', 'latencies', 'dns', 'dennard', 'triadic',
'benchmarking', 'gdpr', 'hipaa', 'backpropagation', 'quantized',
'autoregressive', 'overfitting', 'checkpointing', 'hyperparameters',
'embeddings', 'spectrograms', 'mfcc', 'kws', 'activations',
'mnist', 'feedforward', 'softmax', 'relu', 'sigmoid', 'thresholding',
'postprocessing', 'suboptimal', 'multilayer', 'perceptrons',
'cnns', 'rnns', 'mlps', 'dnn', 'translational', 'invariance',
'parallelizable', 'uat', 'discriminative', 'fpgas', 'asics',
'topologies', 'reconceptualization', 'orchestrators', 'bfloat',
# Product and project names
'plantvillage', 'nuru', 'farmbeats', 'respira', 'colabs', 'edgeml',
'mlperf', 'linpack', 'specpowerssj', 'datahub', 'kubeflow',
'mobilenets', 'efficientnets', 'gpt', 'palm',
# Company and organization names
'mckinsey', 'espressif', 'hortonworks', 'linkedin', 'uber', 'cloudtrail',
# Acronyms and abbreviations
'cmd', 'cbsd', 'mw', 'sram', 'sox', 'sdg', 'sdgs', 'agi', 'tco',
'gpus', 'mlops', 'gigaflops', 'eniac', 'cpus', 'tpus', 'fp', 'nist',
# Legitimate English words often flagged
'underserved', 'sociotechnical', 'ebola', 'forecasted', 'unmonitored',
'transformative', 'microclimates', 'microclimate', 'responders',
'scalable', 'aspirational', 'lifecycle', 'lifecycles',
'representativeness', 'reproducibility', 'milliwatt', 'milliwatts',
'decomposable', 'interpretability', 'modularity', 'architecting',
'instantiations', 'crowdsourcing', 'crowdsourced', 'interdependencies',
'degradations', 'natively', 'detections', 'observability', 'exfiltration',
'auditable', 'cryptographic', 'curation', 'engineerable', 'subfield',
'misrouted', 'tradeoff', 'tradeoffs', 'pre',
# People names (for attributions)
'vijay', 'janapa', 'reddi', 'yann', 'lecun', 'corinna', 'burges',
'cybenko', 'hornik', 'augereau',
# Image filename patterns (without extensions)
'covermlsystems', 'coveraigood', 'coveraibenchmarking',
'coverconclusion', 'coverdataengineering', 'covernnprimer',
'coverdlarch',
# LaTeX commands
'noindent',
# AI tools
'dall', 'dalle',
# Short codes/patterns
'fn',
# Additional comprehensive technical terms (auto-generated from book content)
'accelerometers', 'acm', 'adamw', 'additionality', 'adreno', 'aes', 'agentic', 'aiops',
'airbnb', 'aitraining', 'akida', 'al', 'alexa', 'alexnet', 'algorithmically', 'alphafold',
'ambri', 'amodei', 'anonymization', 'anonymized', 'anthropic', 'asilomar', 'auditability',
'autocorrect', 'autocorrection', 'autocorrections', 'automatable', 'automl', 'avr', 'axonal',
'backdoored', 'backdoors', 'backend', 'backends', 'balancers', 'batchsize', 'bibliometric',
'binarization', 'biometric', 'bist', 'blas', 'bostrom', 'bottlenecked', 'brominated', 'carlini',
'cfe', 'channelwise', 'chatbot', 'chatbots', 'chatgpt', 'checkmark', 'chiplet', 'chiplets',
'clinaiops', 'cloudlets', 'cmsis', 'codecarbon', 'compas', 'conda', 'contestability', 'coprocessor',
'coprocessors', 'coveraihardware', 'coveraiworkflow', 'coverefficientai', 'coverfrontiers',
'coverintroduction', 'covermlframeworks', 'covermlops', 'covermodeloptimizations',
'coverondevicelearning', 'coverresponsibleai', 'coverrobustai', 'coversecurityprivacy',
'coversustainableai', 'cublas', 'cuda', 'customizations', 'cybersecurity', 'cyberweapon',
'de', 'debois', 'debuggable', 'deepsparse', 'deepspeed', 'devops', 'distilbert', 'dma', 'dp',
'dsp', 'dsps', 'dvfs', 'dwork', 'dx', 'eacs', 'electrodermal', 'electromechanical',
'epistemologically', 'esg', 'esrs', 'et', 'ethnicities', 'ets', 'ewc', 'exaflops',
'explainability', 'explanations', 'expressivity', 'externality', 'facto', 'failover', 'fairlearn',
'fairscale', 'fe', 'fedavgm', 'fedprox', 'fi', 'flops', 'forrester', 'fpu', 'frac', 'freertos',
'fx', 'gapped', 'gboard', 'gemm', 'gflops', 'giga', 'goertzel', 'gradcam', 'greenwashing',
'groupwise', 'handlin', 'hbm', 'hd', 'hdfs', 'hitl', 'homomorphic', 'hsms', 'huggingface',
'hwacc', 'hyperscale', 'iid', 'imagenet', 'imbalancing', 'incentivized', 'incentivizing',
'instantiation', 'intentioned', 'interdependency', 'intra', 'jax', 'jenkins', 'jpeg', 'kaggle',
'kanies', 'kawaguchi', 'kdd', 'keras', 'kinetis', 'kleinberg', 'kohsuke', 'kolmogorov', 'krum',
'kryo', 'kubernetes', 'lapack', 'lca', 'leaderboards', 'lidar', 'llms', 'ln', 'loihi', 'lora',
'lpddr', 'mah', 'maml', 'mance', 'mapa', 'mbed', 'mbps', 'mcus', 'medskip', 'metux', 'metuxs',
'micronpu', 'microservices', 'microsystems', 'millijoules', 'misalignments', 'misclassification',
'misclassifies', 'misclassify', 'misconfigured', 'mitigations', 'mj', 'mlcommons', 'mlflow',
'mlir', 'mlp', 'mobilenetv', 'modelscaling', 'moores', 'msqe', 'multimodal', 'multiphase',
'mwh', 'nas', 'natanz', 'nbsp', 'netron', 'neurosymbolic', 'ngo', 'nm', 'nn', 'npu', 'npus',
'npv', 'nsight', 'numenta', 'numerics', 'nvlink', 'nwp', 'nxp', 'oecd', 'onnx', 'ons', 'openai',
'opencl', 'openvino', 'openwebtext', 'operationalization', 'operationalize', 'operationalizing',
'optum', 'ota', 'overcorrecting', 'overfit', 'overreliance', 'parallelizes', 'pcie', 'perceptron',
'performant', 'personalization', 'pes', 'picojoules', 'pipelining', 'pj', 'plcs', 'ppv',
'prefetched', 'prefetching', 'pretrained', 'programmability', 'proliferative', 'proprioception',
'propublica', 'ptq', 'pufs', 'pypi', 'qat', 'qos', 'quadratically', 'quant', 'rbac', 'recalibrate',
'recalibrating', 'recommender', 'reconceptualizes', 'recyclability', 'reframing', 'reimagined',
'reimagining', 'reimplement', 'reimplementing', 'renewables', 'repairability', 'rescoring',
'reskilling', 'retinopathy', 'reusability', 'ridesharing', 'rlhf', 'roadmap', 'rollout', 'rollouts',
'rss', 'runtimes', 'sagemaker', 'sanitization', 'scipy', 'scopus', 'sdt', 'sgd', 'shader', 'shaders',
'shap', 'shapley', 'simd', 'siri', 'situationally', 'slas', 'smi', 'smirnov', 'snns', 'snpe',
'soc', 'socs', 'sparseml', 'sparsification', 'spinoff', 'sprase', 'spss', 'stationarity', 'stm',
'stuxnet', 'swappable', 'synergistically', 'tcp', 'tdsp', 'tees', 'tensorboard', 'tensorrt',
'tera', 'ternarization', 'tflite', 'tfx', 'thresholded', 'timm', 'titration', 'tls', 'tokenization',
'toolchains', 'torchscript', 'torchserve', 'tpr', 'tpuv', 'tradeable', 'trojan', 'truenorth',
'tvm', 'ultrapure', 'unbundled', 'underutilization', 'unexplainable', 'unimodal', 'unoptimized',
'untrusted', 'upgradable', 'upgradeable', 'upskilling', 'uptime', 'usb', 'utensor', 'utopian',
'vectornet', 'virusbokbok', 'vitis', 'von', 'vr', 'vtune', 'vulkan', 'waymo', 'wearables',
'wellbeing', 'wi', 'xla', 'zero',
}
try:
content = filepath.read_text(encoding='utf-8')
except Exception as e:
print(f"Error reading {filepath}: {e}", file=sys.stderr)
return []
prose_segments = extract_prose_text(content)
errors = []
for text, line_num in prose_segments:
cleaned = clean_prose_text(text)
if not cleaned:
continue
misspelled = check_with_aspell(cleaned, ignore_terms)
if misspelled:
errors.append({
'file': filepath.resolve(), # Store absolute path
'line': line_num,
'text': text[:100] + ('...' if len(text) > 100 else ''),
'misspelled': misspelled
})
return errors
def main():
"""Main function."""
# Check if aspell is available
try:
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
except (FileNotFoundError, subprocess.CalledProcessError):
print("Error: aspell not found. Install it with: brew install aspell", file=sys.stderr)
return 1
# Get directory to check
repo_root = Path(__file__).resolve().parents[3]
if len(sys.argv) > 1:
target_dir = Path(sys.argv[1])
else:
target_dir = repo_root / 'quarto' / 'contents' / 'core'
if not target_dir.exists():
print(f"Error: Directory not found: {target_dir}", file=sys.stderr)
return 1
# Find all QMD files
qmd_files = list(target_dir.rglob('*.qmd'))
print(f"Checking {len(qmd_files)} .qmd files for prose spelling errors...\n")
all_errors = []
files_with_errors = 0
for qmd_file in sorted(qmd_files):
errors = check_file(qmd_file)
if errors:
files_with_errors += 1
all_errors.extend(errors)
# Print results
if all_errors:
print(f"Found {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
current_file = None
for error in sorted(all_errors, key=lambda e: (str(e['file']), e['line'])):
if error['file'] != current_file:
current_file = error['file']
try:
rel_path = error['file'].relative_to(repo_root)
except ValueError:
rel_path = error['file']
print(f"\n{rel_path}")
print("=" * len(str(rel_path)))
print(f" Line {error['line']}: {error['text']}")
print(f" → Misspelled: {', '.join(error['misspelled'])}")
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
return 1
else:
print("✓ No spelling errors found in prose text!")
return 0
if __name__ == '__main__':
sys.exit(main())