mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-09 07:15:51 -05:00
497 lines
18 KiB
Python
Executable File
497 lines
18 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
r"""
|
|
Spell check prose content in QMD files using aspell.
|
|
|
|
Intelligently parses QMD file structure to only check actual prose text,
|
|
excluding YAML frontmatter, code blocks, TikZ diagrams, inline code, URLs, etc.
|
|
|
|
Usage:
|
|
python3 tools/scripts/content/check_prose_spelling.py [directory]
|
|
|
|
Requirements:
|
|
- aspell must be installed (brew install aspell)
|
|
- No Python dependencies beyond standard library
|
|
|
|
Checks:
|
|
- Paragraph text
|
|
- Headings
|
|
- List items
|
|
- Callout content
|
|
|
|
Ignores:
|
|
- YAML frontmatter
|
|
- Code blocks (```...```)
|
|
- Inline code (`...`)
|
|
- TikZ diagrams
|
|
- URLs and links
|
|
- LaTeX math ($...$, $$...$$)
|
|
- Special Quarto syntax
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import subprocess
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Set
|
|
|
|
|
|
def extract_yaml_frontmatter(content: str) -> Tuple[int, int]:
|
|
"""
|
|
Find the start and end positions of YAML frontmatter.
|
|
|
|
Returns:
|
|
Tuple of (start_pos, end_pos) or (0, 0) if no frontmatter
|
|
"""
|
|
if not content.startswith('---'):
|
|
return (0, 0)
|
|
|
|
# Find the closing ---
|
|
lines = content.split('\n')
|
|
for i, line in enumerate(lines[1:], 1):
|
|
if line.strip() == '---':
|
|
# Return character positions
|
|
start = 0
|
|
end = sum(len(lines[j]) + 1 for j in range(i + 1))
|
|
return (start, end)
|
|
|
|
return (0, 0)
|
|
|
|
|
|
def extract_code_blocks(content: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find all code blocks (```...``` and TikZ blocks).
|
|
|
|
Returns:
|
|
List of (start_pos, end_pos) tuples
|
|
"""
|
|
blocks = []
|
|
|
|
# Find ``` code blocks
|
|
pattern = r'```.*?```'
|
|
for match in re.finditer(pattern, content, re.DOTALL):
|
|
blocks.append((match.start(), match.end()))
|
|
|
|
# Find TikZ blocks specifically (in case they're not in ```)
|
|
tikz_pattern = r'\\begin\{tikzpicture\}.*?\\end\{tikzpicture\}'
|
|
for match in re.finditer(tikz_pattern, content, re.DOTALL):
|
|
blocks.append((match.start(), match.end()))
|
|
|
|
return blocks
|
|
|
|
|
|
def extract_inline_code(content: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find all inline code spans (`...`).
|
|
|
|
Returns:
|
|
List of (start_pos, end_pos) tuples
|
|
"""
|
|
spans = []
|
|
pattern = r'`[^`]+?`'
|
|
for match in re.finditer(pattern, content):
|
|
spans.append((match.start(), match.end()))
|
|
return spans
|
|
|
|
|
|
def extract_math_blocks(content: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find all LaTeX math blocks ($...$, $$...$$).
|
|
|
|
Returns:
|
|
List of (start_pos, end_pos) tuples
|
|
"""
|
|
blocks = []
|
|
|
|
# Display math $$...$$
|
|
pattern = r'\$\$.*?\$\$'
|
|
for match in re.finditer(pattern, content, re.DOTALL):
|
|
blocks.append((match.start(), match.end()))
|
|
|
|
# Inline math $...$
|
|
pattern = r'(?<!\$)\$(?!\$)[^\$]+?\$(?!\$)'
|
|
for match in re.finditer(pattern, content):
|
|
blocks.append((match.start(), match.end()))
|
|
|
|
return blocks
|
|
|
|
|
|
def extract_links_and_urls(content: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find all markdown links and URLs.
|
|
|
|
Returns:
|
|
List of (start_pos, end_pos) tuples
|
|
"""
|
|
spans = []
|
|
|
|
# Markdown links [text](url)
|
|
pattern = r'\[([^\]]+)\]\([^\)]+\)'
|
|
for match in re.finditer(pattern, content):
|
|
# Only exclude the URL part, keep the link text
|
|
url_start = match.group(0).find('](') + match.start() + 1
|
|
url_end = match.end() - 1
|
|
spans.append((url_start, url_end))
|
|
|
|
# Reference-style links [@ref], {#id}, @sec-name
|
|
pattern = r'(\[@[^\]]+\]|\{#[^\}]+\}|@[a-z]+-[a-z0-9-]+)'
|
|
for match in re.finditer(pattern, content):
|
|
spans.append((match.start(), match.end()))
|
|
|
|
# Plain URLs
|
|
pattern = r'https?://[^\s\)>]+'
|
|
for match in re.finditer(pattern, content):
|
|
spans.append((match.start(), match.end()))
|
|
|
|
return spans
|
|
|
|
|
|
def extract_quarto_syntax(content: str) -> List[Tuple[int, int]]:
|
|
"""
|
|
Find Quarto-specific syntax to exclude.
|
|
|
|
Returns:
|
|
List of (start_pos, end_pos) tuples
|
|
"""
|
|
spans = []
|
|
|
|
# Quarto divs ::: {.classname}
|
|
pattern = r':::\s*\{[^\}]+\}'
|
|
for match in re.finditer(pattern, content):
|
|
spans.append((match.start(), match.end()))
|
|
|
|
# Quarto shortcodes {{< ... >}}
|
|
pattern = r'\{\{<.*?>\}\}'
|
|
for match in re.finditer(pattern, content, re.DOTALL):
|
|
spans.append((match.start(), match.end()))
|
|
|
|
return spans
|
|
|
|
|
|
def should_exclude_position(pos: int, exclude_ranges: List[Tuple[int, int]]) -> bool:
|
|
"""Check if a position falls within any exclude range."""
|
|
for start, end in exclude_ranges:
|
|
if start <= pos < end:
|
|
return True
|
|
return False
|
|
|
|
|
|
def extract_prose_text(content: str) -> List[Tuple[str, int]]:
|
|
"""
|
|
Extract only prose text from QMD content.
|
|
|
|
Returns:
|
|
List of (text, line_number) tuples
|
|
"""
|
|
# Build exclude ranges
|
|
exclude_ranges = []
|
|
|
|
yaml_start, yaml_end = extract_yaml_frontmatter(content)
|
|
if yaml_end > 0:
|
|
exclude_ranges.append((yaml_start, yaml_end))
|
|
|
|
exclude_ranges.extend(extract_code_blocks(content))
|
|
exclude_ranges.extend(extract_inline_code(content))
|
|
exclude_ranges.extend(extract_math_blocks(content))
|
|
exclude_ranges.extend(extract_links_and_urls(content))
|
|
exclude_ranges.extend(extract_quarto_syntax(content))
|
|
|
|
# Sort and merge overlapping ranges
|
|
exclude_ranges.sort()
|
|
merged = []
|
|
for start, end in exclude_ranges:
|
|
if merged and start <= merged[-1][1]:
|
|
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
|
|
else:
|
|
merged.append((start, end))
|
|
|
|
# Extract prose text
|
|
prose_segments = []
|
|
lines = content.split('\n')
|
|
pos = 0
|
|
|
|
for line_num, line in enumerate(lines, 1):
|
|
line_start = pos
|
|
line_end = pos + len(line)
|
|
|
|
# Check if any part of this line is prose
|
|
if not should_exclude_position(line_start, merged):
|
|
# Extract prose parts from this line
|
|
prose_text = ""
|
|
for i, char in enumerate(line):
|
|
char_pos = line_start + i
|
|
if not should_exclude_position(char_pos, merged):
|
|
prose_text += char
|
|
else:
|
|
if prose_text.strip():
|
|
prose_segments.append((prose_text.strip(), line_num))
|
|
prose_text = ""
|
|
|
|
if prose_text.strip():
|
|
prose_segments.append((prose_text.strip(), line_num))
|
|
|
|
pos = line_end + 1 # +1 for newline
|
|
|
|
return prose_segments
|
|
|
|
|
|
def clean_prose_text(text: str) -> str:
|
|
"""
|
|
Clean prose text of markdown formatting while keeping words.
|
|
|
|
Args:
|
|
text: Raw prose text with markdown
|
|
|
|
Returns:
|
|
Cleaned text for spell checking
|
|
"""
|
|
# Remove markdown formatting
|
|
text = re.sub(r'\*\*([^\*]+)\*\*', r'\1', text) # Bold
|
|
text = re.sub(r'\*([^\*]+)\*', r'\1', text) # Italic
|
|
text = re.sub(r'_([^_]+)_', r'\1', text) # Italic
|
|
text = re.sub(r'~~([^~]+)~~', r'\1', text) # Strikethrough
|
|
|
|
# Remove remaining markdown symbols
|
|
text = re.sub(r'[#\*_~]', '', text)
|
|
|
|
# Remove special characters but keep apostrophes in words
|
|
text = re.sub(r'[^\w\s\'-]', ' ', text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def check_with_aspell(text: str, ignore_terms: Set[str]) -> List[str]:
|
|
"""
|
|
Check text with aspell.
|
|
|
|
Returns:
|
|
List of misspelled words
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
['aspell', 'list', '--lang=en'],
|
|
input=text,
|
|
capture_output=True,
|
|
text=True,
|
|
check=False
|
|
)
|
|
if result.returncode == 0:
|
|
words = [w for w in result.stdout.strip().split('\n') if w]
|
|
# Filter ignore terms
|
|
filtered = [w for w in words if w.lower() not in ignore_terms]
|
|
return filtered
|
|
return []
|
|
except Exception as e:
|
|
print(f"Error running aspell: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
|
|
def check_file(filepath: Path) -> List[dict]:
|
|
"""
|
|
Check a single QMD file for spelling errors.
|
|
|
|
Returns:
|
|
List of error dictionaries
|
|
"""
|
|
# Common technical terms to ignore
|
|
ignore_terms = {
|
|
# File formats and common abbreviations
|
|
'qmd', 'yml', 'json', 'png', 'jpg', 'svg', 'pdf',
|
|
'tikz', 'quarto', 'pandoc', 'latex', 'tensorflow', 'pytorch',
|
|
'gpu', 'cpu', 'tpu', 'ram', 'api', 'ui', 'ux', 'cli', 'sdk',
|
|
'yaml', 'toml', 'html', 'css', 'javascript', 'typescript',
|
|
'numpy', 'pandas', 'matplotlib', 'jupyter', 'colab',
|
|
'github', 'gitlab', 'bitbucket',
|
|
'ai', 'ml', 'dl', 'cv', 'nlp', 'iot', 'rl', 'gan',
|
|
'lstm', 'gru', 'rnn', 'cnn', 'vgg', 'resnet', 'bert',
|
|
|
|
# ML systems and techniques
|
|
'tinyml', 'microcontroller', 'microcontrollers', 'preprocessing',
|
|
'convolutional', 'latencies', 'dns', 'dennard', 'triadic',
|
|
'benchmarking', 'gdpr', 'hipaa', 'backpropagation', 'quantized',
|
|
'autoregressive', 'overfitting', 'checkpointing', 'hyperparameters',
|
|
'embeddings', 'spectrograms', 'mfcc', 'kws', 'activations',
|
|
'mnist', 'feedforward', 'softmax', 'relu', 'sigmoid', 'thresholding',
|
|
'postprocessing', 'suboptimal', 'multilayer', 'perceptrons',
|
|
'cnns', 'rnns', 'mlps', 'dnn', 'translational', 'invariance',
|
|
'parallelizable', 'uat', 'discriminative', 'fpgas', 'asics',
|
|
'topologies', 'reconceptualization', 'orchestrators', 'bfloat',
|
|
|
|
# Product and project names
|
|
'plantvillage', 'nuru', 'farmbeats', 'respira', 'colabs', 'edgeml',
|
|
'mlperf', 'linpack', 'specpowerssj', 'datahub', 'kubeflow',
|
|
'mobilenets', 'efficientnets', 'gpt', 'palm',
|
|
|
|
# Company and organization names
|
|
'mckinsey', 'espressif', 'hortonworks', 'linkedin', 'uber', 'cloudtrail',
|
|
|
|
# Acronyms and abbreviations
|
|
'cmd', 'cbsd', 'mw', 'sram', 'sox', 'sdg', 'sdgs', 'agi', 'tco',
|
|
'gpus', 'mlops', 'gigaflops', 'eniac', 'cpus', 'tpus', 'fp', 'nist',
|
|
|
|
# Legitimate English words often flagged
|
|
'underserved', 'sociotechnical', 'ebola', 'forecasted', 'unmonitored',
|
|
'transformative', 'microclimates', 'microclimate', 'responders',
|
|
'scalable', 'aspirational', 'lifecycle', 'lifecycles',
|
|
'representativeness', 'reproducibility', 'milliwatt', 'milliwatts',
|
|
'decomposable', 'interpretability', 'modularity', 'architecting',
|
|
'instantiations', 'crowdsourcing', 'crowdsourced', 'interdependencies',
|
|
'degradations', 'natively', 'detections', 'observability', 'exfiltration',
|
|
'auditable', 'cryptographic', 'curation', 'engineerable', 'subfield',
|
|
'misrouted', 'tradeoff', 'tradeoffs', 'pre',
|
|
|
|
# People names (for attributions)
|
|
'vijay', 'janapa', 'reddi', 'yann', 'lecun', 'corinna', 'burges',
|
|
'cybenko', 'hornik', 'augereau',
|
|
|
|
# Image filename patterns (without extensions)
|
|
'covermlsystems', 'coveraigood', 'coveraibenchmarking',
|
|
'coverconclusion', 'coverdataengineering', 'covernnprimer',
|
|
'coverdlarch',
|
|
|
|
# LaTeX commands
|
|
'noindent',
|
|
|
|
# AI tools
|
|
'dall', 'dalle',
|
|
|
|
# Short codes/patterns
|
|
'fn',
|
|
|
|
# Additional comprehensive technical terms (auto-generated from book content)
|
|
'accelerometers', 'acm', 'adamw', 'additionality', 'adreno', 'aes', 'agentic', 'aiops',
|
|
'airbnb', 'aitraining', 'akida', 'al', 'alexa', 'alexnet', 'algorithmically', 'alphafold',
|
|
'ambri', 'amodei', 'anonymization', 'anonymized', 'anthropic', 'asilomar', 'auditability',
|
|
'autocorrect', 'autocorrection', 'autocorrections', 'automatable', 'automl', 'avr', 'axonal',
|
|
'backdoored', 'backdoors', 'backend', 'backends', 'balancers', 'batchsize', 'bibliometric',
|
|
'binarization', 'biometric', 'bist', 'blas', 'bostrom', 'bottlenecked', 'brominated', 'carlini',
|
|
'cfe', 'channelwise', 'chatbot', 'chatbots', 'chatgpt', 'checkmark', 'chiplet', 'chiplets',
|
|
'clinaiops', 'cloudlets', 'cmsis', 'codecarbon', 'compas', 'conda', 'contestability', 'coprocessor',
|
|
'coprocessors', 'coveraihardware', 'coveraiworkflow', 'coverefficientai', 'coverfrontiers',
|
|
'coverintroduction', 'covermlframeworks', 'covermlops', 'covermodeloptimizations',
|
|
'coverondevicelearning', 'coverresponsibleai', 'coverrobustai', 'coversecurityprivacy',
|
|
'coversustainableai', 'cublas', 'cuda', 'customizations', 'cybersecurity', 'cyberweapon',
|
|
'de', 'debois', 'debuggable', 'deepsparse', 'deepspeed', 'devops', 'distilbert', 'dma', 'dp',
|
|
'dsp', 'dsps', 'dvfs', 'dwork', 'dx', 'eacs', 'electrodermal', 'electromechanical',
|
|
'epistemologically', 'esg', 'esrs', 'et', 'ethnicities', 'ets', 'ewc', 'exaflops',
|
|
'explainability', 'explanations', 'expressivity', 'externality', 'facto', 'failover', 'fairlearn',
|
|
'fairscale', 'fe', 'fedavgm', 'fedprox', 'fi', 'flops', 'forrester', 'fpu', 'frac', 'freertos',
|
|
'fx', 'gapped', 'gboard', 'gemm', 'gflops', 'giga', 'goertzel', 'gradcam', 'greenwashing',
|
|
'groupwise', 'handlin', 'hbm', 'hd', 'hdfs', 'hitl', 'homomorphic', 'hsms', 'huggingface',
|
|
'hwacc', 'hyperscale', 'iid', 'imagenet', 'imbalancing', 'incentivized', 'incentivizing',
|
|
'instantiation', 'intentioned', 'interdependency', 'intra', 'jax', 'jenkins', 'jpeg', 'kaggle',
|
|
'kanies', 'kawaguchi', 'kdd', 'keras', 'kinetis', 'kleinberg', 'kohsuke', 'kolmogorov', 'krum',
|
|
'kryo', 'kubernetes', 'lapack', 'lca', 'leaderboards', 'lidar', 'llms', 'ln', 'loihi', 'lora',
|
|
'lpddr', 'mah', 'maml', 'mance', 'mapa', 'mbed', 'mbps', 'mcus', 'medskip', 'metux', 'metuxs',
|
|
'micronpu', 'microservices', 'microsystems', 'millijoules', 'misalignments', 'misclassification',
|
|
'misclassifies', 'misclassify', 'misconfigured', 'mitigations', 'mj', 'mlcommons', 'mlflow',
|
|
'mlir', 'mlp', 'mobilenetv', 'modelscaling', 'moores', 'msqe', 'multimodal', 'multiphase',
|
|
'mwh', 'nas', 'natanz', 'nbsp', 'netron', 'neurosymbolic', 'ngo', 'nm', 'nn', 'npu', 'npus',
|
|
'npv', 'nsight', 'numenta', 'numerics', 'nvlink', 'nwp', 'nxp', 'oecd', 'onnx', 'ons', 'openai',
|
|
'opencl', 'openvino', 'openwebtext', 'operationalization', 'operationalize', 'operationalizing',
|
|
'optum', 'ota', 'overcorrecting', 'overfit', 'overreliance', 'parallelizes', 'pcie', 'perceptron',
|
|
'performant', 'personalization', 'pes', 'picojoules', 'pipelining', 'pj', 'plcs', 'ppv',
|
|
'prefetched', 'prefetching', 'pretrained', 'programmability', 'proliferative', 'proprioception',
|
|
'propublica', 'ptq', 'pufs', 'pypi', 'qat', 'qos', 'quadratically', 'quant', 'rbac', 'recalibrate',
|
|
'recalibrating', 'recommender', 'reconceptualizes', 'recyclability', 'reframing', 'reimagined',
|
|
'reimagining', 'reimplement', 'reimplementing', 'renewables', 'repairability', 'rescoring',
|
|
'reskilling', 'retinopathy', 'reusability', 'ridesharing', 'rlhf', 'roadmap', 'rollout', 'rollouts',
|
|
'rss', 'runtimes', 'sagemaker', 'sanitization', 'scipy', 'scopus', 'sdt', 'sgd', 'shader', 'shaders',
|
|
'shap', 'shapley', 'simd', 'siri', 'situationally', 'slas', 'smi', 'smirnov', 'snns', 'snpe',
|
|
'soc', 'socs', 'sparseml', 'sparsification', 'spinoff', 'sprase', 'spss', 'stationarity', 'stm',
|
|
'stuxnet', 'swappable', 'synergistically', 'tcp', 'tdsp', 'tees', 'tensorboard', 'tensorrt',
|
|
'tera', 'ternarization', 'tflite', 'tfx', 'thresholded', 'timm', 'titration', 'tls', 'tokenization',
|
|
'toolchains', 'torchscript', 'torchserve', 'tpr', 'tpuv', 'tradeable', 'trojan', 'truenorth',
|
|
'tvm', 'ultrapure', 'unbundled', 'underutilization', 'unexplainable', 'unimodal', 'unoptimized',
|
|
'untrusted', 'upgradable', 'upgradeable', 'upskilling', 'uptime', 'usb', 'utensor', 'utopian',
|
|
'vectornet', 'virusbokbok', 'vitis', 'von', 'vr', 'vtune', 'vulkan', 'waymo', 'wearables',
|
|
'wellbeing', 'wi', 'xla', 'zero',
|
|
}
|
|
|
|
try:
|
|
content = filepath.read_text(encoding='utf-8')
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
prose_segments = extract_prose_text(content)
|
|
errors = []
|
|
|
|
for text, line_num in prose_segments:
|
|
cleaned = clean_prose_text(text)
|
|
if not cleaned:
|
|
continue
|
|
|
|
misspelled = check_with_aspell(cleaned, ignore_terms)
|
|
if misspelled:
|
|
errors.append({
|
|
'file': filepath.resolve(), # Store absolute path
|
|
'line': line_num,
|
|
'text': text[:100] + ('...' if len(text) > 100 else ''),
|
|
'misspelled': misspelled
|
|
})
|
|
|
|
return errors
|
|
|
|
|
|
def main():
|
|
"""Main function."""
|
|
# Check if aspell is available
|
|
try:
|
|
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
|
|
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
print("Error: aspell not found. Install it with: brew install aspell", file=sys.stderr)
|
|
return 1
|
|
|
|
# Get directory to check
|
|
repo_root = Path(__file__).resolve().parents[3]
|
|
|
|
if len(sys.argv) > 1:
|
|
target_dir = Path(sys.argv[1])
|
|
else:
|
|
target_dir = repo_root / 'quarto' / 'contents' / 'core'
|
|
|
|
if not target_dir.exists():
|
|
print(f"Error: Directory not found: {target_dir}", file=sys.stderr)
|
|
return 1
|
|
|
|
# Find all QMD files
|
|
qmd_files = list(target_dir.rglob('*.qmd'))
|
|
print(f"Checking {len(qmd_files)} .qmd files for prose spelling errors...\n")
|
|
|
|
all_errors = []
|
|
files_with_errors = 0
|
|
|
|
for qmd_file in sorted(qmd_files):
|
|
errors = check_file(qmd_file)
|
|
if errors:
|
|
files_with_errors += 1
|
|
all_errors.extend(errors)
|
|
|
|
# Print results
|
|
if all_errors:
|
|
print(f"Found {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
|
|
|
|
current_file = None
|
|
for error in sorted(all_errors, key=lambda e: (str(e['file']), e['line'])):
|
|
if error['file'] != current_file:
|
|
current_file = error['file']
|
|
try:
|
|
rel_path = error['file'].relative_to(repo_root)
|
|
except ValueError:
|
|
rel_path = error['file']
|
|
print(f"\n{rel_path}")
|
|
print("=" * len(str(rel_path)))
|
|
|
|
print(f" Line {error['line']}: {error['text']}")
|
|
print(f" → Misspelled: {', '.join(error['misspelled'])}")
|
|
|
|
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
|
|
return 1
|
|
else:
|
|
print("✓ No spelling errors found in prose text!")
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|