mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-04 00:29:10 -05:00
494 lines
17 KiB
Python
Executable File
494 lines
17 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
r"""
|
|
Spell check text content within TikZ diagrams.
|
|
|
|
Extracts and validates all visible text from TikZ diagrams in .qmd files,
|
|
including node labels, inline annotations, formatted text, and comments.
|
|
|
|
Usage:
|
|
python3 tools/scripts/content/check_tikz_spelling.py
|
|
|
|
Checks text in:
|
|
- Node commands: \node{text}, node{text} in \draw/\path/\fill
|
|
- Formatted text: \textbf{}, \textit{}, \emph{}, etc.
|
|
- Drawing annotations: \draw--node{label}--
|
|
- Custom pics: pics/name/, \pic{name}
|
|
- Foreach loops: /{Text}/ patterns
|
|
- Labels: label={text}, pin={text}
|
|
- Legends: \legend{Item 1, Item 2}
|
|
- Comments: % text
|
|
|
|
Optional: Install aspell for comprehensive dictionary checking
|
|
macOS: brew install aspell
|
|
Ubuntu: sudo apt-get install aspell
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import List, Tuple, Set
|
|
import subprocess
|
|
|
|
|
|
def extract_tikz_blocks(content: str, filepath: str) -> List[Tuple[str, int]]:
|
|
"""
|
|
Extract TikZ code blocks with their starting line numbers.
|
|
|
|
Returns:
|
|
List of (tikz_content, start_line_number) tuples
|
|
"""
|
|
blocks = []
|
|
lines = content.split('\n')
|
|
in_tikz = False
|
|
current_block = []
|
|
start_line = 0
|
|
|
|
for i, line in enumerate(lines, 1):
|
|
if r'\begin{tikzpicture}' in line:
|
|
in_tikz = True
|
|
start_line = i
|
|
current_block = [line]
|
|
elif r'\end{tikzpicture}' in line and in_tikz:
|
|
current_block.append(line)
|
|
blocks.append(('\n'.join(current_block), start_line))
|
|
in_tikz = False
|
|
current_block = []
|
|
elif in_tikz:
|
|
current_block.append(line)
|
|
|
|
return blocks
|
|
|
|
|
|
def clean_latex_text(text: str) -> str:
|
|
"""
|
|
Clean LaTeX formatting from text to get readable content.
|
|
|
|
Args:
|
|
text: Raw text from LaTeX/TikZ
|
|
|
|
Returns:
|
|
Cleaned text with LaTeX commands removed
|
|
"""
|
|
# Replace \\ (line breaks) with spaces first
|
|
text = text.replace('\\\\', ' ')
|
|
|
|
# Remove size commands that appear before text (like {\huge ?})
|
|
text = re.sub(r'\\(tiny|scriptsize|footnotesize|small|normalsize|large|Large|LARGE|huge|Huge)\s+', ' ', text)
|
|
|
|
# Remove font commands
|
|
text = re.sub(r'\\usefont\{[^}]*\}\{[^}]*\}\{[^}]*\}\{[^}]*\}', ' ', text)
|
|
text = re.sub(r'\\fontsize\{[^}]*\}\{[^}]*\}\\selectfont', ' ', text)
|
|
text = re.sub(r'\\bfseries\s*', ' ', text)
|
|
|
|
# Handle nested formatting commands (multiple passes)
|
|
for _ in range(3): # Up to 3 levels of nesting
|
|
# Remove common LaTeX formatting commands but keep the text
|
|
text = re.sub(r'\\textbf\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\textit\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\emph\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\text\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\mathbf\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\mathrm\{([^}]+)\}', r'\1', text)
|
|
text = re.sub(r'\\textsubscript\{([^}]+)\}', r'_\1', text)
|
|
text = re.sub(r'\\textsuperscript\{([^}]+)\}', r'^\1', text)
|
|
text = re.sub(r'\\textcolor\{[^}]*\}\{([^}]+)\}', r'\1', text)
|
|
|
|
# Remove $ signs (math mode)
|
|
text = text.replace('$', '')
|
|
|
|
# Remove other common LaTeX commands (but preserve the text after them)
|
|
text = re.sub(r'\\[a-zA-Z]+\s*', ' ', text)
|
|
|
|
# Clean up whitespace
|
|
text = ' '.join(text.split())
|
|
|
|
return text.strip()
|
|
|
|
|
|
def extract_all_curly_brace_text(tikz_content: str) -> List[Tuple[str, str, int]]:
|
|
"""
|
|
Extract all text content from curly braces that could be visible text.
|
|
|
|
Returns:
|
|
List of (text, context, char_position) tuples
|
|
"""
|
|
texts = []
|
|
|
|
# Find all text in curly braces that follows common TikZ commands or appears in node definitions
|
|
# This catches: \node{text}, node{text}, \textbf{text}, etc.
|
|
|
|
# Pattern 1: \node[options]{text} or \node(name){text}
|
|
node_standalone = r'\\node\s*(?:\[[^\]]*\])?\s*(?:\([^)]*\))?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
|
|
for match in re.finditer(node_standalone, tikz_content):
|
|
text = match.group(1)
|
|
texts.append((text, '\\node{...}', match.start()))
|
|
|
|
# Pattern 2: node[options]{text} (inside \draw, \fill, or \path)
|
|
node_inline = r'(?<!\\)node\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
|
|
for match in re.finditer(node_inline, tikz_content):
|
|
text = match.group(1)
|
|
texts.append((text, 'node{...} in draw/path/fill', match.start()))
|
|
|
|
# Pattern 3: Text formatting commands
|
|
text_commands = [
|
|
(r'\\textbf\{([^}]+)\}', '\\textbf{...}'),
|
|
(r'\\textit\{([^}]+)\}', '\\textit{...}'),
|
|
(r'\\emph\{([^}]+)\}', '\\emph{...}'),
|
|
(r'\\text\{([^}]+)\}', '\\text{...}'),
|
|
]
|
|
for pattern, context in text_commands:
|
|
for match in re.finditer(pattern, tikz_content):
|
|
text = match.group(1)
|
|
texts.append((text, context, match.start()))
|
|
|
|
# Pattern 4: label={text} and similar options
|
|
label_pattern = r'(?:label|pin|xlabel|ylabel)\s*=\s*(?:\[[^\]]*\])?\s*\{([^}]+)\}'
|
|
for match in re.finditer(label_pattern, tikz_content):
|
|
text = match.group(1)
|
|
texts.append((text, 'label={...}', match.start()))
|
|
|
|
# Pattern 5: legend command
|
|
legend_pattern = r'\\legend\s*\{([^}]+)\}'
|
|
for match in re.finditer(legend_pattern, tikz_content):
|
|
text = match.group(1)
|
|
texts.append((text, '\\legend{...}', match.start()))
|
|
|
|
return texts
|
|
|
|
|
|
def extract_text_from_foreach(tikz_content: str) -> List[Tuple[str, str]]:
|
|
r"""
|
|
Extract text from \foreach loops which often contain labels.
|
|
|
|
Pattern: \\foreach \\i/\\j/... in {val1/{Text 1}/val2, val2/{Text 2}/val3, ...}
|
|
|
|
Returns:
|
|
List of (text, context) tuples
|
|
"""
|
|
texts = []
|
|
|
|
# Find \foreach statements
|
|
foreach_pattern = r'\\foreach[^{]+in\s*\{([^}]+)\}'
|
|
|
|
for match in re.finditer(foreach_pattern, tikz_content, re.DOTALL):
|
|
content = match.group(1)
|
|
|
|
# Extract text from {...} within the foreach content
|
|
# Pattern: /{text}/
|
|
text_in_braces = re.findall(r'/\{([^}]+)\}/', content)
|
|
for text in text_in_braces:
|
|
cleaned = clean_latex_text(text)
|
|
if cleaned and len(cleaned) > 2:
|
|
# Skip if it's just a number or coordinate
|
|
if not re.match(r'^[\d\s\.,\-\+]+$', cleaned):
|
|
texts.append((cleaned, f'\\foreach loop: /{{{text}}}/'))
|
|
|
|
return texts
|
|
|
|
|
|
def extract_text_from_tikz(tikz_content: str) -> List[Tuple[str, str]]:
|
|
"""
|
|
Extract ALL human-readable text from TikZ code.
|
|
|
|
Returns:
|
|
List of (text, context) tuples where context shows where the text was found
|
|
"""
|
|
texts = []
|
|
seen_texts = set() # Avoid duplicates
|
|
|
|
# Extract all text from curly braces
|
|
for raw_text, context, pos in extract_all_curly_brace_text(tikz_content):
|
|
# Clean the text
|
|
cleaned = clean_latex_text(raw_text)
|
|
|
|
# Skip if it's just numbers, coordinates, colors, or TikZ commands
|
|
if not cleaned:
|
|
continue
|
|
if re.match(r'^[\d\s\.,\-\+\*\/\(\)_\^]+$', cleaned): # Just numbers/math/subscripts
|
|
continue
|
|
if re.match(r'^[a-z]+!?\d*$', cleaned): # Colors like "red", "blue!50"
|
|
continue
|
|
if len(cleaned) < 2: # Too short to be meaningful text
|
|
continue
|
|
|
|
# Avoid duplicates
|
|
key = (cleaned.lower(), context)
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((cleaned, f'{context}: "{raw_text}"'))
|
|
|
|
# Extract text from \foreach loops
|
|
for text, context in extract_text_from_foreach(tikz_content):
|
|
key = (text.lower(), 'foreach')
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((text, context))
|
|
|
|
# Extract text from pic names (custom TikZ pictures)
|
|
pic_name_pattern = r'pics/([a-zA-Z_]+)/'
|
|
for match in re.finditer(pic_name_pattern, tikz_content):
|
|
name = match.group(1)
|
|
if len(name) > 2:
|
|
key = (name.lower(), 'pics')
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((name, f'pics/{name}/'))
|
|
|
|
# Extract text from pic usage
|
|
pic_usage_pattern = r'\\pic\s*(?:\[[^\]]*\])?\s*(?:at\s*\([^)]*\))?\s*\{([^}]+)\}'
|
|
for match in re.finditer(pic_usage_pattern, tikz_content):
|
|
name = match.group(1)
|
|
if len(name) > 2 and not re.match(r'^[\d\s]+$', name):
|
|
key = (name.lower(), 'pic_usage')
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((name, f'\\pic{{...}}{{{name}}}'))
|
|
|
|
# Extract comments (often contain descriptive text)
|
|
comment_pattern = r'%\s*(.+?)(?:\n|$)'
|
|
for match in re.finditer(comment_pattern, tikz_content):
|
|
comment = match.group(1).strip()
|
|
# Skip comments that are just separators or structure
|
|
if comment and not re.match(r'^[\-\=\*\s]+$', comment):
|
|
key = (comment.lower(), 'comment')
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((comment, f'% {comment}'))
|
|
|
|
# Extract variable names from \def that might be words
|
|
def_pattern = r'\\def\\([a-zA-Z]+)\{'
|
|
for match in re.finditer(def_pattern, tikz_content):
|
|
name = match.group(1)
|
|
# Only check if it looks like a word (not all caps, reasonable length)
|
|
if len(name) > 3 and not name.isupper() and not name.startswith('r'):
|
|
key = (name.lower(), 'def')
|
|
if key not in seen_texts:
|
|
seen_texts.add(key)
|
|
texts.append((name, f'\\def\\{name}'))
|
|
|
|
return texts
|
|
|
|
|
|
def check_spelling_with_aspell(text: str) -> List[str]:
|
|
"""
|
|
Check spelling using aspell if available, filtering out TikZ/LaTeX technical terms.
|
|
|
|
Returns:
|
|
List of misspelled words (excluding known technical terms)
|
|
"""
|
|
# Terms to ignore (TikZ syntax, LaTeX commands, common technical terms, etc.)
|
|
ignore_terms = {
|
|
# TikZ pic parameters
|
|
'scalefac', 'picname', 'filllcolor', 'drawcolor', 'linewidth',
|
|
'filllcirclecolor', 'drawcircle', 'bodycolor', 'tiecolor', 'stetcolor',
|
|
'drawchannelcolor', 'channelcolor',
|
|
|
|
# Color names
|
|
'brownline', 'redline', 'blueline', 'violetline', 'greenline', 'orangeline',
|
|
'violetl', 'greenl', 'bluel', 'redl', 'orangel',
|
|
'greend',
|
|
|
|
# TikZ/LaTeX commands
|
|
'tikzset', 'foreach', 'tikz', 'usefont', 'phv', 'bfseries', 'textbf',
|
|
'pgfmathparse', 'addplot', 'sqrt',
|
|
|
|
# Common variable names
|
|
'cellsize', 'cellheight', 'xmax', 'ymin', 'newx', 'pos', 'sep',
|
|
|
|
# Technical diagram terms
|
|
'mycylinder', 'mycycle', 'myline', 'rgpoly', 'zerofill',
|
|
|
|
# Display/UI elements
|
|
'displaye', 'autotext',
|
|
|
|
# Abbreviations used in diagrams
|
|
'zgl', 'zgd', 'da', 'dcd', 'dcl', 'dsc', 'ggb', 'lca', 'sre',
|
|
|
|
# Common acronyms and abbreviations
|
|
'ui', 'kpis', 'oss', 'rtx', 'tpus', 'bg', 'eniac', 'fp',
|
|
|
|
# Technical terms (keep legitimate ones but add clearly technical)
|
|
'preprocessing', 'backprop', 'weightgradient', 'davit', 'tokenize',
|
|
'multimodality', 'microarchitecture', 'hypercomputing', 'curation',
|
|
'transformative',
|
|
|
|
# Misc
|
|
'helvetica', 'geeksforgeeks', 'lightgray', 'gaussian', 'yshift',
|
|
'ack', 'zz', 'yy',
|
|
}
|
|
|
|
try:
|
|
# Check if aspell is available
|
|
result = subprocess.run(
|
|
['aspell', '--version'],
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
if result.returncode != 0:
|
|
return []
|
|
except FileNotFoundError:
|
|
return []
|
|
|
|
# Use aspell to check spelling
|
|
try:
|
|
result = subprocess.run(
|
|
['aspell', 'list', '--lang=en'],
|
|
input=text,
|
|
capture_output=True,
|
|
text=True
|
|
)
|
|
if result.returncode == 0:
|
|
words = [word for word in result.stdout.strip().split('\n') if word]
|
|
# Filter out ignored terms
|
|
filtered = [w for w in words if w.lower() not in ignore_terms]
|
|
return filtered
|
|
except Exception:
|
|
pass
|
|
|
|
return []
|
|
|
|
|
|
def simple_spell_check(text: str) -> List[str]:
|
|
"""
|
|
Simple pattern-based spell checking for common mistakes.
|
|
|
|
Returns:
|
|
List of potential typos
|
|
"""
|
|
common_typos = {
|
|
'teh': 'the',
|
|
'htat': 'that',
|
|
'taht': 'that',
|
|
'adn': 'and',
|
|
'nad': 'and',
|
|
'gatewey': 'gateway',
|
|
'poihnts': 'points',
|
|
'poitns': 'points',
|
|
'recieve': 'receive',
|
|
'seperate': 'separate',
|
|
'occured': 'occurred',
|
|
'occurance': 'occurrence',
|
|
'begining': 'beginning',
|
|
'lenght': 'length',
|
|
'widht': 'width',
|
|
'heigth': 'height',
|
|
'coordiante': 'coordinate',
|
|
'cooridate': 'coordinate',
|
|
'paramter': 'parameter',
|
|
'paramters': 'parameters',
|
|
'intellignet': 'intelligent',
|
|
}
|
|
|
|
words = re.findall(r'\b[a-zA-Z]+\b', text.lower())
|
|
typos = []
|
|
|
|
for word in words:
|
|
if word in common_typos:
|
|
typos.append(f'{word} (suggest: {common_typos[word]})')
|
|
|
|
return typos
|
|
|
|
|
|
def check_file(filepath: Path, use_aspell: bool = True) -> List[dict]:
|
|
"""
|
|
Check a single file for spelling errors in TikZ diagrams.
|
|
|
|
Returns:
|
|
List of error dictionaries with file, line, text, and suggestions
|
|
"""
|
|
try:
|
|
content = filepath.read_text(encoding='utf-8')
|
|
except Exception as e:
|
|
print(f"Error reading {filepath}: {e}", file=sys.stderr)
|
|
return []
|
|
|
|
tikz_blocks = extract_tikz_blocks(content, str(filepath))
|
|
errors = []
|
|
|
|
for tikz_content, start_line in tikz_blocks:
|
|
texts = extract_text_from_tikz(tikz_content)
|
|
|
|
for text, context in texts:
|
|
# Simple pattern check (always run)
|
|
simple_errors = simple_spell_check(text)
|
|
if simple_errors:
|
|
errors.append({
|
|
'file': str(filepath),
|
|
'line': start_line,
|
|
'text': text,
|
|
'context': context,
|
|
'suggestions': simple_errors
|
|
})
|
|
|
|
# Aspell check (if available and requested)
|
|
if use_aspell:
|
|
aspell_errors = check_spelling_with_aspell(text)
|
|
if aspell_errors:
|
|
errors.append({
|
|
'file': str(filepath),
|
|
'line': start_line,
|
|
'text': text,
|
|
'context': context,
|
|
'suggestions': aspell_errors
|
|
})
|
|
|
|
return errors
|
|
|
|
|
|
def main():
|
|
"""Main function to check all .qmd files for TikZ spelling errors."""
|
|
# Find all .qmd files in the quarto/contents directory
|
|
repo_root = Path(__file__).resolve().parents[3]
|
|
contents_dir = repo_root / 'quarto' / 'contents'
|
|
|
|
if not contents_dir.exists():
|
|
print(f"Error: Contents directory not found at {contents_dir}", file=sys.stderr)
|
|
return 1
|
|
|
|
qmd_files = list(contents_dir.rglob('*.qmd'))
|
|
print(f"Checking {len(qmd_files)} .qmd files for TikZ spelling errors...\n")
|
|
|
|
# Check if aspell is available
|
|
use_aspell = True
|
|
try:
|
|
subprocess.run(['aspell', '--version'], capture_output=True, check=True)
|
|
print("Using aspell for comprehensive spell checking.")
|
|
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
print("aspell not found. Using pattern-based checking only.")
|
|
print("Install aspell for more comprehensive checking: brew install aspell\n")
|
|
use_aspell = False
|
|
|
|
all_errors = []
|
|
files_with_errors = 0
|
|
|
|
for qmd_file in sorted(qmd_files):
|
|
errors = check_file(qmd_file, use_aspell)
|
|
if errors:
|
|
files_with_errors += 1
|
|
all_errors.extend(errors)
|
|
|
|
# Print results
|
|
if all_errors:
|
|
print(f"\nFound {len(all_errors)} potential spelling errors in {files_with_errors} files:\n")
|
|
|
|
current_file = None
|
|
for error in sorted(all_errors, key=lambda e: (e['file'], e['line'])):
|
|
if error['file'] != current_file:
|
|
current_file = error['file']
|
|
rel_path = Path(error['file']).relative_to(repo_root)
|
|
print(f"\n{rel_path}")
|
|
print("=" * len(str(rel_path)))
|
|
|
|
print(f" Line {error['line']}: {error['context']}")
|
|
print(f" → Issues: {', '.join(error['suggestions'])}")
|
|
|
|
print(f"\n\nSummary: {len(all_errors)} potential errors in {files_with_errors} files")
|
|
return 1
|
|
else:
|
|
print("\n✓ No spelling errors found in TikZ diagrams!")
|
|
return 0
|
|
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|