Files
cs249r_book/book/tools/scripts/content/format_tables.py
Vijay Janapa Reddi ccd7e5f7a9 Fix table formatter HTML escaping bug and add rendering validator
format_tables.py was escaping <, >, & to HTML entities inside Markdown
grid tables, breaking LaTeX math and comparison operators in rendered
output. Removed the escape_html_entities() calls since Quarto grid
tables are Markdown, not HTML.

New validate_tables.py catches rendering issues the structural formatter
misses: bare pipes in LaTeX math, \frac in multiline cells, HTML
entities, and missing table labels.
2026-02-01 12:41:55 -05:00

990 lines
34 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Production Table Formatter for MLSysBook
This script formats and validates grid tables in Quarto .qmd files.
Designed for use in CI/CD pipelines and pre-commit hooks.
Key Features:
- Smart column header bolding (always, including multiline headers)
- Intelligent first column bolding (based on content analysis)
- Proper spacing calculation accounting for bold markers
- Handles multiline headers, multiline cells, empty cells, and Unicode
- Comprehensive validation with detailed error reporting
- Exit codes suitable for CI/CD integration
Usage:
# Check single file
python format_tables.py --check -f quarto/contents/vol1/efficient_ai/efficient_ai.qmd
# Fix single file
python format_tables.py --fix -f quarto/contents/vol1/efficient_ai/efficient_ai.qmd
# Check all files in a directory
python format_tables.py --check -d quarto/contents/vol1/optimizations
# Fix all chapter files
python format_tables.py --fix --all
# With text wrapping
python format_tables.py --fix --all --max-width 60
Exit Codes:
0: Success (all tables properly formatted)
1: Formatting issues found
2: Validation errors (structural problems)
3: File errors
"""
import argparse
import sys
import unicodedata
from pathlib import Path
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass
from enum import Enum
class ExitCode(Enum):
"""Exit codes for CI/CD integration."""
SUCCESS = 0
FORMATTING_ISSUES = 1
VALIDATION_ERRORS = 2
FILE_ERRORS = 3
@dataclass
class TableIssue:
"""Represents an issue found in a table."""
line_num: int
severity: str # 'error' or 'warning'
message: str
class GridTableParser:
"""Parser for grid-style markdown tables."""
def __init__(self, lines: List[str], start_line: int = 0):
self.lines = lines
self.start_line = start_line
self.issues: List[TableIssue] = []
# Parsed components
self.header_rows: List[List[str]] = [] # Changed to support multiline headers
self.header_cells: List[str] = [] # Kept for backward compatibility (first row)
self.data_rows: List[List[str]] = []
self.alignments: List[str] = []
self.num_columns = 0
self.row_has_border_after: List[bool] = [] # Track which data rows have borders after them
def parse(self) -> bool:
"""
Parse the table. Returns True if successful, False otherwise.
Issues are stored in self.issues.
"""
if not self.lines or not self.lines[0].strip().startswith('+'):
self.issues.append(TableIssue(
self.start_line, 'error',
'Table must start with border line (+----+...)'
))
return False
try:
# Skip first border
idx = 1
# Parse header rows (may be multiline)
if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'):
self.issues.append(TableIssue(
self.start_line + idx, 'error',
'Expected header row after top border'
))
return False
# Read all header rows until we hit the separator
while idx < len(self.lines) and self.lines[idx].strip().startswith('|'):
header_row = self._parse_row(self.lines[idx])
if not self.header_rows: # First header row
self.num_columns = len(header_row)
self.header_cells = header_row # For backward compatibility
elif len(header_row) != self.num_columns:
self.issues.append(TableIssue(
self.start_line + idx, 'error',
f'Header row has {len(header_row)} columns, expected {self.num_columns}'
))
return False
self.header_rows.append(header_row)
idx += 1
if not self.header_rows:
self.issues.append(TableIssue(
self.start_line + idx, 'error',
'No header rows found'
))
return False
# Parse separator with alignments
if idx >= len(self.lines) or not self.lines[idx].strip().startswith('+'):
self.issues.append(TableIssue(
self.start_line + idx, 'error',
'Expected separator with alignment markers (+:===+...)'
))
return False
self.alignments = self._extract_alignments(self.lines[idx])
if len(self.alignments) != self.num_columns:
self.issues.append(TableIssue(
self.start_line + idx, 'error',
f'Alignment count ({len(self.alignments)}) != column count ({self.num_columns})'
))
return False
idx += 1
# Parse data rows
while idx < len(self.lines):
line = self.lines[idx].strip()
if line.startswith('|'):
cells = self._parse_row(line)
if len(cells) != self.num_columns:
self.issues.append(TableIssue(
self.start_line + idx, 'error',
f'Row has {len(cells)} columns, expected {self.num_columns}'
))
return False
self.data_rows.append(cells)
idx += 1
# Check if next line is a border
if idx < len(self.lines) and self.lines[idx].strip().startswith('+'):
self.row_has_border_after.append(True)
idx += 1 # Skip the border
if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'):
# End of table
break
else:
self.row_has_border_after.append(False)
elif line.startswith('+'):
# Unexpected border (shouldn't happen with the logic above, but just in case)
idx += 1
if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'):
# End of table
break
else:
# End of table
break
return True
except Exception as e:
self.issues.append(TableIssue(
self.start_line, 'error',
f'Parsing error: {str(e)}'
))
return False
def _parse_row(self, row: str) -> List[str]:
"""Parse a table row into cells."""
row = row.strip()
if row.startswith('|'):
row = row[1:]
if row.endswith('|'):
row = row[:-1]
return [cell.strip() for cell in row.split('|')]
def _extract_alignments(self, separator: str) -> List[str]:
"""Extract alignment from separator line."""
parts = separator.strip().split('+')[1:-1]
alignments = []
for part in parts:
part = part.strip()
if not part:
continue
has_left = part.startswith(':')
has_right = part.endswith(':')
if has_left and has_right:
alignments.append('center')
elif has_left:
alignments.append('left')
elif has_right:
alignments.append('right')
else:
alignments.append('left')
return alignments
def display_width(text: str) -> int:
"""
Calculate display width of text.
Bold markers (**) don't count toward width.
Unicode wide characters count as 2.
"""
# Remove bold markers
clean_text = text.replace('**', '')
width = 0
for char in clean_text:
ea_width = unicodedata.east_asian_width(char)
if ea_width in ('F', 'W'): # Fullwidth or Wide
width += 2
else:
width += 1
return width
def is_bolded(text: str) -> bool:
"""Check if text is already bolded."""
text = text.strip()
return (text.startswith('**') and text.endswith('**') and len(text) > 4)
def add_bold(text: str) -> str:
"""Add bold markers to text if not already bolded. Returns empty string for empty text."""
text = text.strip()
if not text:
return ''
if is_bolded(text):
return text
return f"**{text}**"
def remove_bold(text: str) -> str:
"""Remove bold markers from text."""
text = text.strip()
if is_bolded(text):
return text[2:-2]
return text
def detect_column_alignments(header_rows: List[List[str]], data_rows: List[List[str]]) -> List[str]:
"""
Detect optimal alignment for each column based on content.
Rules:
- FIRST COLUMN: Always left-aligned (book style guide requirement)
- Numeric columns (>70% numbers): right-aligned
- Text columns: left-aligned
- Mixed: left-aligned (default)
"""
if not data_rows or not header_rows:
return ['left'] * len(header_rows[0]) if header_rows else []
num_columns = len(header_rows[0])
alignments = []
for col_idx in range(num_columns):
# RULE: First column is ALWAYS left-aligned (book style guide)
if col_idx == 0:
alignments.append('left')
continue
# Collect all values in this column (skip empty cells)
column_values = []
for row in data_rows:
if col_idx < len(row):
cell = row[col_idx].strip()
# Remove bold markers for analysis
if cell.startswith('**') and cell.endswith('**'):
cell = cell[2:-2].strip()
if cell: # Skip empty cells
column_values.append(cell)
if not column_values:
alignments.append('left')
continue
# Count numeric cells
numeric_count = 0
for value in column_values:
# Remove common formatting: commas, spaces, currency symbols
clean_value = value.replace(',', '').replace(' ', '').replace('$', '')
# Remove units (W, mW, µW, KB, MB, GB, etc.)
clean_value = ''.join(c for c in clean_value if c.isdigit() or c in '.-+<>~')
# Check if it's primarily numeric
if clean_value and any(c.isdigit() for c in clean_value):
numeric_count += 1
# If >70% of cells are numeric, right-align
if numeric_count / len(column_values) > 0.7:
alignments.append('right')
else:
alignments.append('left')
return alignments
def should_bold_first_column(header_cells: List[str], data_rows: List[List[str]]) -> bool:
"""
Determine if first column should be bolded based on intelligent analysis.
Returns True for comparison/definition tables where first column contains:
- Category names (Aspect, Technique, Category, Architecture, etc.)
- Descriptive multi-word phrases
Returns False for data tables where first column contains:
- Numbers, IDs, years
- Simple enumeration
"""
if not header_cells:
return False
first_header = remove_bold(header_cells[0]).lower()
# Keywords that indicate first column should be bolded
bold_indicators = [
'aspect', 'technique', 'category', 'architecture', 'challenge',
'criterion', 'criteria', 'feature', 'characteristic', 'dimension',
'metric', 'property', 'attribute', 'method', 'approach', 'strategy',
'type', 'principle', 'factor', 'component', 'element', 'term',
'concept', 'deployment context', 'system aspect', 'design pattern',
'era', 'role', 'threat type', 'mechanism', 'resource type',
'storage tier', 'stage', 'characteristic'
]
# Check if header matches bold indicators
if any(indicator in first_header for indicator in bold_indicators):
return True
# Keywords that indicate DON'T bold
no_bold_indicators = [
'id', '#', 'number', 'index', 'rank', 'year', 'date', 'time',
'count', 'order'
]
if any(indicator in first_header for indicator in no_bold_indicators):
return False
# Analyze first column content
if not data_rows:
return True # Default to bolding if no data
first_col_values = [row[0] for row in data_rows if row and row[0].strip()]
if not first_col_values:
return True
# Check if mostly numeric (data table)
numeric_count = 0
for value in first_col_values:
clean = remove_bold(value).replace('%', '').replace('$', '').replace(',', '').strip()
try:
float(clean)
numeric_count += 1
except ValueError:
pass
if numeric_count > len(first_col_values) * 0.7:
return False
# Check if descriptive (multi-word = comparison table)
descriptive_count = 0
for value in first_col_values:
clean = remove_bold(value)
words = clean.replace('/', ' ').replace('-', ' ').replace('(', ' ').split()
# Filter out empty words
words = [w for w in words if w.strip()]
if len(words) >= 2:
descriptive_count += 1
if descriptive_count > len(first_col_values) * 0.4:
return True
# Default: bold for comparison-style tables
return True
def calculate_column_widths(parser: GridTableParser,
bold_headers: bool = True,
bold_first_col: bool = False) -> List[int]:
"""
Calculate required width for each column, accounting for bolding.
"""
widths = [0] * parser.num_columns
# Header widths (with potential bolding)
for i, cell in enumerate(parser.header_cells):
text = cell
if bold_headers and not is_bolded(cell) and cell.strip():
text = add_bold(cell)
widths[i] = max(widths[i], display_width(text))
# Data row widths
for row in parser.data_rows:
for i, cell in enumerate(row):
text = cell
# First column might need bolding (but not if empty - multiline cells)
if i == 0 and bold_first_col and cell.strip() and not is_bolded(cell):
text = add_bold(cell)
widths[i] = max(widths[i], display_width(text))
return widths
def build_border(widths: List[int]) -> str:
"""Build border line: +----+----+----+"""
parts = ['-' * (w + 2) for w in widths] # +2 for padding spaces
return '+' + '+'.join(parts) + '+'
def build_separator(widths: List[int], alignments: List[str]) -> str:
"""Build separator line: +:===+:===:+====:+
The separator must match the border length exactly.
Border segment for width W: '-' * (W + 2) [+2 for padding spaces]
So separator segment must also be length (W + 2).
"""
parts = []
for width, align in zip(widths, alignments):
if align == 'center':
# :===: format - colon + equals + colon = W+2
parts.append(':' + '=' * width + ':')
elif align == 'left':
# :==== format - colon + equals = W+2
parts.append(':' + '=' * (width + 1))
elif align == 'right':
# ====: format - equals + colon = W+2
parts.append('=' * (width + 1) + ':')
else:
# ===== format - no alignment markers = W+2
parts.append('=' * (width + 2))
return '+' + '+'.join(parts) + '+'
def escape_html_entities(content: str) -> str:
r"""Convert bare < and > to HTML entities (&lt; and &gt;).
Preserves:
- Already-escaped sequences like \> and \<
- HTML tags like <li>, </li>, <ul>, etc.
"""
import re
# Temporarily protect escaped sequences and HTML tags
content = content.replace('\\>', '\x00ESCAPED_GT\x00') # Protect \>
content = content.replace('\\<', '\x00ESCAPED_LT\x00') # Protect \<
# Protect HTML tags (e.g., <li>, </li>, <ul>, <p>, etc.)
# Match opening tags: <tagname> or <tagname attr="value">
# Match closing tags: </tagname>
# Match self-closing: <tagname />
html_tag_pattern = r'</?[a-zA-Z][a-zA-Z0-9]*(?:\s+[^>]*)?/?>'
tags = re.findall(html_tag_pattern, content)
for i, tag in enumerate(tags):
content = content.replace(tag, f'\x00TAG_{i}\x00', 1)
# Now convert bare < and >
content = content.replace('>', '&gt;')
content = content.replace('<', '&lt;')
# Restore HTML tags
for i, tag in enumerate(tags):
content = content.replace(f'\x00TAG_{i}\x00', tag)
# Restore escaped sequences
content = content.replace('\x00ESCAPED_GT\x00', '\\>')
content = content.replace('\x00ESCAPED_LT\x00', '\\<')
return content
def format_cell(content: str, width: int, alignment: str = 'left') -> str:
"""Format cell content with proper padding.
Width is the LITERAL character count (including ** markers and HTML entities).
Always left-aligns content within cells (the alignment parameter only
affects column alignment markers in the separator row).
"""
content = content.strip()
content_len = len(content) # Literal length including ** and HTML entities
padding = width - content_len
if padding < 0:
padding = 0
# Always left-align cell content (padding on right only)
return content + ' ' * padding
def wrap_cell_text(text: str, max_width: int) -> List[str]:
"""
Wrap text to fit within max_width, breaking at natural points.
Returns list of lines (wrapped text).
"""
text = text.strip()
# If text fits, no wrapping needed
if len(text) <= max_width:
return [text]
# Find good break points: commas, semicolons, " and ", " or "
lines = []
current_line = ""
# Split by commas first (most common)
parts = text.split(',')
for i, part in enumerate(parts):
part = part.strip()
# Add comma back except for last part
if i < len(parts) - 1:
part_with_comma = part + ','
else:
part_with_comma = part
# Check if adding this part would exceed max_width
if not current_line:
# First part of line
current_line = part_with_comma
elif len(current_line + ' ' + part_with_comma) <= max_width:
# Fits on current line
current_line = current_line + ' ' + part_with_comma
else:
# Need to start new line
lines.append(current_line)
current_line = part_with_comma
# Add remaining text
if current_line:
lines.append(current_line)
return lines
def wrap_table_rows(data_rows: List[List[str]], max_width: int) -> List[List[str]]:
"""
Wrap cells in data rows that exceed max_width.
Creates continuation rows where needed.
"""
if max_width is None:
return data_rows
wrapped_rows = []
for row in data_rows:
# Check if any cell needs wrapping
needs_wrapping = False
wrapped_cells = []
max_lines = 1
for cell in row:
wrapped = wrap_cell_text(cell, max_width)
wrapped_cells.append(wrapped)
max_lines = max(max_lines, len(wrapped))
if len(wrapped) > 1:
needs_wrapping = True
if not needs_wrapping:
# No wrapping needed, keep original row
wrapped_rows.append(row)
else:
# Create multiple rows (one per line)
for line_idx in range(max_lines):
new_row = []
for col_idx, cell_lines in enumerate(wrapped_cells):
if line_idx < len(cell_lines):
new_row.append(cell_lines[line_idx])
else:
# Empty cell for continuation
new_row.append('')
wrapped_rows.append(new_row)
return wrapped_rows
def format_table_lines(parser: GridTableParser, max_width: Optional[int] = None) -> List[str]:
"""Format a parsed table into properly formatted lines."""
# Determine formatting rules
bold_headers = True # Always bold headers
bold_first_col = should_bold_first_column(parser.header_cells, parser.data_rows)
# Auto-detect optimal alignments (text=left, numbers=right)
optimal_alignments = detect_column_alignments(parser.header_rows, parser.data_rows)
# Apply text wrapping FIRST (before bolding)
wrapped_data = wrap_table_rows(parser.data_rows, max_width)
# Prepare ALL header rows (support multiline headers)
formatted_header_rows = []
for header_row in parser.header_rows:
formatted_row = []
for cell in header_row:
# Then apply bolding if needed
if bold_headers and cell.strip() and not is_bolded(cell):
formatted_row.append(add_bold(cell))
else:
formatted_row.append(cell)
formatted_header_rows.append(formatted_row)
# Prepare data rows (with wrapping applied)
formatted_data = []
for row in wrapped_data:
new_row = []
for i, cell in enumerate(row):
# Bold first column only if it has content (preserve empty cells for multiline)
if i == 0 and bold_first_col and cell.strip() and not is_bolded(cell):
new_row.append(add_bold(cell))
else:
new_row.append(cell)
formatted_data.append(new_row)
# Calculate widths based on formatted content
# IMPORTANT: Use len() not display_width() because restructuredText counts literal chars including **
widths = [0] * parser.num_columns
# Header widths (literal length of formatted/bolded text) - check ALL header rows
for header_row in formatted_header_rows:
for i, cell in enumerate(header_row):
widths[i] = max(widths[i], len(cell.strip()))
# Data widths (literal length of formatted/bolded text)
for row in formatted_data:
for i, cell in enumerate(row):
widths[i] = max(widths[i], len(cell.strip()))
# Build formatted table
lines = []
# Top border
lines.append(build_border(widths))
# ALL Header rows (support multiline)
for header_row in formatted_header_rows:
header_cells_formatted = []
for cell, width, align in zip(header_row, widths, optimal_alignments):
header_cells_formatted.append(format_cell(cell, width, align))
lines.append('| ' + ' | '.join(header_cells_formatted) + ' |')
# Separator (use optimal alignments)
lines.append(build_separator(widths, optimal_alignments))
# Data rows with borders between them
for i, row in enumerate(formatted_data):
row_cells_formatted = []
for cell, width, align in zip(row, widths, optimal_alignments):
row_cells_formatted.append(format_cell(cell, width, align))
lines.append('| ' + ' | '.join(row_cells_formatted) + ' |')
# Add border after this row if the original table had one
if i < len(parser.row_has_border_after) and parser.row_has_border_after[i]:
lines.append(build_border(widths))
# Footer border (only if last row didn't already have a border)
if not (formatted_data and len(parser.row_has_border_after) > 0 and parser.row_has_border_after[-1]):
lines.append(build_border(widths))
return lines
def validate_table(parser: GridTableParser, max_width: Optional[int] = None) -> Tuple[bool, List[str]]:
"""
Validate table formatting.
Returns:
(is_valid, list_of_warnings)
"""
warnings = []
# Check first column alignment (must ALWAYS be left-aligned per style guide)
if parser.alignments and parser.alignments[0] != 'left':
warnings.append(f"First column must be left-aligned (found: {parser.alignments[0]})")
# Check header bolding (check ALL header rows for multiline headers)
unbolded_headers = []
for row_idx, header_row in enumerate(parser.header_rows):
for col_idx, cell in enumerate(header_row):
if cell.strip() and not is_bolded(cell):
# Track column index (1-based for human readability)
if (col_idx + 1) not in unbolded_headers:
unbolded_headers.append(col_idx + 1)
if unbolded_headers:
warnings.append(f"Headers not bolded in columns: {', '.join(map(str, sorted(unbolded_headers)))}")
# Check first column bolding
if should_bold_first_column(parser.header_cells, parser.data_rows):
unbolded_first = []
for i, row in enumerate(parser.data_rows):
if row[0].strip() and not is_bolded(row[0]):
unbolded_first.append(i + 1)
if unbolded_first:
warnings.append(f"First column not bolded in rows: {', '.join(map(str, unbolded_first[:5]))}")
if len(unbolded_first) > 5:
warnings.append(f" ... and {len(unbolded_first) - 5} more rows")
# Check spacing
formatted_lines = format_table_lines(parser, max_width)
original_borders = [line for line in parser.lines if line.strip().startswith('+')]
formatted_borders = [line for line in formatted_lines if line.startswith('+')]
if original_borders and formatted_borders:
if original_borders[0].strip() != formatted_borders[0].strip():
warnings.append("Table spacing is incorrect (column widths don't match content)")
# Check if any data lines differ (catches alignment changes)
original_data_lines = [l.strip() for l in parser.lines if l.strip().startswith('|') and '|' in l[1:]]
formatted_data_lines = [l.strip() for l in formatted_lines if l.startswith('|') and '|' in l[1:]]
if original_data_lines and formatted_data_lines:
if original_data_lines != formatted_data_lines:
warnings.append("Cell content alignment needs updating")
return len(warnings) == 0, warnings
def extract_tables_from_file(file_path: Path) -> List[Tuple[int, List[str], int]]:
"""
Extract all tables from a file.
Returns:
List of (start_line, table_lines, end_line) tuples
"""
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
tables = []
i = 0
while i < len(lines):
if lines[i].strip().startswith('+') and '---' in lines[i]:
# Potential table start
start_line = i
table_lines = []
while i < len(lines):
line = lines[i].rstrip()
if line.startswith(('+', '|')):
table_lines.append(line)
i += 1
elif not line and table_lines:
# Empty line after table
break
else:
break
if len(table_lines) >= 5: # Minimum valid table
tables.append((start_line, table_lines, i))
else:
i = start_line + 1
else:
i += 1
return tables
def process_file(file_path: Path, mode: str, verbose: bool = False, max_width: Optional[int] = None) -> Tuple[int, int, int]:
"""
Process a single file.
Args:
file_path: Path to file
mode: 'check' or 'format'
verbose: Print detailed info
Returns:
(tables_found, tables_with_issues, tables_with_errors)
"""
if not file_path.exists():
print(f"Error: File not found: {file_path}")
return 0, 0, 1
try:
tables = extract_tables_from_file(file_path)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return 0, 0, 1
tables_found = len(tables)
tables_with_issues = 0
tables_with_errors = 0
if mode == 'check':
# Validate tables
for start_line, table_lines, _ in tables:
parser = GridTableParser(table_lines, start_line)
if not parser.parse():
tables_with_errors += 1
print(f"\n{file_path}:{start_line + 1}: Table validation errors:")
for issue in parser.issues:
print(f" {issue.severity.upper()}: {issue.message}")
else:
is_valid, warnings = validate_table(parser, max_width)
if not is_valid:
tables_with_issues += 1
if verbose or True: # Always show in check mode
print(f"\n{file_path}:{start_line + 1}: Table formatting issues:")
for warning in warnings:
print(f" - {warning}")
elif mode == 'format':
# Format tables
content = file_path.read_text(encoding='utf-8')
lines = content.split('\n')
new_lines = []
processed_lines = set()
for start_line, table_lines, end_line in tables:
parser = GridTableParser(table_lines, start_line)
if not parser.parse():
# Can't format invalid tables
tables_with_errors += 1
print(f"\n{file_path}:{start_line + 1}: Cannot format (validation errors):")
for issue in parser.issues:
print(f" {issue.severity.upper()}: {issue.message}")
continue
is_valid, warnings = validate_table(parser)
if not is_valid:
tables_with_issues += 1
if verbose:
print(f"\n{file_path}:{start_line + 1}: Formatting table...")
for warning in warnings:
print(f" Fixing: {warning}")
# Mark these lines as processed
for line_idx in range(start_line, end_line):
processed_lines.add(line_idx)
# Rebuild file with formatted tables
i = 0
for start_line, table_lines, end_line in tables:
# Copy lines before table
while i < start_line:
new_lines.append(lines[i])
i += 1
# Parse and format table
parser = GridTableParser(table_lines, start_line)
if parser.parse():
formatted = format_table_lines(parser, max_width)
new_lines.extend(formatted)
else:
# Keep original if can't parse
new_lines.extend(table_lines)
i = end_line
# Copy remaining lines
while i < len(lines):
new_lines.append(lines[i])
i += 1
# Write back
if tables_with_issues > 0:
file_path.write_text('\n'.join(new_lines), encoding='utf-8')
print(f"{file_path}: Formatted {tables_with_issues} tables")
return tables_found, tables_with_issues, tables_with_errors
def main():
parser = argparse.ArgumentParser(
description='Production table formatter for MLSysBook',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__
)
# File/directory selection (consistent with other scripts)
file_group = parser.add_mutually_exclusive_group()
file_group.add_argument('-f', '--file', type=str,
help='Process a specific .qmd file')
file_group.add_argument('-d', '--directory', type=str,
help='Process all .qmd files in a directory recursively')
file_group.add_argument('--all', action='store_true',
help='Process all .qmd files in quarto/contents/vol1')
# Action selection
action_group = parser.add_mutually_exclusive_group(required=False)
action_group.add_argument('--check', action='store_true',
help='Check formatting only (default)')
action_group.add_argument('--fix', action='store_true',
help='Fix table formatting in place')
# Options
parser.add_argument('--max-width', type=int, default=None,
help='Maximum cell width before wrapping (default: no wrapping)')
parser.add_argument('--verbose', '-v', action='store_true', help='Verbose output')
args = parser.parse_args()
# Determine mode
if args.fix:
mode = 'format'
else:
mode = 'check' # Default to check
# Determine files to process
script_path = Path(__file__).resolve()
# Script is at book/tools/scripts/content/format_tables.py, need 5 parents to get to repo root
workspace_root = script_path.parent.parent.parent.parent.parent
files_to_process = []
if args.file:
# Single file
file_path = Path(args.file)
if not file_path.is_absolute():
file_path = workspace_root / file_path
files_to_process = [file_path]
elif args.directory:
# Directory
dir_path = Path(args.directory)
if not dir_path.is_absolute():
dir_path = workspace_root / dir_path
if not dir_path.exists():
print(f"Error: Directory not found: {dir_path}")
return ExitCode.FILE_ERRORS.value
files_to_process = sorted(dir_path.rglob('*.qmd'))
elif args.all:
# All chapter files
core_path = workspace_root / 'quarto' / 'contents' / 'core'
if not core_path.exists():
print(f"Error: {core_path} does not exist")
return ExitCode.FILE_ERRORS.value
files_to_process = sorted(core_path.rglob('*.qmd'))
else:
parser.print_help()
return ExitCode.SUCCESS.value
# Process files
total_tables = 0
total_issues = 0
total_errors = 0
for file_path in files_to_process:
tables, issues, errors = process_file(file_path, mode, args.verbose, args.max_width)
total_tables += tables
total_issues += issues
total_errors += errors
# Print summary
print(f"\nSummary: {total_tables} tables, {total_issues} with formatting issues, {total_errors} with errors")
# Determine exit code
if total_errors > 0:
return ExitCode.VALIDATION_ERRORS.value
elif total_issues > 0 and mode == 'check':
return ExitCode.FORMATTING_ISSUES.value
else:
return ExitCode.SUCCESS.value
if __name__ == '__main__':
sys.exit(main())