#!/usr/bin/env python3 """ Production Table Formatter for MLSysBook This script formats and validates grid tables in Quarto .qmd files. Designed for use in CI/CD pipelines and pre-commit hooks. Key Features: - Smart column header bolding (always, including multiline headers) - Intelligent first column bolding (based on content analysis) - Proper spacing calculation accounting for bold markers - Handles multiline headers, multiline cells, empty cells, and Unicode - Comprehensive validation with detailed error reporting - Exit codes suitable for CI/CD integration Usage: # Check single file python format_tables.py --check -f quarto/contents/vol1/efficient_ai/efficient_ai.qmd # Fix single file python format_tables.py --fix -f quarto/contents/vol1/efficient_ai/efficient_ai.qmd # Check all files in a directory python format_tables.py --check -d quarto/contents/vol1/optimizations # Fix all chapter files python format_tables.py --fix --all # With text wrapping python format_tables.py --fix --all --max-width 60 Exit Codes: 0: Success (all tables properly formatted) 1: Formatting issues found 2: Validation errors (structural problems) 3: File errors """ import argparse import sys import unicodedata from pathlib import Path from typing import List, Dict, Optional, Tuple from dataclasses import dataclass from enum import Enum class ExitCode(Enum): """Exit codes for CI/CD integration.""" SUCCESS = 0 FORMATTING_ISSUES = 1 VALIDATION_ERRORS = 2 FILE_ERRORS = 3 @dataclass class TableIssue: """Represents an issue found in a table.""" line_num: int severity: str # 'error' or 'warning' message: str class GridTableParser: """Parser for grid-style markdown tables.""" def __init__(self, lines: List[str], start_line: int = 0): self.lines = lines self.start_line = start_line self.issues: List[TableIssue] = [] # Parsed components self.header_rows: List[List[str]] = [] # Changed to support multiline headers self.header_cells: List[str] = [] # Kept for backward compatibility (first row) self.data_rows: List[List[str]] = [] self.alignments: List[str] = [] self.num_columns = 0 self.row_has_border_after: List[bool] = [] # Track which data rows have borders after them def parse(self) -> bool: """ Parse the table. Returns True if successful, False otherwise. Issues are stored in self.issues. """ if not self.lines or not self.lines[0].strip().startswith('+'): self.issues.append(TableIssue( self.start_line, 'error', 'Table must start with border line (+----+...)' )) return False try: # Skip first border idx = 1 # Parse header rows (may be multiline) if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'): self.issues.append(TableIssue( self.start_line + idx, 'error', 'Expected header row after top border' )) return False # Read all header rows until we hit the separator while idx < len(self.lines) and self.lines[idx].strip().startswith('|'): header_row = self._parse_row(self.lines[idx]) if not self.header_rows: # First header row self.num_columns = len(header_row) self.header_cells = header_row # For backward compatibility elif len(header_row) != self.num_columns: self.issues.append(TableIssue( self.start_line + idx, 'error', f'Header row has {len(header_row)} columns, expected {self.num_columns}' )) return False self.header_rows.append(header_row) idx += 1 if not self.header_rows: self.issues.append(TableIssue( self.start_line + idx, 'error', 'No header rows found' )) return False # Parse separator with alignments if idx >= len(self.lines) or not self.lines[idx].strip().startswith('+'): self.issues.append(TableIssue( self.start_line + idx, 'error', 'Expected separator with alignment markers (+:===+...)' )) return False self.alignments = self._extract_alignments(self.lines[idx]) if len(self.alignments) != self.num_columns: self.issues.append(TableIssue( self.start_line + idx, 'error', f'Alignment count ({len(self.alignments)}) != column count ({self.num_columns})' )) return False idx += 1 # Parse data rows while idx < len(self.lines): line = self.lines[idx].strip() if line.startswith('|'): cells = self._parse_row(line) if len(cells) != self.num_columns: self.issues.append(TableIssue( self.start_line + idx, 'error', f'Row has {len(cells)} columns, expected {self.num_columns}' )) return False self.data_rows.append(cells) idx += 1 # Check if next line is a border if idx < len(self.lines) and self.lines[idx].strip().startswith('+'): self.row_has_border_after.append(True) idx += 1 # Skip the border if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'): # End of table break else: self.row_has_border_after.append(False) elif line.startswith('+'): # Unexpected border (shouldn't happen with the logic above, but just in case) idx += 1 if idx >= len(self.lines) or not self.lines[idx].strip().startswith('|'): # End of table break else: # End of table break return True except Exception as e: self.issues.append(TableIssue( self.start_line, 'error', f'Parsing error: {str(e)}' )) return False def _parse_row(self, row: str) -> List[str]: """Parse a table row into cells.""" row = row.strip() if row.startswith('|'): row = row[1:] if row.endswith('|'): row = row[:-1] return [cell.strip() for cell in row.split('|')] def _extract_alignments(self, separator: str) -> List[str]: """Extract alignment from separator line.""" parts = separator.strip().split('+')[1:-1] alignments = [] for part in parts: part = part.strip() if not part: continue has_left = part.startswith(':') has_right = part.endswith(':') if has_left and has_right: alignments.append('center') elif has_left: alignments.append('left') elif has_right: alignments.append('right') else: alignments.append('left') return alignments def display_width(text: str) -> int: """ Calculate display width of text. Bold markers (**) don't count toward width. Unicode wide characters count as 2. """ # Remove bold markers clean_text = text.replace('**', '') width = 0 for char in clean_text: ea_width = unicodedata.east_asian_width(char) if ea_width in ('F', 'W'): # Fullwidth or Wide width += 2 else: width += 1 return width def is_bolded(text: str) -> bool: """Check if text is already bolded.""" text = text.strip() return (text.startswith('**') and text.endswith('**') and len(text) > 4) def add_bold(text: str) -> str: """Add bold markers to text if not already bolded. Returns empty string for empty text.""" text = text.strip() if not text: return '' if is_bolded(text): return text return f"**{text}**" def remove_bold(text: str) -> str: """Remove bold markers from text.""" text = text.strip() if is_bolded(text): return text[2:-2] return text def detect_column_alignments(header_rows: List[List[str]], data_rows: List[List[str]]) -> List[str]: """ Detect optimal alignment for each column based on content. Rules: - FIRST COLUMN: Always left-aligned (book style guide requirement) - Numeric columns (>70% numbers): right-aligned - Text columns: left-aligned - Mixed: left-aligned (default) """ if not data_rows or not header_rows: return ['left'] * len(header_rows[0]) if header_rows else [] num_columns = len(header_rows[0]) alignments = [] for col_idx in range(num_columns): # RULE: First column is ALWAYS left-aligned (book style guide) if col_idx == 0: alignments.append('left') continue # Collect all values in this column (skip empty cells) column_values = [] for row in data_rows: if col_idx < len(row): cell = row[col_idx].strip() # Remove bold markers for analysis if cell.startswith('**') and cell.endswith('**'): cell = cell[2:-2].strip() if cell: # Skip empty cells column_values.append(cell) if not column_values: alignments.append('left') continue # Count numeric cells numeric_count = 0 for value in column_values: # Remove common formatting: commas, spaces, currency symbols clean_value = value.replace(',', '').replace(' ', '').replace('$', '') # Remove units (W, mW, µW, KB, MB, GB, etc.) clean_value = ''.join(c for c in clean_value if c.isdigit() or c in '.-+<>~') # Check if it's primarily numeric if clean_value and any(c.isdigit() for c in clean_value): numeric_count += 1 # If >70% of cells are numeric, right-align if numeric_count / len(column_values) > 0.7: alignments.append('right') else: alignments.append('left') return alignments def should_bold_first_column(header_cells: List[str], data_rows: List[List[str]]) -> bool: """ Determine if first column should be bolded based on intelligent analysis. Returns True for comparison/definition tables where first column contains: - Category names (Aspect, Technique, Category, Architecture, etc.) - Descriptive multi-word phrases Returns False for data tables where first column contains: - Numbers, IDs, years - Simple enumeration """ if not header_cells: return False first_header = remove_bold(header_cells[0]).lower() # Keywords that indicate first column should be bolded bold_indicators = [ 'aspect', 'technique', 'category', 'architecture', 'challenge', 'criterion', 'criteria', 'feature', 'characteristic', 'dimension', 'metric', 'property', 'attribute', 'method', 'approach', 'strategy', 'type', 'principle', 'factor', 'component', 'element', 'term', 'concept', 'deployment context', 'system aspect', 'design pattern', 'era', 'role', 'threat type', 'mechanism', 'resource type', 'storage tier', 'stage', 'characteristic' ] # Check if header matches bold indicators if any(indicator in first_header for indicator in bold_indicators): return True # Keywords that indicate DON'T bold no_bold_indicators = [ 'id', '#', 'number', 'index', 'rank', 'year', 'date', 'time', 'count', 'order' ] if any(indicator in first_header for indicator in no_bold_indicators): return False # Analyze first column content if not data_rows: return True # Default to bolding if no data first_col_values = [row[0] for row in data_rows if row and row[0].strip()] if not first_col_values: return True # Check if mostly numeric (data table) numeric_count = 0 for value in first_col_values: clean = remove_bold(value).replace('%', '').replace('$', '').replace(',', '').strip() try: float(clean) numeric_count += 1 except ValueError: pass if numeric_count > len(first_col_values) * 0.7: return False # Check if descriptive (multi-word = comparison table) descriptive_count = 0 for value in first_col_values: clean = remove_bold(value) words = clean.replace('/', ' ').replace('-', ' ').replace('(', ' ').split() # Filter out empty words words = [w for w in words if w.strip()] if len(words) >= 2: descriptive_count += 1 if descriptive_count > len(first_col_values) * 0.4: return True # Default: bold for comparison-style tables return True def calculate_column_widths(parser: GridTableParser, bold_headers: bool = True, bold_first_col: bool = False) -> List[int]: """ Calculate required width for each column, accounting for bolding. """ widths = [0] * parser.num_columns # Header widths (with potential bolding) for i, cell in enumerate(parser.header_cells): text = cell if bold_headers and not is_bolded(cell) and cell.strip(): text = add_bold(cell) widths[i] = max(widths[i], display_width(text)) # Data row widths for row in parser.data_rows: for i, cell in enumerate(row): text = cell # First column might need bolding (but not if empty - multiline cells) if i == 0 and bold_first_col and cell.strip() and not is_bolded(cell): text = add_bold(cell) widths[i] = max(widths[i], display_width(text)) return widths def build_border(widths: List[int]) -> str: """Build border line: +----+----+----+""" parts = ['-' * (w + 2) for w in widths] # +2 for padding spaces return '+' + '+'.join(parts) + '+' def build_separator(widths: List[int], alignments: List[str]) -> str: """Build separator line: +:===+:===:+====:+ The separator must match the border length exactly. Border segment for width W: '-' * (W + 2) [+2 for padding spaces] So separator segment must also be length (W + 2). """ parts = [] for width, align in zip(widths, alignments): if align == 'center': # :===: format - colon + equals + colon = W+2 parts.append(':' + '=' * width + ':') elif align == 'left': # :==== format - colon + equals = W+2 parts.append(':' + '=' * (width + 1)) elif align == 'right': # ====: format - equals + colon = W+2 parts.append('=' * (width + 1) + ':') else: # ===== format - no alignment markers = W+2 parts.append('=' * (width + 2)) return '+' + '+'.join(parts) + '+' def escape_html_entities(content: str) -> str: r"""Convert bare < and > to HTML entities (< and >). Preserves: - Already-escaped sequences like \> and \< - HTML tags like
  • ,
  • ,