Files
cs249r_book/book/tools/scripts/content/format_div_spacing.py
Vijay Janapa Reddi 7ad6d51f96 Update two-volume textbook content, config, and tooling
- Edit all Vol 1 and Vol 2 chapters for print readiness and pedagogical clarity
- Update Quarto config files for both volumes (PDF, HTML, EPUB)
- Add frontmatter updates (about, acknowledgements, socratiq)
- Remove unused _brand assets (scss, favicon, scripts, manifest)
- Add new utility scripts (audit_figure_placement, format_div_spacing, audit_refs)
- Update format_python_in_qmd script
- Add references.bib entries and seminal papers corpus
2026-01-30 02:42:59 -05:00

362 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Script to ensure proper blank line spacing inside Quarto div blocks (callouts).
Problem: When content inside a div (like a callout) has a paragraph immediately
followed by a list without a blank line between them, Pandoc may render them
incorrectly in PDF output (content gets "mushed" together on one line).
Additionally, blank lines BETWEEN list items create "loose lists" with extra
spacing, which is usually undesirable.
Solution: This script detects and fixes these patterns by ensuring proper
blank lines exist between block elements inside divs, while removing
unnecessary blank lines between list items.
Patterns fixed:
1. Paragraph → List: Ensures blank line between a paragraph and a list
2. List → Paragraph: Ensures blank line between end of list and new paragraph
3. Bold header → List: Ensures blank line after standalone bold headers before lists
4. List item → blank → List item: Removes blank line to create tight list
Usage:
python format_div_spacing.py -f <file.qmd> # Process single file
python format_div_spacing.py -d <directory> # Process directory
python format_div_spacing.py -f <file.qmd> --check # Check only (no changes)
python format_div_spacing.py -f <file.qmd> --verbose # Show details
"""
import argparse
import os
import re
import sys
from dataclasses import dataclass
from enum import Enum
from typing import Optional
class LineType(Enum):
"""Classification of line types for spacing analysis."""
BLANK = "blank"
DIV_START = "div_start"
DIV_END = "div_end"
LIST_ITEM = "list_item"
PARAGRAPH = "paragraph"
CODE_FENCE = "code_fence"
HEADER = "header"
BOLD_HEADER = "bold_header" # Lines that are just **Bold Text**
OTHER = "other"
@dataclass
class SpacingIssue:
"""Represents a spacing issue found in a file."""
line_number: int
issue_type: str
before_line: str
after_line: str
context: str
def classify_line(line: str) -> LineType:
"""Classify a line by its markdown type."""
stripped = line.strip()
if stripped == "":
return LineType.BLANK
# Div boundaries
if stripped.startswith(":::"):
if stripped == ":::" or stripped == "::::":
return LineType.DIV_END
return LineType.DIV_START
# Code fences
if stripped.startswith("```"):
return LineType.CODE_FENCE
# Headers
if stripped.startswith("#"):
return LineType.HEADER
# List items (-, *, +, or numbered)
if re.match(r'^[-*+]\s', stripped) or re.match(r'^\d+\.\s', stripped):
return LineType.LIST_ITEM
# Bold header lines (entire line is just bold text, possibly with colon)
# Matches: **Some Header** or **Some Header**:
if re.match(r'^\*\*[^*]+\*\*:?\s*$', stripped):
return LineType.BOLD_HEADER
# Regular paragraph
return LineType.PARAGRAPH
def find_spacing_issues(content: str) -> list[SpacingIssue]:
"""Find spacing issues inside div blocks."""
lines = content.split('\n')
issues = []
div_depth = 0
in_code_block = False
for i, line in enumerate(lines):
line_type = classify_line(line)
# Track code blocks
if line_type == LineType.CODE_FENCE:
in_code_block = not in_code_block
continue
# Skip processing inside code blocks
if in_code_block:
continue
# Track div depth
if line_type == LineType.DIV_START:
div_depth += 1
continue
elif line_type == LineType.DIV_END:
div_depth = max(0, div_depth - 1)
continue
# Only check inside divs
if div_depth == 0:
continue
# Check for issues: need to look at current line and previous non-blank line
if i > 0:
prev_idx = i - 1
prev_type = classify_line(lines[prev_idx])
# Issue 1: Bold header or paragraph immediately followed by list item
# (no blank line between)
if line_type == LineType.LIST_ITEM and prev_type in (LineType.BOLD_HEADER, LineType.PARAGRAPH):
issues.append(SpacingIssue(
line_number=i + 1, # 1-indexed
issue_type="missing_blank_before_list",
before_line=lines[prev_idx].strip(),
after_line=line.strip(),
context=f"Line {prev_idx + 1}: {lines[prev_idx].strip()[:50]}..."
))
# Issue 2: List item immediately followed by paragraph (no blank line)
if line_type == LineType.PARAGRAPH and prev_type == LineType.LIST_ITEM:
issues.append(SpacingIssue(
line_number=i + 1,
issue_type="missing_blank_after_list",
before_line=lines[prev_idx].strip(),
after_line=line.strip(),
context=f"Line {prev_idx + 1}: {lines[prev_idx].strip()[:50]}..."
))
# Issue 3: Loose list - blank line between list items
if i >= 2 and line_type == LineType.LIST_ITEM and prev_type == LineType.BLANK:
prev_prev_type = classify_line(lines[i - 2])
if prev_prev_type == LineType.LIST_ITEM:
issues.append(SpacingIssue(
line_number=i, # The blank line
issue_type="loose_list_blank_line",
before_line=lines[i - 2].strip(),
after_line=line.strip(),
context=f"Line {i - 1}: (blank line between list items)"
))
return issues
def fix_spacing(content: str) -> str:
"""Fix spacing issues inside div blocks."""
lines = content.split('\n')
result = []
div_depth = 0
in_code_block = False
i = 0
while i < len(lines):
line = lines[i]
line_type = classify_line(line)
# Track code blocks
if line_type == LineType.CODE_FENCE:
in_code_block = not in_code_block
result.append(line)
i += 1
continue
# Preserve content inside code blocks
if in_code_block:
result.append(line)
i += 1
continue
# Track div depth
if line_type == LineType.DIV_START:
div_depth += 1
result.append(line)
i += 1
continue
elif line_type == LineType.DIV_END:
div_depth = max(0, div_depth - 1)
result.append(line)
i += 1
continue
# Only fix inside divs
if div_depth == 0:
result.append(line)
i += 1
continue
# Fix loose lists: skip blank lines between list items
if line_type == LineType.BLANK and i + 1 < len(lines) and len(result) > 0:
next_type = classify_line(lines[i + 1])
prev_type = classify_line(result[-1])
if prev_type == LineType.LIST_ITEM and next_type == LineType.LIST_ITEM:
# Skip this blank line (don't add to result)
i += 1
continue
# Check if we need to insert a blank line before this line
if len(result) > 0:
prev_type = classify_line(result[-1])
# Insert blank line before list if previous was bold header or paragraph
if line_type == LineType.LIST_ITEM and prev_type in (LineType.BOLD_HEADER, LineType.PARAGRAPH):
result.append('') # Insert blank line
# Insert blank line before paragraph if previous was list item
elif line_type == LineType.PARAGRAPH and prev_type == LineType.LIST_ITEM:
result.append('') # Insert blank line
result.append(line)
i += 1
return '\n'.join(result)
def process_file(filepath: str, check_only: bool = False, verbose: bool = False) -> tuple[bool, list[SpacingIssue]]:
"""Process a single file. Returns (was_modified, issues_found)."""
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
issues = find_spacing_issues(content)
if verbose and issues:
print(f"\n{filepath}:")
for issue in issues:
print(f" Line {issue.line_number}: {issue.issue_type}")
print(f" Before: {issue.before_line[:60]}...")
print(f" After: {issue.after_line[:60]}...")
if check_only:
return False, issues
if not issues:
return False, issues
# Apply fixes
new_content = fix_spacing(content)
if new_content != content:
with open(filepath, 'w', encoding='utf-8') as f:
f.write(new_content)
return True, issues
return False, issues
def process_directory(directory: str, check_only: bool = False, verbose: bool = False) -> tuple[list[str], int]:
"""Process all .qmd files in a directory. Returns (modified_files, total_issues)."""
modified_files = []
total_issues = 0
for root, _, files in os.walk(directory):
for file in files:
if file.endswith('.qmd'):
filepath = os.path.join(root, file)
was_modified, issues = process_file(filepath, check_only, verbose)
total_issues += len(issues)
if was_modified:
modified_files.append(filepath)
print(f"Modified: {filepath} ({len(issues)} issues fixed)")
elif issues and check_only:
print(f"Issues found: {filepath} ({len(issues)} issues)")
return modified_files, total_issues
def main():
parser = argparse.ArgumentParser(
description="Fix spacing inside Quarto div blocks (callouts) for proper PDF rendering.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Fix a single file
python format_div_spacing.py -f chapter.qmd
# Check all files in a directory (no changes)
python format_div_spacing.py -d contents/ --check
# Fix all files with verbose output
python format_div_spacing.py -d contents/ --verbose
"""
)
parser.add_argument('-f', '--file', help='Process a single .qmd file')
parser.add_argument('-d', '--directory', help='Process all .qmd files in directory')
parser.add_argument('--check', action='store_true',
help='Check only, do not modify files')
parser.add_argument('--verbose', '-v', action='store_true',
help='Show detailed output for each issue')
args = parser.parse_args()
if not args.file and not args.directory:
parser.print_help()
sys.exit(1)
total_issues = 0
modified_files = []
if args.file:
if not os.path.exists(args.file):
print(f"Error: File not found: {args.file}", file=sys.stderr)
sys.exit(1)
was_modified, issues = process_file(args.file, args.check, args.verbose)
total_issues = len(issues)
if was_modified:
modified_files.append(args.file)
print(f"Modified: {args.file} ({len(issues)} issues fixed)")
elif issues:
if args.check:
print(f"Issues found: {args.file} ({len(issues)} issues)")
else:
print(f"No changes needed: {args.file}")
else:
print(f"No issues found: {args.file}")
if args.directory:
if not os.path.isdir(args.directory):
print(f"Error: Directory not found: {args.directory}", file=sys.stderr)
sys.exit(1)
modified_files, total_issues = process_directory(args.directory, args.check, args.verbose)
# Summary
print(f"\n{'=' * 50}")
if args.check:
print(f"Total issues found: {total_issues}")
if total_issues > 0:
sys.exit(1)
else:
print(f"Total files modified: {len(modified_files)}")
print(f"Total issues fixed: {total_issues}")
if modified_files:
sys.exit(1) # Pre-commit convention: exit 1 if files were modified
sys.exit(0)
if __name__ == '__main__':
main()