mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-03 08:08:51 -05:00
- Edit all Vol 1 and Vol 2 chapters for print readiness and pedagogical clarity - Update Quarto config files for both volumes (PDF, HTML, EPUB) - Add frontmatter updates (about, acknowledgements, socratiq) - Remove unused _brand assets (scss, favicon, scripts, manifest) - Add new utility scripts (audit_figure_placement, format_div_spacing, audit_refs) - Update format_python_in_qmd script - Add references.bib entries and seminal papers corpus
362 lines
12 KiB
Python
362 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Script to ensure proper blank line spacing inside Quarto div blocks (callouts).
|
|
|
|
Problem: When content inside a div (like a callout) has a paragraph immediately
|
|
followed by a list without a blank line between them, Pandoc may render them
|
|
incorrectly in PDF output (content gets "mushed" together on one line).
|
|
|
|
Additionally, blank lines BETWEEN list items create "loose lists" with extra
|
|
spacing, which is usually undesirable.
|
|
|
|
Solution: This script detects and fixes these patterns by ensuring proper
|
|
blank lines exist between block elements inside divs, while removing
|
|
unnecessary blank lines between list items.
|
|
|
|
Patterns fixed:
|
|
1. Paragraph → List: Ensures blank line between a paragraph and a list
|
|
2. List → Paragraph: Ensures blank line between end of list and new paragraph
|
|
3. Bold header → List: Ensures blank line after standalone bold headers before lists
|
|
4. List item → blank → List item: Removes blank line to create tight list
|
|
|
|
Usage:
|
|
python format_div_spacing.py -f <file.qmd> # Process single file
|
|
python format_div_spacing.py -d <directory> # Process directory
|
|
python format_div_spacing.py -f <file.qmd> --check # Check only (no changes)
|
|
python format_div_spacing.py -f <file.qmd> --verbose # Show details
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import re
|
|
import sys
|
|
from dataclasses import dataclass
|
|
from enum import Enum
|
|
from typing import Optional
|
|
|
|
|
|
class LineType(Enum):
|
|
"""Classification of line types for spacing analysis."""
|
|
BLANK = "blank"
|
|
DIV_START = "div_start"
|
|
DIV_END = "div_end"
|
|
LIST_ITEM = "list_item"
|
|
PARAGRAPH = "paragraph"
|
|
CODE_FENCE = "code_fence"
|
|
HEADER = "header"
|
|
BOLD_HEADER = "bold_header" # Lines that are just **Bold Text**
|
|
OTHER = "other"
|
|
|
|
|
|
@dataclass
|
|
class SpacingIssue:
|
|
"""Represents a spacing issue found in a file."""
|
|
line_number: int
|
|
issue_type: str
|
|
before_line: str
|
|
after_line: str
|
|
context: str
|
|
|
|
|
|
def classify_line(line: str) -> LineType:
|
|
"""Classify a line by its markdown type."""
|
|
stripped = line.strip()
|
|
|
|
if stripped == "":
|
|
return LineType.BLANK
|
|
|
|
# Div boundaries
|
|
if stripped.startswith(":::"):
|
|
if stripped == ":::" or stripped == "::::":
|
|
return LineType.DIV_END
|
|
return LineType.DIV_START
|
|
|
|
# Code fences
|
|
if stripped.startswith("```"):
|
|
return LineType.CODE_FENCE
|
|
|
|
# Headers
|
|
if stripped.startswith("#"):
|
|
return LineType.HEADER
|
|
|
|
# List items (-, *, +, or numbered)
|
|
if re.match(r'^[-*+]\s', stripped) or re.match(r'^\d+\.\s', stripped):
|
|
return LineType.LIST_ITEM
|
|
|
|
# Bold header lines (entire line is just bold text, possibly with colon)
|
|
# Matches: **Some Header** or **Some Header**:
|
|
if re.match(r'^\*\*[^*]+\*\*:?\s*$', stripped):
|
|
return LineType.BOLD_HEADER
|
|
|
|
# Regular paragraph
|
|
return LineType.PARAGRAPH
|
|
|
|
|
|
def find_spacing_issues(content: str) -> list[SpacingIssue]:
|
|
"""Find spacing issues inside div blocks."""
|
|
lines = content.split('\n')
|
|
issues = []
|
|
|
|
div_depth = 0
|
|
in_code_block = False
|
|
|
|
for i, line in enumerate(lines):
|
|
line_type = classify_line(line)
|
|
|
|
# Track code blocks
|
|
if line_type == LineType.CODE_FENCE:
|
|
in_code_block = not in_code_block
|
|
continue
|
|
|
|
# Skip processing inside code blocks
|
|
if in_code_block:
|
|
continue
|
|
|
|
# Track div depth
|
|
if line_type == LineType.DIV_START:
|
|
div_depth += 1
|
|
continue
|
|
elif line_type == LineType.DIV_END:
|
|
div_depth = max(0, div_depth - 1)
|
|
continue
|
|
|
|
# Only check inside divs
|
|
if div_depth == 0:
|
|
continue
|
|
|
|
# Check for issues: need to look at current line and previous non-blank line
|
|
if i > 0:
|
|
prev_idx = i - 1
|
|
prev_type = classify_line(lines[prev_idx])
|
|
|
|
# Issue 1: Bold header or paragraph immediately followed by list item
|
|
# (no blank line between)
|
|
if line_type == LineType.LIST_ITEM and prev_type in (LineType.BOLD_HEADER, LineType.PARAGRAPH):
|
|
issues.append(SpacingIssue(
|
|
line_number=i + 1, # 1-indexed
|
|
issue_type="missing_blank_before_list",
|
|
before_line=lines[prev_idx].strip(),
|
|
after_line=line.strip(),
|
|
context=f"Line {prev_idx + 1}: {lines[prev_idx].strip()[:50]}..."
|
|
))
|
|
|
|
# Issue 2: List item immediately followed by paragraph (no blank line)
|
|
if line_type == LineType.PARAGRAPH and prev_type == LineType.LIST_ITEM:
|
|
issues.append(SpacingIssue(
|
|
line_number=i + 1,
|
|
issue_type="missing_blank_after_list",
|
|
before_line=lines[prev_idx].strip(),
|
|
after_line=line.strip(),
|
|
context=f"Line {prev_idx + 1}: {lines[prev_idx].strip()[:50]}..."
|
|
))
|
|
|
|
# Issue 3: Loose list - blank line between list items
|
|
if i >= 2 and line_type == LineType.LIST_ITEM and prev_type == LineType.BLANK:
|
|
prev_prev_type = classify_line(lines[i - 2])
|
|
if prev_prev_type == LineType.LIST_ITEM:
|
|
issues.append(SpacingIssue(
|
|
line_number=i, # The blank line
|
|
issue_type="loose_list_blank_line",
|
|
before_line=lines[i - 2].strip(),
|
|
after_line=line.strip(),
|
|
context=f"Line {i - 1}: (blank line between list items)"
|
|
))
|
|
|
|
return issues
|
|
|
|
|
|
def fix_spacing(content: str) -> str:
|
|
"""Fix spacing issues inside div blocks."""
|
|
lines = content.split('\n')
|
|
result = []
|
|
|
|
div_depth = 0
|
|
in_code_block = False
|
|
|
|
i = 0
|
|
while i < len(lines):
|
|
line = lines[i]
|
|
line_type = classify_line(line)
|
|
|
|
# Track code blocks
|
|
if line_type == LineType.CODE_FENCE:
|
|
in_code_block = not in_code_block
|
|
result.append(line)
|
|
i += 1
|
|
continue
|
|
|
|
# Preserve content inside code blocks
|
|
if in_code_block:
|
|
result.append(line)
|
|
i += 1
|
|
continue
|
|
|
|
# Track div depth
|
|
if line_type == LineType.DIV_START:
|
|
div_depth += 1
|
|
result.append(line)
|
|
i += 1
|
|
continue
|
|
elif line_type == LineType.DIV_END:
|
|
div_depth = max(0, div_depth - 1)
|
|
result.append(line)
|
|
i += 1
|
|
continue
|
|
|
|
# Only fix inside divs
|
|
if div_depth == 0:
|
|
result.append(line)
|
|
i += 1
|
|
continue
|
|
|
|
# Fix loose lists: skip blank lines between list items
|
|
if line_type == LineType.BLANK and i + 1 < len(lines) and len(result) > 0:
|
|
next_type = classify_line(lines[i + 1])
|
|
prev_type = classify_line(result[-1])
|
|
if prev_type == LineType.LIST_ITEM and next_type == LineType.LIST_ITEM:
|
|
# Skip this blank line (don't add to result)
|
|
i += 1
|
|
continue
|
|
|
|
# Check if we need to insert a blank line before this line
|
|
if len(result) > 0:
|
|
prev_type = classify_line(result[-1])
|
|
|
|
# Insert blank line before list if previous was bold header or paragraph
|
|
if line_type == LineType.LIST_ITEM and prev_type in (LineType.BOLD_HEADER, LineType.PARAGRAPH):
|
|
result.append('') # Insert blank line
|
|
|
|
# Insert blank line before paragraph if previous was list item
|
|
elif line_type == LineType.PARAGRAPH and prev_type == LineType.LIST_ITEM:
|
|
result.append('') # Insert blank line
|
|
|
|
result.append(line)
|
|
i += 1
|
|
|
|
return '\n'.join(result)
|
|
|
|
|
|
def process_file(filepath: str, check_only: bool = False, verbose: bool = False) -> tuple[bool, list[SpacingIssue]]:
|
|
"""Process a single file. Returns (was_modified, issues_found)."""
|
|
with open(filepath, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
|
|
issues = find_spacing_issues(content)
|
|
|
|
if verbose and issues:
|
|
print(f"\n{filepath}:")
|
|
for issue in issues:
|
|
print(f" Line {issue.line_number}: {issue.issue_type}")
|
|
print(f" Before: {issue.before_line[:60]}...")
|
|
print(f" After: {issue.after_line[:60]}...")
|
|
|
|
if check_only:
|
|
return False, issues
|
|
|
|
if not issues:
|
|
return False, issues
|
|
|
|
# Apply fixes
|
|
new_content = fix_spacing(content)
|
|
|
|
if new_content != content:
|
|
with open(filepath, 'w', encoding='utf-8') as f:
|
|
f.write(new_content)
|
|
return True, issues
|
|
|
|
return False, issues
|
|
|
|
|
|
def process_directory(directory: str, check_only: bool = False, verbose: bool = False) -> tuple[list[str], int]:
|
|
"""Process all .qmd files in a directory. Returns (modified_files, total_issues)."""
|
|
modified_files = []
|
|
total_issues = 0
|
|
|
|
for root, _, files in os.walk(directory):
|
|
for file in files:
|
|
if file.endswith('.qmd'):
|
|
filepath = os.path.join(root, file)
|
|
was_modified, issues = process_file(filepath, check_only, verbose)
|
|
total_issues += len(issues)
|
|
if was_modified:
|
|
modified_files.append(filepath)
|
|
print(f"Modified: {filepath} ({len(issues)} issues fixed)")
|
|
elif issues and check_only:
|
|
print(f"Issues found: {filepath} ({len(issues)} issues)")
|
|
|
|
return modified_files, total_issues
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Fix spacing inside Quarto div blocks (callouts) for proper PDF rendering.",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
# Fix a single file
|
|
python format_div_spacing.py -f chapter.qmd
|
|
|
|
# Check all files in a directory (no changes)
|
|
python format_div_spacing.py -d contents/ --check
|
|
|
|
# Fix all files with verbose output
|
|
python format_div_spacing.py -d contents/ --verbose
|
|
"""
|
|
)
|
|
parser.add_argument('-f', '--file', help='Process a single .qmd file')
|
|
parser.add_argument('-d', '--directory', help='Process all .qmd files in directory')
|
|
parser.add_argument('--check', action='store_true',
|
|
help='Check only, do not modify files')
|
|
parser.add_argument('--verbose', '-v', action='store_true',
|
|
help='Show detailed output for each issue')
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not args.file and not args.directory:
|
|
parser.print_help()
|
|
sys.exit(1)
|
|
|
|
total_issues = 0
|
|
modified_files = []
|
|
|
|
if args.file:
|
|
if not os.path.exists(args.file):
|
|
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
sys.exit(1)
|
|
was_modified, issues = process_file(args.file, args.check, args.verbose)
|
|
total_issues = len(issues)
|
|
if was_modified:
|
|
modified_files.append(args.file)
|
|
print(f"Modified: {args.file} ({len(issues)} issues fixed)")
|
|
elif issues:
|
|
if args.check:
|
|
print(f"Issues found: {args.file} ({len(issues)} issues)")
|
|
else:
|
|
print(f"No changes needed: {args.file}")
|
|
else:
|
|
print(f"No issues found: {args.file}")
|
|
|
|
if args.directory:
|
|
if not os.path.isdir(args.directory):
|
|
print(f"Error: Directory not found: {args.directory}", file=sys.stderr)
|
|
sys.exit(1)
|
|
modified_files, total_issues = process_directory(args.directory, args.check, args.verbose)
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 50}")
|
|
if args.check:
|
|
print(f"Total issues found: {total_issues}")
|
|
if total_issues > 0:
|
|
sys.exit(1)
|
|
else:
|
|
print(f"Total files modified: {len(modified_files)}")
|
|
print(f"Total issues fixed: {total_issues}")
|
|
if modified_files:
|
|
sys.exit(1) # Pre-commit convention: exit 1 if files were modified
|
|
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|