mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-09 07:15:51 -05:00
Adds a pypandoc-based section splitter utility that parses .qmd chapter files and extracts individual sections for processing. This enables guaranteed 100% coverage in editorial workflows by processing each section independently rather than entire chapters at once. Key features: - Uses pypandoc JSON AST for robust parsing (correctly ignores headers inside code blocks, callouts, and TikZ diagrams) - Falls back to regex-based block tracking if pypandoc unavailable - Extracts section metadata: title, ID, line numbers, word count - Supports listing, extraction to files, and JSON manifest output - Designed for integration with polish workflow agents Usage: python3 section_splitter.py -f chapter.qmd --list python3 section_splitter.py -f chapter.qmd --manifest python3 section_splitter.py -f chapter.qmd --get-section 3
574 lines
19 KiB
Python
574 lines
19 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
section_splitter.py
|
|
|
|
Splits .qmd chapter files into individual sections for processing.
|
|
Designed to support section-by-section editorial workflows where each
|
|
section needs to be processed independently (e.g., by stylist agent).
|
|
|
|
Key Features:
|
|
- Uses pypandoc JSON AST for robust parsing (handles code blocks, callouts correctly)
|
|
- Extracts sections based on ## headers (level 2)
|
|
- Preserves YAML frontmatter separately
|
|
- Tracks section metadata (line numbers, word counts)
|
|
- Supports both extraction (to files) and in-memory operation
|
|
- Can reassemble sections back into complete chapter
|
|
|
|
Usage:
|
|
# List sections in a chapter
|
|
python3 section_splitter.py -f path/to/chapter.qmd --list
|
|
|
|
# Extract sections to individual files
|
|
python3 section_splitter.py -f path/to/chapter.qmd --extract --output-dir ./sections/
|
|
|
|
# Get JSON manifest of sections (for programmatic use)
|
|
python3 section_splitter.py -f path/to/chapter.qmd --manifest
|
|
|
|
Requirements:
|
|
- pypandoc (pip install pypandoc)
|
|
- pandoc must be installed
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
import json
|
|
import argparse
|
|
from pathlib import Path
|
|
from dataclasses import dataclass, asdict
|
|
from typing import Optional
|
|
|
|
try:
|
|
import pypandoc
|
|
PYPANDOC_AVAILABLE = True
|
|
except ImportError:
|
|
PYPANDOC_AVAILABLE = False
|
|
|
|
|
|
@dataclass
|
|
class Section:
|
|
"""Represents a single section of a chapter."""
|
|
index: int
|
|
title: str
|
|
section_id: Optional[str]
|
|
level: int # Number of # symbols (2 for ##, 3 for ###)
|
|
start_line: int
|
|
end_line: int
|
|
word_count: int
|
|
content: str
|
|
is_unnumbered: bool = False # For {.unnumbered} sections like Purpose
|
|
|
|
def to_dict(self) -> dict:
|
|
"""Convert to dictionary (excluding content for manifest)."""
|
|
d = asdict(self)
|
|
d.pop('content') # Don't include full content in manifest
|
|
return d
|
|
|
|
|
|
@dataclass
|
|
class ChapterStructure:
|
|
"""Complete structure of a chapter."""
|
|
file_path: str
|
|
chapter_title: str
|
|
chapter_id: Optional[str]
|
|
frontmatter: str # YAML frontmatter
|
|
pre_content: str # Content before first ## section (includes # title)
|
|
sections: list[Section]
|
|
post_content: str # Any content after last section (rare)
|
|
total_lines: int
|
|
total_words: int
|
|
|
|
|
|
def count_words(text: str) -> int:
|
|
"""Count words in text, excluding code blocks and TikZ."""
|
|
# Remove code blocks
|
|
text = re.sub(r'```[\s\S]*?```', '', text)
|
|
# Remove TikZ blocks
|
|
text = re.sub(r'\{\.tikz\}[\s\S]*?(?=\n##|\n#|\Z)', '', text)
|
|
# Remove inline code
|
|
text = re.sub(r'`[^`]+`', '', text)
|
|
# Count remaining words
|
|
words = text.split()
|
|
return len(words)
|
|
|
|
|
|
def parse_header(line: str) -> tuple[int, str, Optional[str], bool]:
|
|
"""
|
|
Parse a markdown header line.
|
|
|
|
Returns: (level, title, section_id, is_unnumbered)
|
|
"""
|
|
match = re.match(r'^(#{1,6})\s+(.+?)(?:\s*\{([^}]+)\})?\s*$', line)
|
|
if not match:
|
|
return (0, '', None, False)
|
|
|
|
level = len(match.group(1))
|
|
title = match.group(2).strip()
|
|
attributes = match.group(3) or ''
|
|
|
|
# Extract section ID
|
|
section_id = None
|
|
id_match = re.search(r'#(sec-[^\s}]+)', attributes)
|
|
if id_match:
|
|
section_id = id_match.group(1)
|
|
|
|
# Check if unnumbered
|
|
is_unnumbered = '.unnumbered' in attributes
|
|
|
|
return (level, title, section_id, is_unnumbered)
|
|
|
|
|
|
def extract_text_from_inlines(inlines: list) -> str:
|
|
"""Extract plain text from pandoc inline elements."""
|
|
text_parts = []
|
|
for inline in inlines:
|
|
if isinstance(inline, dict):
|
|
t = inline.get('t', '')
|
|
if t == 'Str':
|
|
text_parts.append(inline.get('c', ''))
|
|
elif t == 'Space':
|
|
text_parts.append(' ')
|
|
elif t in ('Emph', 'Strong', 'Strikeout', 'Superscript', 'Subscript', 'SmallCaps'):
|
|
text_parts.append(extract_text_from_inlines(inline.get('c', [])))
|
|
elif t == 'Link':
|
|
# Link: [attr, inlines, target]
|
|
text_parts.append(extract_text_from_inlines(inline.get('c', [None, [], None])[1]))
|
|
elif t == 'Quoted':
|
|
text_parts.append(extract_text_from_inlines(inline.get('c', [None, []])[1]))
|
|
elif isinstance(inline, str):
|
|
text_parts.append(inline)
|
|
return ''.join(text_parts)
|
|
|
|
|
|
def get_section_headers_from_ast(content: str) -> list[dict]:
|
|
"""
|
|
Use pypandoc to parse the document and extract real section headers.
|
|
|
|
This properly handles headers inside code blocks, callouts, etc.
|
|
|
|
Args:
|
|
content: The markdown content
|
|
|
|
Returns:
|
|
List of dicts with 'title', 'id', 'level', 'line_hint' (approx line)
|
|
"""
|
|
if not PYPANDOC_AVAILABLE:
|
|
return []
|
|
|
|
try:
|
|
ast_json = pypandoc.convert_text(
|
|
content,
|
|
'json',
|
|
format='markdown+smart',
|
|
extra_args=['--preserve-tabs']
|
|
)
|
|
ast = json.loads(ast_json)
|
|
|
|
headers = []
|
|
|
|
def walk_ast(element):
|
|
if isinstance(element, dict):
|
|
element_type = element.get('t', '')
|
|
|
|
if element_type == 'Header':
|
|
# Header: [level, [id, classes, attrs], inlines]
|
|
c = element.get('c', [])
|
|
if len(c) >= 3:
|
|
level = c[0]
|
|
header_id = c[1][0] if c[1] else None
|
|
inlines = c[2]
|
|
title = extract_text_from_inlines(inlines)
|
|
|
|
headers.append({
|
|
'level': level,
|
|
'id': header_id,
|
|
'title': title
|
|
})
|
|
|
|
# Recurse into content
|
|
for key in ('c', 'content'):
|
|
if key in element:
|
|
walk_ast(element[key])
|
|
|
|
elif isinstance(element, list):
|
|
for item in element:
|
|
walk_ast(item)
|
|
|
|
walk_ast(ast.get('blocks', []))
|
|
return headers
|
|
|
|
except Exception as e:
|
|
print(f"Warning: pypandoc parsing failed: {e}", file=__import__('sys').stderr)
|
|
return []
|
|
|
|
|
|
def is_real_section_header(line: str, in_code_block: bool, in_callout: bool) -> bool:
|
|
"""
|
|
Determine if a line is a real section header (not inside code/callout).
|
|
|
|
This is the fallback method when pypandoc is not available.
|
|
|
|
Args:
|
|
line: The line to check
|
|
in_code_block: Whether we're currently inside a code block
|
|
in_callout: Whether we're currently inside a callout
|
|
|
|
Returns:
|
|
True if this is a real ## section header
|
|
"""
|
|
if not line.startswith('## '):
|
|
return False
|
|
|
|
# Skip if inside code block or callout
|
|
if in_code_block or in_callout:
|
|
return False
|
|
|
|
# Must have proper header format (## followed by text)
|
|
# and should have a section ID {#sec-...} for real sections
|
|
# (though Purpose section may not have one)
|
|
return True
|
|
|
|
|
|
def split_chapter(file_path: str) -> ChapterStructure:
|
|
"""
|
|
Split a chapter file into its component sections.
|
|
|
|
Uses pypandoc AST parsing when available for robust handling of:
|
|
- Code blocks (``` ... ```) - headers inside are ignored
|
|
- Callouts (::: ... :::) - headers inside are ignored
|
|
- TikZ blocks - headers inside are ignored
|
|
|
|
Falls back to regex-based parsing with block tracking if pypandoc unavailable.
|
|
|
|
Args:
|
|
file_path: Path to the .qmd file
|
|
|
|
Returns:
|
|
ChapterStructure with all sections parsed
|
|
"""
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
content = f.read()
|
|
lines = content.split('\n')
|
|
|
|
total_lines = len(lines)
|
|
|
|
# Extract YAML frontmatter
|
|
frontmatter = ''
|
|
content_start = 0
|
|
if lines[0].strip() == '---':
|
|
for i, line in enumerate(lines[1:], 1):
|
|
if line.strip() == '---':
|
|
frontmatter = '\n'.join(lines[:i+1])
|
|
content_start = i + 1
|
|
break
|
|
|
|
# Try to get headers from pypandoc AST (most reliable)
|
|
ast_headers = get_section_headers_from_ast(content)
|
|
|
|
# Build a set of valid section header titles from AST
|
|
# This tells us which ## lines are REAL headers (not in code/callouts)
|
|
valid_section_titles = set()
|
|
chapter_title_from_ast = ''
|
|
chapter_id_from_ast = None
|
|
|
|
for h in ast_headers:
|
|
if h['level'] == 1 and not chapter_title_from_ast:
|
|
chapter_title_from_ast = h['title']
|
|
chapter_id_from_ast = h['id']
|
|
elif h['level'] == 2:
|
|
valid_section_titles.add(h['title'])
|
|
|
|
# Find chapter title (# header) and track sections
|
|
chapter_title = chapter_title_from_ast
|
|
chapter_id = chapter_id_from_ast
|
|
pre_content_lines = []
|
|
first_section_line = None
|
|
|
|
# Track block states (fallback if AST not available)
|
|
in_code_block = False
|
|
in_callout_depth = 0
|
|
|
|
for i, line in enumerate(lines[content_start:], content_start):
|
|
stripped = line.strip()
|
|
|
|
# Track code block state (``` or ```python, ```{.tikz}, etc.)
|
|
if stripped.startswith('```'):
|
|
in_code_block = not in_code_block
|
|
|
|
# Track callout state (::: {.callout-...} or just :::)
|
|
if not in_code_block:
|
|
if stripped.startswith(':::') and ('{' in stripped or stripped == ':::'):
|
|
if stripped == ':::':
|
|
if in_callout_depth > 0:
|
|
in_callout_depth -= 1
|
|
else:
|
|
in_callout_depth += 1
|
|
|
|
# Check for chapter title (if AST didn't find one)
|
|
if not chapter_title and line.startswith('# ') and not line.startswith('## '):
|
|
if not in_code_block and in_callout_depth == 0:
|
|
level, title, sec_id, _ = parse_header(line)
|
|
if level == 1:
|
|
chapter_title = title
|
|
chapter_id = sec_id
|
|
|
|
# Check for first real section header
|
|
elif line.startswith('## '):
|
|
_, title, _, _ = parse_header(line)
|
|
# Use AST validation if available, otherwise use block tracking
|
|
if valid_section_titles:
|
|
is_real = title in valid_section_titles
|
|
else:
|
|
is_real = not in_code_block and in_callout_depth == 0
|
|
|
|
if is_real:
|
|
first_section_line = i
|
|
break
|
|
|
|
pre_content_lines.append(line)
|
|
|
|
pre_content = '\n'.join(pre_content_lines)
|
|
|
|
# Reset block tracking for section parsing
|
|
in_code_block = False
|
|
in_callout_depth = 0
|
|
|
|
# Parse sections (## level)
|
|
sections = []
|
|
current_section_start = first_section_line
|
|
current_section_lines = []
|
|
current_title = ''
|
|
current_id = None
|
|
current_is_unnumbered = False
|
|
section_index = 0
|
|
|
|
if first_section_line is not None:
|
|
for i, line in enumerate(lines[first_section_line:], first_section_line):
|
|
stripped = line.strip()
|
|
|
|
# Track code block state
|
|
if stripped.startswith('```'):
|
|
in_code_block = not in_code_block
|
|
|
|
# Track callout state
|
|
if not in_code_block:
|
|
if stripped.startswith(':::') and ('{' in stripped or stripped == ':::'):
|
|
if stripped == ':::':
|
|
if in_callout_depth > 0:
|
|
in_callout_depth -= 1
|
|
else:
|
|
in_callout_depth += 1
|
|
|
|
# Check if this is a real section header
|
|
is_section_header = False
|
|
if line.startswith('## ') and i > first_section_line:
|
|
_, title, _, _ = parse_header(line)
|
|
if valid_section_titles:
|
|
is_section_header = title in valid_section_titles
|
|
else:
|
|
is_section_header = not in_code_block and in_callout_depth == 0
|
|
|
|
if is_section_header:
|
|
# Save previous section
|
|
if current_section_lines:
|
|
section_content = '\n'.join(current_section_lines)
|
|
sections.append(Section(
|
|
index=section_index,
|
|
title=current_title,
|
|
section_id=current_id,
|
|
level=2,
|
|
start_line=current_section_start + 1, # 1-indexed
|
|
end_line=i, # Line before new section
|
|
word_count=count_words(section_content),
|
|
content=section_content,
|
|
is_unnumbered=current_is_unnumbered
|
|
))
|
|
section_index += 1
|
|
|
|
# Start new section
|
|
current_section_start = i
|
|
current_section_lines = [line]
|
|
_, current_title, current_id, current_is_unnumbered = parse_header(line)
|
|
else:
|
|
current_section_lines.append(line)
|
|
|
|
# Don't forget the last section
|
|
if current_section_lines:
|
|
section_content = '\n'.join(current_section_lines)
|
|
sections.append(Section(
|
|
index=section_index,
|
|
title=current_title,
|
|
section_id=current_id,
|
|
level=2,
|
|
start_line=current_section_start + 1,
|
|
end_line=total_lines,
|
|
word_count=count_words(section_content),
|
|
content=section_content,
|
|
is_unnumbered=current_is_unnumbered
|
|
))
|
|
|
|
# Calculate totals
|
|
total_words = count_words(content)
|
|
|
|
return ChapterStructure(
|
|
file_path=str(file_path),
|
|
chapter_title=chapter_title,
|
|
chapter_id=chapter_id,
|
|
frontmatter=frontmatter,
|
|
pre_content=pre_content,
|
|
sections=sections,
|
|
post_content='', # Typically empty
|
|
total_lines=total_lines,
|
|
total_words=total_words
|
|
)
|
|
|
|
|
|
def extract_sections(chapter: ChapterStructure, output_dir: str) -> list[str]:
|
|
"""
|
|
Extract each section to its own file.
|
|
|
|
Args:
|
|
chapter: Parsed chapter structure
|
|
output_dir: Directory to write section files
|
|
|
|
Returns:
|
|
List of created file paths
|
|
"""
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
created_files = []
|
|
|
|
# Write frontmatter + pre-content as section 0
|
|
pre_path = os.path.join(output_dir, 'section_00_preamble.qmd')
|
|
with open(pre_path, 'w', encoding='utf-8') as f:
|
|
f.write(chapter.frontmatter + '\n' + chapter.pre_content)
|
|
created_files.append(pre_path)
|
|
|
|
# Write each section
|
|
for section in chapter.sections:
|
|
# Create safe filename from title
|
|
safe_title = re.sub(r'[^\w\s-]', '', section.title.lower())
|
|
safe_title = re.sub(r'\s+', '_', safe_title)[:40]
|
|
filename = f'section_{section.index + 1:02d}_{safe_title}.qmd'
|
|
|
|
section_path = os.path.join(output_dir, filename)
|
|
with open(section_path, 'w', encoding='utf-8') as f:
|
|
f.write(section.content)
|
|
created_files.append(section_path)
|
|
|
|
return created_files
|
|
|
|
|
|
def reassemble_chapter(chapter: ChapterStructure, modified_sections: Optional[dict[int, str]] = None) -> str:
|
|
"""
|
|
Reassemble a chapter from its components.
|
|
|
|
Args:
|
|
chapter: Original chapter structure
|
|
modified_sections: Optional dict mapping section index to new content
|
|
|
|
Returns:
|
|
Complete chapter content
|
|
"""
|
|
parts = []
|
|
|
|
# Add frontmatter
|
|
if chapter.frontmatter:
|
|
parts.append(chapter.frontmatter)
|
|
|
|
# Add pre-content (includes # title)
|
|
if chapter.pre_content:
|
|
parts.append(chapter.pre_content)
|
|
|
|
# Add sections (possibly modified)
|
|
for section in chapter.sections:
|
|
if modified_sections and section.index in modified_sections:
|
|
parts.append(modified_sections[section.index])
|
|
else:
|
|
parts.append(section.content)
|
|
|
|
return '\n'.join(parts)
|
|
|
|
|
|
def generate_manifest(chapter: ChapterStructure) -> dict:
|
|
"""
|
|
Generate a JSON manifest of the chapter structure.
|
|
|
|
Returns:
|
|
Dictionary suitable for JSON serialization
|
|
"""
|
|
return {
|
|
'file_path': chapter.file_path,
|
|
'chapter_title': chapter.chapter_title,
|
|
'chapter_id': chapter.chapter_id,
|
|
'total_sections': len(chapter.sections),
|
|
'total_lines': chapter.total_lines,
|
|
'total_words': chapter.total_words,
|
|
'sections': [s.to_dict() for s in chapter.sections]
|
|
}
|
|
|
|
|
|
def list_sections(chapter: ChapterStructure) -> None:
|
|
"""Print a formatted list of sections."""
|
|
print(f"\nChapter: {chapter.chapter_title}")
|
|
print(f"File: {chapter.file_path}")
|
|
print(f"Total: {len(chapter.sections)} sections, {chapter.total_words:,} words, {chapter.total_lines:,} lines")
|
|
print("-" * 80)
|
|
print(f"{'#':<3} {'Lines':<12} {'Words':<8} {'ID':<40} Title")
|
|
print("-" * 80)
|
|
|
|
for section in chapter.sections:
|
|
line_range = f"{section.start_line}-{section.end_line}"
|
|
sec_id = section.section_id or "(none)"
|
|
unnumbered = " [unnumbered]" if section.is_unnumbered else ""
|
|
print(f"{section.index + 1:<3} {line_range:<12} {section.word_count:<8} {sec_id:<40} {section.title}{unnumbered}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Split .qmd chapter files into sections for processing"
|
|
)
|
|
parser.add_argument('-f', '--file', required=True,
|
|
help='Path to the .qmd chapter file')
|
|
|
|
action = parser.add_mutually_exclusive_group(required=True)
|
|
action.add_argument('--list', action='store_true',
|
|
help='List all sections in the chapter')
|
|
action.add_argument('--extract', action='store_true',
|
|
help='Extract sections to individual files')
|
|
action.add_argument('--manifest', action='store_true',
|
|
help='Output JSON manifest of chapter structure')
|
|
action.add_argument('--get-section', type=int, metavar='N',
|
|
help='Get content of section N (1-indexed)')
|
|
|
|
parser.add_argument('--output-dir', default='./sections',
|
|
help='Directory for extracted sections (default: ./sections)')
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Parse the chapter
|
|
chapter = split_chapter(args.file)
|
|
|
|
if args.list:
|
|
list_sections(chapter)
|
|
|
|
elif args.extract:
|
|
files = extract_sections(chapter, args.output_dir)
|
|
print(f"Extracted {len(files)} files to {args.output_dir}/")
|
|
for f in files:
|
|
print(f" {f}")
|
|
|
|
elif args.manifest:
|
|
manifest = generate_manifest(chapter)
|
|
print(json.dumps(manifest, indent=2))
|
|
|
|
elif args.get_section is not None:
|
|
idx = args.get_section - 1 # Convert to 0-indexed
|
|
if 0 <= idx < len(chapter.sections):
|
|
print(chapter.sections[idx].content)
|
|
else:
|
|
print(f"Error: Section {args.get_section} not found. Chapter has {len(chapter.sections)} sections.")
|
|
exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|