cs249r_book/book/tools/scripts/content/section_splitter.py

#!/usr/bin/env python3
"""
section_splitter.py

Splits .qmd chapter files into individual sections for processing.
Designed to support section-by-section editorial workflows where each
section needs to be processed independently (e.g., by stylist agent).

Key Features:
- Uses pypandoc JSON AST for robust parsing (handles code blocks, callouts correctly)
- Extracts sections based on ## headers (level 2)
- Preserves YAML frontmatter separately
- Tracks section metadata (line numbers, word counts)
- Supports both extraction (to files) and in-memory operation
- Can reassemble sections back into complete chapter

Usage:
    # List sections in a chapter
    python3 section_splitter.py -f path/to/chapter.qmd --list

    # Extract sections to individual files
    python3 section_splitter.py -f path/to/chapter.qmd --extract --output-dir ./sections/

    # Get JSON manifest of sections (for programmatic use)
    python3 section_splitter.py -f path/to/chapter.qmd --manifest

Requirements:
    - pypandoc (pip install pypandoc)
    - pandoc must be installed
"""

import os
import re
import json
import argparse
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import Optional

try:
    import pypandoc
    PYPANDOC_AVAILABLE = True
except ImportError:
    PYPANDOC_AVAILABLE = False


@dataclass
class Section:
    """Represents a single section of a chapter."""
    index: int
    title: str
    section_id: Optional[str]
    level: int  # Number of # symbols (2 for ##, 3 for ###)
    start_line: int
    end_line: int
    word_count: int
    content: str
    is_unnumbered: bool = False  # For {.unnumbered} sections like Purpose

    def to_dict(self) -> dict:
        """Convert to dictionary (excluding content for manifest)."""
        d = asdict(self)
        d.pop('content')  # Don't include full content in manifest
        return d


@dataclass
class ChapterStructure:
    """Complete structure of a chapter."""
    file_path: str
    chapter_title: str
    chapter_id: Optional[str]
    frontmatter: str  # YAML frontmatter
    pre_content: str  # Content before first ## section (includes # title)
    sections: list[Section]
    post_content: str  # Any content after last section (rare)
    total_lines: int
    total_words: int


def count_words(text: str) -> int:
    """Count words in text, excluding code blocks and TikZ."""
    # Remove code blocks
    text = re.sub(r'```[\s\S]*?```', '', text)
    # Remove TikZ blocks
    text = re.sub(r'\{\.tikz\}[\s\S]*?(?=\n##|\n#|\Z)', '', text)
    # Remove inline code
    text = re.sub(r'`[^`]+`', '', text)
    # Count remaining words
    words = text.split()
    return len(words)


def parse_header(line: str) -> tuple[int, str, Optional[str], bool]:
    """
    Parse a markdown header line.

    Returns: (level, title, section_id, is_unnumbered)
    """
    match = re.match(r'^(#{1,6})\s+(.+?)(?:\s*\{([^}]+)\})?\s*$', line)
    if not match:
        return (0, '', None, False)

    level = len(match.group(1))
    title = match.group(2).strip()
    attributes = match.group(3) or ''

    # Extract section ID
    section_id = None
    id_match = re.search(r'#(sec-[^\s}]+)', attributes)
    if id_match:
        section_id = id_match.group(1)

    # Check if unnumbered
    is_unnumbered = '.unnumbered' in attributes

    return (level, title, section_id, is_unnumbered)


def extract_text_from_inlines(inlines: list) -> str:
    """Extract plain text from pandoc inline elements."""
    text_parts = []
    for inline in inlines:
        if isinstance(inline, dict):
            t = inline.get('t', '')
            if t == 'Str':
                text_parts.append(inline.get('c', ''))
            elif t == 'Space':
                text_parts.append(' ')
            elif t in ('Emph', 'Strong', 'Strikeout', 'Superscript', 'Subscript', 'SmallCaps'):
                text_parts.append(extract_text_from_inlines(inline.get('c', [])))
            elif t == 'Link':
                # Link: [attr, inlines, target]
                text_parts.append(extract_text_from_inlines(inline.get('c', [None, [], None])[1]))
            elif t == 'Quoted':
                text_parts.append(extract_text_from_inlines(inline.get('c', [None, []])[1]))
        elif isinstance(inline, str):
            text_parts.append(inline)
    return ''.join(text_parts)


def get_section_headers_from_ast(content: str) -> list[dict]:
    """
    Use pypandoc to parse the document and extract real section headers.

    This properly handles headers inside code blocks, callouts, etc.

    Args:
        content: The markdown content

    Returns:
        List of dicts with 'title', 'id', 'level', 'line_hint' (approx line)
    """
    if not PYPANDOC_AVAILABLE:
        return []

    try:
        ast_json = pypandoc.convert_text(
            content,
            'json',
            format='markdown+smart',
            extra_args=['--preserve-tabs']
        )
        ast = json.loads(ast_json)

        headers = []

        def walk_ast(element):
            if isinstance(element, dict):
                element_type = element.get('t', '')

                if element_type == 'Header':
                    # Header: [level, [id, classes, attrs], inlines]
                    c = element.get('c', [])
                    if len(c) >= 3:
                        level = c[0]
                        header_id = c[1][0] if c[1] else None
                        inlines = c[2]
                        title = extract_text_from_inlines(inlines)

                        headers.append({
                            'level': level,
                            'id': header_id,
                            'title': title
                        })

                # Recurse into content
                for key in ('c', 'content'):
                    if key in element:
                        walk_ast(element[key])

            elif isinstance(element, list):
                for item in element:
                    walk_ast(item)

        walk_ast(ast.get('blocks', []))
        return headers

    except Exception as e:
        print(f"Warning: pypandoc parsing failed: {e}", file=__import__('sys').stderr)
        return []


def is_real_section_header(line: str, in_code_block: bool, in_callout: bool) -> bool:
    """
    Determine if a line is a real section header (not inside code/callout).

    This is the fallback method when pypandoc is not available.

    Args:
        line: The line to check
        in_code_block: Whether we're currently inside a code block
        in_callout: Whether we're currently inside a callout

    Returns:
        True if this is a real ## section header
    """
    if not line.startswith('## '):
        return False

    # Skip if inside code block or callout
    if in_code_block or in_callout:
        return False

    # Must have proper header format (## followed by text)
    # and should have a section ID {#sec-...} for real sections
    # (though Purpose section may not have one)
    return True


def split_chapter(file_path: str) -> ChapterStructure:
    """
    Split a chapter file into its component sections.

    Uses pypandoc AST parsing when available for robust handling of:
    - Code blocks (``` ... ```) - headers inside are ignored
    - Callouts (::: ... :::) - headers inside are ignored
    - TikZ blocks - headers inside are ignored

    Falls back to regex-based parsing with block tracking if pypandoc unavailable.

    Args:
        file_path: Path to the .qmd file

    Returns:
        ChapterStructure with all sections parsed
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
        lines = content.split('\n')

    total_lines = len(lines)

    # Extract YAML frontmatter
    frontmatter = ''
    content_start = 0
    if lines[0].strip() == '---':
        for i, line in enumerate(lines[1:], 1):
            if line.strip() == '---':
                frontmatter = '\n'.join(lines[:i+1])
                content_start = i + 1
                break

    # Try to get headers from pypandoc AST (most reliable)
    ast_headers = get_section_headers_from_ast(content)

    # Build a set of valid section header titles from AST
    # This tells us which ## lines are REAL headers (not in code/callouts)
    valid_section_titles = set()
    chapter_title_from_ast = ''
    chapter_id_from_ast = None

    for h in ast_headers:
        if h['level'] == 1 and not chapter_title_from_ast:
            chapter_title_from_ast = h['title']
            chapter_id_from_ast = h['id']
        elif h['level'] == 2:
            valid_section_titles.add(h['title'])

    # Find chapter title (# header) and track sections
    chapter_title = chapter_title_from_ast
    chapter_id = chapter_id_from_ast
    pre_content_lines = []
    first_section_line = None

    # Track block states (fallback if AST not available)
    in_code_block = False
    in_callout_depth = 0

    for i, line in enumerate(lines[content_start:], content_start):
        stripped = line.strip()

        # Track code block state (``` or ```python, ```{.tikz}, etc.)
        if stripped.startswith('```'):
            in_code_block = not in_code_block

        # Track callout state (::: {.callout-...} or just :::)
        if not in_code_block:
            if stripped.startswith(':::') and ('{' in stripped or stripped == ':::'):
                if stripped == ':::':
                    if in_callout_depth > 0:
                        in_callout_depth -= 1
                else:
                    in_callout_depth += 1

        # Check for chapter title (if AST didn't find one)
        if not chapter_title and line.startswith('# ') and not line.startswith('## '):
            if not in_code_block and in_callout_depth == 0:
                level, title, sec_id, _ = parse_header(line)
                if level == 1:
                    chapter_title = title
                    chapter_id = sec_id

        # Check for first real section header
        elif line.startswith('## '):
            _, title, _, _ = parse_header(line)
            # Use AST validation if available, otherwise use block tracking
            if valid_section_titles:
                is_real = title in valid_section_titles
            else:
                is_real = not in_code_block and in_callout_depth == 0

            if is_real:
                first_section_line = i
                break

        pre_content_lines.append(line)

    pre_content = '\n'.join(pre_content_lines)

    # Reset block tracking for section parsing
    in_code_block = False
    in_callout_depth = 0

    # Parse sections (## level)
    sections = []
    current_section_start = first_section_line
    current_section_lines = []
    current_title = ''
    current_id = None
    current_is_unnumbered = False
    section_index = 0

    if first_section_line is not None:
        for i, line in enumerate(lines[first_section_line:], first_section_line):
            stripped = line.strip()

            # Track code block state
            if stripped.startswith('```'):
                in_code_block = not in_code_block

            # Track callout state
            if not in_code_block:
                if stripped.startswith(':::') and ('{' in stripped or stripped == ':::'):
                    if stripped == ':::':
                        if in_callout_depth > 0:
                            in_callout_depth -= 1
                    else:
                        in_callout_depth += 1

            # Check if this is a real section header
            is_section_header = False
            if line.startswith('## ') and i > first_section_line:
                _, title, _, _ = parse_header(line)
                if valid_section_titles:
                    is_section_header = title in valid_section_titles
                else:
                    is_section_header = not in_code_block and in_callout_depth == 0

            if is_section_header:
                # Save previous section
                if current_section_lines:
                    section_content = '\n'.join(current_section_lines)
                    sections.append(Section(
                        index=section_index,
                        title=current_title,
                        section_id=current_id,
                        level=2,
                        start_line=current_section_start + 1,  # 1-indexed
                        end_line=i,  # Line before new section
                        word_count=count_words(section_content),
                        content=section_content,
                        is_unnumbered=current_is_unnumbered
                    ))
                    section_index += 1

                # Start new section
                current_section_start = i
                current_section_lines = [line]
                _, current_title, current_id, current_is_unnumbered = parse_header(line)
            else:
                current_section_lines.append(line)

        # Don't forget the last section
        if current_section_lines:
            section_content = '\n'.join(current_section_lines)
            sections.append(Section(
                index=section_index,
                title=current_title,
                section_id=current_id,
                level=2,
                start_line=current_section_start + 1,
                end_line=total_lines,
                word_count=count_words(section_content),
                content=section_content,
                is_unnumbered=current_is_unnumbered
            ))

    # Calculate totals
    total_words = count_words(content)

    return ChapterStructure(
        file_path=str(file_path),
        chapter_title=chapter_title,
        chapter_id=chapter_id,
        frontmatter=frontmatter,
        pre_content=pre_content,
        sections=sections,
        post_content='',  # Typically empty
        total_lines=total_lines,
        total_words=total_words
    )


def extract_sections(chapter: ChapterStructure, output_dir: str) -> list[str]:
    """
    Extract each section to its own file.

    Args:
        chapter: Parsed chapter structure
        output_dir: Directory to write section files

    Returns:
        List of created file paths
    """
    os.makedirs(output_dir, exist_ok=True)
    created_files = []

    # Write frontmatter + pre-content as section 0
    pre_path = os.path.join(output_dir, 'section_00_preamble.qmd')
    with open(pre_path, 'w', encoding='utf-8') as f:
        f.write(chapter.frontmatter + '\n' + chapter.pre_content)
    created_files.append(pre_path)

    # Write each section
    for section in chapter.sections:
        # Create safe filename from title
        safe_title = re.sub(r'[^\w\s-]', '', section.title.lower())
        safe_title = re.sub(r'\s+', '_', safe_title)[:40]
        filename = f'section_{section.index + 1:02d}_{safe_title}.qmd'

        section_path = os.path.join(output_dir, filename)
        with open(section_path, 'w', encoding='utf-8') as f:
            f.write(section.content)
        created_files.append(section_path)

    return created_files


def reassemble_chapter(chapter: ChapterStructure, modified_sections: Optional[dict[int, str]] = None) -> str:
    """
    Reassemble a chapter from its components.

    Args:
        chapter: Original chapter structure
        modified_sections: Optional dict mapping section index to new content

    Returns:
        Complete chapter content
    """
    parts = []

    # Add frontmatter
    if chapter.frontmatter:
        parts.append(chapter.frontmatter)

    # Add pre-content (includes # title)
    if chapter.pre_content:
        parts.append(chapter.pre_content)

    # Add sections (possibly modified)
    for section in chapter.sections:
        if modified_sections and section.index in modified_sections:
            parts.append(modified_sections[section.index])
        else:
            parts.append(section.content)

    return '\n'.join(parts)


def generate_manifest(chapter: ChapterStructure) -> dict:
    """
    Generate a JSON manifest of the chapter structure.

    Returns:
        Dictionary suitable for JSON serialization
    """
    return {
        'file_path': chapter.file_path,
        'chapter_title': chapter.chapter_title,
        'chapter_id': chapter.chapter_id,
        'total_sections': len(chapter.sections),
        'total_lines': chapter.total_lines,
        'total_words': chapter.total_words,
        'sections': [s.to_dict() for s in chapter.sections]
    }


def list_sections(chapter: ChapterStructure) -> None:
    """Print a formatted list of sections."""
    print(f"\nChapter: {chapter.chapter_title}")
    print(f"File: {chapter.file_path}")
    print(f"Total: {len(chapter.sections)} sections, {chapter.total_words:,} words, {chapter.total_lines:,} lines")
    print("-" * 80)
    print(f"{'#':<3} {'Lines':<12} {'Words':<8} {'ID':<40} Title")
    print("-" * 80)

    for section in chapter.sections:
        line_range = f"{section.start_line}-{section.end_line}"
        sec_id = section.section_id or "(none)"
        unnumbered = " [unnumbered]" if section.is_unnumbered else ""
        print(f"{section.index + 1:<3} {line_range:<12} {section.word_count:<8} {sec_id:<40} {section.title}{unnumbered}")


def main():
    parser = argparse.ArgumentParser(
        description="Split .qmd chapter files into sections for processing"
    )
    parser.add_argument('-f', '--file', required=True,
                        help='Path to the .qmd chapter file')

    action = parser.add_mutually_exclusive_group(required=True)
    action.add_argument('--list', action='store_true',
                        help='List all sections in the chapter')
    action.add_argument('--extract', action='store_true',
                        help='Extract sections to individual files')
    action.add_argument('--manifest', action='store_true',
                        help='Output JSON manifest of chapter structure')
    action.add_argument('--get-section', type=int, metavar='N',
                        help='Get content of section N (1-indexed)')

    parser.add_argument('--output-dir', default='./sections',
                        help='Directory for extracted sections (default: ./sections)')

    args = parser.parse_args()

    # Parse the chapter
    chapter = split_chapter(args.file)

    if args.list:
        list_sections(chapter)

    elif args.extract:
        files = extract_sections(chapter, args.output_dir)
        print(f"Extracted {len(files)} files to {args.output_dir}/")
        for f in files:
            print(f"  {f}")

    elif args.manifest:
        manifest = generate_manifest(chapter)
        print(json.dumps(manifest, indent=2))

    elif args.get_section is not None:
        idx = args.get_section - 1  # Convert to 0-indexed
        if 0 <= idx < len(chapter.sections):
            print(chapter.sections[idx].content)
        else:
            print(f"Error: Section {args.get_section} not found. Chapter has {len(chapter.sections)} sections.")
            exit(1)


if __name__ == "__main__":
    main()