cs249r_book/interviews/paper/scripts/generate_figures.py

#!/usr/bin/env python3
"""Generate publication-quality data figures for the StaffML paper.

Pipeline: generated corpus.json (``vault build --local-json``) + chains
  → analyze_corpus.py → corpus_stats.json → THIS → PDFs

Run: python3 generate_figures.py
  (or: make figures)

Reads: corpus_stats.json (structured stats from analyze_corpus.py)
Writes: fig-corpus-distribution.pdf, fig-format-balance.pdf,
        fig-zone-distribution.pdf, fig-zone-level-heatmap.pdf
"""

import json
import sys
from collections import Counter, defaultdict
from pathlib import Path

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns

# ── Config ──────────────────────────────────────────────────────
SCRIPTS_DIR = Path(__file__).parent
PAPER_DIR = SCRIPTS_DIR.parent
FIGURES_DIR = PAPER_DIR / "figures"
STATS_PATH = PAPER_DIR / "corpus_stats.json"

TRACKS = ["cloud", "edge", "mobile", "tinyml", "global"]
LEVELS = ["L1", "L2", "L3", "L4", "L5", "L6+"]
BLOOM_LABELS = {
    "L1": "Remember", "L2": "Understand", "L3": "Apply",
    "L4": "Analyze", "L5": "Evaluate", "L6+": "Create",
}

# Harvard/MIT color palette
CRIMSON = "#A31F34"
BLUE = "#4A90C4"
GREEN = "#3D9E5A"
ORANGE = "#C87B2A"
RED = "#C44444"
GRAY = "#888888"

TRACK_COLORS = {
    "cloud": "#4A90C4",
    "edge": "#3D9E5A",
    "mobile": "#C87B2A",
    "tinyml": "#A31F34",
    "global": "#888888",
}

FORMAT_COLORS = {
    "calculation": "#cfe2f3",  # blue   — compute / processing
    "design":      "#d4edda",  # green  — architecture / data flow
    "conceptual":  "#fdebd0",  # orange — routing / scheduling
    "optimization":"#e7d8ed",  # purple — improvement (distinct hue from green so the
                               #          stacked-bar reading is unambiguous)
    "diagnosis":   "#f9d6d5",  # red    — failure / cost
    "tradeoff":    "#f7f7f7",  # gray   — neutral
}
FORMAT_EDGES = {
    "calculation": "#4a90c4",
    "design":      "#3d9e5a",
    "conceptual":  "#c87b2a",
    "optimization":"#7d4f96",
    "diagnosis":   "#c44",
    "tradeoff":    "#bbb",
}

# Matplotlib defaults for paper — Helvetica to match SVG figures
plt.rcParams.update({
    "font.family": "sans-serif",
    "font.sans-serif": ["Helvetica", "Helvetica Neue", "Arial", "DejaVu Sans"],
    "font.size": 9,
    "axes.titlesize": 10,
    "axes.labelsize": 9,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "legend.fontsize": 7.5,
    "figure.dpi": 300,
    "savefig.dpi": 300,
    "savefig.bbox": "tight",
    "savefig.pad_inches": 0.1,
})


def load_stats():
    """Load pre-computed stats from analyze_corpus.py."""
    if not STATS_PATH.exists():
        print("Error: corpus_stats.json not found. Run: python3 analyze_corpus.py")
        sys.exit(1)
    return json.loads(STATS_PATH.read_text())


def classify_format(scenario: str) -> list[str]:
    s = scenario.lower()
    fmts = []
    if any(w in s for w in ["calculate", "compute", "estimate", "how many", "how much"]):
        fmts.append("calculation")
    if any(w in s for w in ["design", "architect", "propose", "how would you build"]):
        fmts.append("design")
    if any(w in s for w in ["explain", "what is", "define", "describe"]):
        fmts.append("conceptual")
    if any(w in s for w in ["optimize", "improve", "reduce", "speed up"]):
        fmts.append("optimization")
    if any(w in s for w in ["diagnose", "debug", "why is", "root cause", "fails"]):
        fmts.append("diagnosis")
    if any(w in s for w in ["compare", "trade-off", "tradeoff", "versus", " vs "]):
        fmts.append("tradeoff")
    return fmts if fmts else ["conceptual"]


# ── Figure 1: Track × Level Heatmap + Competency Bars ───────────
def fig_corpus_distribution(stats):
    # Wide enough: heatmap + cbar + row totals, then gap, then bar panel
    fig, (ax_heat, ax_bar) = plt.subplots(
        1, 2, figsize=(7.8, 3.35), width_ratios=[1.35, 1],
        gridspec_kw={"wspace": 0.42},
    )

    # Heatmap from stats
    tlm = stats["track_level_matrix"]
    matrix = np.zeros((len(TRACKS), len(LEVELS)), dtype=int)
    for i, t in enumerate(TRACKS):
        for j, l in enumerate(LEVELS):
            matrix[i, j] = tlm["data"][t][l]

    # Row sums on the same line as the track (avoids colorbar + extra-column clash)
    row_totals = [int(matrix[i].sum()) for i in range(len(TRACKS))]
    y_hm_labels = [f"{t.capitalize()} ({n:,})" for t, n in zip(TRACKS, row_totals)]

    sns.heatmap(
        matrix, ax=ax_heat, annot=True, fmt="d",
        xticklabels=LEVELS, yticklabels=y_hm_labels,
        cmap="Blues", linewidths=0.5, linecolor="white",
        annot_kws={"size": 7.5},
        cbar_kws={
            "label": "Questions",
            "shrink": 0.72,
            "pad": 0.04,
        },
    )
    ax_heat.set_xlabel("Mastery level", labelpad=6)
    ax_heat.set_ylabel("Deployment track", labelpad=6)
    ax_heat.tick_params(axis="y", which="major", labelsize=7, rotation=0)

    # Competency bar chart from stats
    sorted_areas = list(stats["competency_areas"].items())
    labels = [a for a, _ in sorted_areas]
    counts = [c for _, c in sorted_areas]

    # Color by semantic category
    area_colors = []
    for a in labels:
        if a in ("compute", "memory", "architecture", "parallelism"):
            area_colors.append(BLUE)
        elif a in ("deployment", "data", "networking"):
            area_colors.append(GREEN)
        elif a in ("latency", "precision", "optimization"):
            area_colors.append(ORANGE)
        elif a in ("power", "reliability"):
            area_colors.append(RED)
        else:
            area_colors.append(GRAY)

    bars = ax_bar.barh(range(len(labels)), counts, color=area_colors, alpha=0.8, height=0.7)
    ax_bar.set_yticks(range(len(labels)))
    ax_bar.set_yticklabels(labels, fontsize=7.5)
    ax_bar.invert_yaxis()
    ax_bar.set_ylabel("Competency area", labelpad=4)
    ax_bar.set_xlabel("Questions", labelpad=3)
    # title in LaTeX caption

    # Count labels at bar end; expand x so labels do not clip
    nmax = max(counts) if counts else 0
    for bar, count in zip(bars, counts):
        ax_bar.text(
            bar.get_width() + 0.02 * nmax,
            bar.get_y() + bar.get_height() / 2,
            f"{count:,}",
            va="center",
            fontsize=6.5,
            color="#555",
        )
    if nmax:
        ax_bar.set_xlim(0, nmax * 1.2)

    # title in LaTeX caption (removed suptitle)
    for ax in (ax_heat, ax_bar):
        for spine in ax.spines.values():
            spine.set_linewidth(0.8)
    # No tight_layout: seaborn heatmap + cbar is not always compatible; bbox tight handles margins.

    fig.savefig(FIGURES_DIR / "fig-corpus-distribution.pdf")
    print("  Saved figures/fig-corpus-distribution.pdf")
    plt.close(fig)


# ── Figure 2: Question Format by Level (Stacked Bar) ────────────
def fig_format_balance(stats):
    # Legend above axes so it does not collide with two-line x tick labels
    fig, ax = plt.subplots(figsize=(4.8, 3.55))

    formats = ["calculation", "design", "conceptual", "optimization", "diagnosis", "tradeoff"]
    data = {fmt: [] for fmt in formats}

    fbl = stats["format_by_level"]
    for level in LEVELS:
        for fmt in formats:
            data[fmt].append(fbl[level]["format_pct"].get(fmt, 0))

    x = np.arange(len(LEVELS))
    width = 0.65
    bottom = np.zeros(len(LEVELS))

    for fmt in formats:
        values = data[fmt]
        bars = ax.bar(
            x, values, width, bottom=bottom,
            label=fmt.capitalize(),
            color=FORMAT_COLORS[fmt],
            edgecolor=FORMAT_EDGES[fmt],
            linewidth=0.5,
        )
        # Label percentages > 10%
        for i, v in enumerate(values):
            if v > 10:
                ax.text(
                    x[i], bottom[i] + v / 2, f"{v:.0f}%",
                    ha="center", va="center", fontsize=5.5, color="#333", zorder=3,
                )
        bottom += values

    ax.set_xticks(x)
    xlabels = [f"{l}\n({BLOOM_LABELS[l]})" for l in LEVELS]
    ax.set_xticklabels(xlabels, fontsize=6.5, ma="center")
    plt.setp(ax.get_xticklabels(), linespacing=1.12)
    ax.set_ylabel("Percent of questions in level", labelpad=5)
    ax.set_xlabel("Mastery level", labelpad=4)
    # title in LaTeX caption
    ax.set_ylim(0, 105)
    ax.set_xlim(x.min() - 0.5, x.max() + 0.5)
    ax.yaxis.set_major_locator(ticker.MultipleLocator(20))
    ax.set_axisbelow(True)
    ax.grid(axis="y", linestyle=":", alpha=0.4, linewidth=0.8, zorder=0)
    for spine in ax.spines.values():
        spine.set_visible(True)
        spine.set_linewidth(0.8)
    ax.tick_params(axis="both", which="major", width=0.8, length=3.5)

    ax.legend(
        loc="lower center", bbox_to_anchor=(0.5, 1.0), ncol=3, fontsize=6.5,
        frameon=True, framealpha=0.95, edgecolor="#cccccc", fancybox=False,
    )
    fig.subplots_adjust(
        top=0.80,
        bottom=0.20,
        left=0.12,
        right=0.98,
    )
    fig.savefig(FIGURES_DIR / "fig-format-balance.pdf", bbox_inches="tight", pad_inches=0.12)
    print("  Saved figures/fig-format-balance.pdf")
    plt.close(fig)


# ── Figure 3: Zone Distribution (Bar Chart) ──────────────────
def fig_zone_distribution(stats):
    zd = stats.get("zone_distribution", {})
    if not zd:
        print("  ⚠️ No zone_distribution in stats, skipping")
        return

    # Order by count descending
    sorted_zones = sorted(zd.items(), key=lambda x: -x[1])
    labels = [z for z, _ in sorted_zones]
    counts = [c for _, c in sorted_zones]

    # Color by zone type
    PURE = {"recall", "analyze", "design", "implement"}
    COMPOUND = {"diagnosis", "specification", "fluency", "evaluation", "realization", "optimization"}

    zone_colors = []
    for z in labels:
        if z == "mastery":
            zone_colors.append(CRIMSON)
        elif z in PURE:
            zone_colors.append(BLUE)
        else:
            zone_colors.append(GREEN)

    cmax = max(counts) if counts else 1
    fig, ax = plt.subplots(figsize=(4.7, 3.1))
    bars = ax.barh(range(len(labels)), counts, color=zone_colors, alpha=0.85, height=0.7)
    ax.set_yticks(range(len(labels)))
    ax.set_yticklabels([z.capitalize() for z in labels], fontsize=7.5)
    ax.invert_yaxis()
    ax.set_xlabel("Questions", labelpad=4)
    # title in LaTeX caption

    total = sum(counts)
    for bar, count in zip(bars, counts):
        pct = 100 * count / total
        ax.text(
            bar.get_width() + 0.01 * cmax,
            bar.get_y() + bar.get_height() / 2,
            f"{count:,} ({pct:.1f}%)",
            va="center",
            fontsize=6.2,
            color="#555",
        )
    ax.set_xlim(0, cmax * 1.14)

    # Legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor=BLUE, alpha=0.85, label="Pure (single skill)"),
        Patch(facecolor=GREEN, alpha=0.85, label="Compound (two skills)"),
        Patch(facecolor=CRIMSON, alpha=0.85, label="Mastery (all four)"),
    ]
    ax.legend(handles=legend_elements, loc="lower right", fontsize=6.5, frameon=True, framealpha=0.95)
    for spine in ax.spines.values():
        spine.set_linewidth(0.8)
    fig.subplots_adjust(left=0.22, right=0.98, top=0.98, bottom=0.12)
    fig.savefig(FIGURES_DIR / "fig-zone-distribution.pdf", bbox_inches="tight", pad_inches=0.1)
    print("  Saved figures/fig-zone-distribution.pdf")
    plt.close(fig)


# ── Figure 4: Zone × Level Heatmap ───────────────────────────
def fig_zone_level_heatmap(stats):
    zlm = stats.get("zone_level_matrix", {})
    if not zlm:
        print("  ⚠️ No zone_level_matrix in stats, skipping")
        return

    ZONES_ORDERED = [
        "recall", "implement", "fluency",
        "analyze", "diagnosis",
        "design", "specification", "optimization",
        "evaluation", "realization",
        "mastery",
    ]

    matrix = np.zeros((len(ZONES_ORDERED), len(LEVELS)), dtype=int)
    for i, z in enumerate(ZONES_ORDERED):
        for j, l in enumerate(LEVELS):
            matrix[i, j] = zlm.get(z, {}).get(l, 0)

    fig, ax = plt.subplots(figsize=(4.7, 3.5))
    sns.heatmap(
        matrix, ax=ax, annot=True, fmt="d",
        xticklabels=LEVELS,
        yticklabels=[z.capitalize() for z in ZONES_ORDERED],
        cmap="YlOrRd", linewidths=0.4, linecolor="white",
        annot_kws={"size": 5.5},
        cbar_kws={"label": "Questions", "shrink": 0.68, "pad": 0.02},
    )
    # title in LaTeX caption — match typography with other data figures
    ax.set_xlabel("Mastery level", labelpad=5)
    ax.set_ylabel("Cognitive zone", labelpad=5)
    ax.tick_params(axis="x", which="major", labelsize=7.5)
    ax.tick_params(axis="y", which="major", labelsize=6.0, pad=2)
    for spine in ax.spines.values():
        spine.set_linewidth(0.8)
    # Leave room for 11 y-labels and colorbar; avoid tight_layout+heatmap cbar glitches
    fig.subplots_adjust(left=0.22, right=0.90, top=0.98, bottom=0.12)

    fig.savefig(FIGURES_DIR / "fig-zone-level-heatmap.pdf", bbox_inches="tight", pad_inches=0.1)
    print("  Saved figures/fig-zone-level-heatmap.pdf")
    plt.close(fig)


# ── Main ────────────────────────────────────────────────────────
def main():
    print("Generating paper figures from corpus_stats.json...\n")
    stats = load_stats()
    print(f"  Published: {stats['summary']['published']}")
    print(f"  Chains: {stats['summary']['chains_total']}")
    meta = stats.get("_meta", {})
    if meta:
        sk = meta.get("source", "unknown")
        print(
            f"  Provenance: {meta.get('generated_utc', '?')} "
            f"({meta.get('pipeline', 'analyze_corpus')})  [source={sk}]"
        )
        m = meta.get("data") or meta.get("corpus")
        if m and isinstance(m, dict) and m.get("sha256_12"):
            print(
                f"    {m.get('path', 'data')}: {m.get('bytes', 0):,} bytes, "
                f"sha256…{m['sha256_12']}"
            )
            if m.get("resolved_path"):
                print(f"      ← {m['resolved_path']}")
        for key in ("chains_registry", "taxonomy_data_yaml", "chains"):
            m2 = meta.get(key)
            if m2 and isinstance(m2, dict) and m2.get("sha256_12") and key != "data":
                print(
                    f"    {m2.get('path', key)}: {m2.get('bytes', 0):,} bytes, "
                    f"sha256…{m2['sha256_12']}"
                )
    else:
        print(
            "  [Hint] Re-run `python3 scripts/analyze_corpus.py` after changing the vault to "
            "record `_meta` (input file fingerprints) in corpus_stats.json."
        )
    print()

    fig_corpus_distribution(stats)
    fig_format_balance(stats)

    fig_zone_distribution(stats)
    fig_zone_level_heatmap(stats)

    print(f"\nDone. All figures saved to {FIGURES_DIR}/")


if __name__ == "__main__":
    main()