Files
cs249r_book/interviews/paper/scripts/generate_figures.py
Vijay Janapa Reddi 2b381bb949 refactor(vault-cli): rename --legacy-json to --local-json
The flag is the StaffML frontend's local-dev fallback (read corpus.json
from disk via NEXT_PUBLIC_VAULT_FALLBACK=static), not a deprecated path.
"Legacy" implied "soon to be removed"; "local-json" describes its actual
role and reads correctly in scripts and docs.

- vault-cli: rename CLI flag, parameter, result key, and help text.
- CI workflows + pre-commit config: invoke the new flag name.
- All scripts that print the command (suggest_exemplars,
  pre_commit_corpus_guard, promote_validated, rename_legacy_ids,
  export_to_staffml, the paper analyze_corpus/generate_*) updated.
- Comments and docs (ARCHITECTURE, CHANGELOG, REVIEWS, TESTING,
  MASSIVE_BUILD_RUNBOOK, DEPRECATED, AUTHORING, plus frontend
  comments and .env.example / .gitignore) updated.

The "legacy_json" sentinel string in corpus_stats.json._meta.source
is intentionally NOT renamed — it is a stable artifact format read
by downstream paper-generation tooling.
2026-04-30 09:30:28 -04:00

421 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""Generate publication-quality data figures for the StaffML paper.
Pipeline: generated corpus.json (``vault build --local-json``) + chains
→ analyze_corpus.py → corpus_stats.json → THIS → PDFs
Run: python3 generate_figures.py
(or: make figures)
Reads: corpus_stats.json (structured stats from analyze_corpus.py)
Writes: fig-corpus-distribution.pdf, fig-format-balance.pdf,
fig-zone-distribution.pdf, fig-zone-level-heatmap.pdf
"""
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np
import seaborn as sns
# ── Config ──────────────────────────────────────────────────────
SCRIPTS_DIR = Path(__file__).parent
PAPER_DIR = SCRIPTS_DIR.parent
FIGURES_DIR = PAPER_DIR / "figures"
STATS_PATH = PAPER_DIR / "corpus_stats.json"
TRACKS = ["cloud", "edge", "mobile", "tinyml", "global"]
LEVELS = ["L1", "L2", "L3", "L4", "L5", "L6+"]
BLOOM_LABELS = {
"L1": "Remember", "L2": "Understand", "L3": "Apply",
"L4": "Analyze", "L5": "Evaluate", "L6+": "Create",
}
# Harvard/MIT color palette
CRIMSON = "#A31F34"
BLUE = "#4A90C4"
GREEN = "#3D9E5A"
ORANGE = "#C87B2A"
RED = "#C44444"
GRAY = "#888888"
TRACK_COLORS = {
"cloud": "#4A90C4",
"edge": "#3D9E5A",
"mobile": "#C87B2A",
"tinyml": "#A31F34",
"global": "#888888",
}
FORMAT_COLORS = {
"calculation": "#cfe2f3", # blue — compute / processing
"design": "#d4edda", # green — architecture / data flow
"conceptual": "#fdebd0", # orange — routing / scheduling
"optimization":"#e7d8ed", # purple — improvement (distinct hue from green so the
# stacked-bar reading is unambiguous)
"diagnosis": "#f9d6d5", # red — failure / cost
"tradeoff": "#f7f7f7", # gray — neutral
}
FORMAT_EDGES = {
"calculation": "#4a90c4",
"design": "#3d9e5a",
"conceptual": "#c87b2a",
"optimization":"#7d4f96",
"diagnosis": "#c44",
"tradeoff": "#bbb",
}
# Matplotlib defaults for paper — Helvetica to match SVG figures
plt.rcParams.update({
"font.family": "sans-serif",
"font.sans-serif": ["Helvetica", "Helvetica Neue", "Arial", "DejaVu Sans"],
"font.size": 9,
"axes.titlesize": 10,
"axes.labelsize": 9,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"legend.fontsize": 7.5,
"figure.dpi": 300,
"savefig.dpi": 300,
"savefig.bbox": "tight",
"savefig.pad_inches": 0.1,
})
def load_stats():
"""Load pre-computed stats from analyze_corpus.py."""
if not STATS_PATH.exists():
print("Error: corpus_stats.json not found. Run: python3 analyze_corpus.py")
sys.exit(1)
return json.loads(STATS_PATH.read_text())
def classify_format(scenario: str) -> list[str]:
s = scenario.lower()
fmts = []
if any(w in s for w in ["calculate", "compute", "estimate", "how many", "how much"]):
fmts.append("calculation")
if any(w in s for w in ["design", "architect", "propose", "how would you build"]):
fmts.append("design")
if any(w in s for w in ["explain", "what is", "define", "describe"]):
fmts.append("conceptual")
if any(w in s for w in ["optimize", "improve", "reduce", "speed up"]):
fmts.append("optimization")
if any(w in s for w in ["diagnose", "debug", "why is", "root cause", "fails"]):
fmts.append("diagnosis")
if any(w in s for w in ["compare", "trade-off", "tradeoff", "versus", " vs "]):
fmts.append("tradeoff")
return fmts if fmts else ["conceptual"]
# ── Figure 1: Track × Level Heatmap + Competency Bars ───────────
def fig_corpus_distribution(stats):
# Wide enough: heatmap + cbar + row totals, then gap, then bar panel
fig, (ax_heat, ax_bar) = plt.subplots(
1, 2, figsize=(7.8, 3.35), width_ratios=[1.35, 1],
gridspec_kw={"wspace": 0.42},
)
# Heatmap from stats
tlm = stats["track_level_matrix"]
matrix = np.zeros((len(TRACKS), len(LEVELS)), dtype=int)
for i, t in enumerate(TRACKS):
for j, l in enumerate(LEVELS):
matrix[i, j] = tlm["data"][t][l]
# Row sums on the same line as the track (avoids colorbar + extra-column clash)
row_totals = [int(matrix[i].sum()) for i in range(len(TRACKS))]
y_hm_labels = [f"{t.capitalize()} ({n:,})" for t, n in zip(TRACKS, row_totals)]
sns.heatmap(
matrix, ax=ax_heat, annot=True, fmt="d",
xticklabels=LEVELS, yticklabels=y_hm_labels,
cmap="Blues", linewidths=0.5, linecolor="white",
annot_kws={"size": 7.5},
cbar_kws={
"label": "Questions",
"shrink": 0.72,
"pad": 0.04,
},
)
ax_heat.set_xlabel("Mastery level", labelpad=6)
ax_heat.set_ylabel("Deployment track", labelpad=6)
ax_heat.tick_params(axis="y", which="major", labelsize=7, rotation=0)
# Competency bar chart from stats
sorted_areas = list(stats["competency_areas"].items())
labels = [a for a, _ in sorted_areas]
counts = [c for _, c in sorted_areas]
# Color by semantic category
area_colors = []
for a in labels:
if a in ("compute", "memory", "architecture", "parallelism"):
area_colors.append(BLUE)
elif a in ("deployment", "data", "networking"):
area_colors.append(GREEN)
elif a in ("latency", "precision", "optimization"):
area_colors.append(ORANGE)
elif a in ("power", "reliability"):
area_colors.append(RED)
else:
area_colors.append(GRAY)
bars = ax_bar.barh(range(len(labels)), counts, color=area_colors, alpha=0.8, height=0.7)
ax_bar.set_yticks(range(len(labels)))
ax_bar.set_yticklabels(labels, fontsize=7.5)
ax_bar.invert_yaxis()
ax_bar.set_ylabel("Competency area", labelpad=4)
ax_bar.set_xlabel("Questions", labelpad=3)
# title in LaTeX caption
# Count labels at bar end; expand x so labels do not clip
nmax = max(counts) if counts else 0
for bar, count in zip(bars, counts):
ax_bar.text(
bar.get_width() + 0.02 * nmax,
bar.get_y() + bar.get_height() / 2,
f"{count:,}",
va="center",
fontsize=6.5,
color="#555",
)
if nmax:
ax_bar.set_xlim(0, nmax * 1.2)
# title in LaTeX caption (removed suptitle)
for ax in (ax_heat, ax_bar):
for spine in ax.spines.values():
spine.set_linewidth(0.8)
# No tight_layout: seaborn heatmap + cbar is not always compatible; bbox tight handles margins.
fig.savefig(FIGURES_DIR / "fig-corpus-distribution.pdf")
print(" Saved figures/fig-corpus-distribution.pdf")
plt.close(fig)
# ── Figure 2: Question Format by Level (Stacked Bar) ────────────
def fig_format_balance(stats):
# Legend above axes so it does not collide with two-line x tick labels
fig, ax = plt.subplots(figsize=(4.8, 3.55))
formats = ["calculation", "design", "conceptual", "optimization", "diagnosis", "tradeoff"]
data = {fmt: [] for fmt in formats}
fbl = stats["format_by_level"]
for level in LEVELS:
for fmt in formats:
data[fmt].append(fbl[level]["format_pct"].get(fmt, 0))
x = np.arange(len(LEVELS))
width = 0.65
bottom = np.zeros(len(LEVELS))
for fmt in formats:
values = data[fmt]
bars = ax.bar(
x, values, width, bottom=bottom,
label=fmt.capitalize(),
color=FORMAT_COLORS[fmt],
edgecolor=FORMAT_EDGES[fmt],
linewidth=0.5,
)
# Label percentages > 10%
for i, v in enumerate(values):
if v > 10:
ax.text(
x[i], bottom[i] + v / 2, f"{v:.0f}%",
ha="center", va="center", fontsize=5.5, color="#333", zorder=3,
)
bottom += values
ax.set_xticks(x)
xlabels = [f"{l}\n({BLOOM_LABELS[l]})" for l in LEVELS]
ax.set_xticklabels(xlabels, fontsize=6.5, ma="center")
plt.setp(ax.get_xticklabels(), linespacing=1.12)
ax.set_ylabel("Percent of questions in level", labelpad=5)
ax.set_xlabel("Mastery level", labelpad=4)
# title in LaTeX caption
ax.set_ylim(0, 105)
ax.set_xlim(x.min() - 0.5, x.max() + 0.5)
ax.yaxis.set_major_locator(ticker.MultipleLocator(20))
ax.set_axisbelow(True)
ax.grid(axis="y", linestyle=":", alpha=0.4, linewidth=0.8, zorder=0)
for spine in ax.spines.values():
spine.set_visible(True)
spine.set_linewidth(0.8)
ax.tick_params(axis="both", which="major", width=0.8, length=3.5)
ax.legend(
loc="lower center", bbox_to_anchor=(0.5, 1.0), ncol=3, fontsize=6.5,
frameon=True, framealpha=0.95, edgecolor="#cccccc", fancybox=False,
)
fig.subplots_adjust(
top=0.80,
bottom=0.20,
left=0.12,
right=0.98,
)
fig.savefig(FIGURES_DIR / "fig-format-balance.pdf", bbox_inches="tight", pad_inches=0.12)
print(" Saved figures/fig-format-balance.pdf")
plt.close(fig)
# ── Figure 3: Zone Distribution (Bar Chart) ──────────────────
def fig_zone_distribution(stats):
zd = stats.get("zone_distribution", {})
if not zd:
print(" ⚠️ No zone_distribution in stats, skipping")
return
# Order by count descending
sorted_zones = sorted(zd.items(), key=lambda x: -x[1])
labels = [z for z, _ in sorted_zones]
counts = [c for _, c in sorted_zones]
# Color by zone type
PURE = {"recall", "analyze", "design", "implement"}
COMPOUND = {"diagnosis", "specification", "fluency", "evaluation", "realization", "optimization"}
zone_colors = []
for z in labels:
if z == "mastery":
zone_colors.append(CRIMSON)
elif z in PURE:
zone_colors.append(BLUE)
else:
zone_colors.append(GREEN)
cmax = max(counts) if counts else 1
fig, ax = plt.subplots(figsize=(4.7, 3.1))
bars = ax.barh(range(len(labels)), counts, color=zone_colors, alpha=0.85, height=0.7)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels([z.capitalize() for z in labels], fontsize=7.5)
ax.invert_yaxis()
ax.set_xlabel("Questions", labelpad=4)
# title in LaTeX caption
total = sum(counts)
for bar, count in zip(bars, counts):
pct = 100 * count / total
ax.text(
bar.get_width() + 0.01 * cmax,
bar.get_y() + bar.get_height() / 2,
f"{count:,} ({pct:.1f}%)",
va="center",
fontsize=6.2,
color="#555",
)
ax.set_xlim(0, cmax * 1.14)
# Legend
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor=BLUE, alpha=0.85, label="Pure (single skill)"),
Patch(facecolor=GREEN, alpha=0.85, label="Compound (two skills)"),
Patch(facecolor=CRIMSON, alpha=0.85, label="Mastery (all four)"),
]
ax.legend(handles=legend_elements, loc="lower right", fontsize=6.5, frameon=True, framealpha=0.95)
for spine in ax.spines.values():
spine.set_linewidth(0.8)
fig.subplots_adjust(left=0.22, right=0.98, top=0.98, bottom=0.12)
fig.savefig(FIGURES_DIR / "fig-zone-distribution.pdf", bbox_inches="tight", pad_inches=0.1)
print(" Saved figures/fig-zone-distribution.pdf")
plt.close(fig)
# ── Figure 4: Zone × Level Heatmap ───────────────────────────
def fig_zone_level_heatmap(stats):
zlm = stats.get("zone_level_matrix", {})
if not zlm:
print(" ⚠️ No zone_level_matrix in stats, skipping")
return
ZONES_ORDERED = [
"recall", "implement", "fluency",
"analyze", "diagnosis",
"design", "specification", "optimization",
"evaluation", "realization",
"mastery",
]
matrix = np.zeros((len(ZONES_ORDERED), len(LEVELS)), dtype=int)
for i, z in enumerate(ZONES_ORDERED):
for j, l in enumerate(LEVELS):
matrix[i, j] = zlm.get(z, {}).get(l, 0)
fig, ax = plt.subplots(figsize=(4.7, 3.5))
sns.heatmap(
matrix, ax=ax, annot=True, fmt="d",
xticklabels=LEVELS,
yticklabels=[z.capitalize() for z in ZONES_ORDERED],
cmap="YlOrRd", linewidths=0.4, linecolor="white",
annot_kws={"size": 5.5},
cbar_kws={"label": "Questions", "shrink": 0.68, "pad": 0.02},
)
# title in LaTeX caption — match typography with other data figures
ax.set_xlabel("Mastery level", labelpad=5)
ax.set_ylabel("Cognitive zone", labelpad=5)
ax.tick_params(axis="x", which="major", labelsize=7.5)
ax.tick_params(axis="y", which="major", labelsize=6.0, pad=2)
for spine in ax.spines.values():
spine.set_linewidth(0.8)
# Leave room for 11 y-labels and colorbar; avoid tight_layout+heatmap cbar glitches
fig.subplots_adjust(left=0.22, right=0.90, top=0.98, bottom=0.12)
fig.savefig(FIGURES_DIR / "fig-zone-level-heatmap.pdf", bbox_inches="tight", pad_inches=0.1)
print(" Saved figures/fig-zone-level-heatmap.pdf")
plt.close(fig)
# ── Main ────────────────────────────────────────────────────────
def main():
print("Generating paper figures from corpus_stats.json...\n")
stats = load_stats()
print(f" Published: {stats['summary']['published']}")
print(f" Chains: {stats['summary']['chains_total']}")
meta = stats.get("_meta", {})
if meta:
sk = meta.get("source", "unknown")
print(
f" Provenance: {meta.get('generated_utc', '?')} "
f"({meta.get('pipeline', 'analyze_corpus')}) [source={sk}]"
)
m = meta.get("data") or meta.get("corpus")
if m and isinstance(m, dict) and m.get("sha256_12"):
print(
f" {m.get('path', 'data')}: {m.get('bytes', 0):,} bytes, "
f"sha256…{m['sha256_12']}"
)
if m.get("resolved_path"):
print(f"{m['resolved_path']}")
for key in ("chains_registry", "taxonomy_data_yaml", "chains"):
m2 = meta.get(key)
if m2 and isinstance(m2, dict) and m2.get("sha256_12") and key != "data":
print(
f" {m2.get('path', key)}: {m2.get('bytes', 0):,} bytes, "
f"sha256…{m2['sha256_12']}"
)
else:
print(
" [Hint] Re-run `python3 scripts/analyze_corpus.py` after changing the vault to "
"record `_meta` (input file fingerprints) in corpus_stats.json."
)
print()
fig_corpus_distribution(stats)
fig_format_balance(stats)
fig_zone_distribution(stats)
fig_zone_level_heatmap(stats)
print(f"\nDone. All figures saved to {FIGURES_DIR}/")
if __name__ == "__main__":
main()