mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-05-07 02:03:55 -05:00
Replace the 839 LLM-extracted concepts with 79 human-curated topics organized into 13 competency areas. Each topic has typed edges (prerequisite, broader, narrower, related) using SKOS vocabulary. Introduce the ikigai competency model: 4 fundamental skills (recall, analyze, design, implement) whose intersections produce 11 cognitive zones for classifying HOW questions test topics. Schema defined in LinkML (staffml_taxonomy.yaml) which generates Pydantic, JSON Schema, and TypeScript from a single source of truth. Key files: - schema/staffml_taxonomy.yaml: LinkML schema definition - schema/taxonomy_data.yaml: 79 topics + 123 typed edges - schema/zones.py: ikigai zone model (4 skills x 11 zones) - schema/graph.py: NetworkX graph explorer + Graphviz DOT export - schema/resolve.py: maps corpus questions to new topic+zone system - topics.json: simplified JSON view (auto-generated from YAML) - topic_schema.py: Pydantic validator with DAG cycle detection
681 lines
23 KiB
JSON
681 lines
23 KiB
JSON
{
|
|
"version": "1.0.0",
|
|
"description": "Curated knowledge graph of ML systems topics for Staff-level technical interview preparation. 77 topics across 13 competency areas with prerequisite, hierarchical, and lateral relationships.",
|
|
"last_updated": "2026-03-30",
|
|
"areas": [
|
|
"architecture",
|
|
"compute",
|
|
"cross-cutting",
|
|
"data",
|
|
"deployment",
|
|
"latency",
|
|
"memory",
|
|
"networking",
|
|
"optimization",
|
|
"parallelism",
|
|
"power",
|
|
"precision",
|
|
"reliability"
|
|
],
|
|
"topics": [
|
|
{
|
|
"id": "roofline-analysis",
|
|
"name": "Roofline Analysis",
|
|
"area": "compute",
|
|
"prerequisites": [],
|
|
"description": "Using the roofline model to diagnose compute-bound vs memory-bound workloads and predict accelerator utilization."
|
|
},
|
|
{
|
|
"id": "gpu-compute-architecture",
|
|
"name": "GPU Compute Architecture",
|
|
"area": "compute",
|
|
"prerequisites": [
|
|
"roofline-analysis"
|
|
],
|
|
"description": "GPU execution model: warps, thread blocks, occupancy, Tensor Cores, and memory coalescing."
|
|
},
|
|
{
|
|
"id": "accelerator-comparison",
|
|
"name": "Accelerator Comparison",
|
|
"area": "compute",
|
|
"prerequisites": [
|
|
"roofline-analysis"
|
|
],
|
|
"description": "Comparing CPUs, GPUs, TPUs, NPUs, and custom ASICs across the programmability-efficiency spectrum."
|
|
},
|
|
{
|
|
"id": "mcu-compute-constraints",
|
|
"name": "MCU Compute Constraints",
|
|
"area": "compute",
|
|
"prerequisites": [
|
|
"roofline-analysis"
|
|
],
|
|
"description": "Compute limits on microcontrollers: no FPU, SIMD utilization (CMSIS-NN), MHz-level clock speeds, integer-only arithmetic."
|
|
},
|
|
{
|
|
"id": "systolic-dataflow",
|
|
"name": "Systolic Arrays & Dataflow",
|
|
"area": "compute",
|
|
"prerequisites": [
|
|
"gpu-compute-architecture"
|
|
],
|
|
"description": "Systolic array architectures, weight-stationary vs output-stationary dataflows, and tiling strategies."
|
|
},
|
|
{
|
|
"id": "compute-cost-estimation",
|
|
"name": "Compute Cost Estimation",
|
|
"area": "compute",
|
|
"prerequisites": [
|
|
"roofline-analysis"
|
|
],
|
|
"description": "Estimating FLOPs, GPU-hours, and dollar cost for training and inference workloads."
|
|
},
|
|
{
|
|
"id": "vram-budgeting",
|
|
"name": "VRAM Budgeting",
|
|
"area": "memory",
|
|
"prerequisites": [],
|
|
"description": "Accounting for weights, optimizer state, activations, and KV-cache in GPU memory."
|
|
},
|
|
{
|
|
"id": "kv-cache-management",
|
|
"name": "KV-Cache Management",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"vram-budgeting"
|
|
],
|
|
"description": "KV-cache sizing, paged attention, cache eviction policies, and memory pressure from long contexts."
|
|
},
|
|
{
|
|
"id": "memory-hierarchy-design",
|
|
"name": "Memory Hierarchy Design",
|
|
"area": "memory",
|
|
"prerequisites": [],
|
|
"description": "Registers, SRAM, HBM, DRAM, and storage tiers: capacity-bandwidth-latency tradeoffs."
|
|
},
|
|
{
|
|
"id": "activation-memory",
|
|
"name": "Activation Memory & Checkpointing",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"vram-budgeting"
|
|
],
|
|
"description": "Forward-pass activation storage, gradient checkpointing, and the compute-memory tradeoff."
|
|
},
|
|
{
|
|
"id": "memory-mapped-inference",
|
|
"name": "Memory-Mapped Inference",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"memory-hierarchy-design"
|
|
],
|
|
"description": "Memory-mapped weight loading, mmap strategies, cold start avoidance, and shared memory across processes."
|
|
},
|
|
{
|
|
"id": "tensor-arena-planning",
|
|
"name": "Tensor Arena Planning",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"memory-hierarchy-design"
|
|
],
|
|
"description": "Static memory planning for MCUs: flat tensor arenas, operator scheduling for peak SRAM, flash vs SRAM placement."
|
|
},
|
|
{
|
|
"id": "dma-data-movement",
|
|
"name": "DMA & Data Movement",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"memory-hierarchy-design"
|
|
],
|
|
"description": "DMA transfers, zero-copy techniques, pinned memory, and host-device data movement overhead."
|
|
},
|
|
{
|
|
"id": "memory-pressure-management",
|
|
"name": "Memory Pressure Management",
|
|
"area": "memory",
|
|
"prerequisites": [
|
|
"vram-budgeting"
|
|
],
|
|
"description": "OOM handling, memory fragmentation, gradient accumulation to reduce batch memory, and OS-level eviction."
|
|
},
|
|
{
|
|
"id": "latency-decomposition",
|
|
"name": "Latency Decomposition",
|
|
"area": "latency",
|
|
"prerequisites": [],
|
|
"description": "Breaking end-to-end latency into components: TTFT, TPOT, network, preprocessing, and postprocessing."
|
|
},
|
|
{
|
|
"id": "batching-strategies",
|
|
"name": "Batching Strategies",
|
|
"area": "latency",
|
|
"prerequisites": [
|
|
"latency-decomposition"
|
|
],
|
|
"description": "Static, dynamic, and continuous batching: throughput-latency tradeoffs and scheduling policies."
|
|
},
|
|
{
|
|
"id": "tail-latency",
|
|
"name": "Tail Latency & SLAs",
|
|
"area": "latency",
|
|
"prerequisites": [
|
|
"latency-decomposition"
|
|
],
|
|
"description": "P99/P999 latency, straggler mitigation, hedged requests, and SLA-driven system design."
|
|
},
|
|
{
|
|
"id": "real-time-deadlines",
|
|
"name": "Real-Time Deadlines",
|
|
"area": "latency",
|
|
"prerequisites": [
|
|
"latency-decomposition"
|
|
],
|
|
"description": "Frame budgets (16ms/33ms), WCET analysis, jank prevention, ANR timeouts, and interrupt-driven pipelines."
|
|
},
|
|
{
|
|
"id": "profiling-bottleneck-analysis",
|
|
"name": "Profiling & Bottleneck Analysis",
|
|
"area": "latency",
|
|
"prerequisites": [
|
|
"latency-decomposition",
|
|
"roofline-analysis"
|
|
],
|
|
"description": "Using profilers, flame graphs, and trace tools to identify compute, memory, and I/O bottlenecks."
|
|
},
|
|
{
|
|
"id": "queueing-theory",
|
|
"name": "Queueing Theory",
|
|
"area": "latency",
|
|
"prerequisites": [
|
|
"latency-decomposition"
|
|
],
|
|
"description": "Little's Law, arrival rate vs service rate, queue depth sizing, and capacity planning."
|
|
},
|
|
{
|
|
"id": "quantization-fundamentals",
|
|
"name": "Quantization Fundamentals",
|
|
"area": "precision",
|
|
"prerequisites": [],
|
|
"description": "INT8/INT4 quantization, zero-point arithmetic, calibration strategies, PTQ vs QAT, per-tensor vs per-channel."
|
|
},
|
|
{
|
|
"id": "mixed-precision-training",
|
|
"name": "Mixed-Precision Training & Inference",
|
|
"area": "precision",
|
|
"prerequisites": [
|
|
"quantization-fundamentals"
|
|
],
|
|
"description": "FP16/BF16/FP8 formats, loss scaling, mixed-precision recipes, and precision-accuracy tradeoffs."
|
|
},
|
|
{
|
|
"id": "extreme-quantization",
|
|
"name": "Extreme Quantization",
|
|
"area": "precision",
|
|
"prerequisites": [
|
|
"quantization-fundamentals"
|
|
],
|
|
"description": "Sub-4-bit quantization, binary/ternary networks, GPTQ, AWQ, and accuracy recovery techniques."
|
|
},
|
|
{
|
|
"id": "power-budgeting",
|
|
"name": "Power Budgeting",
|
|
"area": "power",
|
|
"prerequisites": [],
|
|
"description": "TDP, power caps, DVFS P-states, the CMOS power equation, and energy-per-inference calculations."
|
|
},
|
|
{
|
|
"id": "thermal-management",
|
|
"name": "Thermal Management",
|
|
"area": "power",
|
|
"prerequisites": [
|
|
"power-budgeting"
|
|
],
|
|
"description": "Thermal throttling, sustained vs burst performance, cooling strategies, and ambient temperature effects."
|
|
},
|
|
{
|
|
"id": "energy-per-operation",
|
|
"name": "Energy Per Operation",
|
|
"area": "power",
|
|
"prerequisites": [
|
|
"power-budgeting"
|
|
],
|
|
"description": "Horowitz energy table, energy cost of memory access vs compute, and energy-aware operator selection."
|
|
},
|
|
{
|
|
"id": "duty-cycling",
|
|
"name": "Duty Cycling & Energy Harvesting",
|
|
"area": "power",
|
|
"prerequisites": [
|
|
"power-budgeting"
|
|
],
|
|
"description": "Sleep/wake scheduling, coin cell budgets, solar harvesting, and always-on vs triggered inference."
|
|
},
|
|
{
|
|
"id": "datacenter-efficiency",
|
|
"name": "Datacenter Efficiency",
|
|
"area": "power",
|
|
"prerequisites": [
|
|
"power-budgeting"
|
|
],
|
|
"description": "PUE, rack power budgets, liquid cooling, carbon-aware scheduling, and embodied vs operational carbon."
|
|
},
|
|
{
|
|
"id": "transformer-systems-cost",
|
|
"name": "Transformer Systems Cost",
|
|
"area": "architecture",
|
|
"prerequisites": [],
|
|
"description": "Scaling laws, parameter-FLOP relationships, attention complexity (O(n²) vs linear), and KV-cache growth."
|
|
},
|
|
{
|
|
"id": "cnn-efficient-design",
|
|
"name": "CNN Efficient Design",
|
|
"area": "architecture",
|
|
"prerequisites": [],
|
|
"description": "Depthwise separable convolutions, MobileNet/EfficientNet design, inverted residuals, and FLOP-accuracy tradeoffs."
|
|
},
|
|
{
|
|
"id": "attention-scaling",
|
|
"name": "Attention Scaling & Variants",
|
|
"area": "architecture",
|
|
"prerequisites": [
|
|
"transformer-systems-cost"
|
|
],
|
|
"description": "Multi-head, grouped-query, and multi-query attention; context length scaling; sliding window attention."
|
|
},
|
|
{
|
|
"id": "mixture-of-experts",
|
|
"name": "Mixture of Experts",
|
|
"area": "architecture",
|
|
"prerequisites": [
|
|
"transformer-systems-cost"
|
|
],
|
|
"description": "MoE routing, expert parallelism, capacity factors, load balancing, and the memory-FLOPs decoupling."
|
|
},
|
|
{
|
|
"id": "model-size-estimation",
|
|
"name": "Model Size Estimation",
|
|
"area": "architecture",
|
|
"prerequisites": [],
|
|
"description": "Parameter counting, memory footprint estimation, and feasibility checks for target hardware."
|
|
},
|
|
{
|
|
"id": "neural-architecture-search",
|
|
"name": "Neural Architecture Search",
|
|
"area": "architecture",
|
|
"prerequisites": [
|
|
"model-size-estimation"
|
|
],
|
|
"description": "Hardware-aware NAS, search spaces constrained by SRAM/FLOPs/latency, and MCUNet-style approaches."
|
|
},
|
|
{
|
|
"id": "encoder-decoder-tradeoffs",
|
|
"name": "Encoder-Decoder Tradeoffs",
|
|
"area": "architecture",
|
|
"prerequisites": [
|
|
"transformer-systems-cost",
|
|
"cnn-efficient-design"
|
|
],
|
|
"description": "Choosing architectures for deployment constraints: encoder-only vs decoder-only vs encoder-decoder system costs."
|
|
},
|
|
{
|
|
"id": "pruning-sparsity",
|
|
"name": "Pruning & Sparsity",
|
|
"area": "optimization",
|
|
"prerequisites": [],
|
|
"description": "Structured vs unstructured pruning, sparsity patterns for accelerator alignment, and accuracy-speedup tradeoffs."
|
|
},
|
|
{
|
|
"id": "knowledge-distillation",
|
|
"name": "Knowledge Distillation",
|
|
"area": "optimization",
|
|
"prerequisites": [],
|
|
"description": "Teacher-student training, logit matching, feature distillation, and when distillation beats pruning."
|
|
},
|
|
{
|
|
"id": "kernel-fusion",
|
|
"name": "Kernel & Operator Fusion",
|
|
"area": "optimization",
|
|
"prerequisites": [
|
|
"gpu-compute-architecture"
|
|
],
|
|
"description": "Fusing memory-bound operators, reducing kernel launch overhead, and custom CUDA kernel design."
|
|
},
|
|
{
|
|
"id": "graph-compilation",
|
|
"name": "Graph Compilation & Optimization",
|
|
"area": "optimization",
|
|
"prerequisites": [
|
|
"kernel-fusion"
|
|
],
|
|
"description": "Ahead-of-time compilation, operator lowering, constant folding, and framework compilers."
|
|
},
|
|
{
|
|
"id": "operator-scheduling",
|
|
"name": "Operator Scheduling",
|
|
"area": "optimization",
|
|
"prerequisites": [],
|
|
"description": "Execution order optimization for memory reuse, parallel operator execution, and layer fusion scheduling."
|
|
},
|
|
{
|
|
"id": "flash-attention",
|
|
"name": "IO-Aware Attention",
|
|
"area": "optimization",
|
|
"prerequisites": [
|
|
"kernel-fusion",
|
|
"roofline-analysis"
|
|
],
|
|
"description": "FlashAttention-style tiling, IO-aware algorithm design, and online softmax for memory-efficient attention."
|
|
},
|
|
{
|
|
"id": "speculative-decoding",
|
|
"name": "Speculative Decoding",
|
|
"area": "optimization",
|
|
"prerequisites": [
|
|
"latency-decomposition",
|
|
"transformer-systems-cost"
|
|
],
|
|
"description": "Draft-verify decoding, acceptance rates, draft model selection, and latency reduction for autoregressive generation."
|
|
},
|
|
{
|
|
"id": "data-parallelism",
|
|
"name": "Data Parallelism",
|
|
"area": "parallelism",
|
|
"prerequisites": [],
|
|
"description": "Replicated model training across devices, gradient averaging, FSDP/ZeRO memory sharding."
|
|
},
|
|
{
|
|
"id": "model-tensor-parallelism",
|
|
"name": "Model & Tensor Parallelism",
|
|
"area": "parallelism",
|
|
"prerequisites": [
|
|
"data-parallelism"
|
|
],
|
|
"description": "Splitting model layers across devices, column/row partitioning, and communication-computation overlap."
|
|
},
|
|
{
|
|
"id": "pipeline-parallelism",
|
|
"name": "Pipeline Parallelism",
|
|
"area": "parallelism",
|
|
"prerequisites": [
|
|
"data-parallelism"
|
|
],
|
|
"description": "Splitting model stages across devices, micro-batching, bubble overhead, and interleaved schedules."
|
|
},
|
|
{
|
|
"id": "3d-parallelism",
|
|
"name": "3D Parallelism",
|
|
"area": "parallelism",
|
|
"prerequisites": [
|
|
"data-parallelism",
|
|
"model-tensor-parallelism",
|
|
"pipeline-parallelism"
|
|
],
|
|
"description": "Combining data, tensor, and pipeline parallelism for frontier model training."
|
|
},
|
|
{
|
|
"id": "gradient-synchronization",
|
|
"name": "Gradient Synchronization",
|
|
"area": "parallelism",
|
|
"prerequisites": [
|
|
"data-parallelism",
|
|
"collective-communication"
|
|
],
|
|
"description": "AllReduce algorithms, gradient compression, asynchronous SGD, and stale gradient handling."
|
|
},
|
|
{
|
|
"id": "scheduling-resource-management",
|
|
"name": "Scheduling & Resource Management",
|
|
"area": "parallelism",
|
|
"prerequisites": [],
|
|
"description": "GPU scheduling, MIG/MPS partitioning, gang scheduling, preemption, and multi-tenant resource sharing."
|
|
},
|
|
{
|
|
"id": "collective-communication",
|
|
"name": "Collective Communication",
|
|
"area": "networking",
|
|
"prerequisites": [],
|
|
"description": "AllReduce, AllGather, ReduceScatter: algorithms, bandwidth-optimal implementations, and ring vs tree topologies."
|
|
},
|
|
{
|
|
"id": "interconnect-topology",
|
|
"name": "Interconnect Topology",
|
|
"area": "networking",
|
|
"prerequisites": [
|
|
"collective-communication"
|
|
],
|
|
"description": "Fat-tree, torus, dragonfly topologies; NVLink, NVSwitch, PCIe, and InfiniBand interconnects."
|
|
},
|
|
{
|
|
"id": "network-bandwidth-bottlenecks",
|
|
"name": "Network Bandwidth Bottlenecks",
|
|
"area": "networking",
|
|
"prerequisites": [
|
|
"collective-communication"
|
|
],
|
|
"description": "Bisection bandwidth, communication-computation ratio, network-bound training, and bandwidth cost modeling."
|
|
},
|
|
{
|
|
"id": "rdma-transport",
|
|
"name": "RDMA & High-Performance Transport",
|
|
"area": "networking",
|
|
"prerequisites": [
|
|
"interconnect-topology"
|
|
],
|
|
"description": "RDMA, RoCE, InfiniBand verbs, zero-copy networking, and kernel bypass for distributed training."
|
|
},
|
|
{
|
|
"id": "load-balancing",
|
|
"name": "Load Balancing & Routing",
|
|
"area": "networking",
|
|
"prerequisites": [],
|
|
"description": "Request routing, consistent hashing, weighted round-robin, and inference traffic management."
|
|
},
|
|
{
|
|
"id": "congestion-control",
|
|
"name": "Congestion & Flow Control",
|
|
"area": "networking",
|
|
"prerequisites": [
|
|
"network-bandwidth-bottlenecks"
|
|
],
|
|
"description": "ECN, PFC, DCQCN, incast congestion, and flow scheduling in GPU cluster networks."
|
|
},
|
|
{
|
|
"id": "model-serving-infrastructure",
|
|
"name": "Model Serving Infrastructure",
|
|
"area": "deployment",
|
|
"prerequisites": [],
|
|
"description": "Inference servers, model loading, request handling, autoscaling, and cold start optimization."
|
|
},
|
|
{
|
|
"id": "mlops-lifecycle",
|
|
"name": "MLOps Lifecycle",
|
|
"area": "deployment",
|
|
"prerequisites": [
|
|
"model-serving-infrastructure"
|
|
],
|
|
"description": "Model registries, CI/CD for ML, experiment tracking, reproducibility, and training-serving consistency."
|
|
},
|
|
{
|
|
"id": "ota-firmware-updates",
|
|
"name": "OTA & Firmware Updates",
|
|
"area": "deployment",
|
|
"prerequisites": [],
|
|
"description": "A/B partition schemes, firmware-over-the-air, rollback mechanisms, and flash programming constraints."
|
|
},
|
|
{
|
|
"id": "container-orchestration",
|
|
"name": "Container & Cluster Orchestration",
|
|
"area": "deployment",
|
|
"prerequisites": [
|
|
"model-serving-infrastructure"
|
|
],
|
|
"description": "Kubernetes for ML, GPU device plugins, node affinity, and job scheduling for training clusters."
|
|
},
|
|
{
|
|
"id": "model-format-conversion",
|
|
"name": "Model Format & Runtime Conversion",
|
|
"area": "deployment",
|
|
"prerequisites": [],
|
|
"description": "ONNX, TFLite, CoreML, TensorRT conversion; operator coverage gaps; and delegation strategies."
|
|
},
|
|
{
|
|
"id": "ab-rollout-strategies",
|
|
"name": "A/B & Rollout Strategies",
|
|
"area": "deployment",
|
|
"prerequisites": [
|
|
"model-serving-infrastructure"
|
|
],
|
|
"description": "Blue-green, canary, shadow deployments; traffic splitting; and progressive rollout with rollback."
|
|
},
|
|
{
|
|
"id": "compound-ai-systems",
|
|
"name": "Compound AI Systems",
|
|
"area": "deployment",
|
|
"prerequisites": [
|
|
"model-serving-infrastructure"
|
|
],
|
|
"description": "Multi-model pipelines, RAG architectures, agent orchestration, routing between models, and end-to-end latency management across chained inference calls."
|
|
},
|
|
{
|
|
"id": "fault-tolerance-checkpointing",
|
|
"name": "Fault Tolerance & Checkpointing",
|
|
"area": "reliability",
|
|
"prerequisites": [],
|
|
"description": "Checkpoint strategies, Young-Daly formula, preemption recovery, and failure modes at scale."
|
|
},
|
|
{
|
|
"id": "distribution-drift-detection",
|
|
"name": "Distribution & Drift Detection",
|
|
"area": "reliability",
|
|
"prerequisites": [],
|
|
"description": "Data drift, concept drift, training-serving skew, and statistical detection methods (KL divergence, PSI)."
|
|
},
|
|
{
|
|
"id": "graceful-degradation",
|
|
"name": "Graceful Degradation",
|
|
"area": "reliability",
|
|
"prerequisites": [
|
|
"fault-tolerance-checkpointing"
|
|
],
|
|
"description": "Degradation ladders, model fallbacks, fail-safe vs fail-operational, and quality-of-service shedding."
|
|
},
|
|
{
|
|
"id": "safety-certification",
|
|
"name": "Safety & Certification",
|
|
"area": "reliability",
|
|
"prerequisites": [],
|
|
"description": "ISO 26262, functional safety levels, watchdog timers, self-test routines, and deterministic execution."
|
|
},
|
|
{
|
|
"id": "adversarial-robustness",
|
|
"name": "Adversarial Robustness & Security",
|
|
"area": "reliability",
|
|
"prerequisites": [],
|
|
"description": "Adversarial attacks, prompt injection, model extraction, side-channel attacks, and defense strategies."
|
|
},
|
|
{
|
|
"id": "monitoring-observability",
|
|
"name": "Monitoring & Observability",
|
|
"area": "reliability",
|
|
"prerequisites": [],
|
|
"description": "Telemetry, alerting, MTBF/MTTR, straggler detection, and production health dashboards."
|
|
},
|
|
{
|
|
"id": "data-pipeline-engineering",
|
|
"name": "Data Pipeline Engineering",
|
|
"area": "data",
|
|
"prerequisites": [],
|
|
"description": "ETL/ELT pipelines, throughput bottlenecks, data loading optimization, and the data pipeline equation."
|
|
},
|
|
{
|
|
"id": "feature-store-management",
|
|
"name": "Feature Store & Feature Engineering",
|
|
"area": "data",
|
|
"prerequisites": [
|
|
"data-pipeline-engineering"
|
|
],
|
|
"description": "Online/offline feature stores, feature freshness, point-in-time correctness, and feature pipelines."
|
|
},
|
|
{
|
|
"id": "data-quality-validation",
|
|
"name": "Data Quality & Validation",
|
|
"area": "data",
|
|
"prerequisites": [
|
|
"data-pipeline-engineering"
|
|
],
|
|
"description": "Schema validation, data contracts, quality gates, lineage tracking, and anomaly detection in data."
|
|
},
|
|
{
|
|
"id": "dataset-curation",
|
|
"name": "Dataset Curation & Labeling",
|
|
"area": "data",
|
|
"prerequisites": [],
|
|
"description": "Data selection, annotation workflows, inter-annotator agreement, active learning, and dataset bias."
|
|
},
|
|
{
|
|
"id": "streaming-ingestion",
|
|
"name": "Streaming & Real-Time Ingestion",
|
|
"area": "data",
|
|
"prerequisites": [
|
|
"data-pipeline-engineering"
|
|
],
|
|
"description": "Stream processing, event-driven pipelines, sensor data ingestion, and real-time feature computation."
|
|
},
|
|
{
|
|
"id": "storage-format-selection",
|
|
"name": "Storage Format & Selection",
|
|
"area": "data",
|
|
"prerequisites": [],
|
|
"description": "Parquet, TFRecord, columnar vs row formats, compression tradeoffs, and storage tier selection."
|
|
},
|
|
{
|
|
"id": "data-efficiency-selection",
|
|
"name": "Data Efficiency & Selection",
|
|
"area": "data",
|
|
"prerequisites": [],
|
|
"description": "Coreset selection, curriculum learning, data pruning, the data wall problem, synthetic data generation, model collapse risks, and the Information-Compute Ratio (ICR) framework."
|
|
},
|
|
{
|
|
"id": "federated-learning",
|
|
"name": "Federated Learning",
|
|
"area": "cross-cutting",
|
|
"prerequisites": [
|
|
"data-parallelism",
|
|
"differential-privacy"
|
|
],
|
|
"description": "Federated averaging, communication efficiency, non-IID data challenges, and cross-device vs cross-silo."
|
|
},
|
|
{
|
|
"id": "differential-privacy",
|
|
"name": "Differential Privacy",
|
|
"area": "cross-cutting",
|
|
"prerequisites": [],
|
|
"description": "DP-SGD, privacy budgets (epsilon), noise calibration, and the privacy-utility tradeoff."
|
|
},
|
|
{
|
|
"id": "fairness-evaluation",
|
|
"name": "Fairness & Bias Evaluation",
|
|
"area": "cross-cutting",
|
|
"prerequisites": [],
|
|
"description": "Demographic parity, equalized odds, intersectional fairness, and subgroup evaluation methodology."
|
|
},
|
|
{
|
|
"id": "responsible-ai",
|
|
"name": "Responsible AI & Governance",
|
|
"area": "cross-cutting",
|
|
"prerequisites": [],
|
|
"description": "Model cards, impact assessments, red-teaming, guardrails, and organizational accountability frameworks."
|
|
},
|
|
{
|
|
"id": "tco-cost-modeling",
|
|
"name": "TCO & Cost Modeling",
|
|
"area": "cross-cutting",
|
|
"prerequisites": [
|
|
"compute-cost-estimation",
|
|
"datacenter-efficiency"
|
|
],
|
|
"description": "Total cost of ownership, buy vs rent, spot vs reserved instances, and cost-performance Pareto analysis."
|
|
}
|
|
]
|
|
}
|