diff --git a/book/docker/windows/Dockerfile b/book/docker/windows/Dockerfile index f48fac6c0..afff8df2d 100644 --- a/book/docker/windows/Dockerfile +++ b/book/docker/windows/Dockerfile @@ -173,7 +173,7 @@ RUN Write-Host '=== STARTING TEX LIVE INSTALLATION ===' ; ` $texLiveBin = Join-Path $texYearDir.FullName 'bin\windows' ; ` Write-Host "📁 TeX Live bin: $texLiveBin" ; ` $env:PATH = "$texLiveBin;$env:PATH" ; ` - [Environment]::SetEnvironmentVariable('PATH', "$texLiveBin;$([Environment]::GetEnvironmentVariable('PATH','Machine'))", 'Machine') ; ` + [Environment]::SetEnvironmentVariable('PATH', ($texLiveBin + ';' + [Environment]::GetEnvironmentVariable('PATH','Machine')), 'Machine') ; ` Write-Host "✅ PATH updated" ; ` ` Write-Host '🔧 Pinning tlmgr repository to stable mirror...' ; ` diff --git a/book/quarto/contents/vol1/data_selection/data_selection.qmd b/book/quarto/contents/vol1/data_selection/data_selection.qmd index dc4387cec..f5c6f76e5 100644 --- a/book/quarto/contents/vol1/data_selection/data_selection.qmd +++ b/book/quarto/contents/vol1/data_selection/data_selection.qmd @@ -886,6 +886,7 @@ def compute_el2n_scores(model, dataloader, num_epochs=5): scores.extend(el2n.tolist()) return scores + def select_coreset(scores, dataset, fraction=0.1): """Select top-k highest-scoring (most uncertain) samples.""" k = int(len(dataset) * fraction) @@ -893,6 +894,7 @@ def select_coreset(scores, dataset, fraction=0.1): indices = argsort(scores, descending=True)[:k] return Subset(dataset, indices) + # Usage: 10x data reduction with minimal accuracy loss scores = compute_el2n_scores(proxy_model, full_loader) coreset = select_coreset(scores, full_dataset, fraction=0.1) diff --git a/book/quarto/contents/vol1/frameworks/frameworks.qmd b/book/quarto/contents/vol1/frameworks/frameworks.qmd index 5a664f81e..a4fc7d87b 100644 --- a/book/quarto/contents/vol1/frameworks/frameworks.qmd +++ b/book/quarto/contents/vol1/frameworks/frameworks.qmd @@ -733,11 +733,13 @@ PyTorch's TorchScript exemplifies both strategies. Tracing\index{JIT Compilation ```{.python} import torch + def forward(x): y = x * 2 z = y + 1 return z + # Trace the function by running it once x_example = torch.tensor([1.0]) traced = torch.jit.trace(forward, x_example) @@ -759,6 +761,7 @@ def conditional_forward(x): else: return x * 3 + traced = torch.jit.trace(conditional_forward, torch.tensor([1.0])) # Tracing captures ONLY the x.sum() > 0 branch # If input later has sum <= 0, traced version @@ -780,6 +783,7 @@ def forward(x): z = y + 1 return z + # Compiles Python source code to TorchScript IR # No example inputs needed # Preserves control flow structure @@ -797,6 +801,7 @@ def conditional_forward(x: torch.Tensor) -> torch.Tensor: else: return x * 3 + # Both branches preserved in IR # Correct branch executes based on runtime input values ``` @@ -810,6 +815,7 @@ To understand what the compiler produces, we can inspect the generated intermedi def example(x: torch.Tensor) -> torch.Tensor: return x * 2 + 1 + # Inspect generated IR: print(example.graph) # graph(%x : Tensor): @@ -833,6 +839,7 @@ def invalid_script(x): print(f"Debug: {x}") # ERROR: f-strings not supported return result + # Valid alternative: @torch.jit.script def valid_script(x: torch.Tensor) -> torch.Tensor: @@ -871,6 +878,7 @@ PyTorch 2.0's `torch.compile` [@ansel2024pytorch2] represents this approach: dev def forward(x): return x * 2 + 1 + # First call: captures execution, compiles optimized kernel (~100ms) result1 = forward(torch.tensor([1.0])) @@ -1072,6 +1080,7 @@ def conditional_compute(x): else: return x * 3 + # Creates two compiled regions: operations before # and after the if statement # The if statement itself executes eagerly @@ -1091,6 +1100,7 @@ def debug_compute(x): z = y + 1 return z + # Creates two compiled regions: before and after print ``` ::: @@ -1105,6 +1115,7 @@ Shape changes prevent compiled code reuse, as @lst-graph-break-shapes illustrate def variable_length(x, length): return x[:, :length] # Shape changes each call + # Each unique length triggers recompilation for i in range(10): result = variable_length(x, i) # 10 recompilations @@ -1137,9 +1148,11 @@ The compilation mode controls *how aggressively* to optimize; the backend contro import torch import time + def forward(x, w): return torch.matmul(x, w).relu() + x = torch.randn(1024, 1024, device="cuda") w = torch.randn(1024, 512, device="cuda") @@ -1693,6 +1706,7 @@ def simple_network(x, w1, w2): output = activated * w2 # Second layer return output + # --- Forward pass stores intermediates --- # x=1.0, w1=2.0, w2=3.0 # hidden=2.0, activated=2.0, output=6.0 @@ -1983,6 +1997,7 @@ class MultiplyAdd(torch.autograd.Function): return grad_x, grad_y, grad_z + # Usage x = torch.tensor([2.0], requires_grad=True) y = torch.tensor([3.0], requires_grad=True) @@ -2009,6 +2024,7 @@ def gradient_hook(grad): # Modify gradient (e.g., gradient clipping) return grad.clamp(-1.0, 1.0) + x = torch.tensor([2.0], requires_grad=True) x.register_hook(gradient_hook) @@ -3240,6 +3256,7 @@ The systems consequence is significant. Automatic parameter discovery enables `o import torch import torch.nn as nn + class CustomLayer(nn.Module): def __init__(self, input_size, output_size): super().__init__() @@ -3252,6 +3269,7 @@ class CustomLayer(nn.Module): def forward(self, x): return torch.matmul(x, self.weight.t()) + self.bias + layer = CustomLayer(10, 20) # Framework discovers both parameters automatically: for name, param in layer.named_parameters(): @@ -3300,6 +3318,7 @@ The state dictionary mechanism provides the serialization half of this principle import torch import torch.nn as nn + class ResidualBlock(nn.Module): def __init__(self, channels): super().__init__() @@ -3314,6 +3333,7 @@ class ResidualBlock(nn.Module): x = self.bn2(self.conv2(x)) return torch.relu(x + residual) + class ResNet(nn.Module): def __init__(self, num_blocks, channels=64): super().__init__() @@ -3330,6 +3350,7 @@ class ResNet(nn.Module): x = x.mean(dim=[2, 3]) # Global average pooling return self.fc(x) + model = ResNet(num_blocks=4) total = sum(p.numel() for p in model.parameters()) print(f"Total parameters: {total}") @@ -3373,6 +3394,7 @@ import torch.nn as nn model = nn.Sequential(nn.Linear(10, 20), nn.ReLU(), nn.Linear(20, 5)) + # Forward hook to inspect activations def forward_hook(module, input, output): print( @@ -3382,10 +3404,12 @@ def forward_hook(module, input, output): f"std={output.std():.3f}" ) + # Backward hook to inspect gradients def backward_hook(module, grad_input, grad_output): print(f"Gradient norm: {grad_output[0].norm():.3f}") + # Register hooks on specific layer handle_fwd = model[0].register_forward_hook(forward_hook) handle_bwd = model[0].register_full_backward_hook(backward_hook) @@ -3517,10 +3541,12 @@ While PyTorch and TensorFlow build computational graphs (dynamically or statical import jax import jax.numpy as jnp + def loss_fn(params, x, y): pred = jnp.dot(x, params["w"]) + params["b"] return jnp.mean((pred - y) ** 2) + # Transform: compute gradients grad_fn = jax.grad(loss_fn) @@ -3576,6 +3602,7 @@ How do these architectural differences look in practice? @lst-framework-hello-wo # PyTorch - Dynamic, Pythonic import torch.nn as nn + class SimpleNet(nn.Module): def __init__(self): super().__init__() @@ -3584,6 +3611,7 @@ class SimpleNet(nn.Module): def forward(self, x): return self.fc(x) + # TensorFlow/Keras - High-level API import tensorflow as tf @@ -3595,9 +3623,11 @@ model = tf.keras.Sequential( import jax.numpy as jnp from jax import random + def simple_net(params, x): return jnp.dot(x, params["w"]) + params["b"] + key = random.PRNGKey(0) params = { "w": random.normal(key, (10, 1)), diff --git a/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd b/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd index bb494178f..502acb902 100644 --- a/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd +++ b/book/quarto/contents/vol1/hw_acceleration/hw_acceleration.qmd @@ -896,7 +896,9 @@ We call the hardware units that exploit these patterns *AI compute primitives*: ::: {#lst-dense_layer_def lst-cap="**Dense Layer Abstraction**: High-level framework APIs encapsulate 131,072 multiply-accumulate operations (256 inputs times 512 outputs) in a single function call, hiding the computational complexity from developers while enabling automatic hardware optimization."} ```{.python} # Framework abstracts compute-intensive operations -dense = Dense(512)(input_tensor) # $256\times512$ = 131K MACs per sample +dense = Dense(512)( + input_tensor +) # $256\times512$ = 131K MACs per sample ``` ::: @@ -909,7 +911,9 @@ This single line of code conceals the computational complexity that accelerators output = ( matmul(input, weights) + bias ) # Matrix multiply dominates cost -output = activation(output) # Element-wise: O(output_dim$\times$batch) +output = activation( + output +) # Element-wise: O(output_dim$\times$batch) ``` ::: diff --git a/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd b/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd index 8235e6cfc..7d3a6c1ae 100644 --- a/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd +++ b/book/quarto/contents/vol1/nn_architectures/nn_architectures.qmd @@ -2489,6 +2489,7 @@ def attention_layer_matrix(Q, K, V): output = matmul(weights, V) # Combine values return output + # Core computational pattern def attention_layer_compute(Q, K, V): # Initialize outputs @@ -2814,6 +2815,7 @@ def self_attention_layer(X, W_Q, W_K, W_V, d_k): return output + def multi_head_attention(X, W_Q, W_K, W_V, W_O, num_heads, d_k): outputs = [] for i in range(num_heads): diff --git a/book/quarto/contents/vol1/optimizations/model_compression.qmd b/book/quarto/contents/vol1/optimizations/model_compression.qmd index 783993ff2..c3736f046 100644 --- a/book/quarto/contents/vol1/optimizations/model_compression.qmd +++ b/book/quarto/contents/vol1/optimizations/model_compression.qmd @@ -5011,6 +5011,7 @@ conv_out = conv2d(input, weight) bn_out = batch_norm(conv_out, ...) relu_out = relu(bn_out) + # === FUSED: 1 kernel launch, 2 memory transfers === def conv_bn_relu_fused(input, weight, gamma, beta, mean, var): # Read input and weight once diff --git a/book/quarto/contents/vol1/responsible_engr/responsible_engr.qmd b/book/quarto/contents/vol1/responsible_engr/responsible_engr.qmd index 3a93edb0a..e58e27c34 100644 --- a/book/quarto/contents/vol1/responsible_engr/responsible_engr.qmd +++ b/book/quarto/contents/vol1/responsible_engr/responsible_engr.qmd @@ -693,6 +693,7 @@ def compute_fairness_metrics(confusion_matrix): "fpr": fp / (fp + tn) if (fp + tn) else 0, } + # Compare groups and flag disparities exceeding threshold for metric in ["approval_rate", "tpr", "fpr"]: disparity = abs(metrics_a[metric] - metrics_b[metric]) diff --git a/book/quarto/contents/vol1/training/training.qmd b/book/quarto/contents/vol1/training/training.qmd index 612ba20ed..c8df07af8 100644 --- a/book/quarto/contents/vol1/training/training.qmd +++ b/book/quarto/contents/vol1/training/training.qmd @@ -3992,6 +3992,7 @@ Flash Attention's performance gains materialize through careful exploitation of import torch import torch.nn.functional as F + # Standard attention (materializes n$\times$ n matrix) def standard_attention(q, k, v): # q, k, v: [batch, heads, seq_len, head_dim] @@ -4002,15 +4003,18 @@ def standard_attention(q, k, v): output = torch.matmul(attn, v) return output + # Flash Attention (no n$\times$ n materialization) def flash_attention(q, k, v): # Automatically uses Flash Attention if available output = F.scaled_dot_product_attention(q, k, v) return output + # Explicit Flash Attention 2 (flash-attn library) from flash_attn import flash_attn_func + def flash_attn_2(q, k, v): # q, k, v: [batch, seq_len, heads, head_dim] # Different layout for optimized memory access diff --git a/book/quarto/contents/vol2/inference/inference.qmd b/book/quarto/contents/vol2/inference/inference.qmd index 9302722d4..7bb6fb82c 100644 --- a/book/quarto/contents/vol2/inference/inference.qmd +++ b/book/quarto/contents/vol2/inference/inference.qmd @@ -2747,6 +2747,7 @@ async def swap_to_cpu(sequence_id): cpu_cache[sequence_id] = kv_cache.cpu() # Async transfer gpu_cache.free(sequence_id) + async def swap_to_gpu(sequence_id): cpu_kv = cpu_cache[sequence_id] gpu_cache[sequence_id] = cpu_kv.cuda() # Async transfer @@ -4256,6 +4257,7 @@ class TenantQuota: max_qps: int # e.g., 1,000 max_batch_tokens: int # e.g., 50,000 + def admit_request(tenant_id, request): quota = get_quota(tenant_id) usage = get_usage(tenant_id) diff --git a/book/quarto/contents/vol2/performance_engineering/performance_engineering.qmd b/book/quarto/contents/vol2/performance_engineering/performance_engineering.qmd index 65982dc84..5d92bc020 100644 --- a/book/quarto/contents/vol2/performance_engineering/performance_engineering.qmd +++ b/book/quarto/contents/vol2/performance_engineering/performance_engineering.qmd @@ -1134,11 +1134,12 @@ A minimal example illustrates the usage: ```python import torch + def transformer_block(x, w1, w2, ln_weight, ln_bias): """Unfused transformer FFN block.""" - h = x @ w1 # Linear projection + h = x @ w1 # Linear projection h = torch.nn.functional.gelu(h) # Activation - h = h @ w2 # Output projection + h = h @ w2 # Output projection # Layer normalization mean = h.mean(dim=-1, keepdim=True) var = h.var(dim=-1, keepdim=True, unbiased=False) @@ -1146,6 +1147,7 @@ def transformer_block(x, w1, w2, ln_weight, ln_bias): h = h * ln_weight + ln_bias return h + # Compile the function — TorchDynamo traces, TorchInductor optimizes compiled_block = torch.compile(transformer_block) @@ -1213,9 +1215,11 @@ A Triton kernel for fused GELU activation illustrates the programming model: import triton import triton.language as tl + @triton.jit def fused_gelu_kernel( - input_ptr, output_ptr, + input_ptr, + output_ptr, n_elements, BLOCK_SIZE: tl.constexpr, ): @@ -1618,7 +1622,11 @@ The PyTorch Profiler integrates with the training loop to capture detailed trace ```python import torch -from torch.profiler import profile, schedule, tensorboard_trace_handler +from torch.profiler import ( + profile, + schedule, + tensorboard_trace_handler, +) # Profile 2 warmup steps + 3 active steps with profile( diff --git a/book/quarto/contents/vol2/responsible_ai/responsible_ai.qmd b/book/quarto/contents/vol2/responsible_ai/responsible_ai.qmd index 36c5e9209..39254d1b7 100644 --- a/book/quarto/contents/vol2/responsible_ai/responsible_ai.qmd +++ b/book/quarto/contents/vol2/responsible_ai/responsible_ai.qmd @@ -1563,6 +1563,7 @@ from typing import Dict, List, Optional import numpy as np from sklearn.metrics import confusion_matrix + @dataclass class FairnessMetrics: demographic_parity_diff: float @@ -1570,6 +1571,7 @@ class FairnessMetrics: equality_opportunity_diff: float group_counts: Dict[str, int] + class RealTimeFairnessMonitor: def __init__( self, window_size: int = 1000, alert_threshold: float = 0.05 diff --git a/book/quarto/contents/vol2/sustainable_ai/sustainable_ai.qmd b/book/quarto/contents/vol2/sustainable_ai/sustainable_ai.qmd index e78fe01cb..878f7d80f 100644 --- a/book/quarto/contents/vol2/sustainable_ai/sustainable_ai.qmd +++ b/book/quarto/contents/vol2/sustainable_ai/sustainable_ai.qmd @@ -1107,6 +1107,7 @@ Intel's Running Average Power Limit (RAPL) interface exposes power measurements import subprocess import time + def read_rapl_energy(): """Read current RAPL energy counters. @@ -1122,6 +1123,7 @@ def read_rapl_energy(): ) return int(result.stdout.strip()) # Returns microjoules + # Measure training energy start_energy = read_rapl_energy() start_time = time.time() @@ -1156,6 +1158,7 @@ import time pynvml.nvmlInit() handle = pynvml.nvmlDeviceGetHandleByIndex(0) # First GPU + def measure_inference_power(model, input_data, num_iterations=100): """Measure average GPU power during inference.""" power_readings = [] @@ -1174,6 +1177,7 @@ def measure_inference_power(model, input_data, num_iterations=100): avg_power = sum(power_readings) / len(power_readings) return avg_power + avg_power = measure_inference_power(model, sample_input) print(f"Average inference power: {avg_power:.1f} W") ``` @@ -1558,6 +1562,7 @@ def calculate_carbon_footprint( / (operational_kg + embodied_kg), } + # Example: 7B model training result = calculate_carbon_footprint( gpu_power_watts=400,