mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-03-11 17:49:25 -05:00
fix(ci): reformat Python blocks with Black 24.10.0 and fix PS string interpolation
CI pins black==24.10.0 but requirements.txt had black>=23.0.0, causing pre-commit to reformat 11 QMD files on the CI run and fail. Format all affected files locally with 24.10.0 to match CI expectations. Also fix PowerShell PATH string interpolation in Windows Dockerfile: use explicit concatenation instead of nested method call inside a double-quoted string, which can be unreliable in some PS contexts.
This commit is contained in:
@@ -173,7 +173,7 @@ RUN Write-Host '=== STARTING TEX LIVE INSTALLATION ===' ; `
|
||||
$texLiveBin = Join-Path $texYearDir.FullName 'bin\windows' ; `
|
||||
Write-Host "📁 TeX Live bin: $texLiveBin" ; `
|
||||
$env:PATH = "$texLiveBin;$env:PATH" ; `
|
||||
[Environment]::SetEnvironmentVariable('PATH', "$texLiveBin;$([Environment]::GetEnvironmentVariable('PATH','Machine'))", 'Machine') ; `
|
||||
[Environment]::SetEnvironmentVariable('PATH', ($texLiveBin + ';' + [Environment]::GetEnvironmentVariable('PATH','Machine')), 'Machine') ; `
|
||||
Write-Host "✅ PATH updated" ; `
|
||||
`
|
||||
Write-Host '🔧 Pinning tlmgr repository to stable mirror...' ; `
|
||||
|
||||
@@ -886,6 +886,7 @@ def compute_el2n_scores(model, dataloader, num_epochs=5):
|
||||
scores.extend(el2n.tolist())
|
||||
return scores
|
||||
|
||||
|
||||
def select_coreset(scores, dataset, fraction=0.1):
|
||||
"""Select top-k highest-scoring (most uncertain) samples."""
|
||||
k = int(len(dataset) * fraction)
|
||||
@@ -893,6 +894,7 @@ def select_coreset(scores, dataset, fraction=0.1):
|
||||
indices = argsort(scores, descending=True)[:k]
|
||||
return Subset(dataset, indices)
|
||||
|
||||
|
||||
# Usage: 10x data reduction with minimal accuracy loss
|
||||
scores = compute_el2n_scores(proxy_model, full_loader)
|
||||
coreset = select_coreset(scores, full_dataset, fraction=0.1)
|
||||
|
||||
@@ -733,11 +733,13 @@ PyTorch's TorchScript exemplifies both strategies. Tracing\index{JIT Compilation
|
||||
```{.python}
|
||||
import torch
|
||||
|
||||
|
||||
def forward(x):
|
||||
y = x * 2
|
||||
z = y + 1
|
||||
return z
|
||||
|
||||
|
||||
# Trace the function by running it once
|
||||
x_example = torch.tensor([1.0])
|
||||
traced = torch.jit.trace(forward, x_example)
|
||||
@@ -759,6 +761,7 @@ def conditional_forward(x):
|
||||
else:
|
||||
return x * 3
|
||||
|
||||
|
||||
traced = torch.jit.trace(conditional_forward, torch.tensor([1.0]))
|
||||
# Tracing captures ONLY the x.sum() > 0 branch
|
||||
# If input later has sum <= 0, traced version
|
||||
@@ -780,6 +783,7 @@ def forward(x):
|
||||
z = y + 1
|
||||
return z
|
||||
|
||||
|
||||
# Compiles Python source code to TorchScript IR
|
||||
# No example inputs needed
|
||||
# Preserves control flow structure
|
||||
@@ -797,6 +801,7 @@ def conditional_forward(x: torch.Tensor) -> torch.Tensor:
|
||||
else:
|
||||
return x * 3
|
||||
|
||||
|
||||
# Both branches preserved in IR
|
||||
# Correct branch executes based on runtime input values
|
||||
```
|
||||
@@ -810,6 +815,7 @@ To understand what the compiler produces, we can inspect the generated intermedi
|
||||
def example(x: torch.Tensor) -> torch.Tensor:
|
||||
return x * 2 + 1
|
||||
|
||||
|
||||
# Inspect generated IR:
|
||||
print(example.graph)
|
||||
# graph(%x : Tensor):
|
||||
@@ -833,6 +839,7 @@ def invalid_script(x):
|
||||
print(f"Debug: {x}") # ERROR: f-strings not supported
|
||||
return result
|
||||
|
||||
|
||||
# Valid alternative:
|
||||
@torch.jit.script
|
||||
def valid_script(x: torch.Tensor) -> torch.Tensor:
|
||||
@@ -871,6 +878,7 @@ PyTorch 2.0's `torch.compile` [@ansel2024pytorch2] represents this approach: dev
|
||||
def forward(x):
|
||||
return x * 2 + 1
|
||||
|
||||
|
||||
# First call: captures execution, compiles optimized kernel (~100ms)
|
||||
result1 = forward(torch.tensor([1.0]))
|
||||
|
||||
@@ -1072,6 +1080,7 @@ def conditional_compute(x):
|
||||
else:
|
||||
return x * 3
|
||||
|
||||
|
||||
# Creates two compiled regions: operations before
|
||||
# and after the if statement
|
||||
# The if statement itself executes eagerly
|
||||
@@ -1091,6 +1100,7 @@ def debug_compute(x):
|
||||
z = y + 1
|
||||
return z
|
||||
|
||||
|
||||
# Creates two compiled regions: before and after print
|
||||
```
|
||||
:::
|
||||
@@ -1105,6 +1115,7 @@ Shape changes prevent compiled code reuse, as @lst-graph-break-shapes illustrate
|
||||
def variable_length(x, length):
|
||||
return x[:, :length] # Shape changes each call
|
||||
|
||||
|
||||
# Each unique length triggers recompilation
|
||||
for i in range(10):
|
||||
result = variable_length(x, i) # 10 recompilations
|
||||
@@ -1137,9 +1148,11 @@ The compilation mode controls *how aggressively* to optimize; the backend contro
|
||||
import torch
|
||||
import time
|
||||
|
||||
|
||||
def forward(x, w):
|
||||
return torch.matmul(x, w).relu()
|
||||
|
||||
|
||||
x = torch.randn(1024, 1024, device="cuda")
|
||||
w = torch.randn(1024, 512, device="cuda")
|
||||
|
||||
@@ -1693,6 +1706,7 @@ def simple_network(x, w1, w2):
|
||||
output = activated * w2 # Second layer
|
||||
return output
|
||||
|
||||
|
||||
# --- Forward pass stores intermediates ---
|
||||
# x=1.0, w1=2.0, w2=3.0
|
||||
# hidden=2.0, activated=2.0, output=6.0
|
||||
@@ -1983,6 +1997,7 @@ class MultiplyAdd(torch.autograd.Function):
|
||||
|
||||
return grad_x, grad_y, grad_z
|
||||
|
||||
|
||||
# Usage
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
y = torch.tensor([3.0], requires_grad=True)
|
||||
@@ -2009,6 +2024,7 @@ def gradient_hook(grad):
|
||||
# Modify gradient (e.g., gradient clipping)
|
||||
return grad.clamp(-1.0, 1.0)
|
||||
|
||||
|
||||
x = torch.tensor([2.0], requires_grad=True)
|
||||
x.register_hook(gradient_hook)
|
||||
|
||||
@@ -3240,6 +3256,7 @@ The systems consequence is significant. Automatic parameter discovery enables `o
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class CustomLayer(nn.Module):
|
||||
def __init__(self, input_size, output_size):
|
||||
super().__init__()
|
||||
@@ -3252,6 +3269,7 @@ class CustomLayer(nn.Module):
|
||||
def forward(self, x):
|
||||
return torch.matmul(x, self.weight.t()) + self.bias
|
||||
|
||||
|
||||
layer = CustomLayer(10, 20)
|
||||
# Framework discovers both parameters automatically:
|
||||
for name, param in layer.named_parameters():
|
||||
@@ -3300,6 +3318,7 @@ The state dictionary mechanism provides the serialization half of this principle
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class ResidualBlock(nn.Module):
|
||||
def __init__(self, channels):
|
||||
super().__init__()
|
||||
@@ -3314,6 +3333,7 @@ class ResidualBlock(nn.Module):
|
||||
x = self.bn2(self.conv2(x))
|
||||
return torch.relu(x + residual)
|
||||
|
||||
|
||||
class ResNet(nn.Module):
|
||||
def __init__(self, num_blocks, channels=64):
|
||||
super().__init__()
|
||||
@@ -3330,6 +3350,7 @@ class ResNet(nn.Module):
|
||||
x = x.mean(dim=[2, 3]) # Global average pooling
|
||||
return self.fc(x)
|
||||
|
||||
|
||||
model = ResNet(num_blocks=4)
|
||||
total = sum(p.numel() for p in model.parameters())
|
||||
print(f"Total parameters: {total}")
|
||||
@@ -3373,6 +3394,7 @@ import torch.nn as nn
|
||||
|
||||
model = nn.Sequential(nn.Linear(10, 20), nn.ReLU(), nn.Linear(20, 5))
|
||||
|
||||
|
||||
# Forward hook to inspect activations
|
||||
def forward_hook(module, input, output):
|
||||
print(
|
||||
@@ -3382,10 +3404,12 @@ def forward_hook(module, input, output):
|
||||
f"std={output.std():.3f}"
|
||||
)
|
||||
|
||||
|
||||
# Backward hook to inspect gradients
|
||||
def backward_hook(module, grad_input, grad_output):
|
||||
print(f"Gradient norm: {grad_output[0].norm():.3f}")
|
||||
|
||||
|
||||
# Register hooks on specific layer
|
||||
handle_fwd = model[0].register_forward_hook(forward_hook)
|
||||
handle_bwd = model[0].register_full_backward_hook(backward_hook)
|
||||
@@ -3517,10 +3541,12 @@ While PyTorch and TensorFlow build computational graphs (dynamically or statical
|
||||
import jax
|
||||
import jax.numpy as jnp
|
||||
|
||||
|
||||
def loss_fn(params, x, y):
|
||||
pred = jnp.dot(x, params["w"]) + params["b"]
|
||||
return jnp.mean((pred - y) ** 2)
|
||||
|
||||
|
||||
# Transform: compute gradients
|
||||
grad_fn = jax.grad(loss_fn)
|
||||
|
||||
@@ -3576,6 +3602,7 @@ How do these architectural differences look in practice? @lst-framework-hello-wo
|
||||
# PyTorch - Dynamic, Pythonic
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
class SimpleNet(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -3584,6 +3611,7 @@ class SimpleNet(nn.Module):
|
||||
def forward(self, x):
|
||||
return self.fc(x)
|
||||
|
||||
|
||||
# TensorFlow/Keras - High-level API
|
||||
import tensorflow as tf
|
||||
|
||||
@@ -3595,9 +3623,11 @@ model = tf.keras.Sequential(
|
||||
import jax.numpy as jnp
|
||||
from jax import random
|
||||
|
||||
|
||||
def simple_net(params, x):
|
||||
return jnp.dot(x, params["w"]) + params["b"]
|
||||
|
||||
|
||||
key = random.PRNGKey(0)
|
||||
params = {
|
||||
"w": random.normal(key, (10, 1)),
|
||||
|
||||
@@ -896,7 +896,9 @@ We call the hardware units that exploit these patterns *AI compute primitives*:
|
||||
::: {#lst-dense_layer_def lst-cap="**Dense Layer Abstraction**: High-level framework APIs encapsulate 131,072 multiply-accumulate operations (256 inputs times 512 outputs) in a single function call, hiding the computational complexity from developers while enabling automatic hardware optimization."}
|
||||
```{.python}
|
||||
# Framework abstracts compute-intensive operations
|
||||
dense = Dense(512)(input_tensor) # $256\times512$ = 131K MACs per sample
|
||||
dense = Dense(512)(
|
||||
input_tensor
|
||||
) # $256\times512$ = 131K MACs per sample
|
||||
```
|
||||
:::
|
||||
|
||||
@@ -909,7 +911,9 @@ This single line of code conceals the computational complexity that accelerators
|
||||
output = (
|
||||
matmul(input, weights) + bias
|
||||
) # Matrix multiply dominates cost
|
||||
output = activation(output) # Element-wise: O(output_dim$\times$batch)
|
||||
output = activation(
|
||||
output
|
||||
) # Element-wise: O(output_dim$\times$batch)
|
||||
```
|
||||
:::
|
||||
|
||||
|
||||
@@ -2489,6 +2489,7 @@ def attention_layer_matrix(Q, K, V):
|
||||
output = matmul(weights, V) # Combine values
|
||||
return output
|
||||
|
||||
|
||||
# Core computational pattern
|
||||
def attention_layer_compute(Q, K, V):
|
||||
# Initialize outputs
|
||||
@@ -2814,6 +2815,7 @@ def self_attention_layer(X, W_Q, W_K, W_V, d_k):
|
||||
|
||||
return output
|
||||
|
||||
|
||||
def multi_head_attention(X, W_Q, W_K, W_V, W_O, num_heads, d_k):
|
||||
outputs = []
|
||||
for i in range(num_heads):
|
||||
|
||||
@@ -5011,6 +5011,7 @@ conv_out = conv2d(input, weight)
|
||||
bn_out = batch_norm(conv_out, ...)
|
||||
relu_out = relu(bn_out)
|
||||
|
||||
|
||||
# === FUSED: 1 kernel launch, 2 memory transfers ===
|
||||
def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
|
||||
# Read input and weight once
|
||||
|
||||
@@ -693,6 +693,7 @@ def compute_fairness_metrics(confusion_matrix):
|
||||
"fpr": fp / (fp + tn) if (fp + tn) else 0,
|
||||
}
|
||||
|
||||
|
||||
# Compare groups and flag disparities exceeding threshold
|
||||
for metric in ["approval_rate", "tpr", "fpr"]:
|
||||
disparity = abs(metrics_a[metric] - metrics_b[metric])
|
||||
|
||||
@@ -3992,6 +3992,7 @@ Flash Attention's performance gains materialize through careful exploitation of
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
# Standard attention (materializes n$\times$ n matrix)
|
||||
def standard_attention(q, k, v):
|
||||
# q, k, v: [batch, heads, seq_len, head_dim]
|
||||
@@ -4002,15 +4003,18 @@ def standard_attention(q, k, v):
|
||||
output = torch.matmul(attn, v)
|
||||
return output
|
||||
|
||||
|
||||
# Flash Attention (no n$\times$ n materialization)
|
||||
def flash_attention(q, k, v):
|
||||
# Automatically uses Flash Attention if available
|
||||
output = F.scaled_dot_product_attention(q, k, v)
|
||||
return output
|
||||
|
||||
|
||||
# Explicit Flash Attention 2 (flash-attn library)
|
||||
from flash_attn import flash_attn_func
|
||||
|
||||
|
||||
def flash_attn_2(q, k, v):
|
||||
# q, k, v: [batch, seq_len, heads, head_dim]
|
||||
# Different layout for optimized memory access
|
||||
|
||||
@@ -2747,6 +2747,7 @@ async def swap_to_cpu(sequence_id):
|
||||
cpu_cache[sequence_id] = kv_cache.cpu() # Async transfer
|
||||
gpu_cache.free(sequence_id)
|
||||
|
||||
|
||||
async def swap_to_gpu(sequence_id):
|
||||
cpu_kv = cpu_cache[sequence_id]
|
||||
gpu_cache[sequence_id] = cpu_kv.cuda() # Async transfer
|
||||
@@ -4256,6 +4257,7 @@ class TenantQuota:
|
||||
max_qps: int # e.g., 1,000
|
||||
max_batch_tokens: int # e.g., 50,000
|
||||
|
||||
|
||||
def admit_request(tenant_id, request):
|
||||
quota = get_quota(tenant_id)
|
||||
usage = get_usage(tenant_id)
|
||||
|
||||
@@ -1134,11 +1134,12 @@ A minimal example illustrates the usage:
|
||||
```python
|
||||
import torch
|
||||
|
||||
|
||||
def transformer_block(x, w1, w2, ln_weight, ln_bias):
|
||||
"""Unfused transformer FFN block."""
|
||||
h = x @ w1 # Linear projection
|
||||
h = x @ w1 # Linear projection
|
||||
h = torch.nn.functional.gelu(h) # Activation
|
||||
h = h @ w2 # Output projection
|
||||
h = h @ w2 # Output projection
|
||||
# Layer normalization
|
||||
mean = h.mean(dim=-1, keepdim=True)
|
||||
var = h.var(dim=-1, keepdim=True, unbiased=False)
|
||||
@@ -1146,6 +1147,7 @@ def transformer_block(x, w1, w2, ln_weight, ln_bias):
|
||||
h = h * ln_weight + ln_bias
|
||||
return h
|
||||
|
||||
|
||||
# Compile the function — TorchDynamo traces, TorchInductor optimizes
|
||||
compiled_block = torch.compile(transformer_block)
|
||||
|
||||
@@ -1213,9 +1215,11 @@ A Triton kernel for fused GELU activation illustrates the programming model:
|
||||
import triton
|
||||
import triton.language as tl
|
||||
|
||||
|
||||
@triton.jit
|
||||
def fused_gelu_kernel(
|
||||
input_ptr, output_ptr,
|
||||
input_ptr,
|
||||
output_ptr,
|
||||
n_elements,
|
||||
BLOCK_SIZE: tl.constexpr,
|
||||
):
|
||||
@@ -1618,7 +1622,11 @@ The PyTorch Profiler integrates with the training loop to capture detailed trace
|
||||
|
||||
```python
|
||||
import torch
|
||||
from torch.profiler import profile, schedule, tensorboard_trace_handler
|
||||
from torch.profiler import (
|
||||
profile,
|
||||
schedule,
|
||||
tensorboard_trace_handler,
|
||||
)
|
||||
|
||||
# Profile 2 warmup steps + 3 active steps
|
||||
with profile(
|
||||
|
||||
@@ -1563,6 +1563,7 @@ from typing import Dict, List, Optional
|
||||
import numpy as np
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
|
||||
@dataclass
|
||||
class FairnessMetrics:
|
||||
demographic_parity_diff: float
|
||||
@@ -1570,6 +1571,7 @@ class FairnessMetrics:
|
||||
equality_opportunity_diff: float
|
||||
group_counts: Dict[str, int]
|
||||
|
||||
|
||||
class RealTimeFairnessMonitor:
|
||||
def __init__(
|
||||
self, window_size: int = 1000, alert_threshold: float = 0.05
|
||||
|
||||
@@ -1107,6 +1107,7 @@ Intel's Running Average Power Limit (RAPL) interface exposes power measurements
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
|
||||
def read_rapl_energy():
|
||||
"""Read current RAPL energy counters.
|
||||
|
||||
@@ -1122,6 +1123,7 @@ def read_rapl_energy():
|
||||
)
|
||||
return int(result.stdout.strip()) # Returns microjoules
|
||||
|
||||
|
||||
# Measure training energy
|
||||
start_energy = read_rapl_energy()
|
||||
start_time = time.time()
|
||||
@@ -1156,6 +1158,7 @@ import time
|
||||
pynvml.nvmlInit()
|
||||
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # First GPU
|
||||
|
||||
|
||||
def measure_inference_power(model, input_data, num_iterations=100):
|
||||
"""Measure average GPU power during inference."""
|
||||
power_readings = []
|
||||
@@ -1174,6 +1177,7 @@ def measure_inference_power(model, input_data, num_iterations=100):
|
||||
avg_power = sum(power_readings) / len(power_readings)
|
||||
return avg_power
|
||||
|
||||
|
||||
avg_power = measure_inference_power(model, sample_input)
|
||||
print(f"Average inference power: {avg_power:.1f} W")
|
||||
```
|
||||
@@ -1558,6 +1562,7 @@ def calculate_carbon_footprint(
|
||||
/ (operational_kg + embodied_kg),
|
||||
}
|
||||
|
||||
|
||||
# Example: 7B model training
|
||||
result = calculate_carbon_footprint(
|
||||
gpu_power_watts=400,
|
||||
|
||||
Reference in New Issue
Block a user