fix(ci): reformat Python blocks with Black 24.10.0 and fix PS string interpolation

CI pins black==24.10.0 but requirements.txt had black>=23.0.0, causing
pre-commit to reformat 11 QMD files on the CI run and fail. Format all
affected files locally with 24.10.0 to match CI expectations.

Also fix PowerShell PATH string interpolation in Windows Dockerfile:
use explicit concatenation instead of nested method call inside a
double-quoted string, which can be unreliable in some PS contexts.
This commit is contained in:
Vijay Janapa Reddi
2026-03-02 22:21:41 -05:00
parent bb0cecbe3d
commit b316005230
12 changed files with 68 additions and 7 deletions

View File

@@ -173,7 +173,7 @@ RUN Write-Host '=== STARTING TEX LIVE INSTALLATION ===' ; `
$texLiveBin = Join-Path $texYearDir.FullName 'bin\windows' ; `
Write-Host "📁 TeX Live bin: $texLiveBin" ; `
$env:PATH = "$texLiveBin;$env:PATH" ; `
[Environment]::SetEnvironmentVariable('PATH', "$texLiveBin;$([Environment]::GetEnvironmentVariable('PATH','Machine'))", 'Machine') ; `
[Environment]::SetEnvironmentVariable('PATH', ($texLiveBin + ';' + [Environment]::GetEnvironmentVariable('PATH','Machine')), 'Machine') ; `
Write-Host "✅ PATH updated" ; `
`
Write-Host '🔧 Pinning tlmgr repository to stable mirror...' ; `

View File

@@ -886,6 +886,7 @@ def compute_el2n_scores(model, dataloader, num_epochs=5):
scores.extend(el2n.tolist())
return scores
def select_coreset(scores, dataset, fraction=0.1):
"""Select top-k highest-scoring (most uncertain) samples."""
k = int(len(dataset) * fraction)
@@ -893,6 +894,7 @@ def select_coreset(scores, dataset, fraction=0.1):
indices = argsort(scores, descending=True)[:k]
return Subset(dataset, indices)
# Usage: 10x data reduction with minimal accuracy loss
scores = compute_el2n_scores(proxy_model, full_loader)
coreset = select_coreset(scores, full_dataset, fraction=0.1)

View File

@@ -733,11 +733,13 @@ PyTorch's TorchScript exemplifies both strategies. Tracing\index{JIT Compilation
```{.python}
import torch
def forward(x):
y = x * 2
z = y + 1
return z
# Trace the function by running it once
x_example = torch.tensor([1.0])
traced = torch.jit.trace(forward, x_example)
@@ -759,6 +761,7 @@ def conditional_forward(x):
else:
return x * 3
traced = torch.jit.trace(conditional_forward, torch.tensor([1.0]))
# Tracing captures ONLY the x.sum() > 0 branch
# If input later has sum <= 0, traced version
@@ -780,6 +783,7 @@ def forward(x):
z = y + 1
return z
# Compiles Python source code to TorchScript IR
# No example inputs needed
# Preserves control flow structure
@@ -797,6 +801,7 @@ def conditional_forward(x: torch.Tensor) -> torch.Tensor:
else:
return x * 3
# Both branches preserved in IR
# Correct branch executes based on runtime input values
```
@@ -810,6 +815,7 @@ To understand what the compiler produces, we can inspect the generated intermedi
def example(x: torch.Tensor) -> torch.Tensor:
return x * 2 + 1
# Inspect generated IR:
print(example.graph)
# graph(%x : Tensor):
@@ -833,6 +839,7 @@ def invalid_script(x):
print(f"Debug: {x}") # ERROR: f-strings not supported
return result
# Valid alternative:
@torch.jit.script
def valid_script(x: torch.Tensor) -> torch.Tensor:
@@ -871,6 +878,7 @@ PyTorch 2.0's `torch.compile` [@ansel2024pytorch2] represents this approach: dev
def forward(x):
return x * 2 + 1
# First call: captures execution, compiles optimized kernel (~100ms)
result1 = forward(torch.tensor([1.0]))
@@ -1072,6 +1080,7 @@ def conditional_compute(x):
else:
return x * 3
# Creates two compiled regions: operations before
# and after the if statement
# The if statement itself executes eagerly
@@ -1091,6 +1100,7 @@ def debug_compute(x):
z = y + 1
return z
# Creates two compiled regions: before and after print
```
:::
@@ -1105,6 +1115,7 @@ Shape changes prevent compiled code reuse, as @lst-graph-break-shapes illustrate
def variable_length(x, length):
return x[:, :length] # Shape changes each call
# Each unique length triggers recompilation
for i in range(10):
result = variable_length(x, i) # 10 recompilations
@@ -1137,9 +1148,11 @@ The compilation mode controls *how aggressively* to optimize; the backend contro
import torch
import time
def forward(x, w):
return torch.matmul(x, w).relu()
x = torch.randn(1024, 1024, device="cuda")
w = torch.randn(1024, 512, device="cuda")
@@ -1693,6 +1706,7 @@ def simple_network(x, w1, w2):
output = activated * w2 # Second layer
return output
# --- Forward pass stores intermediates ---
# x=1.0, w1=2.0, w2=3.0
# hidden=2.0, activated=2.0, output=6.0
@@ -1983,6 +1997,7 @@ class MultiplyAdd(torch.autograd.Function):
return grad_x, grad_y, grad_z
# Usage
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)
@@ -2009,6 +2024,7 @@ def gradient_hook(grad):
# Modify gradient (e.g., gradient clipping)
return grad.clamp(-1.0, 1.0)
x = torch.tensor([2.0], requires_grad=True)
x.register_hook(gradient_hook)
@@ -3240,6 +3256,7 @@ The systems consequence is significant. Automatic parameter discovery enables `o
import torch
import torch.nn as nn
class CustomLayer(nn.Module):
def __init__(self, input_size, output_size):
super().__init__()
@@ -3252,6 +3269,7 @@ class CustomLayer(nn.Module):
def forward(self, x):
return torch.matmul(x, self.weight.t()) + self.bias
layer = CustomLayer(10, 20)
# Framework discovers both parameters automatically:
for name, param in layer.named_parameters():
@@ -3300,6 +3318,7 @@ The state dictionary mechanism provides the serialization half of this principle
import torch
import torch.nn as nn
class ResidualBlock(nn.Module):
def __init__(self, channels):
super().__init__()
@@ -3314,6 +3333,7 @@ class ResidualBlock(nn.Module):
x = self.bn2(self.conv2(x))
return torch.relu(x + residual)
class ResNet(nn.Module):
def __init__(self, num_blocks, channels=64):
super().__init__()
@@ -3330,6 +3350,7 @@ class ResNet(nn.Module):
x = x.mean(dim=[2, 3]) # Global average pooling
return self.fc(x)
model = ResNet(num_blocks=4)
total = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total}")
@@ -3373,6 +3394,7 @@ import torch.nn as nn
model = nn.Sequential(nn.Linear(10, 20), nn.ReLU(), nn.Linear(20, 5))
# Forward hook to inspect activations
def forward_hook(module, input, output):
print(
@@ -3382,10 +3404,12 @@ def forward_hook(module, input, output):
f"std={output.std():.3f}"
)
# Backward hook to inspect gradients
def backward_hook(module, grad_input, grad_output):
print(f"Gradient norm: {grad_output[0].norm():.3f}")
# Register hooks on specific layer
handle_fwd = model[0].register_forward_hook(forward_hook)
handle_bwd = model[0].register_full_backward_hook(backward_hook)
@@ -3517,10 +3541,12 @@ While PyTorch and TensorFlow build computational graphs (dynamically or statical
import jax
import jax.numpy as jnp
def loss_fn(params, x, y):
pred = jnp.dot(x, params["w"]) + params["b"]
return jnp.mean((pred - y) ** 2)
# Transform: compute gradients
grad_fn = jax.grad(loss_fn)
@@ -3576,6 +3602,7 @@ How do these architectural differences look in practice? @lst-framework-hello-wo
# PyTorch - Dynamic, Pythonic
import torch.nn as nn
class SimpleNet(nn.Module):
def __init__(self):
super().__init__()
@@ -3584,6 +3611,7 @@ class SimpleNet(nn.Module):
def forward(self, x):
return self.fc(x)
# TensorFlow/Keras - High-level API
import tensorflow as tf
@@ -3595,9 +3623,11 @@ model = tf.keras.Sequential(
import jax.numpy as jnp
from jax import random
def simple_net(params, x):
return jnp.dot(x, params["w"]) + params["b"]
key = random.PRNGKey(0)
params = {
"w": random.normal(key, (10, 1)),

View File

@@ -896,7 +896,9 @@ We call the hardware units that exploit these patterns *AI compute primitives*:
::: {#lst-dense_layer_def lst-cap="**Dense Layer Abstraction**: High-level framework APIs encapsulate 131,072 multiply-accumulate operations (256 inputs times 512 outputs) in a single function call, hiding the computational complexity from developers while enabling automatic hardware optimization."}
```{.python}
# Framework abstracts compute-intensive operations
dense = Dense(512)(input_tensor) # $256\times512$ = 131K MACs per sample
dense = Dense(512)(
input_tensor
) # $256\times512$ = 131K MACs per sample
```
:::
@@ -909,7 +911,9 @@ This single line of code conceals the computational complexity that accelerators
output = (
matmul(input, weights) + bias
) # Matrix multiply dominates cost
output = activation(output) # Element-wise: O(output_dim$\times$batch)
output = activation(
output
) # Element-wise: O(output_dim$\times$batch)
```
:::

View File

@@ -2489,6 +2489,7 @@ def attention_layer_matrix(Q, K, V):
output = matmul(weights, V) # Combine values
return output
# Core computational pattern
def attention_layer_compute(Q, K, V):
# Initialize outputs
@@ -2814,6 +2815,7 @@ def self_attention_layer(X, W_Q, W_K, W_V, d_k):
return output
def multi_head_attention(X, W_Q, W_K, W_V, W_O, num_heads, d_k):
outputs = []
for i in range(num_heads):

View File

@@ -5011,6 +5011,7 @@ conv_out = conv2d(input, weight)
bn_out = batch_norm(conv_out, ...)
relu_out = relu(bn_out)
# === FUSED: 1 kernel launch, 2 memory transfers ===
def conv_bn_relu_fused(input, weight, gamma, beta, mean, var):
# Read input and weight once

View File

@@ -693,6 +693,7 @@ def compute_fairness_metrics(confusion_matrix):
"fpr": fp / (fp + tn) if (fp + tn) else 0,
}
# Compare groups and flag disparities exceeding threshold
for metric in ["approval_rate", "tpr", "fpr"]:
disparity = abs(metrics_a[metric] - metrics_b[metric])

View File

@@ -3992,6 +3992,7 @@ Flash Attention's performance gains materialize through careful exploitation of
import torch
import torch.nn.functional as F
# Standard attention (materializes n$\times$ n matrix)
def standard_attention(q, k, v):
# q, k, v: [batch, heads, seq_len, head_dim]
@@ -4002,15 +4003,18 @@ def standard_attention(q, k, v):
output = torch.matmul(attn, v)
return output
# Flash Attention (no n$\times$ n materialization)
def flash_attention(q, k, v):
# Automatically uses Flash Attention if available
output = F.scaled_dot_product_attention(q, k, v)
return output
# Explicit Flash Attention 2 (flash-attn library)
from flash_attn import flash_attn_func
def flash_attn_2(q, k, v):
# q, k, v: [batch, seq_len, heads, head_dim]
# Different layout for optimized memory access

View File

@@ -2747,6 +2747,7 @@ async def swap_to_cpu(sequence_id):
cpu_cache[sequence_id] = kv_cache.cpu() # Async transfer
gpu_cache.free(sequence_id)
async def swap_to_gpu(sequence_id):
cpu_kv = cpu_cache[sequence_id]
gpu_cache[sequence_id] = cpu_kv.cuda() # Async transfer
@@ -4256,6 +4257,7 @@ class TenantQuota:
max_qps: int # e.g., 1,000
max_batch_tokens: int # e.g., 50,000
def admit_request(tenant_id, request):
quota = get_quota(tenant_id)
usage = get_usage(tenant_id)

View File

@@ -1134,11 +1134,12 @@ A minimal example illustrates the usage:
```python
import torch
def transformer_block(x, w1, w2, ln_weight, ln_bias):
"""Unfused transformer FFN block."""
h = x @ w1 # Linear projection
h = x @ w1 # Linear projection
h = torch.nn.functional.gelu(h) # Activation
h = h @ w2 # Output projection
h = h @ w2 # Output projection
# Layer normalization
mean = h.mean(dim=-1, keepdim=True)
var = h.var(dim=-1, keepdim=True, unbiased=False)
@@ -1146,6 +1147,7 @@ def transformer_block(x, w1, w2, ln_weight, ln_bias):
h = h * ln_weight + ln_bias
return h
# Compile the function — TorchDynamo traces, TorchInductor optimizes
compiled_block = torch.compile(transformer_block)
@@ -1213,9 +1215,11 @@ A Triton kernel for fused GELU activation illustrates the programming model:
import triton
import triton.language as tl
@triton.jit
def fused_gelu_kernel(
input_ptr, output_ptr,
input_ptr,
output_ptr,
n_elements,
BLOCK_SIZE: tl.constexpr,
):
@@ -1618,7 +1622,11 @@ The PyTorch Profiler integrates with the training loop to capture detailed trace
```python
import torch
from torch.profiler import profile, schedule, tensorboard_trace_handler
from torch.profiler import (
profile,
schedule,
tensorboard_trace_handler,
)
# Profile 2 warmup steps + 3 active steps
with profile(

View File

@@ -1563,6 +1563,7 @@ from typing import Dict, List, Optional
import numpy as np
from sklearn.metrics import confusion_matrix
@dataclass
class FairnessMetrics:
demographic_parity_diff: float
@@ -1570,6 +1571,7 @@ class FairnessMetrics:
equality_opportunity_diff: float
group_counts: Dict[str, int]
class RealTimeFairnessMonitor:
def __init__(
self, window_size: int = 1000, alert_threshold: float = 0.05

View File

@@ -1107,6 +1107,7 @@ Intel's Running Average Power Limit (RAPL) interface exposes power measurements
import subprocess
import time
def read_rapl_energy():
"""Read current RAPL energy counters.
@@ -1122,6 +1123,7 @@ def read_rapl_energy():
)
return int(result.stdout.strip()) # Returns microjoules
# Measure training energy
start_energy = read_rapl_energy()
start_time = time.time()
@@ -1156,6 +1158,7 @@ import time
pynvml.nvmlInit()
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # First GPU
def measure_inference_power(model, input_data, num_iterations=100):
"""Measure average GPU power during inference."""
power_readings = []
@@ -1174,6 +1177,7 @@ def measure_inference_power(model, input_data, num_iterations=100):
avg_power = sum(power_readings) / len(power_readings)
return avg_power
avg_power = measure_inference_power(model, sample_input)
print(f"Average inference power: {avg_power:.1f} W")
```
@@ -1558,6 +1562,7 @@ def calculate_carbon_footprint(
/ (operational_kg + embodied_kg),
}
# Example: 7B model training
result = calculate_carbon_footprint(
gpu_power_watts=400,