mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-30 01:29:07 -05:00
Updated equations
This commit is contained in:
@@ -83,11 +83,11 @@ class DAMTaxonomy:
|
||||
ex2_model_size_gb_str = f"{ex2_model_size_gb:.0f}"
|
||||
|
||||
ex2_achieved_eq = md(
|
||||
f"$$\text{{Achieved FLOP/s}} = \frac{{{ex2_flops_per_pass_str} \text{{ TFLOPs}}}}"
|
||||
f"{{0.050 \text{{ s}}}} = {ex2_achieved_str} \text{{ TFLOP/s}}$$"
|
||||
rf"$$\text{{Achieved FLOP/s}} = \frac{{{ex2_flops_per_pass_str} \text{{ TFLOPs}}}}"
|
||||
rf"{{0.050 \text{{ s}}}} = {ex2_achieved_str} \text{{ TFLOP/s}}$$"
|
||||
)
|
||||
ex2_util_eq = md(
|
||||
f"$$\eta = \frac{{{ex2_achieved_str}}}{{{int(h100_fp16_tflops_peak)}}} \approx {ex2_util_str}\%$$"
|
||||
rf"$$\eta = \frac{{{ex2_achieved_str}}}{{{int(h100_fp16_tflops_peak)}}} \approx {ex2_util_str}\%$$"
|
||||
)
|
||||
|
||||
ex3_params_start_str = "125M"
|
||||
@@ -96,8 +96,8 @@ class DAMTaxonomy:
|
||||
ex3_imp_str = f"{ex3_imp_pct:.1f}"
|
||||
ex3_chin_pred_str = f"{ex3_chin_pred_pct}"
|
||||
|
||||
ex4_gpu_old_str = f"{ex4_gpu_old_n}$ imes$ A100"
|
||||
ex4_gpu_new_str = f"{ex4_gpu_new_n}$ imes$ H100"
|
||||
ex4_gpu_old_str = rf"{ex4_gpu_old_n}$\times$ A100"
|
||||
ex4_gpu_new_str = rf"{ex4_gpu_new_n}$\times$ H100"
|
||||
ex4_cost_str = f"${ex4_cost_k}K"
|
||||
```
|
||||
|
||||
|
||||
@@ -1561,7 +1561,7 @@ class SyntheticStorage:
|
||||
total_footprint_tb = data_size_tb * (1 + provenance_metadata_overhead) * verification_passes
|
||||
amplification = total_footprint_tb / data_size_tb
|
||||
# ┌── 3. GUARD ─────────────────────────────────────────
|
||||
check(amplification == 4.2, f"Amp {amplification} unexpected")
|
||||
check(round(amplification, 1) == 4.2, f"Amp {amplification} unexpected")
|
||||
# ┌── 4. OUTPUT ────────────────────────────────────────
|
||||
amp_str = f"{amplification:.1f}"
|
||||
total_tb_str = f"{total_footprint_tb:.1f}"
|
||||
|
||||
@@ -2154,14 +2154,14 @@ ax.text(50.5, 5.4, '50% of peak', fontsize=8, color=COLORS['primary'], alpha=0.6
|
||||
va='bottom')
|
||||
|
||||
# Annotation: MFU doubled 2020-2022
|
||||
ax.annotate('MFU doubled\n2020 → 2022',
|
||||
ax.annotate('MFU doubled\n2020 -> 2022',
|
||||
xy=(21.3, 0), xytext=(35, -0.8),
|
||||
fontsize=8, color=COLORS['VioletLine'], fontweight='bold',
|
||||
arrowprops=dict(arrowstyle='->', color=COLORS['VioletLine'], lw=1.2),
|
||||
ha='center')
|
||||
|
||||
# Annotation: Scaling Tax for Llama 3
|
||||
ax.annotate('Scaling Tax:\n43% → 41% as\n8K → 16K GPUs',
|
||||
ax.annotate('Scaling Tax:\n43% -> 41% as\n8K -> 16K GPUs',
|
||||
xy=(41, 5), xytext=(52, 4.2),
|
||||
fontsize=8, color=COLORS['GreenLine'], fontweight='bold',
|
||||
arrowprops=dict(arrowstyle='->', color=COLORS['GreenLine'], lw=1.2),
|
||||
|
||||
@@ -1608,9 +1608,9 @@ class SDCCollective:
|
||||
# ┌── 2. EXECUTE ───────────────────────────────────────
|
||||
p_step_sdc = 1 - (1 - p_sdc_per_gpu_hr)**(n_gpus * (training_step_s / 3600))
|
||||
# ┌── 3. GUARD ─────────────────────────────────────────
|
||||
check(p_step_sdc > 0.05, f"Prob {p_step_sdc:.4f} unexpected")
|
||||
check(p_step_sdc > 0.00005, f"Prob {p_step_sdc:.6f} unexpected")
|
||||
# ┌── 4. OUTPUT ────────────────────────────────────────
|
||||
prob_str = f"{p_step_sdc*100:.2f}%"
|
||||
prob_str = f"{p_step_sdc*100:.4f}%"
|
||||
|
||||
@classmethod
|
||||
def plot(cls):
|
||||
@@ -1633,7 +1633,7 @@ class SDCCollective:
|
||||
3. **The Exposure**: In a 2-second window, the fleet has $100,000 \times (2/3600) \approx 55$ "GPU-hours" of exposure.
|
||||
4. **The Probability**: $P(\text{at least one SDC}) \approx$ **`{python} SDCCollective.prob_str`**.
|
||||
|
||||
**The Systems Insight**: In a 100k-GPU fleet, a silent error occurs every 20 steps. If your AllReduce does not implement **Checksummed Collectives** or **Hash-and-Verify** gradients, your model parameters will silently drift into "Numerical Garbage" within minutes. Robustness moves from being a "Restart" problem to a **Verification** problem: the fleet must perform redundant reductions or use parity-protected gradients to catch the silent killer of scale.
|
||||
**The Systems Insight**: In a 100k-GPU fleet, a silent error occurs every 18,000 steps (roughly every 10 hours). If your AllReduce does not implement **Checksummed Collectives** or **Hash-and-Verify** gradients, your model parameters will silently drift into "Numerical Garbage" within half a day. Robustness moves from being a "Restart" problem to a **Verification** problem: the fleet must perform redundant reductions or use parity-protected gradients to catch the silent killer of scale.
|
||||
|
||||
:::
|
||||
|
||||
|
||||
@@ -2563,7 +2563,7 @@ ax.text(8, 50, 'Low-carbon\nwindow', fontsize=9, ha='center', color='#008F45')
|
||||
ax.text(20, 900, 'Peak carbon', fontsize=9, ha='center', color='#CB202D')
|
||||
|
||||
ax.set_xlabel('Hour of Day', fontsize=11, fontweight='bold')
|
||||
ax.set_ylabel('Grid Carbon Intensity (g CO₂/kWh)', fontsize=11, fontweight='bold')
|
||||
ax.set_ylabel('Grid Carbon Intensity (g CO2/kWh)', fontsize=11, fontweight='bold')
|
||||
ax.set_xlim(0, 23)
|
||||
ax.set_ylim(0, 1050)
|
||||
ax.legend(loc='upper right', fontsize=9)
|
||||
|
||||
Reference in New Issue
Block a user