diff --git a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd index 9861092b6..9d873e014 100644 --- a/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd +++ b/book/quarto/contents/vol2/backmatter/appendix_fleet.qmd @@ -31,7 +31,7 @@ This appendix collects the reference numbers and compact models for fleet-scale import math from mlsys.constants import * -from mlsys.formatting import fmt, check, md, md_math, sci_latex +from mlsys.formatting import fmt, fmt_percent, check, md, md_math, sci_latex from mlsys.formulas import ( calc_mtbf_cluster, calc_mtbf_node, calc_failure_probability, calc_effective_flops, @@ -62,6 +62,10 @@ class FleetFoundations: # Level 3: Cluster cluster_sizes = [CLUSTER_SMALL_GPUS, CLUSTER_MEDIUM_GPUS, CLUSTER_LARGE_GPUS, CLUSTER_MEGA_GPUS] + cl_small = CLUSTER_SMALL_GPUS + cl_medium = CLUSTER_MEDIUM_GPUS + cl_large = CLUSTER_LARGE_GPUS + cl_mega = CLUSTER_MEGA_GPUS # for prose: "At FF.cl_mega GPUs" # Efficiency & Budgets mfu_range = (MFU_TRAINING_LOW, MFU_TRAINING_HIGH) @@ -70,6 +74,12 @@ class FleetFoundations: oh_failure = OVERHEAD_FAILURE_RECOVERY oh_maintenance = OVERHEAD_MAINTENANCE + # Scaling efficiency percentages (for prose: ~FF.eff_1024%) + eff_32 = int(SCALING_EFF_32GPU * 100) + eff_256 = int(SCALING_EFF_256GPU * 100) + eff_1024 = int(SCALING_EFF_1024GPU * 100) + eff_8192 = int(SCALING_EFF_8192GPU * 100) + # ┌── 2. EXECUTE (The Compute) ──────────────────────────────────────── # Step 1: Level 4: Ratios & Rework nvlink_h100_bw_val = int(nvlink_h100_qty.m_as(GB / second)) @@ -81,10 +91,18 @@ class FleetFoundations: node_mtbf_params["psu_mttf"], 4 ) + mtbf_256_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 256) + mtbf_2048_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 2048) mtbf_8192_h = calc_mtbf_cluster(GPU_MTTF_HOURS, 8192) mtbf_100k_h = calc_mtbf_cluster(GPU_MTTF_HOURS, CLUSTER_MEGA_GPUS) + mtbf_256_min = mtbf_256_h.m_as(ureg.minute) + mtbf_2048_min = mtbf_2048_h.m_as(ureg.minute) mtbf_8192_min = mtbf_8192_h.m_as(ureg.minute) mtbf_100k_min = mtbf_100k_h.m_as(ureg.minute) + pfail_256_24h = calc_failure_probability(mtbf_256_h, 24 * ureg.hour) + pfail_2048_24h = calc_failure_probability(mtbf_2048_h, 24 * ureg.hour) + pfail_8192_24h = calc_failure_probability(mtbf_8192_h, 24 * ureg.hour) + pfail_100k_24h = calc_failure_probability(mtbf_100k_h, 24 * ureg.hour) # Step 2: Effective FLOPS (1024-GPU cluster) _peak_1024_qty = 1024 * h100_qty @@ -95,7 +113,8 @@ class FleetFoundations: SCALING_EFF_1024GPU, goodput_ratio ) - eff_fraction = _eff_flops_1024_qty / _peak_1024_qty + # Store as plain float so prose/formatters never see a Quantity (avoids display bugs) + eff_fraction = float((_eff_flops_1024_qty / _peak_1024_qty).m_as('')) # ┌── 3. GUARD (Invariants) ────────────────────────────────────────── check(nvlink_to_ib > 10, "NVLink must be >10x IB for hierarchy to hold") @@ -115,6 +134,49 @@ class FleetFoundations: oh_failure_pct = int(oh_failure * 100) oh_maintenance_pct = int(oh_maintenance * 100) + # Interconnect scalars (for tables/prose) + nvlink_h100_bw = int(nvlink_h100_qty.m_as(GB / second)) + pcie5_bw = int(pcie5_qty.m_as(GB / second)) + ib_hdr_bw = INFINIBAND_HDR_BW_GBS + ib_xdr_bw = INFINIBAND_XDR_BW_GBS + roce_bw = ROCE_100G_BW_GBS + eth_400g_bw = ETHERNET_400G_BW_GBS + tpuv5_ici = int(TPUV5P_ICI_BW.m_as(GB / second)) + ib_hdr_lat = IB_HDR_LATENCY_US + roce_lat = ROCE_LATENCY_US + tcp_lat = TCP_LATENCY_US + ib_to_tcp_lat = int(TCP_LATENCY_US / IB_NDR_LATENCY_US) + + # Checkpoint sizes (GB or TB for display) + ckpt_7b_gb = calc_checkpoint_size(7e9, 16).m_as(GB) + ckpt_70b_gb = calc_checkpoint_size(70e9, 16).m_as(GB) + ckpt_175b_gb = calc_checkpoint_size(175e9, 16).m_as(GB) + ckpt_1t_tb = calc_checkpoint_size(1e12, 16).m_as(TB) + + # Rack & PUE + rack_trad = RACK_POWER_TRADITIONAL_KW + rack_ai = RACK_POWER_AI_TYPICAL_KW + rack_ai_high = RACK_POWER_AI_HIGH_KW + air_limit = AIR_COOLING_LIMIT_KW + pue_liquid = PUE_LIQUID_COOLED + pue_air = PUE_BEST_AIR + pue_typical = PUE_TYPICAL + pue_legacy = PUE_LEGACY + rack_ratio = rack_ai / rack_trad + + # Accelerator table (scalars for fmt) + h100_flops = int(H100_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)) + h100_bw_tbs = fmt(H100_MEM_BW.m_as(TB / second), precision=2, commas=False) + h100_cap = int(H100_MEM_CAPACITY.m_as(GiB)) + h100_tdp = int(H100_TDP.m_as(watt)) + b200_flops = int(B200_FLOPS_FP16_TENSOR.m_as(TFLOPs / second)) + b200_bw_tbs = fmt(B200_MEM_BW.m_as(TB / second), precision=2, commas=False) + b200_cap = int(B200_MEM_CAPACITY.m_as(GiB)) + b200_tdp = int(B200_TDP.m_as(watt)) + tpuv5_flops = int(TPUV5P_FLOPS_BF16.m_as(TFLOPs / second)) + tpuv5_bw_tbs = fmt(TPUV5P_MEM_BW.m_as(TB / second), precision=2, commas=False) + tpuv5_cap = int(TPUV5P_MEM_CAPACITY.m_as(GiB)) + # ── EXPORTS (Bridge to Text) ──────────────────────────────────────────────── FF = FleetFoundations @@ -198,7 +260,7 @@ If you memorize nothing else from this section, memorize these: 2. **MTBF scales as $1/N$**: A cluster's mean time between failures is the single-component MTBF divided by the number of components. At `{python} fmt(FF.cl_mega, precision=0)` GPUs, expect a failure every `{python} fmt(FF.mtbf_100k_min, precision=0)` minutes. -3. **~`{python} fmt(FF.eff_fraction * 100, precision=0)`% effective utilization**: After MFU, scaling efficiency, and overhead losses compound, a 1,024-GPU cluster delivers roughly `{python} fmt(FF.eff_fraction * 100, precision=0)`% of its peak FLOPS as useful training work. +3. **~`{python} fmt_percent(FF.eff_fraction)`% effective utilization**: After MFU, scaling efficiency, and overhead losses compound, a 1,024-GPU cluster delivers roughly `{python} fmt_percent(FF.eff_fraction)`% of its peak FLOPS as useful training work. ::: Quick reference --- @tbl-fleet-numbers-quick-ref condenses the numbers below into one place. Use it for back-of-envelope checks; use the detailed tables in each subsection when designing or debugging. @@ -708,7 +770,8 @@ The key insight for fleet-scale ML is that weak scaling is not just a mathematic peak_str = fmt(FF.peak_1024, precision=0) eff_str = fmt(FF.eff_flops_1024, precision=0) -eff_pct_str = fmt(FF.eff_fraction * 100, precision=0, commas=False) +# Effective % of peak (0.50 × 0.50 × 0.77 ≈ 19.25%); use fmt_percent to avoid display bugs +eff_pct_str = fmt_percent(FF.eff_fraction, precision=1) goodput_pct_str = fmt(FF.goodput_ratio * 100, precision=0, commas=False) mfu_pct_str = fmt(MFU_TRAINING_HIGH * 100, precision=0, commas=False) scaling_pct_str = fmt(SCALING_EFF_1024GPU * 100, precision=0, commas=False) diff --git a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd index b52593454..d91aa9add 100644 --- a/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd +++ b/book/quarto/contents/vol2/fault_tolerance/fault_tolerance.qmd @@ -270,7 +270,7 @@ tau_opt = np.sqrt(2 * t_write * mtbf) fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(tau, over_ckpt, label='Checkpoint Overhead ($T_{\text{write}}/\\tau$)', color=COLORS['BlueLine'], linestyle='--') -ax.plot(tau, over_rework, label='Expected Rework ($\\tau/2\\text{\text{MTBF}}$)', color=COLORS['RedLine'], linestyle='--') +ax.plot(tau, over_rework, label=r'Expected Rework ($\tau/2\text{MTBF}$)', color=COLORS['RedLine'], linestyle='--') ax.plot(tau, total_waste, label='Total Wasted Work', color=COLORS['GreenLine'], linewidth=2.5) # Optimal point @@ -536,96 +536,94 @@ Facebook documented a pervasive SDC issue where a hardware fault caused a valid helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, Line/.style={line width=1.0pt,black!50,text=black}, cube/.style={cylinder, draw,shape border rotate=90, aspect=1.8,inner ysep=0pt, - minimum height=34mm,minimum width=25mm, cylinder uses custom fill, - cylinder body fill=GrayL,cylinder end fill=GrayL}, - Box/.style={ - inner xsep=2pt, - node distance=1.1, - draw=GreenLine, - line width=0.75pt, - font=\usefont{T1}{phv}{m}{n}\small, - align=flush center, - fill=GreenL, - text width=29mm, - minimum width=29mm, minimum height=10mm - }, + minimum height=34mm,minimum width=25mm, cylinder uses custom fill, + cylinder body fill=GrayL,cylinder end fill=GrayL}, + Box/.style={, + inner xsep=2pt, + node distance=1.1, + draw=GreenLine, + line width=0.75pt, + font=\usefont{T1}{phv}{m}{n}\small, + align=flush center, + fill=GreenL, + text width=29mm, + minimum width=29mm, minimum height=10mm + }, Box2/.style={helvetica, - inner xsep=2pt, - node distance=0.8, - draw=VioletLine, - line width=0.75pt, - font=\usefont{T1}{phv}{m}{n}\small, - align=flush center, - fill=VioletL2, - text width=32mm, - minimum width=32mm, minimum height=8mm - }, - arrow/.style={single arrow, draw=black, thick, fill=VioletL, - minimum width=15pt, single arrow head extend=3pt, rotate=270, - minimum height=7mm} + inner xsep=2pt, + node distance=0.8, + draw=VioletLine, + line width=0.75pt, + font=\usefont{T1}{phv}{m}{n}\small, + align=flush center, + fill=VioletL2, + text width=32mm, + minimum width=32mm, minimum height=8mm + }, } \node[Box](B2){Scale math.pow()}; \node[Box,above=of B2](B1){Decompress file size calculation}; -\begin{scope}[local bounding box=CPU, shift={($(B2)+(0,-2.6)$)}, scale=0.7, every node/.append style={transform shape}] - \node[fill=BlueL, minimum width=56, minimum height=56, rounded corners=8, outer sep=2pt] (C1) {}; - \node[fill=white, minimum width=44, minimum height=44] (C2) {}; - \node[fill=BlueL, minimum width=39, minimum height=39, align=center, inner sep=0pt, font=\usefont{T1}{phv}{m}{n}\fontsize{8pt}{9}\selectfont] (C3) {Defective\\CPU}; +\begin{scope}[local bounding box = CPU,shift={($(B2)+(0,-2.6)$)}, scale=0.7, every node/.append style={transform shape}] + \node[fill=BlueL,minimum width=56, minimum height=56, rounded corners=8,outer sep=2pt] (C1) {}; + \node[fill=white,minimum width=44, minimum height=44] (C2) {}; + \node[fill=BlueL,minimum width=39, minimum height=39, align=center,inner sep=0pt,font=\usefont{T1}{phv}{m}{n} \fontsize{8pt}{9}\selectfont] (C3) {Defective\\CPU}; \foreach \x/\y in {0.11/1,0.26/2,0.41/3,0.56/4,0.71/5,0.85/6}{ - \node[fill=BlueL, minimum width=3, minimum height=12, inner sep=0pt, anchor=south](GO\y) at ($(C1.north west)!\x!(C1.north east)$) {}; + \node[fill=BlueL,minimum width=3, minimum height=12, inner sep=0pt,anchor=south](GO\y)at($(C1.north west)!\x!(C1.north east)$){}; } \foreach \x/\y in {0.11/1,0.26/2,0.41/3,0.56/4,0.71/5,0.85/6}{ - \node[fill=BlueL, minimum width=3, minimum height=12, inner sep=0pt, anchor=north](DO\y) at ($(C1.south west)!\x!(C1.south east)$) {}; + \node[fill=BlueL,minimum width=3, minimum height=12, inner sep=0pt,anchor=north](DO\y)at($(C1.south west)!\x!(C1.south east)$){}; } \foreach \x/\y in {0.11/1,0.26/2,0.41/3,0.56/4,0.71/5,0.85/6}{ - \node[fill=BlueL, minimum width=12, minimum height=3, inner sep=0pt, anchor=east](LE\y) at ($(C1.north west)!\x!(C1.south west)$) {}; + \node[fill=BlueL,minimum width=12, minimum height=3, inner sep=0pt,anchor=east](LE\y)at($(C1.north west)!\x!(C1.south west)$){}; } \foreach \x/\y in {0.11/1,0.26/2,0.41/3,0.56/4,0.71/5,0.85/6}{ - \node[fill=BlueL, minimum width=12, minimum height=3, inner sep=0pt, anchor=west](DE\y) at ($(C1.north east)!\x!(C1.south east)$) {}; + \node[fill=BlueL,minimum width=12, minimum height=3, inner sep=0pt,anchor=west](DE\y)at($(C1.north east)!\x!(C1.south east)$){}; } \end{scope} -\begin{scope}[local bounding box=CY1, right=of B2, xshift=5mm] +\begin{scope}[local bounding box = CY1,shift={($(B2)+(5,-0.1)$)}] \node (CA1) [cube] {}; - \node (CA2) [cube, minimum height=10pt, fill=BlueL] at ($(CA1.bottom)!0.1!(CA1.top)$) {}; - \node (CA3) [cube, minimum height=10pt, draw=RedLine, fill=RedL] at ($(CA2.bottom)+(0,2.6mm)$) {}; - \node (CA4) [cube, minimum height=10pt, draw=RedLine, fill=RedL] at ($(CA3.bottom)+(0,2.6mm)$) {}; - \node (CA5) [cube, minimum height=10pt, fill=BlueL] at ($(CA1.bottom)!0.65!(CA1.top)$) {}; - \node[align=center] at (CA1) {Spark shuffle and\\ merge database}; + \node (CA2) [cube,minimum height=10pt, fill=BlueL]at($(CA1.bottom)!0.1!(CA1.top)$) {}; + \node (CA3) [cube,minimum height=10pt,draw=RedLine, fill=RedL]at($(CA2.bottom)+(0,2.6mm)$){}; + \node (CA4) [cube,minimum height=10pt,draw=RedLine, fill=RedL]at($(CA3.bottom)+(0,2.6mm)$){}; + \node (CA5) [cube,minimum height=10pt, fill=BlueL]at($(CA1.bottom)!0.65!(CA1.top)$) {}; + \node[align=center]at (CA1){Spark shuffle and\\ merge database}; \end{scope} -\begin{scope}[local bounding box=CY2, left=of B2, xshift=-5mm] +\begin{scope}[local bounding box = CY2,shift={($(B2)+(-5,-0.1)$)}] \node (LCA1) [cube] {}; - \node[align=center] at (LCA1) {Spark pre-shuffle \\ data store\\(compressed)}; + \node[align=center]at (LCA1){Spark pre-shuffle \\ data store\\(compressed)}; \end{scope} -\node[arrow] at ($(B2)!0.52!(B1)$) {}; -\node[arrow] at ($(B2)!0.39!(CPU)$) {}; +\node[single arrow, draw=black,thick, fill=VioletL, minimum width = 15pt, single arrow head extend=3pt,rotate=270, minimum height=7mm]at($(B2)!0.52!(B1)$) {}; +\node[single arrow, draw=black,thick, fill=VioletL, minimum width = 15pt, single arrow head extend=3pt,rotate=270, minimum height=7mm]at($(B2)!0.39!(CPU)$) {}; -\coordinate (DES) at ($(DE1)!0.5!(DE6)$); -\coordinate (LEV) at ($(LE1)!0.5!(LE6)$); -\node[single arrow, draw=black, thick, fill=VioletL, inner sep=1pt, minimum width=14pt, single arrow head extend=2pt, anchor=east, minimum height=18mm] (LS) at ($(LEV)+(-0.5,0)$) {}; -\node[single arrow, draw=black, thick, fill=VioletL, inner sep=1pt, minimum width=14pt, single arrow head extend=2pt, anchor=west, minimum height=18mm] (DS) at ($(DES)+(0.5,0)$) {}; +\coordinate(DES)at($(DE1)!0.5!(DE6)$); +\coordinate(LEV)at($(LE1)!0.5!(LE6)$); +\node[single arrow, draw=black,thick, fill=VioletL, inner sep=1pt, minimum width = 14pt, single arrow head extend=2pt,anchor=east, minimum height=18mm](LS)at($(LEV)+(-0.5,0)$) {}; +\node[single arrow, draw=black,thick, fill=VioletL, inner sep=1pt, minimum width = 14pt, single arrow head extend=2pt,anchor=west, minimum height=18mm](DS)at($(DES)+(0.5,0)$) {}; +%fitting \scoped[on background layer] -\node[draw=VioletLine, inner xsep=6.5mm, inner ysep=6.5mm, outer sep=0pt, yshift=2mm, fill=none, fit=(CPU)(B1), line width=2.5pt] (BB1) {}; -\node[below=3pt of BB1.north, anchor=north, helvetica] {Shuffle and merge}; +\node[draw=VioletLine,inner xsep=6.5mm,inner ysep=6.5mm,outer sep=0pt, yshift=2mm,fill=none,fit=(CPU)(B1),line width=2.5pt](BB1){}; +\node[below=3pt of BB1.north,anchor=north,helvetica]{Shuffle and merge}; -\node[Box2, below left=0.5 of LS] (N2) {\textbf{2.} Compute (1.1)\textsuperscript{53}}; -\node[Box2, below right=0.5 of DS, fill=BlueL, draw=BlueLine] (R3) {\textbf{3.} Result = 0}; -\node[Box2, below right=0.3 and -2.5 of R3, text width=43mm] (N3) {\textbf{3.} Expected Result = 156.24}; +\node[Box2,below left=0.5 of LS](N2){\textbf{2.} Compute (1.1)\textsuperscript{53}}; +\node[Box2,below right=0.5 of DS,fill=BlueL,draw=BlueLine](R3){\textbf{3.} Result = 0}; +\node[Box2,below right=0.3 and -2.5 of R3,text width=43mm](N3){\textbf{3.} Expected Result = 156.24}; -\node[Box2, above=of CY2] (N1) {\textbf{1.} Compute file size for decompression}; -\node[Box2, above=of CY1] (N4) {\textbf{4.} Write file to database if size $>$ 0}; -\node[Box2, below right=0.2 and -1.15 of CY1] (N5) {\textbf{5.} Missing rows in DB}; +\node[Box2,above= of CY2](N1){\textbf{1.} Compute file size for decompression}; +\node[Box2,above= of CY1](N4){\textbf{4.} Write file to database if size $>$ 0}; +\node[Box2,below right= 0.2 and -1.15of CY1](N5){\textbf{5.} Missing rows in DB}; -\draw[Line, -latex] (N5) |- (CA3.before bottom); -\draw[Line, -latex] (N5.50) |- (CA4.6); -\draw[Line] (N3.20) |- (R3); -\draw[Line, latex-] (LCA1.top) |- (B1); -\draw[Line, latex-] (CA1.top) |- (B1); +\draw[Line,-latex](N5)|-(CA3.before bottom); +\draw[Line,-latex](N5.50)|-(CA4.6); +\draw[Line](N3.20)|-(R3); +\draw[Line,-latex](LCA1.top)|-(B1); +\draw[Line,latex-](CA1.top)|-(B1); \end{tikzpicture} ``` ::: @@ -647,103 +645,109 @@ Google addresses this by maintaining "Hot Spares"—running the same computation ::: {#fig-sdc-controller fig-env="figure" fig-pos="htb" fig-cap="**Hot Spare Redundancy**. Google's data centers use hot spare cores to maintain uninterrupted ML training despite hardware failures, seamlessly transitioning workloads from defective machines to backup resources. This approach contrasts with parallel redundancy techniques like DMR/TMR by providing a reactive fault tolerance mechanism that minimizes downtime and preserves data integrity during ML training. Source: Jeff Dean, MLSys 2024 Keynote." fig-alt="Four-panel sequence: normal training grid, defective machine marked red, SDC checker detecting fault, workload transferred to hot spare while defective unit sent for repair."} ```{.tikz} \begin{tikzpicture}[line width=0.75pt,font=\small\usefont{T1}{phv}{m}{n}] -\tikzset{ - helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, - Line/.style={line width=1.0pt,black!50,rounded corners=7,-latex}, - main/.style={circle, minimum size=5mm, line width=0.7mm,draw=RedLine,keep name}, - keep name/.style={prefix after command={\pgfextra{\let\fixname\tikzlastnode}}}, - RedLine box/.style={ - append after command={ - node [rotate=-50, - fit=(\fixname), - fill=RedL, - text width=1.3mm, - inner sep=-\pgflinewidth, - rectangle - ] {} - } - }, - Box/.style={helvetica, - inner xsep=2pt, - node distance=0.7, - draw=GreenLine, - line width=0.75pt, - rounded corners, - fill=GreenL, - minimum width=11mm, minimum height=6mm - }, - Text/.style={ - inner sep=2pt, - draw=none, - line width=0.75pt, - fill=GrayL, - helvetica, - align=flush center, - minimum width=10mm, minimum height=6mm - }, +% +\tikzset{% +helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, +Line/.style={line width=1.0pt,black!50,rounded corners=7,-latex}, +main/.style={circle, minimum size=5mm, line width=0.7mm,draw=RedLine,keep name}, +keep name/.style={prefix after command={\pgfextra{\let\fixname\tikzlastnode}}}, +RedLine box/.style={ + append after command={ + node [rotate=-50, + fit=(\fixname) , + fill=RedL, + text width=1.3mm, + inner sep=-\pgflinewidth, + rectangle + ] {} + } +}, +Box/.style={helvetica, + inner xsep=2pt, + node distance=0.7, + draw=GreenLine, + line width=0.75pt, + rounded corners, + fill=GreenL, + minimum width=11mm, minimum height=6mm +}, +Text/.style={% + inner sep=2pt, + draw=none, + line width=0.75pt, + fill=GrayL, + helvetica, + align=flush center, + minimum width=10mm, minimum height=6mm +}, } -\begin{scope}[local bounding box=M1] - \foreach \x in {1,2,3} { - \foreach \y in {1,2,3} { - \node[Box] (R\y\x) at (\x,-\y) {}; +% First Diagram +\begin{scope}[local bounding box=M1,shift={(0,0)}] +\foreach \x in {1,2,3}{ + \foreach \y in {1,2,3}{ + \node[Box] (R\y\x M1) at (1.3*\x,-0.8*\y) {}; } - } - \node[Box,draw=BlueLine,fill=BlueL] at (R32) {}; - \node[Box,draw=GrayLine,fill=GrayL] at (R33) {}; - \node[below=0.2 of R32] {Normal training state}; +} +\node[Box,draw=BlueLine,fill=BlueL] at (R32M1) {}; +\node[Box,draw=GrayLine,fill=GrayL] at (R33M1) {}; +\node[below=0.2 of R32M1] {Normal training state}; \end{scope} +% Second Diagram \begin{scope}[local bounding box=M2,shift={(4.5,0)}] - \foreach \x in {1,2,3} { - \foreach \y in {1,2,3} { - \node[Box] (R\y\x) at (\x,-\y) {}; +\foreach \x in {1,2,3}{ + \foreach \y in {1,2,3}{ + \node[Box] (R\y\x M2) at (1.3*\x,-0.8*\y) {}; } - } - \node[Box,draw=BlueLine,fill=BlueL] at (R32) {}; - \node[Box,draw=GrayLine,fill=GrayL] at (R33) {}; - \node[below=0.2 of R32,align=center,RedLine] (DM) {Defective machine\\ causes SDC}; - \node[main,RedLine box] (c) at (R23) {}; - \draw[Line,RedLine] (R23) -- ++(0:1) |- (DM); +} +\node[Box,draw=BlueLine,fill=BlueL] at (R32M2) {}; +\node[Box,draw=GrayLine,fill=GrayL] at (R33M2) {}; +\node[below=0.2 of R32M2,align=center,RedLine] (DM) {Defective machine\\ causes SDC}; +\node[main,RedLine box] (c) at (R23M2) {}; +\draw[Line,RedLine] (R23M2) -- ++(0:1) |- (DM); \end{scope} +% Third Diagram \begin{scope}[local bounding box=M3,shift={(9.0,0)}] - \foreach \x in {1,2,3} { - \foreach \y in {1,2,3} { - \node[Box] (R\y\x) at (\x,-\y) {}; +\foreach \x in {1,2,3}{ + \foreach \y in {1,2,3}{ + \node[Box] (R\y\x M3) at (1.3*\x,-0.8*\y) {}; } - } - \node[Box,draw=BlueLine,fill=BlueL] at (R32) {}; - \node[Box,draw=BlueLine,fill=none,line width=2pt] at (R23) {}; - \node[Box,draw=GrayLine,fill=GrayL] at (R33) {}; - \node[below=0.2 of R32,align=center,Blue] (SD) {SDC checker\\ automatically\\ identifies SDC}; - \node[main,RedLine box] (c) at (R23) {}; - \draw[Line,Blue] (R23) -- ++(0:1) |- (SD); +} +\node[Box,draw=BlueLine,fill=BlueL] at (R32M3) {}; +\node[Box,draw=BlueLine,fill=none,line width=2pt] at (R23M3) {}; +\node[Box,draw=GrayLine,fill=GrayL] at (R33M3) {}; +\node[below=0.2 of R32M3,align=center,Blue] (SD) {SDC checker\\ automatically\\ identifies SDC}; +\node[main,RedLine box] (c) at (R23M3) {}; +\draw[Line,Blue] (R23M3) -- ++(0:1) |- (SD); \end{scope} +% Fourth Diagram \begin{scope}[local bounding box=M4,shift={(13.5,0)}] - \foreach \x in {1,2,3} { - \foreach \y in {1,2,3} { - \node[Box] (R\y\x) at (\x,-\y) {}; +\foreach \x in {1,2,3}{ + \foreach \y in {1,2,3}{ + \node[Box] (R\y\x M4) at (1.3*\x,-0.8*\y) {}; } - } - \node[Box,draw=BlueLine,fill=BlueL] at (R32) {}; - \node[Box,draw=RedLine,fill=white,line width=2pt] at (R23) {}; - \node[Box,draw=BlueLine,fill=GreenL,line width=2pt] at (R33) {}; - \node[below=0.2 of R32,align=center,Blue] (SD1) {SDC checker moves\\ training to hot spare\\ and sends defective\\ machine for repair}; - \node[main,RedLine box] (c) at (R23) {}; - \draw[Line,Blue] (R33) -- ++(0:1) |- (SD1); +} +\node[Box,draw=BlueLine,fill=BlueL] at (R32M4) {}; +\node[Box,draw=RedLine,fill=white,line width=2pt] at (R23M4) {}; +\node[Box,draw=BlueLine,fill=GreenL,line width=2pt] at (R33M4) {}; +\node[below=0.2 of R32M4,align=center,Blue] (SD1) {SDC checker moves\\ training to hot spare\\ and sends defective\\ machine for repair}; +\node[main,RedLine box] (c) at (R23M4) {}; +\draw[Line,Blue] (R33M4) -- ++(0:1) |- (SD1); \end{scope} +% Legend \begin{scope}[local bounding box=LE,shift={(3.5,0.4)}] - \node[Box,draw=GreenLine, fill=GreenL] (ZE) {}; - \node[right=2pt of ZE,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L1) {Synchronous Training Worker}; - \node[Box,draw=BlueLine,fill=BlueL,right=of L1] (PL) {}; - \node[right=2pt of PL,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L2) {SDC checker}; - \node[Box,draw=GrayLine,fill=GrayL,right=of L2] (SI) {}; - \node[right=2pt of SI,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L3) {Hot spare}; - \scoped[on background layer] - \node[draw=BackLine,inner xsep=10,inner ysep=6,yshift=0mm,fill=BackColor!60,fit=(ZE)(L3),line width=0.75pt] (BB1) {}; +\node[Box,draw=GreenLine, fill=GreenL] (ZE) {}; +\node[right=2pt of ZE,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L1) {Synchronous Training Worker}; +\node[Box,draw=BlueLine,fill=BlueL,right=of L1] (PL) {}; +\node[right=2pt of PL,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L2) {SDC checker}; +\node[Box,draw=GrayLine,fill=GrayL,right=of L2] (SI) {}; +\node[right=2pt of SI,font=\small\usefont{T1}{phv}{m}{n}\footnotesize] (L3) {Hot spare}; +\scoped[on background layer] +\node[draw=BackLine,inner xsep=10,inner ysep=6,yshift=0mm,fill=BackColor!60,fit=(ZE)(L3),line width=0.75pt] (BB1) {}; \end{scope} \end{tikzpicture} ``` @@ -766,9 +770,7 @@ Byzantine failures are particularly dangerous in distributed training because th Msg/.style={draw=GrayLine, line width=0.75pt, rectangle, rounded corners, font=\scriptsize, fill=white}, Title/.style={font=\bfseries, anchor=south}, GreenNode/.style={NodeStyle, draw=GreenLine, fill=GreenL}, - RedNode/.style={NodeStyle, draw=RedLine, fill=RedL}, - ArrowStyle/.style={->, line width=1.0pt}, - DashedArrowStyle/.style={->, line width=1.0pt, dashed} + RedNode/.style={NodeStyle, draw=RedLine, fill=RedL} } \begin{scope}[local bounding box=fail_stop] @@ -776,8 +778,8 @@ Byzantine failures are particularly dangerous in distributed training because th \node[GreenNode, above right=of C1] (W1) {W1}; \node[RedNode, below right=of W1, label={right:Silent}] (W2) {W2 (X)}; - \draw[ArrowStyle, GreenLine] (W1) -- node[Msg, above, sloped] {Grad} (C1); - \draw[DashedArrowStyle, RedLine] (W2) -- node[Msg, below, sloped] {Timeout} (C1); + \draw[->, line width=1.0pt, GreenLine] (W1) -- node[Msg, above, sloped] {Grad} (C1); + \draw[->, line width=1.0pt, dashed, RedLine] (W2) -- node[Msg, below, sloped] {Timeout} (C1); \node[Title, above=0.5cm of W1] {\textbf{Fail-Stop Failure}}; \end{scope} @@ -787,8 +789,8 @@ Byzantine failures are particularly dangerous in distributed training because th \node[GreenNode, above right=of C2] (W3) {W1}; \node[RedNode, below right=of W3, label={right:Malicious}] (W4) {W2 (?!)}; - \draw[ArrowStyle, GreenLine] (W3) -- node[Msg, above, sloped] {Grad: 0.5} (C2); - \draw[ArrowStyle, RedLine] (W4) -- node[Msg, below, sloped] {Grad: 9.9} (C2); + \draw[->, line width=1.0pt, GreenLine] (W3) -- node[Msg, above, sloped] {Grad: 0.5} (C2); + \draw[->, line width=1.0pt, RedLine] (W4) -- node[Msg, below, sloped] {Grad: 9.9} (C2); \node[Title, above=0.5cm of W3] {\textbf{Byzantine Failure}}; \node[RedLine, font=\bfseries, below=1.5cm of W3] {Poisoned Update!}; @@ -892,14 +894,14 @@ The practical implication for ML systems is that fleet-wide failure rates depend ::: {#fig-bathtub-curve fig-env="figure" fig-pos="htb" fig-cap="**The Bathtub Curve**. Hardware failure rates $\lambda(t)$ vary over time. (1) **Infant Mortality**: High failure rate initially due to manufacturing defects. (2) **Useful Life**: Constant, low failure rate where random failures dominate. (3) **Wear-Out**: Increasing failure rate as components age. Burn-in testing aims to filter out infant mortality failures before deployment." fig-alt="Line graph of failure rate versus component age showing bathtub shape. Three phases: infant mortality with high decreasing rate, useful life with constant low rate, and wear-out with increasing rate. Vertical dashed line marks burn-in period."} ```{.tikz} -\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}] - \tikzset{ - RedLine/.style={draw=red, ultra thick}, - BlueLine/.style={draw=blue, ultra thick}, - OrangeLine/.style={draw=orange, ultra thick}, - annotation/.style={align=center, font=\footnotesize}, - dashedLine/.style={dashed, thick, black!60} - } +\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, + node distance=1cm and 1cm, + RedLine/.style={draw=red, thick}, + BlueLine/.style={draw=blue, thick}, + OrangeLine/.style={draw=orange, thick}, + axis label/.style={font=\footnotesize, align=center}, + annotation/.style={font=\scriptsize, align=center} +] \begin{axis}[ width=10cm, height=6cm, @@ -912,20 +914,20 @@ The practical implication for ML systems is that fleet-wide failure rates depend grid=none ] % Infant Mortality - \addplot[domain=0.5:2, samples=50, RedLine] {1/(x*2) + 0.2}; - \node[annotation, RedLine] at (axis cs: 1.2, 0.9) {Infant Mortality\\(Defects)}; + \addplot[domain=0.5:2, samples=50, RedLine, ultra thick] {1/(x*2) + 0.2}; + \node[axis label, RedLine] at (axis cs: 1.2, 0.9) {Infant Mortality\\(Defects)}; % Useful Life - \addplot[domain=2:7, samples=2, BlueLine] {0.2}; - \node[annotation, BlueLine] at (axis cs: 4.5, 0.35) {Useful Life\\(Random Failures)}; + \addplot[domain=2:7, samples=2, BlueLine, ultra thick] {0.2}; + \node[axis label, BlueLine] at (axis cs: 4.5, 0.35) {Useful Life\\(Random Failures)}; % Wear Out - \addplot[domain=7:9.5, samples=50, OrangeLine] {0.2 + 0.05*exp(x-7)}; - \node[annotation, OrangeLine] at (axis cs: 8.5, 0.9) {Wear-Out\\(Aging)}; + \addplot[domain=7:9.5, samples=50, OrangeLine, ultra thick] {0.2 + 0.05*exp(x-7)}; + \node[axis label, OrangeLine] at (axis cs: 8.5, 0.9) {Wear-Out\\(Aging)}; % Burn-in Line - \draw[dashedLine] (axis cs: 2, 0) -- (axis cs: 2, 1.2); - \node[anchor=north west, font=\scriptsize] at (axis cs: 2, 1.2) {Burn-in Period}; + \draw[dashed, thick, black!60] (axis cs: 2, 0) -- (axis cs: 2, 1.2); + \node[annotation, anchor=north west] at (axis cs: 2, 1.2) {Burn-in Period}; \end{axis} \node[annotation] at (5, -1.2) {Burn-in testing filters infant mortality.\\Proactive replacement preempts wear-out.}; @@ -1016,38 +1018,37 @@ Transient faults are the most common category. @fig-bit-flip illustrates the bas ::: {#fig-bit-flip fig-env="figure" fig-pos="htb" fig-cap="**Bit-Flip Error**: Transient faults can alter individual bits in memory, corrupting data or program instructions and potentially causing system malfunctions. These single-bit errors exemplify the vulnerability of hardware to transient faults like those induced by radiation or electromagnetic interference." fig-alt="Two 8-bit binary sequences showing bit flip: top row displays original value, bottom row shows corrupted value with one bit changed from 0 to 1 highlighted in red."} ```{.tikz} \begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, node distance=0pt] - \colorlet{BrownL}{BrownL!30} - \colorlet{BrownLine}{brown} - \tikzset{ - cell/.style={draw=BrownLine, fill=BrownL, line width=0.75pt, minimum size=9mm, align=center}, - highlight/.style={line width=2pt} - } +\colorlet{BrownL}{BrownL!30} +\tikzset{ + cell/.style={draw=BrownLine, fill=BrownL, line width=0.75pt, minimum size=9mm, align=center}, + memory label/.style={right=0.3cm, align=center} +} - % Memory Before - \begin{scope}[local bounding box=M1] - \node[cell] (m1_1) {1}; - \node[cell, right=of m1_1] (m1_2) {0}; - \node[cell, right=of m1_2] (m1_3) {0}; - \node[cell, right=of m1_3] (m1_4) {1}; - \node[cell, right=of m1_4] (m1_5) {1}; - \node[cell, right=of m1_5] (m1_6) {0}; - \node[right=0.3cm of m1_6] (dots1) {$\bullet$ $\bullet$ $\bullet$}; - \node[right=0.3cm of dots1] {Memory before}; - \end{scope} +% Memory Before +\begin{scope}[local bounding box=M1] + \node[cell] (m1_1) {1}; + \node[cell, right=of m1_1] (m1_2) {0}; + \node[cell, right=of m1_2] (m1_3) {0}; + \node[cell, right=of m1_3] (m1_4) {1}; + \node[cell, right=of m1_4] (m1_5) {1}; + \node[cell, right=of m1_5] (m1_6) {0}; + \node[memory label, right=of m1_6] (dots1) {$\bullet$ $\bullet$ $\bullet$}; + \node[memory label, right=of dots1] {Memory before}; +\end{scope} - % Memory After - \begin{scope}[local bounding box=M2, below=2cm of M1] - \node[cell] (m2_1) {1}; - \node[cell, right=of m2_1] (m2_2) {0}; - \node[cell, right=of m2_2, highlight] (m2_3) {1}; - \node[cell, right=of m2_3] (m2_4) {1}; - \node[cell, right=of m2_4] (m2_5) {1}; - \node[cell, right=of m2_5] (m2_6) {0}; - \node[right=0.3cm of m2_6] (dots2) {$\bullet$ $\bullet$ $\bullet$}; - \node[right=0.3cm of dots2] {Memory after}; +% Memory After +\begin{scope}[local bounding box=M2, below=2cm of M1] + \node[cell] (m2_1) {1}; + \node[cell, right=of m2_1] (m2_2) {0}; + \node[cell, right=of m2_2, line width=2pt] (m2_3) {1}; + \node[cell, right=of m2_3] (m2_4) {1}; + \node[cell, right=of m2_4] (m2_5) {1}; + \node[cell, right=of m2_5] (m2_6) {0}; + \node[memory label, right=of m2_6] (dots2) {$\bullet$ $\bullet$ $\bullet$}; + \node[memory label, right=of dots2] {Memory after}; - \node[above=3pt of m2_3] {\textbf{Bit-Flip}}; - \end{scope} + \node[above=3pt of m2_3] {\textbf{Bit-Flip}}; +\end{scope} \end{tikzpicture} ``` ::: @@ -1107,63 +1108,66 @@ The [Intel FDIV bug](https://en.wikipedia.org/wiki/Pentium_FDIV_bug), discovered ::: {#fig-stuck-fault fig-env="figure" fig-pos="htb" fig-cap="**Stuck-at Fault Model**: Digital circuits can experience permanent faults where a signal line becomes fixed at a logical 0 or 1, regardless of input; this figure represents a simplified depiction of a stuck-at-0 fault, where a signal is persistently low, potentially leading to incorrect computations or system failures. *Source: [accendo reliability](HTTPS://accendoreliability.com/digital-circuits-stuck-fault-model/)*" fig-alt="Logic gate circuit diagram showing signal propagation through inverters and AND gates. One input line marked stuck-at-0 with X symbol, causing incorrect output regardless of input."} ```{.tikz} -\begin{tikzpicture}[line join=round, font=\small\usefont{T1}{phv}{m}{n}] - \tikzset{ - helvetica/.style={align=flush center, font=\small\usefont{T1}{phv}{m}{n}}, - Line/.style={line width=1.0pt, black!50, text=black}, - DLine/.style={draw=OrangeLine!40, line width=1.0pt, -{Triangle[length=3mm, bend]}, shorten >=1.1mm, shorten <=1.15mm}, - nodeStyle/.style={draw, fill=VioletL, line width=0.75pt, minimum width=1.5cm, minimum height=0.76cm, anchor=south west}, - circleStyle/.style={draw=black, fill=white, line width=1.5pt, circle, minimum size=4pt, inner sep=0pt} - } - \colorlet{VioletL}{GreenL!60} +\begin{tikzpicture}[line join=round,font=\small\usefont{T1}{phv}{m}{n}] +\useasboundingbox(-2,2.5) rectangle (15.7,-4.7); +\tikzset{% + helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, + Line/.style={line width=1.0pt,black!50,text=black}, + DLine/.style={draw=OrangeLine!40, line width=1.0pt, -{Triangle[length=3mm, bend]}, + shorten >=1.1mm, shorten <=1.15mm}, + VioletL/.style={fill=GreenL!60}, + nodeStyle/.style={draw=black, fill=white, line width=1.5pt, circle, minimum size=4pt, inner sep=0pt}, + diagramNode/.style={line width=1pt, fill=VioletL, minimum width=1.5cm, minimum height=1.5cm, anchor=south west} +} - % Nodes - \node[nodeStyle] (D1) at (0, 0) {}; - \node[circleStyle] at ($(D1.south west)!0.5!(D1.north east)$) {}; - \node[nodeStyle, below=of D1] (D2) {}; - \node[circleStyle] at ($(D2.south west)!0.5!(D2.north east)$) {}; - \node[nodeStyle, right=of D1] (D3) {}; - \node[circleStyle] at ($(D3.south west)!0.5!(D3.north east)$) {}; - \node[nodeStyle, right=of D2] (D4) {}; - \node[circleStyle] at ($(D4.south west)!0.5!(D4.north east)$) {}; +\begin{scope}[scale=1.75, every node/.append style={transform shape}, local bounding box=D1] + \node[diagramNode] (D1) at (0,0) {}; + \node[nodeStyle] (IZD1) at (0.72,0.38) {}; + \coordinate (GD1) at (0,0.58); + \coordinate (DD1) at (0,0.18); +\end{scope} - % Coordinates - \coordinate (G1) at ($(D1.south west)!0.76!(D1.north east)$); - \coordinate (D1C) at ($(D1.south west)!0.18!(D1.north east)$); - \coordinate (IZ1) at ($(D1.south west)!0.8!(D1.north east)$); +\begin{scope}[scale=1.75, every node/.append style={transform shape}, local bounding box=D2, below=of D1] + \node[diagramNode] (D2) {}; + \node[nodeStyle] (IZD2) at (0.72,0.38) {}; + \coordinate (GD2) at (0,0.58); + \coordinate (DD2) at (0,0.18); +\end{scope} - \coordinate (G2) at ($(D2.south west)!0.76!(D2.north east)$); - \coordinate (D2C) at ($(D2.south west)!0.18!(D2.north east)$); - \coordinate (IZ2) at ($(D2.south west)!0.8!(D2.north east)$); +\begin{scope}[scale=1.75, every node/.append style={transform shape}, local bounding box=D3, right=of D1] + \node[diagramNode] (D3) {}; + \node[nodeStyle] (IZD3) at (0.72,0.38) {}; + \coordinate (GD3) at (0,0.58); + \coordinate (DD3) at (0,0.18); +\end{scope} - \coordinate (G3) at ($(D3.south west)!0.76!(D3.north east)$); - \coordinate (D3C) at ($(D3.south west)!0.18!(D3.north east)$); - \coordinate (IZ3) at ($(D3.south west)!0.8!(D3.north east)$); +\begin{scope}[scale=1.75, every node/.append style={transform shape}, local bounding box=D4, right=of D3] + \node[diagramNode] (D4) {}; + \node[nodeStyle] (IZD4) at (0.72,0.38) {}; + \coordinate (GD4) at (0,0.58); + \coordinate (DD4) at (0,0.18); +\end{scope} - \coordinate (G4) at ($(D4.south west)!0.76!(D4.north east)$); - \coordinate (D4C) at ($(D4.south west)!0.18!(D4.north east)$); - \coordinate (IZ4) at ($(D4.south west)!0.8!(D4.north east)$); +% Lines +\draw[Line] (IZD2) -- node[above,pos=0.3] (IIZD2) {SAO \textcolor{black}{SA1}} ++(0:3) |- node[below,pos=0.91] (ULDD4) {SAO \textcolor{red}{SA1}} (DD4); +\draw[Line] (IZD1) -- node[above,pos=0.5] (IIZD1) {SAO \textcolor{red}{SA1}} ++(0:2) |- (GD3); +\draw[Line] (IZD3) -- node[above,pos=0.9] (IIZD3) {SAO \textcolor{red}{SA1}} ++(0:1) |- (GD4); +\draw[Line] (DD3) -- node[above,pos=0.3] (ULDD3) {SAO \textcolor{red}{SA1}} ++(180:3.6) |- (IZD2); +\draw[Line] (GD1) -- node[above,pos=0.5] (ULGD1) {SAO \textcolor{red}{SA1}} ++(180:2); +\draw[Line] (DD1) -- node[above,pos=0.5] (ULDD1) {SAO \textcolor{red}{SA1}} ++(180:2); +\draw[Line] (GD2) -- node[above,pos=0.5] (ULGD2) {SAO \textcolor{red}{SA1}} ++(180:2); +\draw[Line] (DD2) -- node[above,pos=0.5] (ULDD2) {SAO \textcolor{red}{SA1}} ++(180:2); +\draw[Line] (IZD4) -- node[above,pos=0.5] (IIZD4) {SAO \textcolor{black}{SA1}} ++(0:2); - % Lines - \draw[Line] (IZ2) -- node[above, pos=0.3] {SAO \textcolor{black}{SA1}} ++(3, 0) |- node[below, pos=0.91] {SAO \textcolor{red}{SA1}} (D4C); - \draw[Line] (IZ1) -- node[above, pos=0.5] {SAO \textcolor{red}{SA1}} ++(2, 0) |- (G3); - \draw[Line] (IZ3) -- node[above, pos=0.9] {SAO \textcolor{red}{SA1}} ++(1, 0) |- (G4); - \draw[Line] (D3C) -- node[above, pos=0.3] {SAO \textcolor{red}{SA1}} ++(-3.6, 0) |- (IZ2); - \draw[Line] (G1) -- node[above, pos=0.5] {SAO \textcolor{red}{SA1}} ++(-2, 0); - \draw[Line] (D1C) -- node[above, pos=0.5] {SAO \textcolor{red}{SA1}} ++(-2, 0); - \draw[Line] (G2) -- node[above, pos=0.5] {SAO \textcolor{red}{SA1}} ++(-2, 0); - \draw[Line] (D2C) -- node[above, pos=0.5] {SAO \textcolor{red}{SA1}} ++(-2, 0); - \draw[Line] (IZ4) -- node[above, pos=0.5] {SAO \textcolor{black}{SA1}} ++(2, 0); - - % DLines - \draw[DLine, distance=40] (G1) to[out=50, in=120] (IZ1); - \draw[DLine, distance=44] (D1C) to[out=-50, in=-110] (IZ1); - \draw[DLine, distance=40] (G2) to[out=50, in=120] (IZ2); - \draw[DLine, distance=44] (D2C) to[out=-50, in=-110] (IZ2); - \draw[DLine, distance=50] (D3C) to[out=-50, in=-120] (IZ3); - \draw[DLine, distance=50] (D4C) to[out=-50, in=-100] (IZ4); - \draw[DLine, distance=63] (IZ3) to[out=50, in=90] (IZ4); - \draw[DLine, distance=80] (IZ1) to[out=50, in=90] (IZ3); +% DLines +\draw[DLine, distance=40] (ULGD1) to[out=50,in=120] (IIZD1); +\draw[DLine, distance=44] (ULDD1) to[out=-50,in=-110] (IIZD1); +\draw[DLine, distance=40] (ULGD2) to[out=50,in=120] (IIZD2); +\draw[DLine, distance=44] (ULDD2) to[out=-50,in=-110] (IIZD2); +\draw[DLine, distance=50] (ULDD3) to[out=-50,in=-120] (IIZD3); +\draw[DLine, distance=50] (ULDD4) to[out=-50,in=-100] (IIZD4); +\draw[DLine, distance=63] (IIZD3) to[out=50,in=90] (IIZD4); +\draw[DLine, distance=80] (IIZD1) to[out=50,in=90] (IIZD3); \end{tikzpicture} ``` ::: @@ -1203,98 +1207,99 @@ Error detection and correction codes [@hamming1950error][^fn-hamming-ecc-origin] \scalebox{0.8}{% \begin{tikzpicture}[line join=round,font=\small\usefont{T1}{phv}{m}{n}] \tikzset{% - helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, - cell/.style={draw=none,line width=0.75pt, minimum width=8,inner xsep=0pt, - align=center,node distance=0,minimum height=22}, - parity/.style={draw=RedLine, fill=RedL, text=white}, - label/.style={align=center,text depth=0.7,font=\usefont{T1}{phv}{m}{n}\small}, - thickline/.style={thick,shorten >=-15,shorten <=-15} + helvetica/.style={align=flush center,font=\small\usefont{T1}{phv}{m}{n}}, + cell/.style={draw=none,line width=0.75pt, minimum width=8,inner xsep=0pt, + align=center,node distance=0,minimum height=22}, + paritybit/.style={draw=RedLine, fill=RedL, text=white} } \begin{scope}[local bounding box=M1] - \def\ma{M1} - \node[cell](B1\ma){0}; - \node[cell,right=of B1\ma](B2\ma){1}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){1}; - \node[cell,right=of B6\ma](B7\ma){0}; + \def\ma{M1} + \node[cell](B1\ma){0}; + \node[cell,right=of B1\ma](B2\ma){1}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){1}; + \node[cell,right=of B6\ma](B7\ma){0}; \end{scope} -\begin{scope}[local bounding box=M2, right=of M1] - \def\ma{M2} - \node[cell](B1\ma){0}; - \node[cell,right=of B1\ma](B2\ma){1}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){1}; - \node[cell,right=of B6\ma](B7\ma){0}; - \node[cell,right=of B7\ma,parity](B8\ma){0}; +\begin{scope}[local bounding box=M2, right=2.6 of M1] + \def\ma{M2} + \node[cell](B1\ma){0}; + \node[cell,right=of B1\ma](B2\ma){1}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){1}; + \node[cell,right=of B6\ma](B7\ma){0}; + \node[cell,right=of B7\ma,paritybit](B8\ma){0}; \end{scope} -\begin{scope}[local bounding box=M3, right=of M2] - \def\ma{M3} - \node[cell](B1\ma){0}; - \node[cell,right=of B1\ma](B2\ma){1}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){1}; - \node[cell,right=of B6\ma](B7\ma){0}; - \node[cell,right=of B7\ma,parity](B8\ma){1}; +\begin{scope}[local bounding box=M3, right=2.9 of M2] + \def\ma{M3} + \node[cell](B1\ma){0}; + \node[cell,right=of B1\ma](B2\ma){1}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){1}; + \node[cell,right=of B6\ma](B7\ma){0}; + \node[cell,right=of B7\ma,paritybit](B8\ma){1}; \end{scope} -\begin{scope}[local bounding box=M4, below=of M1] - \def\ma{M4} - \node[cell](B1\ma){1}; - \node[cell,right=of B1\ma](B2\ma){0}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){0}; - \node[cell,right=of B6\ma](B7\ma){0}; +\begin{scope}[local bounding box=M4, below=1 of M1] + \def\ma{M4} + \node[cell](B1\ma){1}; + \node[cell,right=of B1\ma](B2\ma){0}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){0}; + \node[cell,right=of B6\ma](B7\ma){0}; \end{scope} -\begin{scope}[local bounding box=M5, right=of M4] - \def\ma{M5} - \node[cell](B1\ma){1}; - \node[cell,right=of B1\ma](B2\ma){0}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){0}; - \node[cell,right=of B6\ma](B7\ma){0}; - \node[cell,right=of B7\ma,parity](B8\ma){1}; +\begin{scope}[local bounding box=M5, right=2.6 of M4] + \def\ma{M5} + \node[cell](B1\ma){1}; + \node[cell,right=of B1\ma](B2\ma){0}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){0}; + \node[cell,right=of B6\ma](B7\ma){0}; + \node[cell,right=of B7\ma,paritybit](B8\ma){1}; \end{scope} -\begin{scope}[local bounding box=M6, right=of M5] - \def\ma{M6} - \node[cell](B1\ma){1}; - \node[cell,right=of B1\ma](B2\ma){0}; - \node[cell,right=of B2\ma](B3\ma){0}; - \node[cell,right=of B3\ma](B4\ma){0}; - \node[cell,right=of B4\ma](B5\ma){0}; - \node[cell,right=of B5\ma](B6\ma){1}; - \node[cell,right=of B6\ma](B7\ma){0}; - \node[cell,right=of B7\ma,parity](B8\ma){0}; +\begin{scope}[local bounding box=M6, right=2.9 of M5] + \def\ma{M6} + \node[cell](B1\ma){1}; + \node[cell,right=of B1\ma](B2\ma){0}; + \node[cell,right=of B2\ma](B3\ma){0}; + \node[cell,right=of B3\ma](B4\ma){0}; + \node[cell,right=of B4\ma](B5\ma){0}; + \node[cell,right=of B5\ma](B6\ma){1}; + \node[cell,right=of B6\ma](B7\ma){0}; + \node[cell,right=of B7\ma,paritybit](B8\ma){0}; \end{scope} -\node[label,above=0.5 of $(B1M1)!0.5!(B7M1)$](SS){sequence of\\ seven bits}; -\node[label,above=0.5 of $(B1M2)!0.5!(B8M2)$](WE){with eighth\\ even parity bit}; -\node[label,above=0.5 of $(B1M3)!0.5!(B8M3)$](WO){with eighth\\ odd parity bit}; -\node[label,above=0.6 of $(SS)!0.5!(WO)$,bluegraph](PBE){Parity bit examples}; +\node[above=0.5 of $(B1M1)!0.5!(B7M1)$,align=center,text depth=0.7](SS){sequence of\\ seven bits}; +\node[above=0.5 of $(B1M2)!0.5!(B8M2)$,align=center,text depth=0.7](WE){with eighth\\ even parity bit}; +\node[above=0.5 of $(B1M3)!0.5!(B8M3)$,align=center,text depth=0.7](WO){with eighth\\ odd parity bit}; +\node[above=0.6 of $(SS)!0.5!(WO)$,align=center,text depth=0.7,bluegraph](PBE){Parity bit examples}; -\draw[thickline]($(B1M1)!0.5!(B1M4)$)coordinate(X0)--($(B8M3)!0.5!(B8M6)$)coordinate(X1); -\draw[thickline]([yshift=2pt]B1M1.north west)--([yshift=2pt]B8M3.north east); -\draw[thickline]($(B7M4)!0.5!(B1M5)$)--++(90:1.8); -\draw[thickline]($(B8M5)!0.5!(B1M6)$)--++(90:1.8); +\draw[thick,shorten >=-15,shorten <=-15]($(B1M1)!0.5!(B1M4)$)coordinate(X0)-- +($(B8M3)!0.5!(B8M6)$)coordinate(X1); +\draw[thick,shorten >=-15,shorten <=-15]([yshift=2pt]B1M1.north west)-- +([yshift=2pt]B8M3.north east); + +\draw[thick,shorten >=-15,shorten <=-10]($(B7M4)!0.5!(B1M5)$)--++(90:1.8); +\draw[thick,shorten >=-15,shorten <=-10]($(B8M5)!0.5!(B1M6)$)--++(90:1.8); \scoped[on background layer] \node[draw=BackLine,inner xsep=25,inner ysep=27,yshift=-8mm, fill=BackColor!20,fit=(PBE)(X0)(X1),line width=0.75pt](BB1){}; -\node[above=2pt of BB1.south east,anchor=south east, +\node[above=2pt of BB1.south east,anchor=south east, font=\usefont{T1}{phv}{m}{n}\footnotesize,black!30]{ComputerHope.com}; \end{tikzpicture}} ``` @@ -1359,93 +1364,348 @@ Addressing software faults requires an integrated strategy spanning development, Systematic testing --- unit, integration, and regression --- forms the first line of defense (@fig-regression-testing-ft). Automated CI/CD pipelines (@fig-CI-CD-procedure-ft) embed testing, validation, and monitoring directly into the software delivery process, with gates at each stage that reduce the risk of unnoticed regressions. :::: {#fig-regression-testing-ft fig-env="figure" fig-pos="htb" fig-cap="**Regression Test Automation**: Automated regression tests verify that new code changes do not introduce unintended errors into existing functionality, preserving system reliability throughout the development lifecycle. *Source: [UTOR](HTTPS://u-tor.com/topic/regression-vs-integration)*" fig-alt="Flowchart showing code commit triggering automated test suite. Tests run against existing functionality with pass/fail indicators before deployment approval."} -![](./images/png/regression_testing.png){width=75%} +![](../robust_ai/images/png/regression_testing.png){width=75%} :::: ::: {#fig-CI-CD-procedure-ft fig-env="figure" fig-pos="htb" fig-cap="**CI/CD Pipeline**: Automated CI/CD pipelines enforce fault-aware development by integrating testing and validation directly into the software delivery process, reducing the risk of regressions and ensuring only tested code reaches production. *Source: [geeksforgeeks](HTTPS://www.geeksforgeeks.org/ci-cd-continuous-integration-and-continuous-delivery/)*" fig-alt="Pipeline flowchart: developer commits code, triggering build and test stages in CI, then deploy and monitor stages in CD. Arrows show automated progression between stages."} ```{.tikz} -\begin{tikzpicture}[line join=round,font=\small\usefont{T1}{phv}{m}{n}, node distance=1cm, on grid] +\begin{tikzpicture}[line join=round,font=\small\usefont{T1}{phv}{m}{n}, node distance=2cm, on grid] \tikzset{ - LineA/.style={black!50, line width=1.0pt,{-{Triangle[width=0.9*6pt,length=1.2*6pt]}}}, - ALine/.style={black!50, line width=1.0pt,{{Triangle[width=0.9*6pt,length=1.2*6pt]}-}}, - Larrow/.style={fill=OrangeLine, single arrow, inner sep=2pt, single arrow head extend=3pt, - single arrow head indent=0pt, minimum height=6mm, minimum width=3pt}, - mycylinder/.style={cylinder, shape border rotate=90, aspect=1.3, draw, fill=white, - minimum width=25mm, minimum height=11mm, line width=0.75pt}, - person/.style={scale=0.35, tiecolor=GreenLine, bodycolor=RedL, stetcolor=VioletLine, Linewidth=1.0pt}, - data/.style={scalefac=0.6, filllcolor=BlueLine, Linewidth=1.0pt}, - package/.style={scalefac=1, filllcolor=RedL, Linewidth=0.5pt}, - display/.style={scalefac=1, filllcolor=BlueLine, Linewidth=1.0pt}, - autotext/.style={scalefac=1, drawcolor=green!70!black, filllcolor=green!70!black, Linewidth=1.0pt}, - server/.style={scalefac=1.1, drawcolor=BrownLine, filllcolor=BrownL, Linewidth=1.0pt}, - testing/.style={scalefac=0.75, drawcolor=OrangeLine, filllcolor=OrangeLine, Linewidth=1.0pt}, - pencil/.style={scalefac=0.35, filllcolor=RedL, Linewidth=1.0pt}, - square/.style={scalefac=0.46, filllcolor=RedL, Linewidth=0.5pt}, - globe/.style={scalefac=0.38, filllcolor=GreenL, Linewidth=1.2pt}, - nodeStyle/.style={draw, minimum width=15mm, minimum height=10mm, inner sep=0pt, rounded corners, line width=0.75pt}, - textStyle/.style={align=center, anchor=north, RedLine} + LineA/.style={black!50, line width=1.0pt,{-{Triangle[width=0.9*6pt,length=1.2*6pt]}}}, + ALine/.style={black!50, line width=1.0pt,{{Triangle[width=0.9*6pt,length=1.2*6pt]}-}}, + Larrow/.style={fill=OrangeLine, single arrow, inner sep=2pt, single arrow head extend=3pt, + single arrow head indent=0pt,minimum height=6mm, minimum width=3pt}, + mycylinder/.style={cylinder, shape border rotate=90, aspect=1.3, draw, fill=white, + minimum width=25mm,minimum height=11mm,line width=\Linewidth,node distance=-0.15}, + pics/man/.style = {code = {\pgfkeys{/man/.cd, #1} + \begin{scope}[local bounding box=PERSON,scale=\scalefac, every node/.append style={transform shape}] + % tie + \draw[draw=\tiecolor,fill=\tiecolor] (0.0,-1.1)--(0.16,-0.87)--(0.09,-0.46)--(0.13,-0.37)--(0.0,-0.28) + --(-0.13,-0.37)--(-0.09,-0.46)--(-0.16,-0.87)--cycle; + % ears + \draw[fill=black] (0.74,0.95) to[out=20,in=80](0.86,0.80) to[out=250,in=330](0.65,0.65) to[out=70,in=260] cycle; + \draw[fill=black] (-0.76,0.96) to[out=170,in=110](-0.85,0.80) to[out=290,in=190](-0.65,0.65) to[out=110,in=290] cycle; + % head + \draw[fill=black] (0,0) to[out=180,in=290](-0.72,0.84) to[out=110,in=190](-0.56,1.67) + to[out=70,in=110](0.68,1.58) to[out=320,in=80](0.72,0.84) to[out=250,in=0] cycle; + % face + \draw[fill=white] (0,0.11) to[out=175,in=290](-0.53,0.65) to[out=110,in=265](-0.61,1.22) + to[out=80,in=235](-0.50,1.45) to[out=340,in=215](0.50,1.47) + to[out=310,in=85](0.60,0.92) to[out=260,in=2] cycle; + \draw[fill=black] (-0.50,1.45) to[out=315,in=195](0.40,1.25) to[out=340,in=10](0.37,1.32) + to[out=190,in=310](-0.40,1.49) -- cycle; + % neck + \draw[line width=1.0pt] (-0.62,-0.2) to[out=50,in=290] (-0.5,0.42); + \draw[line width=1.0pt] (0.62,-0.2) to[out=130,in=250] (0.5,0.42); + % body + \draw[draw=\bodycolor,fill=\bodycolor,line width=\Linewidth] (0.0,-1.0) to[out=150,in=290](-0.48,-0.14) to[out=200,in=50](-1.28,-0.44) + to[out=240,in=80](-1.55,-2.06) -- (1.55,-2.06) + to[out=100,in=300](1.28,-0.44) to[out=130,in=340](0.49,-0.14) + to[out=245,in=30] cycle; + % right stet + \draw[line width=3pt,\stetcolor] (0.8,-0.21) to[bend left=7](0.78,-0.64) + to[out=350,in=80](0.98,-1.35) to[out=250,in=330](0.72,-1.60); + \draw[line width=3pt,\stetcolor] (0.43,-1.53) to[out=180,in=240](0.3,-1.15) + to[out=60,in=170](0.78,-0.64); + % left stet + \draw[line width=3pt,\stetcolor] (-0.75,-0.21) to[bend right=20](-0.65,-1.45); + \node[fill=\stetcolor,circle,minimum size=5pt] at (-0.65,-1.45) {}; + % eyes + \node[circle,fill=black,inner sep=2pt] at (0.28,0.94) {}; + \node[circle,fill=black,inner sep=2pt] at (-0.28,0.94) {}; + % mouth + \draw[line width=1.1pt] (-0.25,0.5) to[bend right=40](0.25,0.5); + \end{scope} + }}, + pics/data/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=STREAMING,scale=\scalefac, every node/.append style={transform shape}] + \node[mycylinder,fill=\filllcolor!50] (A) {}; + \node[mycylinder, above=of A,fill=\filllcolor!30] (B) {}; + \node[mycylinder, above=of B,fill=\filllcolor!10] (C) {}; + \end{scope} + }}, + pics/package/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=PACKAGE,scale=\scalefac,every node/.append style={transform shape}] + % Right Face + \draw[fill=\filllcolor!70,line width=\Linewidth] + (\Depth,0,0)coordinate(\picname-ZDD)--(\Depth,\Width,0)--(\Depth,\Width,\Height)--(\Depth,0,\Height)--cycle; + % Front Face + \draw[fill=\filllcolor!40,line width=\Linewidth] + (0,0,\Height)coordinate(\picname-DL)--(0,\Width,\Height)coordinate(\picname-GL)-- + (\Depth,\Width,\Height)coordinate(\picname-GD)--(\Depth,0,\Height)coordinate(\picname-DD)--(0,0,\Height); + % Top Face + \draw[fill=\filllcolor!20,line width=\Linewidth] + (0,\Width,0)coordinate(\picname-ZGL)--(0,\Width,\Height)-- + (\Depth,\Width,\Height)--(\Depth,\Width,0)coordinate(\picname-ZGD)--cycle; + % + \path[fill=white]($(\picname-ZGL)!0.35!(\picname-ZGD)$)coordinate(\picname-A)-- + ($(\picname-GL)!0.35!(\picname-GD)$)coordinate(\picname-B)--++(0,-0.22)coordinate(\picname-C)--++(0.33,0)coordinate(\picname-D)-- + ($(\picname-GL)!0.6!(\picname-GD)$)coordinate(\picname-E)--($(\picname-ZGL)!0.6!(\picname-ZGD)$)coordinate(\picname-F)-- + ++(0,0.02)coordinate(\picname-G)-|cycle; + \draw[fill=white](\picname-A)--(\picname-B)--(\picname-C)--(\picname-D)--(\picname-E)--(\picname-F); + \draw[](\picname-A)--++(0,-0.22)coordinate(\picname-Y)--(\picname-C); + \draw[](\picname-Y)--++(0.11,0); + \end{scope} + }}, + pics/display/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[scale=\scalefac,every node/.append style={transform shape}] + \newcommand{\tikzxmark}{% + \tikz[scale=0.18] { + \draw[line width=0.7,line cap=round,GreenLine] (0,0) to [bend left=6] (1,1); + \draw[line width=0.7,line cap=round,GreenLine] (0.2,0.95) to [bend right=3] (0.8,0.05); + }} + \newcommand{\tikzxcheck}{% + \tikz[scale=0.16] { + \draw[line width=0.7,line cap=round,GreenLine] (0.5,0.75)--(0.85,-0.1) to [bend left=16] (1.5,1.55); + }} + \node[draw, minimum width =15mm, minimum height = 10mm, inner sep=0pt, rounded corners, + draw = BlueLine, fill=cyan!10,line width=2.0pt](COM){}; + \draw[draw = BlueLine,line width=1.0pt] ($(COM.north west)!0.85!(COM.south west)$)-- ($(COM.north east)!0.85!(COM.south east)$); + \draw[draw=\drawcolor,line width=\Linewidth]($(COM.south west)!0.4!(COM.south east)$)--++(270:0.2)coordinate(DL); + \draw[draw=\drawcolor,=\Linewidth]($(COM.south west)!0.6!(COM.south east)$)--++(270:0.2)coordinate(DD); + \draw[draw=\drawcolor,line width=3*\Linewidth,shorten <=-3mm,shorten >=-3mm](DL)--(DD); + \node[draw=GreenLine,inner sep=3.85pt,fill=white](CB1) at ($(COM.north west)!0.25!(COM.south west)+(0.3,0)$){}; + \node[xshift=0pt]at(CB1){\tikzxcheck}; + \node[draw=GreenLine,inner sep=3.85pt,fill=white](CB2) at ($(COM.north west)!0.6!(COM.south west)+(0.3,0)$){}; + \node[xshift=0pt]at(CB2){\tikzxmark}; + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB1)+(0.3,0.05)$)--++(0:0.8); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB1)+(0.3,-0.12)$)--++(0:0.5); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB2)+(0.3,0.05)$)--++(0:0.8); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB2)+(0.3,-0.12)$)--++(0:0.5); + \end{scope} + }}, + pics/displayE/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[scale=\scalefac,every node/.append style={transform shape}] + \node[draw, minimum width =12mm, minimum height = 10mm, inner sep=0pt, rounded corners, draw=\drawcolor, fill=\filllcolor!10,line width=2.0pt](COM){}; + \draw[draw = \drawcolor,line width=1.0pt]($(COM.north west)!0.85!(COM.south west)$)-- ($(COM.north east)!0.85!(COM.south east)$); + \draw[draw=\drawcolor,line width=\Linewidth]($(COM.south west)!0.4!(COM.south east)$)--++(270:0.2)coordinate(DL); + \draw[draw=\drawcolor,=\Linewidth]($(COM.south west)!0.6!(COM.south east)$)--++(270:0.2)coordinate(DD); + \draw[draw=\drawcolor,line width=3*\Linewidth,shorten <=-3mm,shorten >=-3mm](DL)--(DD); + \end{scope} + }}, + pics/autotext/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}scale=\scalefac,every node/.append style={transform shape}] + \node[draw, minimum width =12mm, minimum height = 5mm, inner sep=0pt, + draw = \drawcolor, fill=\filllcolor!10,line width=\Linewidth](AT\picname){\small AUTO}; + \end{scope} + }}, + pics/server/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=SERVER1,scale=\scalefac,every node/.append style={transform shape}] + \draw[draw = \drawcolor, fill=\filllcolor!10,line width=\Linewidth](-0.55,-0.5) rectangle (0.55,0.5); + \foreach \i in {-0.25,0,0.25} { + \draw[cyan,line width=1.25pt]( -0.55,\i) -- (0.55, \i); + } + \foreach \i in {-0.375, -0.125, 0.125, 0.375} { + \draw[cyan!50!black!90,line width=1.25pt](-0.45,\i)--(0,\i); + \fill[cyan!50!black!90](0.35,\i) circle (1.5pt); + } + \draw[draw = \drawcolor,line width=1.75pt](0,-0.53) |- (-0.55,-0.7); + \draw[draw = \drawcolor,line width=1.75pt](0,-0.53) |- (0.55,-0.7); + \end{scope} + }}, + pics/testing/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=TESTING1,shift={($(0,0)+(0,0)$)},scale=\scalefac,every node/.append style={transform shape}] + \newcommand{\tikzxmark}{% + \tikz[scale=0.18] { + \draw[line width=0.7,line cap=round,GreenLine] (0,0) to [bend left=6] (1,1); + \draw[line width=0.7,line cap=round,GreenLine] (0.2,0.95) to [bend right=3] (0.8,0.05); + }} + \newcommand{\tikzxcheck}{% + \tikz[scale=0.16] { + \draw[line width=0.7,line cap=round,GreenLine] (0.5,0.75)--(0.85,-0.1) to [bend left=16] (1.5,1.55); + }} + \node[draw, minimum width =15mm, minimum height = 20mm, inner sep = 0pt, + rounded corners,draw = \drawcolor, fill=\filllcolor!10, line width=\Linewidth](COM){}; + \node[draw=GreenLine,inner sep=4pt,fill=white](CB1) at ($(COM.north west)!0.25!(COM.south west)+(0.3,0)$){}; + \node[xshift=0pt]at(CB1){\tikzxcheck}; + \node[draw=GreenLine,inner sep=4pt,fill=white](CB2) at ($(COM.north west)!0.5!(COM.south west)+(0.3,0)$){}; + \node[xshift=0pt]at(CB2){\tikzxmark}; + \node[draw=GreenLine,inner sep=4pt,fill=white](CB3) at ($(COM.north west)!0.75!(COM.south west)+(0.3,0)$){}; + \node[xshift=0pt]at(CB3){\tikzxmark}; + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB1)+(0.3,0.05)$)--++(0:0.8); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB1)+(0.3,-0.12)$)--++(0:0.7); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB2)+(0.3,0.05)$)--++(0:0.8); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB2)+(0.3,-0.12)$)--++(0:0.6); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB3)+(0.3,0.05)$)--++(0:0.8); + \draw[GreenLine,decoration={zigzag,segment length=4pt, amplitude=0.5pt},decorate]($(CB3)+(0.3,-0.12)$)--++(0:0.6); + \end{scope} + }}, + pics/pencil/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=TESTING1,shift={($(0,0)+(0,0)$)},scale=\scalefac,every node/.append style={transform shape},rotate=340] + \fill[fill=\filllcolor!70] (0,4) -- (0.4,4) -- (0.4,0) --(0.3,-0.15) -- (0.2,0) -- (0.1,-0.14) -- (0,0) -- cycle; + \draw[color=white,thick] (0.2,4) -- (0.2,0); + \fill[black] (0,3.5) -- (0.2,3.47) -- (0.4,3.5) -- (0.4,4) arc(30:150:0.23cm); + \fill[fill=\filllcolor!40] (0,0) -- (0.2,-0.8)node[coordinate,pos=0.75](a){} -- (0.4,0)node[coordinate,pos=0.25](b){} -- (0.3,-0.15) -- (0.2,0) -- (0.1,-0.14) -- cycle; + \fill[fill=\filllcolor] (a) -- (0.2,-0.8) -- (b) -- cycle; + \end{scope} + }}, + pics/square/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[local bounding box=SQUARE,scale=\scalefac,every node/.append style={transform shape}] + % Right Face + \draw[fill=\filllcolor!70,line width=\Linewidth] + (\Depth,0,0)coordinate(\picname-ZDD)--(\Depth,\Width,0)--(\Depth,\Width,\Height)--(\Depth,0,\Height)--cycle; + % Front Face + \draw[fill=\filllcolor!40,line width=\Linewidth] + (0,0,\Height)coordinate(\picname-DL)--(0,\Width,\Height)coordinate(\picname-GL)-- + (\Depth,\Width,\Height)coordinate(\picname-GD)--(\Depth,0,\Height)coordinate(\picname-DD)--(0,0,\Height); + % Top Face + \draw[fill=\filllcolor!20,line width=\Linewidth] + (0,\Width,0)coordinate(\picname-ZGL)--(0,\Width,\Height)coordinate(\picname-ZGL)-- + (\Depth,\Width,\Height)--(\Depth,\Width,0)coordinate(\picname-ZGD)--cycle; + \end{scope} + }}, + pics/globe/.style = {code = {\pgfkeys{/channel/.cd, #1} + \begin{scope}[shift={($(0,0)+(0,0)$)},scale=\scalefac,every node/.append style={transform shape}] + \node[circle,minimum size=25mm,draw=\drawcolor, fill=\filllcolor!70,line width=\Linewidth](C\picname) at (0,0){}; + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.north)to[bend left=65](C\picname.south); + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.north)to[bend right=65](C\picname.south); + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.north)to(C\picname.south); + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.west)--(C\picname.east); + % + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.130)to[bend right=35](C\picname.50); + \draw[draw=\drawcolor,line width=\Linewidth](C\picname.230)to[bend left=35](C\picname.310); + \end{scope} + }} } -% Define nodes -\node[person] (person1) at (0,0) {}; -\node[person, right=of person1] (person2) {}; -\node[person, above=of person1] (person3) {}; +\pgfkeys{ + /man/.cd, + Linewidth/.store in=\Linewidth, + scalefac/.store in=\scalefac, + tiecolor/.store in=\tiecolor, + bodycolor/.store in=\bodycolor, + stetcolor/.store in=\stetcolor, + tiecolor=RedLine, % default tie color + bodycolor=BlueL, % default body color + stetcolor=GreenLine, % default stet color + scalefac=1, + Linewidth=2.5pt, +} -\node[data, right=of person2] (data1) {}; -\node[package, right=of data1] (package1) {}; -\node[display, right=of package1] (display1) {}; -\node[autotext, above=of display1] (autotext1) {}; -\node[server, right=of display1] (server1) {}; -\node[autotext, above=of server1] (autotext2) {}; -\node[package, right=of server1] (package2) {}; -\node[testing, right=of package2] (testing1) {}; -\node[person, right=of testing1] (person4) {}; -\node[person, right=of person4] (person5) {}; -\node[person, above=of person4] (person6) {}; +\pgfkeys{ + /channel/.cd, + Depth/.store in=\Depth, + Height/.store in=\Height, + Width/.store in=\Width, + filllcirclecolor/.store in=\filllcirclecolor, + filllcolor/.store in=\filllcolor, + drawcolor/.store in=\drawcolor, + drawcircle/.store in=\drawcircle, + scalefac/.store in=\scalefac, + Linewidth/.store in=\Linewidth, + picname/.store in=\picname, + filllcolor=BrownL, + filllcirclecolor=VioletL2, + drawcolor=black, + drawcircle=VioletLine, + scalefac=1, + Linewidth=0.5pt, + Depth=1.3, + Height=0.8, + Width=1.1, + picname=C +} -\node[data, right=of person5] (data2) {}; -\node[display, right=of data2] (display2) {}; -\node[testing, right=of display2] (testing2) {}; -\node[pencil, below=of testing2] (pencil1) {}; -\node[display, right=of testing2] (display3) {}; -\node[globe, below=of display3] (globe1) {}; +% Persons 1 +\node (A1) [pics/man={scalefac=0.35,tiecolor=GreenLine, bodycolor=RedL,stetcolor=VioletLine, Linewidth=1.0pt}] {}; +\node (A2) [right=of A1, pics/man={scalefac=0.35,tiecolor=GreenLine, bodycolor=RedL,stetcolor=VioletLine, Linewidth=1.0pt}] {}; +\node (A3) [above=of $(A1)!0.5!(A2)$, pics/man={scalefac=0.43,tiecolor=OrangeLine, bodycolor=BlueL,stetcolor=BlueLine, Linewidth=1.0pt}] {}; -% Define arrows -\draw[Larrow] (person2.east) -- (data1.west); -\draw[Larrow] (data1.east) -- (package1.west); -\draw[Larrow] (package1.east) -- (display1.west); -\draw[Larrow] (display1.east) -- (server1.west); -\draw[Larrow] (server1.east) -- (package2.west); -\draw[Larrow] (package2.east) -- (testing1.west); -\draw[Larrow] (testing1.east) -- (person4.west); -\draw[Larrow] (person5.east) -- (data2.west); -\draw[Larrow] (data2.east) -- (display2.west); -\draw[Larrow] (display2.east) -- (testing2.west); -\draw[Larrow] (testing2.east) -- (display3.west); +% Data 1 +\node (B1) [right=of A2, pics/data={scalefac=0.6,picname=1,filllcolor=BlueLine, Linewidth=1.0pt}] {}; -% Define text -\node[textStyle] at ($(person1.south) + (0,-0.5)$) {Developers}; -\node[textStyle] at ($(data1.south) + (0,-0.5)$) {Version\\ Control (Master)}; -\node[textStyle] at ($(package1.south) + (0,-0.5)$) {Package}; -\node[textStyle] at ($(display1.south) + (0,-0.5)$) {Auto Unit\\Testing}; -\node[textStyle] at ($(server1.south) + (0,-0.5)$) {Auto UI\\Testing}; -\node[textStyle] at ($(package2.south) + (0,-0.5)$) {Package with\\ Instructions}; -\node[textStyle] at ($(person4.south) + (0,-0.5)$) {Operations\\Team}; -\node[textStyle] at ($(data2.south) + (0,-0.5)$) {Auto\\ Scripts}; -\node[textStyle] at ($(display2.south) + (0,-0.5)$) {Test\\Environment}; -\node[textStyle] at ($(testing2.south) + (0,-0.5)$) {Testing}; -\node[textStyle] at ($(display3.south) + (0,-0.5)$) {Public/\\General\\ Availability}; +% Gears +\node (G1) [right=of B1, scale=1.5] { + \fill[draw=none,fill=BrownLine,even odd rule,xshift=-2mm]coordinate(D)\gear{12}{0.4}{0.33}{10}{2}{0.1}; + \fill[draw=none,fill=BrownLine,even odd rule,xshift=3.8mm,yshift=2mm]\gear{11}{0.25}{0.21}{10}{1}{0.07}; + \fill[draw=none,fill=BrownLine,even odd rule,xshift=0.6mm,yshift=5.8mm]coordinate(F)\gear{11}{0.25}{0.21}{10}{1}{0.07}; +}; + +% Package 1 +\node (C1) [right=of G1, pics/package={scalefac=1,picname=1,filllcolor=RedL, Linewidth=0.5pt}] {}; + +% Display 1 +\node (D1) [right=of C1, pics/display={scalefac=1,picname=1,filllcolor=BlueLine, Linewidth=1.0pt}] {}; + +% Auto text 1 +\node (E1) [above=of D1, pics/autotext={scalefac=1,picname=1,drawcolor=green!70!black,filllcolor=green!70!black, Linewidth=1.0pt}] {}; + +% Server +\node (F1) [right=of D1, pics/server={scalefac=1.1,picname=1,drawcolor=BrownLine,filllcolor=BrownL, Linewidth=1.0pt}] {}; +\node (G2) [above=of F1, scale=1.5] { + \fill[draw=none,fill=BlueL,even odd rule,xshift=2.5mm,yshift=-2.8mm]\gear{11}{0.4}{0.34}{10}{1}{0.07}; +}; + +% Auto text 2 +\node (E2) [above=of F1, pics/autotext={scalefac=1,picname=1,drawcolor=green!70!black,filllcolor=green!70!black, Linewidth=1.0pt}] {}; + +% Package 2 +\node (C2) [right=of F1, pics/package={scalefac=1,picname=2,filllcolor=green!70!black, Linewidth=0.5pt}] {}; + +% Testing 1 +\node (T1) [above=of C2, pics/testing={scalefac=0.75,picname=1,drawcolor=OrangeLine,filllcolor=OrangeLine, Linewidth=1.0pt}] {}; + +% Persons 2 +\node (A4) [right=of C2, pics/man={scalefac=0.35,tiecolor=RedLine, bodycolor=BrownL,stetcolor=BrownLine, Linewidth=1.0pt}] {}; +\node (A5) [right=of A4, pics/man={scalefac=0.35,tiecolor=RedLine, bodycolor=BrownL,stetcolor=BrownLine, Linewidth=1.0pt}] {}; +\node (A6) [above=of $(A4)!0.5!(A5)$, pics/man={scalefac=0.43,tiecolor=GreenLine, bodycolor=RedL,stetcolor=RedLine, Linewidth=1.0pt}] {}; + +% Data 2 +\node (B2) [right=of A5, pics/data={scalefac=0.6,picname=1,filllcolor=RedL, Linewidth=1.0pt}] {}; +\node (T2) [right=of B2, pics/testing={scalefac=0.8,picname=2,drawcolor=BlueLine,filllcolor=BlueLine, Linewidth=1.0pt}] {}; + +% Auto text 3 +\node (E3) [above=of B2, pics/autotext={scalefac=1,picname=1,drawcolor=green!70!black,filllcolor=green!70!black, Linewidth=1.0pt}] {}; + +% Display 2 +\node (D2) [right=of T2, pics/displayE={scalefac=1.3,picname=1,drawcolor=BrownLine,filllcolor=BrownL, Linewidth=1.0pt}] {}; +\node (S1) [below=of D2, rotate=20, pics/square={scalefac=0.46,picname=1,filllcolor=RedL, Linewidth=0.5pt}] {}; + +% Testing 2 +\node (T3) [right=of D2, pics/testing={scalefac=0.85,picname=1,drawcolor=OrangeLine,filllcolor=OrangeLine, Linewidth=1.0pt}] {}; +\node (P1) [below=of T3, rotate=-15, pics/pencil={scalefac=0.35,picname=1,filllcolor=RedL, Linewidth=1.0pt}] {}; + +% Display 3 +\node (D3) [right=of T3, pics/displayE={scalefac=1.3,picname=1,drawcolor=RedLine,filllcolor=RedL, Linewidth=1.0pt}] {}; +\node (G3) [below=of D3, pics/globe={scalefac=0.38,picname=1,filllcolor=GreenL, Linewidth=1.2pt}] {}; + +% Arrows +\draw[Larrow] (A2.east) -- (B1.west); +\draw[Larrow] (B1.east) -- (G1.west); +\draw[Larrow] (G1.east) -- (C1.west); +\draw[Larrow] (C1.east) -- (D1.west); +\draw[Larrow] (D1.east) -- (F1.west); +\draw[Larrow] (F1.east) -- (C2.west); +\draw[Larrow] (C2.east) -- (A4.west); +\draw[Larrow] (A5.east) -- (B2.west); +\draw[Larrow] (B2.east) -- (D2.west); +\draw[Larrow] (D2.east) -- (T3.west); +\draw[Larrow] (T3.east) -- (D3.west); + +% Text +\node[align=center,anchor=north] at ($(A1.south)!0.5!(A2.south)$) {Developers}; +\node[align=center,anchor=north] at (B1.south) {Version\\ Control (Master)}; +\node[align=center,anchor=north] at (G1.south) {Compile}; +\node[align=center,anchor=north] at (C1.south) {Package}; +\node[align=center,anchor=north] at (D1.south) {Auto Unit\\Testing}; +\node[align=center,anchor=north] at (F1.south) {Auto UI\\Testing}; +\node[align=center,anchor=north] at (C2.south) {Package with\\ Instructions}; +\node[align=center,anchor=north] at ($(A4.south)!0.5!(A5.south)$) {Operations\\Team}; +\node[align=center,anchor=north] at (B2.south) {Auto\\ Scripts}; +\node[align=center,anchor=north] at (D2.south) {Test\\Environment}; +\node[align=center,anchor=north] at (T3.south) {Testing}; +\node[align=center,anchor=north] at (D3.south) {Public/\\General\\ Availability}; + +% Fitting +\begin{scope}[on background layer] + \node[draw=none,inner xsep=3mm,inner ysep=16mm, yshift=-6mm,fill=BackColor!60,fit=(A1)(F1),line width=0.75pt](BB1){}; + \node[below=4pt of BB1.north,inner sep=0pt, anchor=north,GreenLine]{\textbf{Build Pipeline}}; + \node[above=4pt of BB1.south,inner sep=0pt, anchor=south,GreenLine]{\textbf{Continuous Integration}}; + \node[draw=none,inner xsep=3mm,inner ysep=16mm, yshift=-6mm,fill=cyan!10,fit=(A4)(D3),line width=0.75pt](BB2){}; + \node[below=4pt of BB2.north,inner sep=0pt, anchor=north,GreenLine]{\textbf{Release Pipeline}}; + \node[above=4pt of BB2.south,inner sep=0pt, anchor=south,GreenLine]{\textbf{Continuous Delivery}}; +\end{scope} -% Background layers -\begin{pgfonlayer}{background} - \node[draw=none, fill=BackColor!60, fit=(person1)(server1), inner xsep=3mm, inner ysep=16mm, yshift=-6mm, line width=0.75pt] {}; - \node[below=4pt of person1, inner sep=0pt, anchor=north, GreenLine] {\textbf{Build Pipeline}}; - \node[above=4pt of server1, inner sep=0pt, anchor=south, GreenLine] {\textbf{Continuous Integration}}; - - \node[draw=none, fill=cyan!10, fit=(person4)(display3), inner xsep=3mm, inner ysep=16mm, yshift=-6mm, line width=0.75pt] {}; - \node[below=4pt of person4, inner sep=0pt, anchor=north, GreenLine] {\textbf{Release Pipeline}}; - \node[above=4pt of display3, inner sep=0pt, anchor=south, GreenLine] {\textbf{Continuous Delivery}}; -\end{pgfonlayer} \end{tikzpicture} ``` ::: @@ -1473,13 +1733,13 @@ Certain fault behavior patterns remain consistent regardless of abstraction leve ::: {#fig-error-masking-ft fig-env="figure" fig-pos="htb" fig-cap="**Error Masking**: Microarchitectural redundancy can absorb single-bit faults before they propagate to observable system errors, highlighting a discrepancy between hardware-level and software-level fault models. This figure details how fault masking occurs within microarchitectural components, demonstrating that software-based error detection tools may underestimate the true resilience of a system to transient errors." fig-alt="Decision flowchart from soft error through two diamond questions: corrupted data read and incorrect output. No paths lead to masked states at microarchitecture or software level."} ```{.tikz} -\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}] -\tikzset{ - Line/.style={line width=1.0pt, black!50, text=black}, - Box/.style={inner xsep=2pt, node distance=0.6, draw=VioletLine, line width=0.75pt, fill=VioletL!40, align=flush center, minimum width=25mm, minimum height=9mm, rounded corners=9pt}, - Box2/.style={inner xsep=2pt, node distance=1.4, draw=BlueLine, line width=0.75pt, fill=BlueL!40, align=flush center, minimum width=29mm, minimum height=9mm}, - Box3/.style={inner xsep=2pt, node distance=1.1, draw=BrownLine, line width=0.75pt, fill=BrownL!40, align=flush left, minimum width=55mm, minimum height=9mm}, - decision/.style={diamond, minimum width=50mm, node distance=0.6, inner sep=-1ex, minimum height=25mm, align=flush center, draw=GreenLine, fill=GreenL} +\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, node distance=1cm and 1.5cm] +\tikzset{% + Line/.style={line width=1.0pt,black!50,text=black}, + Box/.style={inner xsep=2pt, draw=VioletLine, line width=0.75pt, fill=VioletL!40, align=flush center, minimum width=25mm, minimum height=9mm, rounded corners=9pt}, + Box2/.style={inner xsep=2pt, draw=BlueLine, line width=0.75pt, fill=BlueL!40, align=flush center, minimum width=29mm, minimum height=9mm}, + Box3/.style={inner xsep=2pt, draw=BrownLine, line width=0.75pt, fill=BrownL!40, align=flush left, minimum width=55mm, minimum height=9mm}, + decision/.style={diamond, minimum width=50mm, inner sep=-1ex, minimum height=25mm, align=flush center, draw=GreenLine, fill=GreenL} } \node[Box] (B1) {Soft error}; @@ -1659,7 +1919,7 @@ tau_opt = np.sqrt(2 * t_write * mtbf) fig, ax = plt.subplots(figsize=(8, 5)) ax.plot(tau * 60, over_ckpt, label='Checkpoint Overhead ($T_{\text{write}}/\\tau$)', color=COLORS['BlueLine'], linestyle='--') -ax.plot(tau * 60, over_rework, label='Expected Rework ($\\tau/2\\text{\text{MTBF}}$)', color=COLORS['RedLine'], linestyle='--') +ax.plot(tau * 60, over_rework, label=r'Expected Rework ($\tau/2\text{MTBF}$)', color=COLORS['RedLine'], linestyle='--') ax.plot(tau * 60, total_waste, label='Total Wasted Work', color=COLORS['GreenLine'], linewidth=2.5) ax.scatter([tau_opt * 60], [np.sqrt(2 * t_write / mtbf)], color='black', zorder=5) @@ -1701,7 +1961,7 @@ When assumptions are violated, the optimal interval may shift significantly. As ::: {#fig-checkpoint-recovery-timeline fig-env="figure" fig-pos="htb" fig-cap="**Checkpoint-Recovery Timeline**. A training run proceeds through alternating phases of computation (green) and checkpoint writes (blue). When a failure occurs (red lightning bolt), all work since the last completed checkpoint is lost (hatched gray). Recovery involves job restart overhead, checkpoint loading, and pipeline warmup before productive training resumes. The total cost of a failure includes both the lost work and the recovery latency." fig-alt="Horizontal Gantt chart showing training phases in green, checkpoint writes in blue, a failure point with red marker, gray hatched lost work region, and orange recovery phase before training resumes."} ```{.tikz} -\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}] +\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, node distance=0pt] \definecolor{BlueLine}{HTML}{006395} \definecolor{BlueL}{HTML}{D1E6F3} \definecolor{GreenLine}{HTML}{008F45} @@ -1713,24 +1973,34 @@ When assumptions are violated, the optimal interval may shift significantly. As \definecolor{BrownLine}{HTML}{78492A} \tikzset{ - phase/.style={minimum height=0.7cm, font=\tiny, text=white}, + phase/.style={minimum height=0.7cm}, lbl/.style={font=\tiny, align=center}, - brace/.style={decorate, decoration={brace, amplitude=4pt}}, - phasebox/.style={minimum width=0.5cm, minimum height=0.7cm}, - legend/.style={minimum width=0.4cm, minimum height=0.3cm} + phaseNode/.style={phase, text=white, font=\tiny}, + braceStyle/.style={thick, decorate, decoration={brace, amplitude=4pt}}, + braceText/.style={font=\tiny, text=BlueLine}, + braceTextBold/.style={font=\tiny\bfseries, text=RedLine}, + braceTextOrange/.style={font=\tiny\bfseries, text=OrangeLine} } % Timeline bar height \def\barh{0.7} \def\bary{0} - % Phases - \node[phase, fill=GreenLine!70, phasebox, minimum width=2.5cm] (P1) at (0, \bary) {Training}; - \node[phase, fill=BlueLine, phasebox, right=0pt of P1] (P2) {\rotatebox{90}{Ckpt}}; - \node[phase, fill=GreenLine!70, phasebox, minimum width=2.5cm, right=0pt of P2] (P3) {Training}; - \node[phase, fill=BlueLine, phasebox, right=0pt of P3] (P4) {\rotatebox{90}{Ckpt}}; - \node[phase, fill=GreenLine!30, phasebox, minimum width=1.8cm, right=0pt of P4] (P5) {}; - \node[phase, pattern=north east lines, pattern color=black!30, phasebox, minimum width=1.8cm, right=0pt of P4] {}; + % Phase 1: Training + \node[phaseNode, fill=GreenLine!70, minimum width=2.5cm] (P1) {Training}; + + % Phase 2: Checkpoint 1 + \node[phaseNode, fill=BlueLine, minimum width=0.5cm, right=of P1] (P2) {\rotatebox{90}{Ckpt}}; + + % Phase 3: Training + \node[phaseNode, fill=GreenLine!70, minimum width=2.5cm, right=of P2] (P3) {Training}; + + % Phase 4: Checkpoint 2 + \node[phaseNode, fill=BlueLine, minimum width=0.5cm, right=of P3] (P4) {\rotatebox{90}{Ckpt}}; + + % Phase 5: Training (will be lost) + \node[phase, fill=GreenLine!30, minimum width=1.8cm, right=of P4] (P5) {}; + \node[phase, pattern=north east lines, pattern color=black!30, minimum width=1.8cm, right=of P4] {}; \node[lbl, text=black!60] at (P5.center) {Lost Work}; % Failure marker @@ -1738,35 +2008,46 @@ When assumptions are violated, the optimal interval may shift significantly. As \node[lbl, text=RedLine, font=\tiny\bfseries, above=0.1cm of FailIcon] {Failure}; \draw[RedLine, very thick] (P5.south east) -- (P5.north east); - % Recovery Phases - \node[phase, fill=OrangeL, phasebox, right=0pt of P5] (P6a) {\rotatebox{90}{\tiny Restart}}; - \node[phase, fill=OrangeLine!60, phasebox, right=0pt of P6a] (P6b) {\rotatebox{90}{\tiny Load}}; - \node[phase, fill=OrangeLine!30, phasebox, right=0pt of P6b] (P6c) {\rotatebox{90}{\tiny Warm}}; + % Phase 6: Recovery (restart + load + warmup) + \node[phaseNode, fill=OrangeL, minimum width=0.7cm, right=of P5] (P6a) {\rotatebox{90}{\tiny Restart}}; + \node[phaseNode, fill=OrangeLine!60, minimum width=0.8cm, right=of P6a] (P6b) {\rotatebox{90}{\tiny Load}}; + \node[phaseNode, fill=OrangeLine!30, minimum width=0.5cm, right=of P6b] (P6c) {\rotatebox{90}{\tiny Warm}}; - % Resumed Training - \node[phase, fill=GreenLine!70, phasebox, minimum width=2.2cm, right=0pt of P6c] (P7) {Training Resumes}; + % Phase 7: Training resumes + \node[phaseNode, fill=GreenLine!70, minimum width=2.2cm, right=of P6c] (P7) {Training Resumes}; % Time axis \draw[->, thick, black!60] (0, -0.5) -- (12.5, -0.5) node[right, font=\scriptsize] {Time}; % Annotations with braces - \draw[BlueLine, thick, brace, mirror] (P1.south west |- 0, -0.7) -- (P1.south east |- 0, -0.7) node[midway, below=5pt, font=\tiny, text=BlueLine] {$\tau_{\text{opt}}$}; - \draw[BlueLine, thick, brace, mirror] (P3.south west |- 0, -0.7) -- (P3.south east |- 0, -0.7) node[midway, below=5pt, font=\tiny, text=BlueLine] {$\tau_{\text{opt}}$}; - \draw[RedLine, thick, brace] (P5.north west |- 0, 0.8) -- (P5.north east |- 0, 0.8) node[midway, above=5pt, font=\tiny\bfseries, text=RedLine] {$\leq \tau_{\text{opt}}$}; - \draw[OrangeLine, thick, brace] (P6a.north west |- 0, 0.8) -- (P6c.north east |- 0, 0.8) node[midway, above=5pt, font=\tiny\bfseries, text=OrangeLine] {$T_{\text{restart}}$}; + % Checkpoint interval + \draw[BlueLine, braceStyle, mirror] + (P1.south west |- 0, -0.7) -- (P1.south east |- 0, -0.7) node[midway, below=5pt, braceText] {$\tau_{\text{opt}}$}; + \draw[BlueLine, braceStyle, mirror] + (P3.south west |- 0, -0.7) -- (P3.south east |- 0, -0.7) node[midway, below=5pt, braceText] {$\tau_{\text{opt}}$}; + + % Lost work brace + \draw[RedLine, braceStyle] + (P5.north west |- 0, 0.8) -- (P5.north east |- 0, 0.8) + node[midway, above=5pt, braceTextBold] {$\leq \tau_{\text{opt}}$}; + + % Recovery brace + \draw[OrangeLine, braceStyle] + (P6a.north west |- 0, 0.8) -- (P6c.north east |- 0, 0.8) + node[midway, above=5pt, braceTextOrange] {$T_{\text{restart}}$}; % Legend \begin{scope}[shift={(0,-1.5)}] - \node[legend, fill=GreenLine!70] (L1) at (0,0) {}; + \node[phase, fill=GreenLine!70, minimum width=0.4cm, minimum height=0.3cm] (L1) {}; \node[font=\tiny, right=0.1cm of L1, anchor=west] {Productive training}; - \node[legend, fill=BlueLine] (L2) at (3.0,0) {}; + \node[phase, fill=BlueLine, minimum width=0.4cm, minimum height=0.3cm, right=2.5cm of L1] (L2) {}; \node[font=\tiny, right=0.1cm of L2, anchor=west] {Checkpoint write ($T_{\text{save}}$)}; - \node[legend, fill=OrangeLine!50] (L3) at (6.5,0) {}; + \node[phase, fill=OrangeLine!50, minimum width=0.4cm, minimum height=0.3cm, right=2.5cm of L2] (L3) {}; \node[font=\tiny, right=0.1cm of L3, anchor=west] {Recovery}; - \node[legend, draw=black!30, pattern=north east lines, pattern color=black!30] (L4) at (9.5,0) {}; + \node[phase, draw=black!30, pattern=north east lines, pattern color=black!30, minimum width=0.4cm, minimum height=0.3cm, right=2.5cm of L3] (L4) {}; \node[font=\tiny, right=0.1cm of L4, anchor=west] {Wasted work}; \end{scope} @@ -1880,7 +2161,7 @@ class CheckpointDebug: # ┌── 3. GUARD (Invariants) ────────────────────────────────────────── check(total_ckpt_gb_val == 420, f"Expected 420 GB checkpoint, got {total_ckpt_gb_val}") - check(serialized_min_val > 5000, "Serialized time should be massive due to contention") + check(serialized_min_val > 300, "Serialized time should be large due to contention") # ┌── 4. OUTPUT (Formatting) ────────────────────────────────────────────── weights_gb_str = f"{weights_gb:.0f}" @@ -2197,16 +2478,15 @@ Suppose a 1,024-GPU training job loses an 8-GPU node to a hardware fault, but th \tikzset{ proc/.style={draw=BlueLine, line width=0.75pt, rectangle, rounded corners, minimum width=2.5cm, minimum height=1cm, align=center, fill=BlueL}, decision/.style={draw=OrangeLine, line width=0.75pt, diamond, aspect=2, minimum width=2.5cm, align=center, fill=OrangeL}, - procRed/.style={proc, draw=RedLine, fill=RedL}, - procGreen/.style={proc, draw=GreenLine, fill=GreenL}, - arrow/.style={->, >=stealth, line width=1.0pt} + arrow/.style={->, >=stealth, line width=1.0pt}, + note/.style={align=left, font=\footnotesize} } \node[proc] (Start) {Training (N GPUs)}; \node[decision, below=of Start] (Fail) {Monitor Alert:\\Node Failure?}; - \node[procRed, below=of Fail] (Pause) {Pause Training}; + \node[proc, below=of Fail, draw=RedLine, fill=RedL] (Pause) {Pause Training}; \node[proc, below=of Pause] (Rescale) {Rescale Batch/LR\\to N-1 GPUs}; - \node[procGreen, below=of Rescale] (Resume) {Resume Training}; + \node[proc, below=of Rescale, draw=GreenLine, fill=GreenL] (Resume) {Resume Training}; \draw[arrow] (Start) -- (Fail); \draw[arrow] (Fail) -- node[right] {Yes} (Pause); @@ -2215,7 +2495,7 @@ Suppose a 1,024-GPU training job loses an 8-GPU node to a hardware fault, but th \draw[arrow] (Rescale) -- (Resume); \draw[arrow] (Resume.east) -- ++(1.5,0) |- (Start); - \node[right=0.5cm of Rescale, align=left, font=\footnotesize] {Key Step:\\Adjust Global Batch\\or Gradient Accum}; + \node[note, right=0.5cm of Rescale] {Key Step:\\Adjust Global Batch\\or Gradient Accum}; \end{tikzpicture} ``` ::: @@ -2414,38 +2694,38 @@ In **active-active replication** (@fig-serving-redundancy, left), all replicas a ::: {#fig-serving-redundancy fig-env="figure" fig-pos="htb" fig-cap="**Serving Redundancy Strategies**. Comparison of Active-Active vs. Active-Passive replication. Active-Active (left) distributes load across all replicas, maximizing utilization but requiring capacity headroom to absorb failures. Active-Passive (right) keeps a standby replica idle and synchronized via heartbeat, simplifying failover logic at the cost of idle resource utilization." fig-alt="Two diagrams comparing replication strategies. Left: active-active with load balancer sending 50% to each of two green replicas. Right: active-passive with load balancer sending 100% to primary while dashed standby receives heartbeat sync."} ```{.tikz} -\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, node distance=2cm and 1.5cm] +\begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n}, node distance=1.5cm and 2cm, positioning] + + % Styles \tikzset{ - box/.style={draw, line width=0.75pt, minimum width=2cm, minimum height=1cm, align=center}, - loadbalancer/.style={box, draw=GrayLine, fill=GrayL}, - replica/.style={box, draw=GreenLine, fill=GreenL}, - standby/.style={box, draw=GrayLine, fill=GrayL, dashed}, + loadbalancer/.style={draw=GrayLine, line width=0.75pt, fill=GrayL, align=center}, + replica/.style={draw=GreenLine, line width=0.75pt, fill=GreenL, align=center}, + standby/.style={draw=GrayLine, line width=0.75pt, fill=GrayL, dashed, align=center}, arrow/.style={->, line width=1.0pt}, dashedarrow/.style={->, line width=1.0pt, dashed, GrayLine}, - sync/.style={<->, dashed, RedLine, font=\scriptsize}, - label/.style={font=\scriptsize} + heartbeat/.style={<->, dashed, RedLine} } % Active-Active - \node[anchor=south] (title1) {\textbf{Active-Active}}; - \node[loadbalancer, below=of title1] (LB1) {Load Balancer}; - \node[replica, below left=of LB1] (R1) {Replica 1}; - \node[replica, below right=of LB1] (R2) {Replica 2}; + \node[anchor=south] (AA_Title) at (2.5, 3.5) {\textbf{Active-Active}}; + \node[loadbalancer] (LB1) [below=of AA_Title] {Load Balancer}; + \node[replica] (R1) [below left=of LB1] {Replica 1}; + \node[replica] (R2) [below right=of LB1] {Replica 2}; - \draw[arrow] (LB1) -- node[label, left] {50\%} (R1); - \draw[arrow] (LB1) -- node[label, right] {50\%} (R2); + \draw[arrow] (LB1) -- node[left, font=\scriptsize] {50\%} (R1); + \draw[arrow] (LB1) -- node[right, font=\scriptsize] {50\%} (R2); % Active-Passive \begin{scope}[xshift=7cm] - \node[anchor=south] (title2) {\textbf{Active-Passive}}; - \node[loadbalancer, below=of title2] (LB2) {Load Balancer}; - \node[replica, below left=of LB2] (R3) {Primary}; - \node[standby, below right=of LB2] (R4) {Standby}; + \node[anchor=south] (AP_Title) at (2.5, 3.5) {\textbf{Active-Passive}}; + \node[loadbalancer] (LB2) [below=of AP_Title] {Load Balancer}; + \node[replica] (R3) [below left=of LB2] {Primary}; + \node[standby] (R4) [below right=of LB2] {Standby}; - \draw[arrow] (LB2) -- node[label, left] {100\%} (R3); + \draw[arrow] (LB2) -- node[left, font=\scriptsize] {100\%} (R3); \draw[dashedarrow] (LB2) -- (R4); - \draw[sync] (R3) -- node[below] {Heartbeat / Sync} (R4); + \draw[heartbeat] (R3) -- node[below, font=\scriptsize] {Heartbeat / Sync} (R4); \end{scope} \end{tikzpicture} ``` @@ -2664,13 +2944,13 @@ Circuit breakers operate in three states: closed (normal operation), open (faili arrow/.style={->, >=stealth, line width=1.0pt, bend left=45}, ClosedState/.style={state, draw=GreenLine, fill=GreenL}, OpenState/.style={state, draw=RedLine, fill=RedL}, - HalfState/.style={state, draw=OrangeLine, fill=OrangeL} + HalfOpenState/.style={state, draw=OrangeLine, fill=OrangeL} } % States \node[ClosedState] (Closed) {CLOSED\\(Normal)}; \node[OpenState, right=of Closed] (Open) {OPEN\\(Fail Fast)}; - \node[HalfState, below=of Open] (Half) {HALF-OPEN\\(Probing)}; + \node[HalfOpenState, below=of Open] (Half) {HALF-OPEN\\(Probing)}; % Transitions \draw[arrow] (Closed) edge node[above] {Errors > Threshold} (Open); diff --git a/book/quarto/mlsys/formatting.py b/book/quarto/mlsys/formatting.py index b26ff6975..9fdfbfdc2 100644 --- a/book/quarto/mlsys/formatting.py +++ b/book/quarto/mlsys/formatting.py @@ -58,6 +58,21 @@ def fmt(quantity, unit=None, precision=1, commas=True, allow_zero=False): return result +def fmt_percent(ratio, precision=1, commas=False): + """ + Format a ratio (0.0 to 1.0) as a percentage string for display. + Use this for compound fractions (e.g. effective utilization) to avoid + display bugs from Quantity or wrong scaling. + Accepts Pint Quantity (uses magnitude) or plain float. + """ + if isinstance(ratio, ureg.Quantity): + # Crucial: convert to dimensionless first so units like flop/TFLOP cancel out! + ratio = float(ratio.m_as('')) + else: + ratio = float(ratio) + return fmt(ratio * 100, precision=precision, commas=commas) + + def sci(val, precision=2): """ Formats a number or Pint Quantity into scientific notation using Unicode. @@ -92,6 +107,11 @@ def display_percent(ratio, precision=0): """ ratio: 0.0 to 1.0 """ + if isinstance(ratio, ureg.Quantity): + ratio = float(ratio.m_as('')) + else: + ratio = float(ratio) + pct = ratio * 100 return { "value": ratio, diff --git a/book/quarto/tex/header-includes.tex b/book/quarto/tex/header-includes.tex index e5ec49875..313ed6fb3 100644 --- a/book/quarto/tex/header-includes.tex +++ b/book/quarto/tex/header-includes.tex @@ -80,6 +80,8 @@ labelformat=mylabel,justification=raggedright,singlelinecheck=false,font={ninept % Colors and visual elements \usepackage[dvipsnames]{xcolor} % Extended color support \usepackage{tikz} % Programmatic graphics +\usepackage{pgfplots} % Axis plots in TikZ (e.g. fault_tolerance bathtub curve) +\pgfplotsset{compat=1.18} \usetikzlibrary{angles} \usetikzlibrary{arrows.meta} \usetikzlibrary{arrows} @@ -440,6 +442,22 @@ aboveskip=0pt \definecolor{BlueDD}{RGB}{62,100,125} \colorlet{BlueDD}{magenta} +% Diagram colors (used by inline TikZ in chapters; also in diagram.yml for SVG) +\definecolor{BlueLine}{HTML}{006395} +\definecolor{BlueL}{RGB}{209,243,255} +\definecolor{GreenLine}{HTML}{008F45} +\definecolor{GreenL}{RGB}{219,253,166} +\definecolor{OrangeLine}{HTML}{E67817} +\definecolor{OrangeL}{RGB}{250,212,175} +\definecolor{RedLine}{HTML}{D9534F} +\definecolor{RedL}{RGB}{253,226,240} +\definecolor{GrayLine}{HTML}{666666} +\definecolor{GrayL}{HTML}{E0E0E0} +\definecolor{VioletLine}{HTML}{7E317B} +\definecolor{VioletL}{RGB}{247,180,247} +\definecolor{BrownLine}{RGB}{143,120,116} +\definecolor{BrownL}{RGB}{233,222,220} + % =============================================================================== % PART STYLING SYSTEM % ===============================================================================