From c68ca02d9ea2822aed0c0c6cd962b766f3224333 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Sat, 21 Feb 2026 08:15:29 -0500 Subject: [PATCH] Enhances data pipeline debugging flowchart Improves the data pipeline debugging flowchart by adding visual cues. These cues help to highlight the type of data issue being investigated and make the flowchart easier to understand. --- .../data_engineering/data_engineering.qmd | 216 +++++++++++++++--- 1 file changed, 183 insertions(+), 33 deletions(-) diff --git a/book/quarto/contents/vol1/data_engineering/data_engineering.qmd b/book/quarto/contents/vol1/data_engineering/data_engineering.qmd index 136dc13c3..eeb80ada3 100644 --- a/book/quarto/contents/vol1/data_engineering/data_engineering.qmd +++ b/book/quarto/contents/vol1/data_engineering/data_engineering.qmd @@ -3265,78 +3265,228 @@ The key insight is that data debt,\index{Data Debt!strategic vs unconscious} lik ::: {#fig-debug-flowchart fig-env="figure" fig-pos="htb" fig-cap="**Data Pipeline Debugging Flowchart**: Four sequential decision nodes guide root cause diagnosis: (1) accuracy degrades over time leads to Data Drift, (2) training accuracy exceeds validation leads to Overfitting, (3) validation exceeds production accuracy leads to Training-Serving Skew, and (4) subgroup inconsistency leads to Bias. If all answers are no, the issue points to Model Architecture." fig-alt="Vertical flowchart with four blue diamond decision nodes and red result boxes. Top diamond asks if accuracy degrades over time, leading to Data Drift result. Second asks if training accuracy exceeds validation, leading to Overfitting. Third asks if validation exceeds production accuracy, leading to Training-Serving Skew. Fourth asks about subgroup inconsistency, leading to Bias. Gray box at bottom shows Model Architecture issue if all answers are no."} ```{.tikz} -\resizebox{.7\textwidth}{!}{% \begin{tikzpicture}[font=\small\usefont{T1}{phv}{m}{n},line width=0.75pt] \tikzset{ Diamond/.style={aspect=2, inner sep=2pt, draw=BlueLine, line width=0.65pt, - fill=BlueL, - text width=30mm,align=center + fill=BlueL, minimum height=11mm, + text width=40mm,align=flush center }, Box/.style={inner xsep=2pt, draw=GreenLine, line width=0.65pt, fill=GreenL, - text width=35mm,align=flush center, - minimum width=35mm, minimum height=9mm, - rounded corners=2pt + % text width=40mm, + align=flush center, + minimum width=40mm, minimum height=11mm, }, Result/.style={inner xsep=2pt, draw=RedLine, line width=0.65pt, fill=RedL!20, - text width=35mm,align=flush center, - minimum width=35mm, minimum height=9mm, - rounded corners=2pt + % text width=45mm, + align=flush center, + minimum width=55mm, minimum height=15mm, }, - Line/.style={line width=1.0pt,black!50,text=black,-latex}, + Line/.style={line width=1.0pt,black!50,text=black,->,>=Latex}, Text/.style={inner sep=1pt, font=\footnotesize\usefont{T1}{phv}{m}{n}, text=black } } +%target +\tikzset{ +pics/target/.style = { + code = { + \pgfkeys{/channel/.cd, #1} +\begin{scope}[shift={($(0,0)+(0,0)$)},scale=\scalefac,every node/.append style={transform shape}] +\definecolor{col1}{RGB}{62,100,125} +\definecolor{col2}{RGB}{219,253,166} +\colorlet{col1}{\filllcolor} +\colorlet{col2}{\filllcirclecolor} +\foreach\i/\col [count=\k]in {22mm/col1,17mm/col2,12mm/col1,7mm/col2,2.5mm/col1}{ +\node[circle,inner sep=0pt,draw=\drawcolor,fill=\col,minimum size=\i,line width=\Linewidth](C\k){}; +} +\draw[thick,fill=brown,xscale=-1](0,0)--++(111:0.13)--++(135:1)--++(225:0.1)--++(315:1)--cycle; +\path[green,xscale=-1](0,0)--(135:0.85)coordinate(XS1); +\draw[thick,fill=yellow,xscale=-1](XS1)--++(80:0.2)--++(135:0.37)--++(260:0.2)--++(190:0.2)--++(315:0.37)--cycle; +\end{scope} + } + } +} +%graph +\tikzset{pics/graph/.style = { + code = { + \pgfkeys{/channel/.cd, #1} +\begin{scope}[local bounding box=GRAPH,scale=\scalefac, every node/.append style={transform shape}] +\draw[line width=1.5*\Linewidth,draw = \drawcolor](-0.20,0)--(2,0); +\draw[line width=1.5*\Linewidth,draw = \drawcolor](-0.20,0)--(-0.20,2); +\foreach \i/\vi in {0/10,0.5/17,1/9,1.5/5}{ +\node[draw, minimum width =4mm, minimum height = \vi mm, inner sep = 0pt, + draw = \filllcolor, fill=\filllcolor!20, line width=\Linewidth,anchor=south west](COM)at(\i,0.2){}; +} + \end{scope} + } + } +} +%square +\tikzset{ +pics/square/.style = { + code = { + \pgfkeys{/channel/.cd, #1} +\begin{scope}[local bounding box=SQUARE,scale=\scalefac,every node/.append style={transform shape}] +% Right Face +\draw[fill=\filllcolor!70,line width=\Linewidth] +(\Depth,0,0)coordinate(\picname-ZDD)--(\Depth,\Width,0)--(\Depth,\Width,\Height)--(\Depth,0,\Height)--cycle; +% Front Face +\draw[fill=\filllcolor!40,line width=\Linewidth] +(0,0,\Height)coordinate(\picname-DL)--(0,\Width,\Height)coordinate(\picname-GL)-- +(\Depth,\Width,\Height)coordinate(\picname-GD)--(\Depth,0,\Height)coordinate(\picname-DD)--(0,0,\Height); +% Top Face +\draw[fill=\filllcolor!20,line width=\Linewidth] +(0,\Width,0)coordinate(\picname-ZGL)--(0,\Width,\Height)coordinate(\picname-ZGL)-- +(\Depth,\Width,\Height)--(\Depth,\Width,0)coordinate(\picname-ZGD)--cycle; +\end{scope} + } + } +} +% #1 number of teeths +% #2 radius intern +% #3 radius extern +% #4 angle from start to end of the first arc +% #5 angle to decale the second arc from the first +% #6 inner radius to cut off +\tikzset{ + pics/gear/.style args={#1/#2/#3/#4/#5/#6/#7}{ + code={ + \pgfkeys{/channel/.cd, #7} +\begin{scope}[shift={($(0,0)+(0,0)$)},scale=\scalefac,every node/.append style={transform shape}] + \pgfmathtruncatemacro{\N}{#1}% + \def\rin{#2}\def\rout{#3}\def\aA{#4}\def\aOff{#5}\def\rcut{#6}% + \path[rounded corners=1.5pt,draw=\drawcolor,fill=\filllcolor] + (0:\rin) + \foreach \i [evaluate=\i as \n using (\i-1)*360/\N] in {1,...,\N}{% + arc (\n:\n+\aA:\rin) + -- (\n+\aA+\aOff:\rout) + arc (\n+\aA+\aOff:\n+360/\N-\aOff:\rout) + -- (\n+360/\N:\rin) + } -- cycle; + \draw[draw=none,fill=white](0,0) circle[radius=\rcut]; +\end{scope} + }} +} +%puzzle +\tikzset{pics/puzzle/.style = { + code = { +\pgfkeys{/channel/.cd, #1} +\begin{scope}[scale=\scalefac, every node/.append style={transform shape}] +\fill[fill=\filllcolor] (-2,-0.35) to[out=90,in=135] (-1.5,-0.45) arc(-135:135:0.6 and +{0.45*sqrt(2)}) to[out=-135,in=-90] (-2,0.35) |- (-0.35,2) +to[out=0,in=-45] (-0.45,2.5) arc(225:-45:{0.45*sqrt(2)} and 0.6) +to[out=-135,in=180] (0.35,2) -| (2,0.35) +to[out=-90,in=225] (2.5,0.45) arc(135:-135:0.6 and {0.45*sqrt(2)}) +to[out=135,in=90] (2,-0.35) |- (0.35,-2) +to[out=180,in=-135] (0.45,-1.5) arc(-45:225:{0.45*sqrt(2)} and 0.6) +to[out=-45,in=0] (-0.35,-2) -| cycle; +\end{scope} +}}} +\pgfkeys{ + /channel/.cd, + Dual/.store in=\Dual, + Depth/.store in=\Depth, + Height/.store in=\Height, + Width/.store in=\Width, + Smile/.store in=\Smile, + Level/.store in=\Level, + filllcirclecolor/.store in=\filllcirclecolor, + filllcolor/.store in=\filllcolor, + drawcolor/.store in=\drawcolor, + drawcircle/.store in=\drawcircle, + scalefac/.store in=\scalefac, + Linewidth/.store in=\Linewidth, + picname/.store in=\picname, + filllcolor=BrownLine, + filllcirclecolor=cyan, + drawcolor=black, + drawcircle=violet, + scalefac=1, + Dual=adual, + Smile=smile, + Level=0.52, + Linewidth=0.5pt, + Depth=1.3, + Height=0.8, + Width=1.1, + picname=C +} + % Nodes \node[Diamond](D1){Accuracy degrading over time?}; - % Branch 1: Yes (Drift) -\node[Result, right=1.5 of D1](R1){\textbf{Data Drift}\\(Freshness Debt)\\ \footnotesize Check PSI, Distributions}; -\draw[Line] (D1.east) -- node[Text,above]{Yes} (R1.west); +\node[Result, right=1.5 of D1](R1){}; +\pic[shift={(0.37,-0.55)}] at (R1.west){graph={scalefac=0.55,picname=1,filllcolor=OrangeLine, Linewidth=0.7pt}}; +\node[Result,draw=none,anchor=south east,minimum width=42mm, fill=none](R3T) +at (R1.south east){\textbf{Data Drift}\\[-2pt]\footnotesize (Freshness Debt)\\[-2pt] \footnotesize Check PSI, Distributions}; +\draw[Line] (D1.east) -- node[Text,above=1pt]{Yes} (R1.west); % Branch 1: No -> D2 -\node[Diamond, below=1.5 of D1](D2){Training Acc $\gg$ Validation Acc?}; -\draw[Line] (D1.south) -- node[Text,right]{No/Constant} (D2.north); +\node[Diamond, below=1.15 of D1](D2){Training Acc $\gg$ Validation Acc?}; +\draw[Line] (D1.south) -- node[Text,right=1pt]{No/Constant} (D2.north); % Branch 2: Yes (Overfitting/Quality) -\node[Result, right=1.5 of D2](R2){\textbf{Overfitting/Artifacts}\\ \footnotesize Check Label Noise, Duplicates}; -\draw[Line] (D2.east) -- node[Text,above]{Yes} (R2.west); +\node[Result, right=1.5 of D2](R2){}; + +\begin{scope}[local bounding box=PUZZLE1,shift={($(0.42,-0.28)+(R2.west)$)}, +scale=0.58, every node/.append style={transform shape}] +\pic[shift={(0,0)}] at (0,0){puzzle={scalefac=0.2,picname=1,filllcolor=orange!80}}; +\pic[shift={(0,0)}] at (0,0.8){puzzle={scalefac=0.2,picname=1,filllcolor=red!80}}; +\pic[shift={(0,0)}] at (0.8,0){puzzle={scalefac=0.2,picname=1,filllcolor=green!60!black}}; +\pic[shift={(0,0)}] at (0.8,0.8){puzzle={scalefac=0.2,picname=1,filllcolor=cyan!70}}; +\end{scope} +\node[Result,draw=none,anchor=south east,minimum width=42mm, fill=none](R3T) +at (R2.south east){\textbf{Overfitting/Artifacts}\\[-1pt] \footnotesize Check Label Noise, +\\[-1pt] +Duplicates}; +\draw[Line] (D2.east) -- node[Text,above=1pt]{Yes} (R2.west); % Branch 2: No -> D3 -\node[Diamond, below=1.5 of D2](D3){Validation Acc $\gg$ Production Acc?}; -\draw[Line] (D2.south) -- node[Text,right]{No ($\approx$)} (D3.north); +\node[Diamond, below=1.15 of D2](D3){Validation Acc $\gg$ Production Acc?}; +\draw[Line] (D2.south) -- node[Text,right=1pt]{No ($\approx$)} (D3.north); % Branch 3: Yes (Skew) -\node[Result, right=1.5 of D3](R3){\textbf{Training-Serving Skew}\\ \footnotesize Check Transformations, Schema}; -\draw[Line] (D3.east) -- node[Text,above]{Yes} (R3.west); +\node[Result, right=1.5 of D3](R3){}; +\draw[Line] (D3.east) -- node[Text,above=1pt]{Yes} (R3.west); +\pic[shift={(0.65,0)}] at (R3.west) {gear={9/1.6/2.1/5/2/1.0/scalefac=0.25,drawcolor=GreenD,filllcolor=GreenD}}; +\node[Result,draw=none,anchor=south east,minimum width=45mm, fill=none](R3T) +at (R3.south east){\textbf{Training-Serving Skew}\\[-1pt] \footnotesize Check Transformations,\\[-1pt] Schema}; % Branch 3: No -> D4 -\node[Diamond, below=1.5 of D3](D4){Performance inconsistent across subgroups?}; -\draw[Line] (D3.south) -- node[Text,right]{No ($\approx$)} (D4.north); +\node[Diamond, below=1.15 of D3](D4){Performance inconsistent across subgroups?}; +\draw[Line] (D3.south) -- node[Text,right=1pt]{No ($\approx$)} (D4.north); % Branch 4: Yes (Bias) -\node[Result, right=1.5 of D4](R4){\textbf{Bias / Coverage Gap}\\ \footnotesize Check Slice Metrics}; -\draw[Line] (D4.east) -- node[Text,above]{Yes} (R4.west); +\node[Result, right=1.5 of D4](R4){}; +\pic[shift={(0.65,0)}] at (R4.west){target={scalefac=0.45,picname=1,drawcolor=BlueLine, +filllcolor=cyan!90!,Linewidth=0.7pt, filllcirclecolor=cyan!20}}; +\node[Result,draw=none,anchor=south east,minimum width=45mm, fill=none](R5T) +at (R4.south east){\textbf{Bias / Coverage Gap}\\ \footnotesize Check Slice Metrics}; +\draw[Line] (D4.east) -- node[Text,above=1pt]{Yes} (R4.west); % Branch 4: No -> Model Issue -\node[Box, below=1.5 of D4, fill=gray!20, draw=black!60](R5){\textbf{Model Architecture}\\or Capacity Issue}; -\draw[Line] (D4.south) -- node[Text,right]{No} (R5.north); +\node[Box, below=1.15 of D4,fill=gray!20, draw=black!60](R5){}; +\pic[rotate=20,shift={(0.27,-0.35)}] at (R5.west){square={scalefac=0.4,picname=1, +filllcolor=VioletLine, Linewidth=0.5pt}}; +\node[Box,draw=none,anchor=south east,minimum width=32mm, fill=none](R5T) +at (R5.south east){\textbf{Model Architecture}\\or Capacity Issue}; +\draw[Line] (D4.south) -- node[Text,right=1pt]{No} (R5.north); % Context Labels -\node[text width=4cm, align=center, font=\footnotesize, anchor=north] at (R1.south) {Fix: Retraining, Monitoring}; -\node[text width=4cm, align=center, font=\footnotesize, anchor=north] at (R2.south) {Fix: Deduplication, Audit}; -\node[text width=4cm, align=center, font=\footnotesize, anchor=north] at (R3.south) {Fix: Feature Store, Consistency}; -\node[text width=4cm, align=center, font=\footnotesize, anchor=north] at (R4.south) {Fix: Targeted Collection}; - -\end{tikzpicture}} +\node[text width=45mm, align=center, font=\footnotesize\usefont{T1}{phv}{m}{n}, anchor=north] at (R1.south) {Fix: Retraining, Monitoring}; +\node[text width=45mm, align=center, font=\footnotesize\usefont{T1}{phv}{m}{n}, anchor=north] at (R2.south) {Fix: Deduplication, Audit}; +\node[text width=45mm, align=center, font=\footnotesize\usefont{T1}{phv}{m}{n}, anchor=north] at (R3.south) {Fix: Feature Store, Consistency}; +\node[text width=45mm, align=center, font=\footnotesize\usefont{T1}{phv}{m}{n}, anchor=north] at (R4.south) {Fix: Targeted Collection}; +\end{tikzpicture} ``` + ::: Most production ML failures trace to data, not models.[^fn-failure-proportions] Industry experience suggests a consistent pattern: training-serving skew accounts for 30–40% of failures, data drift for another 20–30%, and label quality issues for 15–25%. Model architecture—the component engineers most naturally investigate first—accounts for only 10–15%. Debugging the model before verifying data consistency wastes engineering cycles on the wrong tenth of the problem space.