mirror of
https://github.com/harvard-edge/cs249r_book.git
synced 2026-04-29 00:59:07 -05:00
Updates AI Engineering definition and references
Updates the AI Engineering definition and corrects a typo. Updates broken cross-references to deployment paradigms. Standardizes the format of bibtex entries. Refactors a table in the robust AI section.
This commit is contained in:
369
docker/windows/Dockerfile.backup-2025-10-08
Normal file
369
docker/windows/Dockerfile.backup-2025-10-08
Normal file
@@ -0,0 +1,369 @@
|
||||
# escape=`
|
||||
# MLSysBook Windows Quarto Build Container (Windows Server 2022)
|
||||
# - PowerShell 7 via ZIP (no MSI)
|
||||
# - Quarto 1.7.31 via ZIP (no MSI)
|
||||
# - Python 3.13.1 + requirements
|
||||
# - Ghostscript + Inkscape (Chocolatey)
|
||||
# - TeX Live pinned to 2025 snapshot + packages from tl_packages
|
||||
# - R 4.3.2 + packages via install_packages.R
|
||||
# - Verifications: versions, kpsewhich font files, TikZ smoke test
|
||||
|
||||
FROM mcr.microsoft.com/windows/server:ltsc2022
|
||||
|
||||
# Use Windows PowerShell initially
|
||||
SHELL ["powershell.exe", "-NoLogo", "-ExecutionPolicy", "Bypass", "-Command"]
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 0: Base dirs and env (same as quarto-build workflow)
|
||||
# ------------------------------------------------------------
|
||||
ENV R_LIBS_USER="C:/r-lib"
|
||||
ENV QUARTO_LOG_LEVEL="INFO"
|
||||
ENV PYTHONIOENCODING="utf-8"
|
||||
ENV LANG="en_US.UTF-8"
|
||||
ENV LC_ALL="en_US.UTF-8"
|
||||
|
||||
RUN Write-Host '=== STARTING BASE SETUP ===' ; `
|
||||
Write-Host 'Creating base directories...' ; `
|
||||
New-Item -ItemType Directory -Force -Path 'C:\temp' | Out-Null ; `
|
||||
Write-Host '📁 Created C:\temp' ; `
|
||||
New-Item -ItemType Directory -Force -Path 'C:\r-lib' | Out-Null ; `
|
||||
Write-Host '📁 Created C:\r-lib' ; `
|
||||
Write-Host 'Environment variables set:' ; `
|
||||
Write-Host " R_LIBS_USER: $env:R_LIBS_USER" ; `
|
||||
Write-Host " QUARTO_LOG_LEVEL: $env:QUARTO_LOG_LEVEL" ; `
|
||||
Write-Host " PYTHONIOENCODING: $env:PYTHONIOENCODING" ; `
|
||||
Write-Host " LANG: $env:LANG" ; `
|
||||
Write-Host " LC_ALL: $env:LC_ALL" ; `
|
||||
Write-Host '✅ Base setup complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 1: PowerShell 7 (ZIP install, container-safe)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING POWERSHELL 7 INSTALLATION ===' ; `
|
||||
Write-Host 'Using ZIP install for container compatibility' ; `
|
||||
Write-Host 'Download URL: https://github.com/PowerShell/PowerShell/releases/download/v7.4.1/PowerShell-7.4.1-win-x64.zip' ; `
|
||||
$Url = 'https://github.com/PowerShell/PowerShell/releases/download/v7.4.1/PowerShell-7.4.1-win-x64.zip' ; `
|
||||
$Zip = 'C:\PowerShell-7.4.1.zip' ; `
|
||||
Write-Host "Downloading PowerShell 7 to: $Zip" ; `
|
||||
Invoke-WebRequest -Uri $Url -OutFile $Zip -UseBasicParsing ; `
|
||||
Write-Host '📥 Download completed' ; `
|
||||
Write-Host 'Creating PowerShell directory...' ; `
|
||||
New-Item -ItemType Directory -Force -Path 'C:\Program Files\PowerShell\7' | Out-Null ; `
|
||||
Write-Host '📁 Directory created' ; `
|
||||
Write-Host 'Extracting ZIP file...' ; `
|
||||
Expand-Archive -Path $Zip -DestinationPath 'C:\Program Files\PowerShell\7' -Force ; `
|
||||
Write-Host '📦 Extraction completed' ; `
|
||||
Write-Host 'Cleaning up ZIP file...' ; `
|
||||
Remove-Item $Zip -Force ; `
|
||||
Write-Host '🧹 Cleanup completed' ; `
|
||||
Write-Host 'Adding PowerShell to PATH...' ; `
|
||||
$mach = [Environment]::GetEnvironmentVariable('PATH','Machine') ; `
|
||||
Write-Host "Current PATH: $mach" ; `
|
||||
if ($mach -notmatch [regex]::Escape('C:\Program Files\PowerShell\7')) { `
|
||||
[Environment]::SetEnvironmentVariable('PATH', ('C:\Program Files\PowerShell\7;' + $mach), 'Machine') ; `
|
||||
Write-Host '🔗 PowerShell added to PATH' ; `
|
||||
} else { `
|
||||
Write-Host '⚠️ PowerShell already in PATH' ; `
|
||||
} ; `
|
||||
Write-Host 'Verifying PowerShell installation...' ; `
|
||||
& 'C:\Program Files\PowerShell\7\pwsh.exe' -NoLogo -Command '$PSVersionTable.PSVersion ; Write-Host ''PowerShell 7 installation verified ✅'''
|
||||
|
||||
# Switch to PowerShell 7 for subsequent layers
|
||||
SHELL ["C:\\Program Files\\PowerShell\\7\\pwsh.exe", "-NoLogo", "-ExecutionPolicy", "Bypass", "-Command"]
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 2: Chocolatey (package manager for Windows)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING CHOCOLATEY INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Chocolatey package manager...' ; `
|
||||
Write-Host 'Setting TLS 1.2 for download...' ; `
|
||||
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 ; `
|
||||
Write-Host '🔒 TLS 1.2 enabled' ; `
|
||||
Write-Host 'Downloading and executing Chocolatey install script...' ; `
|
||||
iex ((New-Object Net.WebClient).DownloadString('https://chocolatey.org/install.ps1')) ; `
|
||||
Write-Host '📦 Chocolatey install script executed' ; `
|
||||
Write-Host 'Verifying Chocolatey installation...' ; `
|
||||
choco --version ; `
|
||||
Write-Host '✅ Chocolatey installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 3: Copy dependency files (same as quarto-build workflow)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING DEPENDENCY FILE COPY ==='
|
||||
COPY tools/dependencies/requirements.txt C:/temp/requirements.txt
|
||||
COPY tools/dependencies/install_packages.R C:/temp/install_packages.R
|
||||
COPY tools/dependencies/tl_packages C:/temp/tl_packages
|
||||
RUN Write-Host '✅ Dependency file copy complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 4: Install TeX Live via Chocolatey (Testing if upstream fixed)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING TEX LIVE INSTALLATION (2025) ===' ; `
|
||||
Write-Host '📦 Installing TeX Live via Chocolatey...' ; `
|
||||
choco install texlive -y ; `
|
||||
Write-Host '✅ TeX Live installed via Chocolatey' ; `
|
||||
`
|
||||
Write-Host '🔍 Finding TeX Live installation directory...' ; `
|
||||
$texRoot = Join-Path $env:SystemDrive 'texlive' ; `
|
||||
Write-Host "📁 TeX Live root: $texRoot" ; `
|
||||
`
|
||||
Write-Host '🔍 Looking for year-based directories...' ; `
|
||||
$texYearDir = Get-ChildItem $texRoot -Directory | `
|
||||
Where-Object { $_.Name -match '^\d{4}$' } | `
|
||||
Sort-Object Name -Descending | `
|
||||
Select-Object -First 1 ; `
|
||||
Write-Host "📁 Found year directory: $($texYearDir.FullName)" ; `
|
||||
`
|
||||
$texLiveBin = Join-Path $texYearDir.FullName 'bin\windows' ; `
|
||||
Write-Host "📁 TeX Live bin directory: $texLiveBin" ; `
|
||||
`
|
||||
Write-Host '🔧 Adding TeX Live to PATH...' ; `
|
||||
$env:PATH = "$texLiveBin;$env:PATH" ; `
|
||||
Write-Host "✅ PATH updated with: $texLiveBin" ; `
|
||||
`
|
||||
Write-Host '📋 Reading collections from tl_packages...' ; `
|
||||
if (Test-Path 'C:\temp\tl_packages') { `
|
||||
$collections = Get-Content 'C:\temp\tl_packages' | `
|
||||
Where-Object { $_.Trim() -ne '' -and -not $_.Trim().StartsWith('#') } ; `
|
||||
Write-Host "📦 Found $($collections.Count) collections to install" ; `
|
||||
Write-Host '📋 Collections:' ; `
|
||||
$collections | ForEach-Object { Write-Host " - $_" } ; `
|
||||
`
|
||||
Write-Host '🔄 Installing collections...' ; `
|
||||
$i = 1 ; `
|
||||
foreach ($collection in $collections) { `
|
||||
Write-Host "📦 [$i/$($collections.Count)] Installing $collection..." ; `
|
||||
& "$texLiveBin\tlmgr.bat" install $collection ; `
|
||||
if ($LASTEXITCODE -eq 0) { `
|
||||
Write-Host "✅ $collection installed successfully" ; `
|
||||
} else { `
|
||||
Write-Host "⚠️ Failed to install $collection, continuing..." ; `
|
||||
} ; `
|
||||
$i++ ; `
|
||||
} ; `
|
||||
Write-Host '✅ Collection installation complete' ; `
|
||||
} else { `
|
||||
Write-Host '⚠️ No tl_packages file found, skipping collection installation' ; `
|
||||
} ; `
|
||||
`
|
||||
Write-Host '🔄 Updating tlmgr...' ; `
|
||||
& "$texLiveBin\tlmgr.bat" update --self --all ; `
|
||||
Write-Host '✅ tlmgr updated' ; `
|
||||
`
|
||||
Write-Host '🔍 Verifying lualatex installation...' ; `
|
||||
& "$texLiveBin\lualatex.exe" --version ; `
|
||||
Write-Host '✅ TeX Live installation verified'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 5: Install Scoop (Package manager setup)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING SCOOP INSTALLATION ===' ; `
|
||||
Write-Host 'Setting UTF-8 encoding...' ; `
|
||||
[Console]::OutputEncoding = [System.Text.Encoding]::UTF8 ; `
|
||||
$OutputEncoding = [System.Text.Encoding]::UTF8 ; `
|
||||
Write-Host '🔤 UTF-8 encoding set' ; `
|
||||
Write-Host 'Setting execution policy...' ; `
|
||||
Set-ExecutionPolicy -ExecutionPolicy RemoteSigned -Scope CurrentUser -Force ; `
|
||||
Write-Host '🔐 Execution policy set' ; `
|
||||
Write-Host 'Installing Scoop package manager...' ; `
|
||||
Invoke-WebRequest -useb get.scoop.sh -outfile 'install.ps1' ; `
|
||||
Write-Host '📥 Scoop install script downloaded' ; `
|
||||
& .\install.ps1 -RunAsAdmin ; `
|
||||
Write-Host '📦 Scoop installed' ; `
|
||||
Write-Host 'Adding Scoop shims to PATH...' ; `
|
||||
$scoopShims = Join-Path (Resolve-Path ~).Path 'scoop\shims' ; `
|
||||
Write-Host "Scoop shims path: $scoopShims" ; `
|
||||
$mach = [Environment]::GetEnvironmentVariable('PATH','Machine') ; `
|
||||
[Environment]::SetEnvironmentVariable('PATH', ($scoopShims + ';' + $mach), 'Machine') ; `
|
||||
Write-Host '🔗 Added Scoop shims to PATH' ; `
|
||||
Write-Host 'Installing Git (required for buckets)...' ; `
|
||||
scoop install git ; `
|
||||
Write-Host '📦 Git installed' ; `
|
||||
Write-Host 'Adding r-bucket...' ; `
|
||||
scoop bucket add r-bucket https://github.com/cderv/r-bucket.git ; `
|
||||
Write-Host '📦 r-bucket added' ; `
|
||||
Write-Host 'Adding extras bucket...' ; `
|
||||
scoop bucket add extras ; `
|
||||
Write-Host '📦 extras bucket added' ; `
|
||||
Write-Host '✅ Scoop installation completed!'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 6: Install Quarto (Main tool)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING QUARTO INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Quarto via Scoop...' ; `
|
||||
scoop install quarto ; `
|
||||
Write-Host '📦 Quarto installed' ; `
|
||||
Write-Host 'Verifying Quarto installation...' ; `
|
||||
quarto --version ; `
|
||||
Write-Host '✅ Quarto installation completed!'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 7: Install Ghostscript (required for PDF generation)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING GHOSTSCRIPT INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Ghostscript via Scoop...' ; `
|
||||
scoop install main/ghostscript ; `
|
||||
Write-Host '📦 Ghostscript installed' ; `
|
||||
Write-Host 'Verifying Ghostscript installation...' ; `
|
||||
gs --version ; `
|
||||
Write-Host '✅ Ghostscript installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 8: Install Inkscape (required for SVG processing)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING INKSCAPE INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Inkscape via Scoop...' ; `
|
||||
scoop install inkscape ; `
|
||||
Write-Host '📦 Inkscape installed' ; `
|
||||
Write-Host 'Verifying Inkscape installation...' ; `
|
||||
inkscape --version ; `
|
||||
Write-Host '✅ Inkscape installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 9: Install Python (Medium complexity)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING PYTHON INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Python via Scoop (same as quarto-build workflow)...' ; `
|
||||
Write-Host 'Installing Python from main bucket...' ; `
|
||||
scoop install main/python ; `
|
||||
Write-Host '📦 Python installed' ; `
|
||||
Write-Host 'Verifying Python installation...' ; `
|
||||
python --version ; `
|
||||
Write-Host '✅ Python installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 10: Install Python packages (Medium complexity)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING PYTHON PACKAGE INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Python packages from requirements.txt (same as quarto-build workflow)...' ; `
|
||||
Write-Host 'Upgrading pip...' ; `
|
||||
python -m pip install --upgrade pip ; `
|
||||
Write-Host '📦 pip upgraded' ; `
|
||||
Write-Host 'Installing packages from requirements.txt...' ; `
|
||||
Write-Host 'Requirements file contents:' ; `
|
||||
Get-Content C:/temp/requirements.txt | Write-Host ; `
|
||||
python -m pip install -r C:/temp/requirements.txt ; `
|
||||
Write-Host '✅ Python package installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 11: Install Visual C++ Redistributable (Required for Quarto DLLs)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING VISUAL C++ REDISTRIBUTABLE INSTALLATION ===' ; `
|
||||
Write-Host 'Installing Microsoft Visual C++ Redistributable...' ; `
|
||||
Write-Host 'This is required for Quarto DLL dependencies on Windows' ; `
|
||||
choco install vcredist-all -y ; `
|
||||
Write-Host '📦 Visual C++ Redistributable installed' ; `
|
||||
Write-Host '✅ Visual C++ Redistributable installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 12: Install R (Medium complexity)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING R INSTALLATION ===' ; `
|
||||
Write-Host 'Installing R via Scoop (same as quarto-build workflow)...' ; `
|
||||
Write-Host 'Installing R from main bucket...' ; `
|
||||
scoop install main/r ; `
|
||||
Write-Host '📦 R installed' ; `
|
||||
Write-Host 'Verifying R installation...' ; `
|
||||
R --version ; `
|
||||
Write-Host '✅ R installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 13: Install R packages (Medium complexity)
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== INSTALLING R PACKAGES ===' ; `
|
||||
Write-Host 'Installing R packages from install_packages.R (same as quarto-build workflow)...' ; `
|
||||
Write-Host 'Setting up R environment...' ; `
|
||||
Write-Host "R_LIBS_USER: $env:R_LIBS_USER" ; `
|
||||
Write-Host 'Installing R packages...' ; `
|
||||
Rscript -e 'options(repos=c(CRAN=\"https://cran.rstudio.com\"))' ; `
|
||||
Rscript -e 'dir.create(Sys.getenv(\"R_LIBS_USER\"), recursive=TRUE, showWarnings=FALSE)' ; `
|
||||
Rscript -e '.libPaths(Sys.getenv(\"R_LIBS_USER\"))' ; `
|
||||
Rscript -e 'install.packages(\"remotes\")' ; `
|
||||
if (Test-Path 'C:/temp/install_packages.R') { `
|
||||
Write-Host 'Found install_packages.R, sourcing it...' ; `
|
||||
Rscript 'C:/temp/install_packages.R' ; `
|
||||
} else { `
|
||||
Write-Host 'No install_packages.R found, installing basic packages...' ; `
|
||||
Rscript -e 'install.packages(c(\"rmarkdown\",\"knitr\",\"ggplot2\"))' ; `
|
||||
} ; `
|
||||
Rscript -e 'for (p in c(\"rmarkdown\",\"knitr\")) if (!require(p, character.only=TRUE, quietly=TRUE)) stop(\"missing: \", p)' ; `
|
||||
Write-Host '📦 R packages installed' ; `
|
||||
Write-Host 'Verifying R packages...' ; `
|
||||
Rscript C:/temp/verify_r_packages.R ; `
|
||||
Write-Host '✅ R package installation complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# PHASE 14: Cleanup and Environment Setup
|
||||
# ------------------------------------------------------------
|
||||
RUN Write-Host '=== STARTING CLEANUP AND ENVIRONMENT SETUP ===' ; `
|
||||
Write-Host 'Cleaning temporary files and setting up environment...' ; `
|
||||
Write-Host 'Removing temporary files...' ; `
|
||||
Remove-Item C:/temp/requirements.txt -ErrorAction SilentlyContinue ; `
|
||||
Write-Host '🗑️ requirements.txt removed' ; `
|
||||
Remove-Item C:/temp/install_packages.R -ErrorAction SilentlyContinue ; `
|
||||
Write-Host '🗑️ install_packages.R removed' ; `
|
||||
Remove-Item C:/temp/verify_r_packages.R -ErrorAction SilentlyContinue ; `
|
||||
Write-Host '🗑️ verify_r_packages.R removed' ; `
|
||||
Remove-Item C:/temp/tl_packages -ErrorAction SilentlyContinue ; `
|
||||
Write-Host '🗑️ tl_packages removed' ; `
|
||||
Remove-Item C:/temp/requirements/ -Recurse -Force -ErrorAction SilentlyContinue ; `
|
||||
Write-Host '🗑️ requirements/ directory removed' ; `
|
||||
Write-Host 'Setting up environment variables for Quarto...' ; `
|
||||
$env:QUARTO_LOG_LEVEL = 'DEBUG' ; `
|
||||
[Environment]::SetEnvironmentVariable('QUARTO_LOG_LEVEL', 'DEBUG', 'Machine') ; `
|
||||
Write-Host '🔧 QUARTO_LOG_LEVEL set to DEBUG' ; `
|
||||
Write-Host '✅ Cleanup and environment setup complete'
|
||||
|
||||
# ------------------------------------------------------------
|
||||
# FINAL CHECKS: Comprehensive verification with diagnostics
|
||||
# ------------------------------------------------------------
|
||||
WORKDIR C:/workspace
|
||||
RUN Write-Host '=== FINAL VERIFICATION WITH ENHANCED DIAGNOSTICS ===' ; `
|
||||
Write-Host 'Verifying all installations with comprehensive checks...' ; `
|
||||
Write-Host '' ; `
|
||||
Write-Host '🔍 SYSTEM DIAGNOSTICS:' ; `
|
||||
Write-Host '----------------------' ; `
|
||||
Write-Host 'PATH environment variable:' ; `
|
||||
Write-Host $env:PATH ; `
|
||||
Write-Host '' ; `
|
||||
Write-Host 'Visual C++ Redistributable check:' ; `
|
||||
Get-ChildItem 'C:\Windows\System32' -Filter 'msvcp*.dll' | Select-Object Name, Length, LastWriteTime ; `
|
||||
Write-Host '' ; `
|
||||
Write-Host '📊 TOOL VERIFICATION:' ; `
|
||||
Write-Host '---------------------' ; `
|
||||
Write-Host 'Checking Quarto...' ; `
|
||||
try { `
|
||||
quarto --version ; `
|
||||
Write-Host '✅ Quarto version check: PASSED' ; `
|
||||
Write-Host 'Running Quarto check for comprehensive validation...' ; `
|
||||
& quarto check 2>&1 | Write-Host ; `
|
||||
if ($LASTEXITCODE -eq 0) { `
|
||||
Write-Host '✅ Quarto check: PASSED' ; `
|
||||
} else { `
|
||||
Write-Host '⚠️ Quarto check: ISSUES DETECTED' ; `
|
||||
Write-Host "Exit code: $LASTEXITCODE" ; `
|
||||
} ; `
|
||||
} catch { `
|
||||
Write-Host '❌ Quarto verification failed:' ; `
|
||||
Write-Host $_.Exception.Message ; `
|
||||
} ; `
|
||||
Write-Host 'Checking Python...' ; `
|
||||
python --version ; `
|
||||
Write-Host '✅ Python verified' ; `
|
||||
Write-Host 'Checking R...' ; `
|
||||
R --version ; `
|
||||
Write-Host '✅ R verified' ; `
|
||||
Write-Host 'Checking LaTeX...' ; `
|
||||
lualatex --version ; `
|
||||
Write-Host '✅ LaTeX verified' ; `
|
||||
Write-Host 'Checking Ghostscript...' ; `
|
||||
gs --version ; `
|
||||
Write-Host '✅ Ghostscript verified' ; `
|
||||
Write-Host 'Checking Inkscape...' ; `
|
||||
inkscape --version ; `
|
||||
Write-Host '✅ Inkscape verified' ; `
|
||||
Write-Host '' ; `
|
||||
Write-Host '🎯 FINAL STATUS:' ; `
|
||||
Write-Host '----------------' ; `
|
||||
Write-Host '✅ Windows container build completed with enhanced diagnostics'
|
||||
@@ -31,7 +31,7 @@ _DALL·E 3 Prompt: An image depicting a concluding chapter of an ML systems book
|
||||
|
||||
:::
|
||||
|
||||
## Overview {#sec-conclusion-overview-9b37}
|
||||
## Synthesizing ML Systems Engineering: From Components to Intelligence {#sec-conclusion-overview-9b37}
|
||||
|
||||
This chapter synthesizes machine learning systems engineering concepts from the preceding twenty chapters, establishing systems thinking as the fundamental paradigm for artificial intelligence development. Our progression from data engineering principles through model architectures, optimization techniques, and operational infrastructure has constructed a comprehensive knowledge foundation spanning ML systems engineering. This synthesis establishes theoretical and practical frameworks that define professional competency in machine learning systems engineering within computer systems research.
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ Machine learning systems require unprecedented access to personal data, institut
|
||||
|
||||
:::
|
||||
|
||||
## Overview {#sec-security-privacy-overview-af7c}
|
||||
## Security and Privacy in ML Systems {#sec-security-privacy-overview-af7c}
|
||||
|
||||
The shift from centralized training architectures to distributed, adaptive machine learning systems has altered the threat landscape and security requirements for modern ML infrastructure. Contemporary machine learning systems, as examined in @sec-ondevice-learning, increasingly operate across heterogeneous computational environments spanning edge devices, federated networks, and hybrid cloud deployments. This architectural evolution enables new capabilities in adaptive intelligence but introduces attack vectors and privacy vulnerabilities that traditional cybersecurity frameworks cannot adequately address.
|
||||
|
||||
@@ -55,7 +55,7 @@ Addressing these challenges requires systematic approaches that integrate securi
|
||||
|
||||
Our investigation proceeds through four interconnected frameworks. We begin by establishing distinctions between security and privacy within machine learning contexts, then examine evidence from historical security incidents to inform contemporary threat assessment. We analyze vulnerabilities that emerge from the learning process itself, before presenting layered defense architectures that span cryptographic data protection, adversarial-robust model design, and hardware security mechanisms. Throughout this analysis, we emphasize implementation guidance that enables practitioners to develop systems meeting both technical performance requirements and the trust standards necessary for societal deployment.
|
||||
|
||||
## Definitions and Distinctions {#sec-security-privacy-definitions-distinctions-8f62}
|
||||
## Foundational Concepts and Definitions {#sec-security-privacy-definitions-distinctions-8f62}
|
||||
|
||||
Security and privacy are core concerns in machine learning system design, but they are often misunderstood or conflated. Both aim to protect systems and data, yet they do so in different ways, address different threat models, and require distinct technical responses. For ML systems, distinguishing between the two helps guide the design of robust and responsible infrastructure.
|
||||
|
||||
@@ -109,7 +109,7 @@ However, they can also be in tension. Techniques like differential privacy[^fn-d
|
||||
|
||||
[^fn-dp-origins]: **Differential Privacy Origins**: Cynthia Dwork coined the term differential privacy at Microsoft Research in 2006, but the concept emerged from her frustration with the "anonymization myth" (the false belief that removing names from data guaranteed privacy). Her groundbreaking insight was that privacy should be mathematically provable, not just plausible, leading to the rigorous framework that now protects billions of users' data in products from Apple to Google.
|
||||
|
||||
## Historical Incidents {#sec-security-privacy-historical-incidents-2c34}
|
||||
## Learning from Security Breaches {#sec-security-privacy-historical-incidents-2c34}
|
||||
|
||||
Having established the conceptual foundations of security and privacy, we now examine how these principles manifest in real-world systems through landmark security incidents. These historical cases provide concrete illustrations of the abstract concepts we've defined, showing how security vulnerabilities emerge and propagate through complex systems. More importantly, they reveal universal patterns (supply chain compromise, insufficient isolation, and weaponized endpoints) that directly apply to modern machine learning deployments.
|
||||
|
||||
@@ -181,7 +181,7 @@ Consider a concrete attack scenario where attackers compromise 50,000 smart secu
|
||||
|
||||
Comprehensive defense against such weaponization requires zero-trust edge security: (1) Secure manufacturing that eliminates default credentials, implements hardware security modules (HSMs) for device-unique keys, and enables secure boot with cryptographic verification; (2) Encrypted communications that mandate TLS 1.3+ for all ML API communications with certificate pinning and mutual authentication; (3) Behavioral monitoring that deploys anomaly detection systems to identify unusual inference patterns, unexpected network traffic, and suspicious computational loads; (4) Automated response that implements kill switches to disable compromised devices remotely and quarantine them from networks; (5) Update security that enforces cryptographically signed firmware updates with automatic security patching and version rollback capabilities.
|
||||
|
||||
## Threat Patterns and Prioritization Framework {#sec-security-privacy-threat-patterns-prioritization-framework-8c27}
|
||||
## Systematic Threat Analysis and Risk Assessment {#sec-security-privacy-threat-patterns-prioritization-framework-8c27}
|
||||
|
||||
The historical incidents demonstrate how fundamental security failures manifest across different computing paradigms. Supply chain vulnerabilities enable persistent compromise, insufficient isolation allows privilege escalation, and weaponized endpoints create attack infrastructure at scale. These patterns directly apply to machine learning deployments: compromised training pipelines and model repositories inherit supply chain risks, external interfaces to safety-critical ML components require strict isolation, and compromised ML edge devices can exfiltrate inference data or participate in coordinated attacks.
|
||||
|
||||
@@ -207,7 +207,7 @@ Consider these threat priority categories:
|
||||
|
||||
This framework guides resource allocation throughout this chapter. We begin with the most common and accessible threats (model theft, data poisoning, and adversarial attacks) before examining more specialized hardware and infrastructure vulnerabilities. Understanding these priority levels helps practitioners implement defenses in a logical sequence that maximizes security benefit per invested effort.
|
||||
|
||||
## Threats to ML Models {#sec-security-privacy-threats-ml-models-fbb8}
|
||||
## Model-Specific Attack Vectors {#sec-security-privacy-threats-ml-models-fbb8}
|
||||
|
||||
Machine learning systems face threats spanning the entire ML lifecycle, from training-time manipulations to inference-time evasion. These threats fall into three broad categories: threats to model confidentiality (model theft), threats to training integrity (data poisoning[^fn-data-poisoning]), and threats to inference robustness (adversarial examples[^fn-adversarial-examples]). Each category targets different vulnerabilities and requires distinct defensive strategies.
|
||||
|
||||
@@ -579,7 +579,7 @@ The specialized computing infrastructure that powers machine learning workloads
|
||||
|
||||
In the next section, we examine how adversaries can target the physical infrastructure that executes machine learning workloads through hardware bugs, physical tampering, side channels, and supply chain risks.
|
||||
|
||||
## Threats to ML Hardware {#sec-security-privacy-threats-ml-hardware-6c4a}
|
||||
## Hardware-Level Security Vulnerabilities {#sec-security-privacy-threats-ml-hardware-6c4a}
|
||||
|
||||
As machine learning systems move from research prototypes to large-scale, real-world deployments, their security depends on the hardware platforms they run on. Whether deployed in data centers, on edge devices, or in embedded systems, machine learning applications rely on a layered stack of processors, accelerators, memory, and communication interfaces. These hardware components, while essential for enabling efficient computation, introduce unique security risks that go beyond traditional software-based vulnerabilities.
|
||||
|
||||
@@ -787,7 +787,7 @@ In response to these risks, both industry and government stakeholders have begun
|
||||
|
||||
The Supermicro controversy serves as a cautionary tale for the machine learning community. It demonstrates that hardware security cannot be taken for granted, even when working with reputable suppliers. Ensuring the integrity of ML systems requires rigorous attention to the entire hardware lifecycle—from design and fabrication to deployment and maintenance. This case reinforces the need for organizations to adopt comprehensive supply chain security practices as a foundational element of trustworthy ML system design.
|
||||
|
||||
## ML-Enabled Attack Vectors {#sec-security-privacy-mlenabled-attack-vectors-880c}
|
||||
## When ML Systems Become Attack Tools {#sec-security-privacy-mlenabled-attack-vectors-880c}
|
||||
|
||||
The threats examined thus far—model theft, data poisoning, adversarial attacks, hardware vulnerabilities—represent attacks targeting machine learning systems. However, a complete threat model must also account for the inverse: machine learning as an attack amplifier. The same capabilities that make ML powerful for beneficial applications also enhance adversarial operations, transforming machine learning from passive target to active weapon.
|
||||
|
||||
@@ -923,7 +923,7 @@ Subsequent work expanded on this approach by introducing long-range models capab
|
||||
|
||||
The implications extend beyond academic interest. As deep learning models continue to scale, their application to side-channel contexts is likely to lower the cost, skill threshold, and trace requirements of hardware-level attacks—posing a growing challenge for the secure deployment of embedded machine learning systems, cryptographic modules, and trusted execution environments.
|
||||
|
||||
## Defensive Strategies {#sec-security-privacy-defensive-strategies-0844}
|
||||
## Comprehensive Defense Architectures {#sec-security-privacy-defensive-strategies-0844}
|
||||
|
||||
Having examined threats against ML systems and threats enabled by ML capabilities, we now turn to comprehensive defensive strategies. Designing secure and privacy-preserving machine learning systems requires more than identifying individual threats. It demands a layered defense strategy that integrates protections across multiple system levels to create comprehensive resilience.
|
||||
|
||||
@@ -1963,7 +1963,7 @@ Issue 5: Federated Learning Convergence Problems
|
||||
- Implement client reliability scoring and failover mechanisms
|
||||
::: -->
|
||||
|
||||
## Implementation Roadmap for ML Security {#sec-security-privacy-practical-roadmap-8f3a}
|
||||
## Practical Implementation Roadmap {#sec-security-privacy-practical-roadmap-8f3a}
|
||||
|
||||
The comprehensive security and privacy techniques covered in this chapter can seem overwhelming for organizations just beginning to secure their ML systems. Rather than implementing every defense simultaneously, a phased approach enables systematic security improvements while managing complexity and costs. This roadmap provides a practical sequence for building robust ML security, progressing from foundational controls to advanced defenses.
|
||||
|
||||
|
||||
@@ -3940,6 +3940,44 @@ def run_verify_mode_simple(file_path):
|
||||
if source_file:
|
||||
if os.path.exists(source_file):
|
||||
print(f"✅ Found corresponding QMD file: {source_file}")
|
||||
|
||||
# Validate section IDs match between quiz and QMD
|
||||
try:
|
||||
with open(source_file, 'r', encoding='utf-8') as qmd_f:
|
||||
qmd_content = qmd_f.read()
|
||||
|
||||
qmd_sections = extract_sections_with_ids(qmd_content)
|
||||
if qmd_sections:
|
||||
# Create sets of section IDs for comparison
|
||||
qmd_section_ids = {s['section_id'] for s in qmd_sections}
|
||||
quiz_section_ids = {s['section_id'] for s in sections}
|
||||
|
||||
# Check for mismatches
|
||||
quiz_not_in_qmd = quiz_section_ids - qmd_section_ids
|
||||
qmd_not_in_quiz = qmd_section_ids - quiz_section_ids
|
||||
|
||||
print(f"\n📋 Section ID Validation:")
|
||||
if not quiz_not_in_qmd and not qmd_not_in_quiz:
|
||||
print(f" ✅ All section IDs match perfectly")
|
||||
else:
|
||||
if quiz_not_in_qmd:
|
||||
print(f" ❌ Quiz sections NOT found in QMD file:")
|
||||
for section_id in sorted(quiz_not_in_qmd):
|
||||
# Find section title from quiz data
|
||||
section_title = next((s['section_title'] for s in sections if s['section_id'] == section_id), 'Unknown')
|
||||
print(f" - {section_id} ({section_title})")
|
||||
|
||||
if qmd_not_in_quiz:
|
||||
print(f" ⚠️ QMD sections NOT found in quiz file:")
|
||||
for section_id in sorted(qmd_not_in_quiz):
|
||||
# Find section title from QMD data
|
||||
section_title = next((s['section_title'] for s in qmd_sections if s['section_id'] == section_id), 'Unknown')
|
||||
print(f" - {section_id} ({section_title})")
|
||||
else:
|
||||
print(f"\n⚠️ Could not extract sections from QMD file for validation")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n⚠️ Could not validate section IDs: {str(e)}")
|
||||
else:
|
||||
print(f"⚠️ QMD file not found: {source_file}")
|
||||
else:
|
||||
@@ -3977,6 +4015,44 @@ def run_verify_mode_simple(file_path):
|
||||
quiz_path = os.path.join(os.path.dirname(file_path), quiz_file)
|
||||
if os.path.exists(quiz_path):
|
||||
print(f"✅ Found corresponding quiz file: {quiz_path}")
|
||||
|
||||
# Validate section IDs match between QMD and quiz
|
||||
try:
|
||||
with open(quiz_path, 'r', encoding='utf-8') as quiz_f:
|
||||
quiz_data = json.load(quiz_f)
|
||||
|
||||
quiz_sections = quiz_data.get('sections', [])
|
||||
if quiz_sections:
|
||||
# Create sets of section IDs for comparison
|
||||
qmd_section_ids = {s['section_id'] for s in sections}
|
||||
quiz_section_ids = {s['section_id'] for s in quiz_sections}
|
||||
|
||||
# Check for mismatches
|
||||
quiz_not_in_qmd = quiz_section_ids - qmd_section_ids
|
||||
qmd_not_in_quiz = qmd_section_ids - quiz_section_ids
|
||||
|
||||
print(f"\n📋 Section ID Validation:")
|
||||
if not quiz_not_in_qmd and not qmd_not_in_quiz:
|
||||
print(f" ✅ All section IDs match perfectly")
|
||||
else:
|
||||
if quiz_not_in_qmd:
|
||||
print(f" ❌ Quiz sections NOT found in QMD file:")
|
||||
for section_id in sorted(quiz_not_in_qmd):
|
||||
# Find section title from quiz data
|
||||
section_title = next((s['section_title'] for s in quiz_sections if s['section_id'] == section_id), 'Unknown')
|
||||
print(f" - {section_id} ({section_title})")
|
||||
|
||||
if qmd_not_in_quiz:
|
||||
print(f" ⚠️ QMD sections NOT found in quiz file:")
|
||||
for section_id in sorted(qmd_not_in_quiz):
|
||||
# Find section title from QMD data
|
||||
section_title = next((s['section_title'] for s in sections if s['section_id'] == section_id), 'Unknown')
|
||||
print(f" - {section_id} ({section_title})")
|
||||
else:
|
||||
print(f"\n⚠️ Could not extract sections from quiz file for validation")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n⚠️ Could not validate section IDs: {str(e)}")
|
||||
else:
|
||||
print(f"⚠️ The quiz file '{quiz_file}' referenced in the frontmatter of {file_path} does not exist.")
|
||||
else:
|
||||
|
||||
Reference in New Issue
Block a user