feat(glossary): add responsible engineering terms and update glossary tools

- Add responsible_engr concepts, glossary, and quizzes files - Update serving glossary with new terms - Update build_global_glossary.py script - Add Clos, Marz, Pease to codespell ignore (proper names)
2026-03-09 07:15:51 -05:00 · 2026-01-07 13:36:55 -05:00
parent ab5b180fc5
commit d43bf848c6
6 changed files with 442 additions and 3 deletions
--- a/.codespell-ignore-words.txt
+++ b/.codespell-ignore-words.txt
@@ -33,3 +33,6 @@ socio-economic
 rin
 rouge
 FPR
+Clos
+Marz
+Pease
--- a/book/quarto/contents/vol1/responsible_engr/responsible_engr_concepts.yml
+++ b/book/quarto/contents/vol1/responsible_engr/responsible_engr_concepts.yml
@@ -0,0 +1,111 @@
+concept_map:
+  source: responsible_engr.qmd
+  generated_date: 2026-01-07
+  primary_concepts:
+    - Responsible ML Systems Engineering
+    - Silent Failure Modes
+    - Engineering Responsibility Gap
+    - Fairness Metrics
+    - Environmental Impact of AI
+    - Model Documentation Standards
+    - Total Cost of Ownership (TCO)
+
+  secondary_concepts:
+    - Disaggregated Evaluation
+    - Proxy Signals
+    - Feedback Loops in Recommendation
+    - Disparity in Error Rates
+    - Technical vs Social Objectives
+    - Green AI vs Red AI
+    - Brain Energy Efficiency
+    - Hierarchical Distributed Intelligence
+
+  technical_terms:
+    - Silent Bias
+    - Demographic Parity
+    - Equal Opportunity
+    - Equalized Odds
+    - False Positive Rate (FPR)
+    - True Positive Rate (TPR)
+    - Confusion Matrix
+    - Model Cards
+    - Datasheets for Datasets
+    - TCO (Total Cost of Ownership)
+    - Green AI
+    - Carbon Footprint
+    - Disaggregated Metrics
+
+  methodologies:
+    - Pre-Deployment Assessment
+    - Incident Response Preparation
+    - Continuous Fairness Monitoring
+    - Stratified Evaluation
+    - Intersectional Analysis
+    - Carbon-Aware Training
+    - TCO Calculation Methodology
+
+  applications:
+    - Amazon Recruiting Tool (Gender Bias Case)
+    - COMPAS Recidivism Prediction
+    - YouTube Recommendation Feedback Loops
+    - Gender Shades Facial Recognition Study
+    - Twitter Image Cropping Analysis
+    - Loan Approval Fairness Analysis
+    - Medical Diagnosis Screening
+
+keywords:
+  - responsible engineering
+  - AI ethics
+  - machine learning fairness
+  - silent failure
+  - bias amplification
+  - model cards
+  - demographic parity
+  - equal opportunity
+  - sustainability
+  - Green AI
+  - environmental impact
+  - carbon footprint
+  - total cost of ownership
+  - TCO
+  - disaggregated evaluation
+  - model documentation
+  - incident response
+  - feedback loops
+  - AI democratization
+
+topics_covered:
+  - topic: Foundations of Responsible Engineering
+    subtopics:
+      - Definition of responsible engineering
+      - Difference between technical correctness and responsible outcomes
+      - Why engineers must lead on responsibility
+      - Silent failure modes in ML systems
+
+  - topic: Case Studies in Engineering Failures
+    subtopics:
+      - Amazon's biased recruiting tool
+      - COMPAS recidivism prediction disparities
+      - YouTube's recommendation feedback loops
+      - Gender Shades facial recognition disparities
+
+  - topic: Frameworks for Responsibility
+    subtopics:
+      - Pre-deployment assessment checklists
+      - Model cards for documentation
+      - Datasheets for datasets
+      - Incident response procedures
+
+  - topic: Quantitative Fairness Measurement
+    subtopics:
+      - Demographic parity
+      - Equality of opportunity
+      - Equalized odds
+      - Disaggregated metrics and relative disparity
+
+  - topic: Environmental and Economic Sustainability
+    subtopics:
+      - Computational resource costs
+      - Carbon footprint of training and inference
+      - Total Cost of Ownership (TCO) methodology
+      - Biological inspiration for energy efficiency (The Brain)
--- a/book/quarto/contents/vol1/responsible_engr/responsible_engr_glossary.json
+++ b/book/quarto/contents/vol1/responsible_engr/responsible_engr_glossary.json
@@ -0,0 +1,117 @@
+{
+  "metadata": {
+    "chapter": "responsible_engr",
+    "version": "1.0.0",
+    "generated": "2026-01-07T12:00:00.000000",
+    "total_terms": 15,
+    "standardized": true,
+    "last_updated": "2026-01-07T12:00:00.000000"
+  },
+  "terms": [
+    {
+      "term": "silent bias",
+      "definition": "Model unfairness that produces valid-looking but discriminatory outputs, evading traditional error monitoring and requiring disaggregated evaluation to detect.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": []
+    },
+    {
+      "term": "demographic parity",
+      "definition": "A fairness criterion requiring that the probability of receiving a positive prediction is independent of group membership across protected attributes.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["equalized odds"]
+    },
+    {
+      "term": "equal opportunity",
+      "definition": "A fairness criterion requiring equal true positive rates among qualified applicants across different demographic groups.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["equalized odds"]
+    },
+    {
+      "term": "equalized odds",
+      "definition": "A fairness criterion requiring that both true positive and false positive rates are equal across different demographic groups.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["demographic parity", "equal opportunity"]
+    },
+    {
+      "term": "model cards",
+      "definition": "A standardized format for documenting machine learning models, capturing information essential for responsible deployment, including intended use, performance factors, and ethical considerations.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["datasheets for datasets"]
+    },
+    {
+      "term": "datasheets for datasets",
+      "definition": "Documentation for training data that captures provenance, collection methodology, demographic composition, and known limitations affecting model behavior.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["model cards"]
+    },
+    {
+      "term": "total cost of ownership (tco)",
+      "definition": "A comprehensive financial metric for ML systems encompassing training, inference, and operational costs over the system's entire lifecycle.",
+      "chapter_source": "responsible_engr",
+      "aliases": ["tco"],
+      "see_also": []
+    },
+    {
+      "term": "green ai",
+      "definition": "A movement in AI research and practice that prioritizes computational efficiency and energy consumption as primary metrics alongside traditional performance metrics like accuracy.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": []
+    },
+    {
+      "term": "disaggregated evaluation",
+      "definition": "The practice of breaking down model performance metrics by demographic groups or other factors to reveal disparities that are hidden by aggregate measures.",
+      "chapter_source": "responsible_engr",
+      "aliases": ["stratified evaluation"],
+      "see_also": []
+    },
+    {
+      "term": "silent failure",
+      "definition": "A system failure mode where an ML model continues to produce plausible-looking outputs that are gradually less accurate or contextually relevant without triggering conventional error alerts.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["silent bias"]
+    },
+    {
+      "term": "feedback loop",
+      "definition": "A phenomenon where a model's outputs influence its own future training data, potentially reinforcing and amplifying initial biases over time.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": []
+    },
+    {
+      "term": "intersectional analysis",
+      "definition": "Evaluation that considers combinations of demographic attributes (e.g., race and gender simultaneously) to detect concentrated harms not visible in single-factor analysis.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["disaggregated evaluation"]
+    },
+    {
+      "term": "red ai",
+      "definition": "AI research and development that prioritizes maximizing accuracy or performance without regard for the increasing computational and environmental costs required.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["green ai"]
+    },
+    {
+      "term": "carbon footprint",
+      "definition": "The total greenhouse gas emissions, typically measured in CO2 equivalent, produced directly and indirectly by training and operating an ML system.",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": ["green ai"]
+    },
+    {
+      "term": "responsible engineering gap",
+      "definition": "The disparity between technical optimization success (e.g., high benchmark accuracy) and responsible deployment outcomes (e.g., fairness and safety in production).",
+      "chapter_source": "responsible_engr",
+      "aliases": [],
+      "see_also": []
+    }
+  ]
+}
--- a/book/quarto/contents/vol1/responsible_engr/responsible_engr_quizzes.json
+++ b/book/quarto/contents/vol1/responsible_engr/responsible_engr_quizzes.json
@@ -0,0 +1,195 @@
+{
+  "metadata": {
+    "source_file": "quarto/contents/vol1/responsible_engr/responsible_engr.qmd",
+    "total_sections": 5,
+    "sections_with_quizzes": 5,
+    "sections_without_quizzes": 0
+  },
+  "sections": [
+    {
+      "section_id": "#sec-responsible-engineering-introduction-7a3f",
+      "section_title": "Introduction",
+      "quiz_data": {
+        "quiz_needed": true,
+        "rationale": {
+          "focus_areas": [
+            "Definition of responsible engineering",
+            "Why engineers must lead on responsibility"
+          ],
+          "question_strategy": "Test foundational concepts and the engineering role in responsibility.",
+          "difficulty_progression": "Start with definitions and move to the 'why'.",
+          "integration": "Connect responsibility to technical decision-making.",
+          "ranking_explanation": "Critical introduction to the shift in engineering mindset."
+        },
+        "questions": [
+          {
+            "question_type": "MCQ",
+            "question": "Why is responsible engineering particularly critical for machine learning systems compared to traditional software?",
+            "choices": [
+              "ML systems are more expensive to develop.",
+              "ML systems fail silently through biased outputs that appear normal.",
+              "Traditional software does not require any testing.",
+              "ML systems always produce deterministic results."
+            ],
+            "answer": "The correct answer is B. ML systems fail silently through biased outputs that appear normal. Unlike traditional software that crashes visibly, ML systems can produce discriminatory results for months without triggering conventional alerts, necessitating a proactive responsibility framework.",
+            "learning_objective": "Contrast failure modes of ML systems and traditional software."
+          },
+          {
+            "question_type": "SHORT",
+            "question": "Why can responsibility not be delegated exclusively to ethics boards or legal departments in an ML project?",
+            "answer": "Engineers occupy a critical position because technical decisions made during inception—such as model architecture, data pipeline design, and optimization objectives—define and constrain the space for all subsequent fairness interventions. Ethics boards often only see the system after these decisive foundational choices have been made.",
+            "learning_objective": "Explain the engineer's role in proactive responsibility design."
+          }
+        ]
+      }
+    },
+    {
+      "section_id": "#sec-responsible-engineering-engineering-responsibility-gap-4d82",
+      "section_title": "The Engineering Responsibility Gap",
+      "quiz_data": {
+        "quiz_needed": true,
+        "rationale": {
+          "focus_areas": [
+            "Technical correctness vs responsible outcomes",
+            "Amazon and COMPAS case studies"
+          ],
+          "question_strategy": "Use real-world examples to illustrate the gap between optimization and outcomes.",
+          "difficulty_progression": "Analyze specific failures to identify systemic patterns.",
+          "integration": "Connect optimization objectives to social consequences.",
+          "ranking_explanation": "Case studies are essential for illustrating how 'correct' code can fail socially."
+        },
+        "questions": [
+          {
+            "question_type": "MCQ",
+            "question": "In the Amazon recruiting tool case, why did removing explicit gender labels fail to eliminate bias?",
+            "choices": [
+              "The model was not trained for enough epochs.",
+              "The model learned proxy signals (like college names) that correlated with gender.",
+              "The engineers forgot to delete the gender column.",
+              "The dataset was too small to be accurate."
+            ],
+            "answer": "The correct answer is B. The model learned proxy signals that correlated with gender. Even without direct labels, the model reconstructed protected attributes from other data features like school names and activity descriptions that encoded historical gender patterns.",
+            "learning_objective": "Understand how models learn protected attributes through proxy variables."
+          },
+          {
+            "question_type": "SHORT",
+            "question": "Explain how a 'feedback loop' in a recommendation system can lead to bias amplification.",
+            "answer": "Feedback loops occur when a model's predictions influence the data it later observes as training input. For example, if a system recommends provocative content to increase watch time, and users engage with it, the system interprets this as success and recommends even more extreme content, reinforcing and amplifying the initial algorithmic bias over time.",
+            "learning_objective": "Analyze the mechanics of bias amplification in ML systems."
+          }
+        ]
+      }
+    },
+    {
+      "section_id": "#sec-responsible-engineering-checklist-5e2c",
+      "section_title": "The Responsible Engineering Checklist",
+      "quiz_data": {
+        "quiz_needed": true,
+        "rationale": {
+          "focus_areas": [
+            "Model cards and documentation",
+            "Fairness metrics calculation"
+          ],
+          "question_strategy": "Focus on the practical application of documentation and measurement tools.",
+          "difficulty_progression": "Move from documentation types to calculating specific disparities.",
+          "integration": "Integrate quantitative metrics with qualitative documentation standards.",
+          "ranking_explanation": "These are the core tools a practitioner uses to implement responsibility."
+        },
+        "questions": [
+          {
+            "question_type": "MCQ",
+            "question": "What is the primary purpose of a 'Model Card' in responsible engineering?",
+            "choices": [
+              "To act as a warranty for the model software.",
+              "To provide a standardized format for documenting intended use, performance factors, and ethical considerations.",
+              "To store the binary weights of the trained neural network.",
+              "To list all the developers who worked on the project."
+            ],
+            "answer": "The correct answer is B. To provide a standardized format for documenting intended use, performance factors, and ethical considerations. Model cards ensure that essential context and limitations are communicated to users and auditors, preventing inappropriate model reuse.",
+            "learning_objective": "Explain the role of standardized documentation in ML accountability."
+          },
+          {
+            "question_type": "SHORT",
+            "question": "Define 'Disaggregated Evaluation' and explain why aggregate accuracy metrics can be misleading.",
+            "answer": "Disaggregated evaluation is the practice of breaking down performance metrics by demographic subgroups. Aggregate metrics can be misleading because a high overall accuracy (e.g., 95%) can conceal severe failures in a minority subgroup (e.g., 65% accuracy), a disparity that only becomes visible when evaluating groups separately.",
+            "learning_objective": "Apply disaggregated evaluation concepts to detect performance disparities."
+          }
+        ]
+      }
+    },
+    {
+      "section_id": "#sec-responsible-engineering-fairness-worked-example",
+      "section_title": "Fairness Worked Example",
+      "quiz_data": {
+        "quiz_needed": true,
+        "rationale": {
+          "focus_areas": [
+            "Calculation of fairness metrics",
+            "Demographic parity vs Equal opportunity"
+          ],
+          "question_strategy": "Quantitative assessment of fairness definitions.",
+          "difficulty_progression": "Requires calculation based on provided confusion matrix data.",
+          "integration": "Links mathematical definitions to social outcomes.",
+          "ranking_explanation": "Crucial for ensuring students can quantitatively verify fairness claims."
+        },
+        "questions": [
+          {
+            "question_type": "MCQ",
+            "question": "If Group A has an approval rate of 55% and Group B has an approval rate of 40%, which fairness criterion is being directly violated?",
+            "choices": [
+              "Equalized Odds",
+              "Demographic Parity",
+              "Numerical Precision",
+              "Model Lineage"
+            ],
+            "answer": "The correct answer is B. Demographic Parity. Demographic parity requires that the probability of a positive outcome (approval) is independent of group membership. A 15 percentage point difference indicates a violation of this parity.",
+            "learning_objective": "Distinguish between different mathematical definitions of fairness."
+          },
+          {
+            "question_type": "SHORT",
+            "question": "Explain the difference between 'Equal Opportunity' and 'Demographic Parity'.",
+            "answer": "Demographic Parity requires equal outcome rates for all groups regardless of qualifications. Equal Opportunity is a stricter criterion focusing on qualified individuals; it requires equal true positive rates (TPR), ensuring that those who would have a positive outcome (like repaying a loan) have an equal chance of being correctly identified by the model across all groups.",
+            "learning_objective": "Compare demographic parity and equal opportunity metrics."
+          }
+        ]
+      }
+    },
+    {
+      "section_id": "#sec-responsible-engineering-environmental-cost-awareness-8f4d",
+      "section_title": "Environmental and Cost Awareness",
+      "quiz_data": {
+        "quiz_needed": true,
+        "rationale": {
+          "focus_areas": [
+            "Total Cost of Ownership (TCO)",
+            "Environmental impact of ML"
+          ],
+          "question_strategy": "Focus on the economic and environmental dimensions of responsibility.",
+          "difficulty_progression": "Analyze the components of TCO and their relative weights.",
+          "integration": "Connect system efficiency to sustainability and business value.",
+          "ranking_explanation": "Sustainability is a core component of modern engineering responsibility."
+        },
+        "questions": [
+          {
+            "question_type": "MCQ",
+            "question": "For a successful production ML system, which cost component typically dominates the Total Cost of Ownership (TCO)?",
+            "choices": [
+              "Initial data labeling",
+              "Hyperparameter search",
+              "Inference costs",
+              "Academic research grants"
+            ],
+            "answer": "The correct answer is C. Inference costs. For high-volume production systems, inference costs can be 10x to 1000x higher than training costs, as they compound across every query served to users over the system's lifetime.",
+            "learning_objective": "Analyze the components of TCO in production ML systems."
+          },
+          {
+            "question_type": "SHORT",
+            "question": "How do model optimization techniques like quantization support both financial and environmental responsibility?",
+            "answer": "Optimization techniques reduce the computational resources required per inference. Quantization (e.g., FP32 to INT8) typically reduces memory and compute needs by 2-4x. This lowers the electricity consumption (reducing carbon footprint) and the hardware requirements (reducing operational expenses and TCO) simultaneously.",
+            "learning_objective": "Connect model optimization to environmental and economic outcomes."
+          }
+        ]
+      }
+    }
+  ]
+}
--- a/book/quarto/contents/vol1/serving/serving_glossary.json
+++ b/book/quarto/contents/vol1/serving/serving_glossary.json
@@ -1,3 +1,11 @@
 {
+  "metadata": {
+    "chapter": "serving",
+    "version": "1.0.0",
+    "generated": "2026-01-07T12:00:00.000000",
+    "total_terms": 0,
+    "standardized": true,
+    "last_updated": "2026-01-07T12:00:00.000000"
+  },
  "terms": []
-}
+}
--- a/book/tools/scripts/glossary/build_global_glossary.py
+++ b/book/tools/scripts/glossary/build_global_glossary.py
@@ -22,8 +22,13 @@ def load_chapter_glossaries():
    """Load all individual chapter glossary files."""
    # Get project root (4 levels up from this script)
    project_root = Path(__file__).parent.parent.parent.parent
-    base_dir = project_root / "quarto/contents/vol1"
-    json_files = list(base_dir.glob("**/*_glossary.json"))
+    
+    # Scan both vol1 and vol2
+    json_files = []
+    for vol in ["vol1", "vol2"]:
+        base_dir = project_root / f"quarto/contents/{vol}"
+        if base_dir.exists():
+            json_files.extend(list(base_dir.glob("**/*_glossary.json")))

    print(f"📚 Found {len(json_files)} chapter glossary files")