From 1c299cddb0c22dddaaf60c381e999c8eea9a1c34 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Sun, 9 Nov 2025 14:38:44 -0500
Subject: [PATCH] docs: add comprehensive docstrings to optimization modules
 16-19

- Add Args/Returns/Example/Hints to key functions
- Improve documentation for compare_model_sizes (16)
- Enhance function documentation in compression (17)
- Add docstring details for acceleration (18)
- Improve benchmarking function docs (19)
---
 .../16_quantization/quantization_dev.py       | 19 ++++++++++++
 .../source/17_compression/compression_dev.py  |  6 ++++
 .../18_acceleration/acceleration_dev.py       | 29 +++++++++++++++++++
 .../19_benchmarking/benchmarking_dev.py       | 11 +++++++
 4 files changed, 65 insertions(+)

diff --git a/modules/source/16_quantization/quantization_dev.py b/modules/source/16_quantization/quantization_dev.py
index 5bfc9874..27a4b8b0 100644
--- a/modules/source/16_quantization/quantization_dev.py
+++ b/modules/source/16_quantization/quantization_dev.py
@@ -1231,6 +1231,25 @@ def compare_model_sizes(original_model, quantized_model) -> Dict[str, float]:
     2. Calculate bytes used (FP32 vs INT8)
     3. Include quantization overhead
     4. Return comparison metrics
+
+    Args:
+        original_model: Model before quantization
+        quantized_model: Model after quantization
+
+    Returns:
+        Dictionary with 'original_mb', 'quantized_mb', 'reduction_ratio', 'memory_saved_mb'
+
+    EXAMPLE:
+    >>> model = Sequential(Linear(100, 50), Linear(50, 10))
+    >>> quantize_model(model)
+    >>> stats = compare_model_sizes(model, model)  # Same model after in-place quantization
+    >>> print(f"Reduced to {stats['reduction_ratio']:.1f}x smaller")
+    Reduced to 4.0x smaller
+
+    HINTS:
+    - FP32 uses 4 bytes per parameter, INT8 uses 1 byte
+    - Include scale/zero_point overhead (2 values per quantized layer)
+    - Expected ratio: ~4x for INT8 quantization
     """
     ### BEGIN SOLUTION
     # Count original model parameters
diff --git a/modules/source/17_compression/compression_dev.py b/modules/source/17_compression/compression_dev.py
index 5d863303..f72d1521 100644
--- a/modules/source/17_compression/compression_dev.py
+++ b/modules/source/17_compression/compression_dev.py
@@ -331,6 +331,12 @@ def measure_sparsity(model) -> float:
     3. Count total parameters
     4. Return percentage: zeros / total * 100
 
+    Args:
+        model: Model with .parameters() method
+
+    Returns:
+        Sparsity percentage (0.0-100.0)
+
     EXAMPLE:
     >>> model = Sequential(Linear(10, 5), Linear(5, 2))
     >>> sparsity = measure_sparsity(model)
diff --git a/modules/source/18_acceleration/acceleration_dev.py b/modules/source/18_acceleration/acceleration_dev.py
index 734fee2d..3d25f8cf 100644
--- a/modules/source/18_acceleration/acceleration_dev.py
+++ b/modules/source/18_acceleration/acceleration_dev.py
@@ -267,6 +267,13 @@ def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor:
     2. Use NumPy's optimized dot product (calls BLAS GEMM)
     3. Return result wrapped in Tensor
 
+    Args:
+        a: First tensor for multiplication (M×K or batch×M×K)
+        b: Second tensor for multiplication (K×N or batch×K×N)
+
+    Returns:
+        Result tensor of shape (M×N or batch×M×N)
+
     EXAMPLE:
     Matrix multiplication visualization:
     >>> a = Tensor([[1, 2], [3, 4]])  # 2×2
@@ -443,6 +450,12 @@ def fused_gelu(x: Tensor) -> Tensor:
     2. Avoid creating temporary arrays
     3. Let NumPy's broadcasting handle vectorization
 
+    Args:
+        x: Input tensor to apply GELU activation
+
+    Returns:
+        GELU-activated tensor (same shape as input)
+
     EXAMPLE:
     >>> x = Tensor([-2, -1, 0, 1, 2])
     >>> result = fused_gelu(x)
@@ -538,11 +551,27 @@ def unfused_gelu(x: Tensor) -> Tensor:
     2. Create temporary Tensor objects for each step
     3. This simulates real memory allocation overhead
 
+    Args:
+        x: Input tensor
+
+    Returns:
+        GELU-activated tensor (same shape as input)
+
+    EXAMPLE:
+    >>> x = Tensor([0.5, 1.0, -0.5])
+    >>> result = unfused_gelu(x)
+    >>> print(result.shape)
+    (3,)  # Same as input
+
     PERFORMANCE IMPACT:
     - Creates 7 temporary arrays
     - Each array allocation/deallocation has overhead
     - More memory bandwidth usage
     - Potential cache misses between operations
+
+    HINTS:
+    - Create each step as: temp = Tensor(operation)
+    - This forces memory allocation for educational comparison
     """
     ### BEGIN SOLUTION
     # Unfused version - creates many intermediate arrays
diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py
index d96b4b6b..29632fd5 100644
--- a/modules/source/19_benchmarking/benchmarking_dev.py
+++ b/modules/source/19_benchmarking/benchmarking_dev.py
@@ -402,6 +402,9 @@ def precise_timer():
     3. Return elapsed time when context exits
     4. Provide warmup capability for JIT compilation
 
+    Yields:
+        Timer object with .elapsed attribute (set after context exits)
+
     EXAMPLE:
     >>> with precise_timer() as timer:
     ...     time.sleep(0.1)  # Some operation
@@ -1721,6 +1724,14 @@ def compare_optimization_techniques(base_model: Any, optimized_models: List[Any]
     3. Generate insights about which optimizations work best
     4. Create recommendation matrix for different use cases
 
+    Args:
+        base_model: Baseline model (unoptimized)
+        optimized_models: List of models with different optimizations applied
+        datasets: List of datasets for evaluation
+
+    Returns:
+        Dictionary with 'base_metrics', 'optimized_results', 'improvements', 'recommendations'
+
     EXAMPLE:
     >>> models = [base_model, quantized_model, pruned_model, distilled_model]
     >>> results = compare_optimization_techniques(base_model, models[1:], datasets)