From 1c299cddb0c22dddaaf60c381e999c8eea9a1c34 Mon Sep 17 00:00:00 2001 From: Vijay Janapa Reddi Date: Sun, 9 Nov 2025 14:38:44 -0500 Subject: [PATCH] docs: add comprehensive docstrings to optimization modules 16-19 - Add Args/Returns/Example/Hints to key functions - Improve documentation for compare_model_sizes (16) - Enhance function documentation in compression (17) - Add docstring details for acceleration (18) - Improve benchmarking function docs (19) --- .../16_quantization/quantization_dev.py | 19 ++++++++++++ .../source/17_compression/compression_dev.py | 6 ++++ .../18_acceleration/acceleration_dev.py | 29 +++++++++++++++++++ .../19_benchmarking/benchmarking_dev.py | 11 +++++++ 4 files changed, 65 insertions(+) diff --git a/modules/source/16_quantization/quantization_dev.py b/modules/source/16_quantization/quantization_dev.py index 5bfc9874..27a4b8b0 100644 --- a/modules/source/16_quantization/quantization_dev.py +++ b/modules/source/16_quantization/quantization_dev.py @@ -1231,6 +1231,25 @@ def compare_model_sizes(original_model, quantized_model) -> Dict[str, float]: 2. Calculate bytes used (FP32 vs INT8) 3. Include quantization overhead 4. Return comparison metrics + + Args: + original_model: Model before quantization + quantized_model: Model after quantization + + Returns: + Dictionary with 'original_mb', 'quantized_mb', 'reduction_ratio', 'memory_saved_mb' + + EXAMPLE: + >>> model = Sequential(Linear(100, 50), Linear(50, 10)) + >>> quantize_model(model) + >>> stats = compare_model_sizes(model, model) # Same model after in-place quantization + >>> print(f"Reduced to {stats['reduction_ratio']:.1f}x smaller") + Reduced to 4.0x smaller + + HINTS: + - FP32 uses 4 bytes per parameter, INT8 uses 1 byte + - Include scale/zero_point overhead (2 values per quantized layer) + - Expected ratio: ~4x for INT8 quantization """ ### BEGIN SOLUTION # Count original model parameters diff --git a/modules/source/17_compression/compression_dev.py b/modules/source/17_compression/compression_dev.py index 5d863303..f72d1521 100644 --- a/modules/source/17_compression/compression_dev.py +++ b/modules/source/17_compression/compression_dev.py @@ -331,6 +331,12 @@ def measure_sparsity(model) -> float: 3. Count total parameters 4. Return percentage: zeros / total * 100 + Args: + model: Model with .parameters() method + + Returns: + Sparsity percentage (0.0-100.0) + EXAMPLE: >>> model = Sequential(Linear(10, 5), Linear(5, 2)) >>> sparsity = measure_sparsity(model) diff --git a/modules/source/18_acceleration/acceleration_dev.py b/modules/source/18_acceleration/acceleration_dev.py index 734fee2d..3d25f8cf 100644 --- a/modules/source/18_acceleration/acceleration_dev.py +++ b/modules/source/18_acceleration/acceleration_dev.py @@ -267,6 +267,13 @@ def vectorized_matmul(a: Tensor, b: Tensor) -> Tensor: 2. Use NumPy's optimized dot product (calls BLAS GEMM) 3. Return result wrapped in Tensor + Args: + a: First tensor for multiplication (M×K or batch×M×K) + b: Second tensor for multiplication (K×N or batch×K×N) + + Returns: + Result tensor of shape (M×N or batch×M×N) + EXAMPLE: Matrix multiplication visualization: >>> a = Tensor([[1, 2], [3, 4]]) # 2×2 @@ -443,6 +450,12 @@ def fused_gelu(x: Tensor) -> Tensor: 2. Avoid creating temporary arrays 3. Let NumPy's broadcasting handle vectorization + Args: + x: Input tensor to apply GELU activation + + Returns: + GELU-activated tensor (same shape as input) + EXAMPLE: >>> x = Tensor([-2, -1, 0, 1, 2]) >>> result = fused_gelu(x) @@ -538,11 +551,27 @@ def unfused_gelu(x: Tensor) -> Tensor: 2. Create temporary Tensor objects for each step 3. This simulates real memory allocation overhead + Args: + x: Input tensor + + Returns: + GELU-activated tensor (same shape as input) + + EXAMPLE: + >>> x = Tensor([0.5, 1.0, -0.5]) + >>> result = unfused_gelu(x) + >>> print(result.shape) + (3,) # Same as input + PERFORMANCE IMPACT: - Creates 7 temporary arrays - Each array allocation/deallocation has overhead - More memory bandwidth usage - Potential cache misses between operations + + HINTS: + - Create each step as: temp = Tensor(operation) + - This forces memory allocation for educational comparison """ ### BEGIN SOLUTION # Unfused version - creates many intermediate arrays diff --git a/modules/source/19_benchmarking/benchmarking_dev.py b/modules/source/19_benchmarking/benchmarking_dev.py index d96b4b6b..29632fd5 100644 --- a/modules/source/19_benchmarking/benchmarking_dev.py +++ b/modules/source/19_benchmarking/benchmarking_dev.py @@ -402,6 +402,9 @@ def precise_timer(): 3. Return elapsed time when context exits 4. Provide warmup capability for JIT compilation + Yields: + Timer object with .elapsed attribute (set after context exits) + EXAMPLE: >>> with precise_timer() as timer: ... time.sleep(0.1) # Some operation @@ -1721,6 +1724,14 @@ def compare_optimization_techniques(base_model: Any, optimized_models: List[Any] 3. Generate insights about which optimizations work best 4. Create recommendation matrix for different use cases + Args: + base_model: Baseline model (unoptimized) + optimized_models: List of models with different optimizations applied + datasets: List of datasets for evaluation + + Returns: + Dictionary with 'base_metrics', 'optimized_results', 'improvements', 'recommendations' + EXAMPLE: >>> models = [base_model, quantized_model, pruned_model, distilled_model] >>> results = compare_optimization_techniques(base_model, models[1:], datasets)