From d43b6c5e33d0558376ea032b1a0e749dd04094c7 Mon Sep 17 00:00:00 2001
From: Vijay Janapa Reddi <vj@eecs.harvard.edu>
Date: Sun, 14 Dec 2025 13:38:00 -0500
Subject: [PATCH] fix: align test directory numbering with module numbering
 (15-17)

- Renamed tests/15_memoization -> tests/17_memoization
- Renamed tests/16_quantization -> tests/15_quantization
- Renamed tests/17_compression -> tests/16_compression

Also expanded test coverage:
- Module 15 (Quantization): Added 4 new tests for edge cases, ordering, negative values, QuantizedLinear
- Module 16 (Compression): Added 5 new tests for target sparsity, large weight preservation, structured pruning
- Module 17 (Memoization): Added 4 new tests for multiple tokens, multiple layers, seq_pos tracking, error handling

All 24 tests for modules 15-17 now pass.
---
 .../15_memoization/test_kv_cache_core.py      |  91 ------
 .../run_all_tests.py                          |   0
 .../test_quantization_integration.py          |   0
 .../15_quantization/test_quantizer_core.py    | 206 ++++++++++++++
 .../run_all_tests.py                          |   0
 .../test_compression_integration.py           |   0
 .../16_compression/test_compressor_core.py    | 262 ++++++++++++++++++
 .../16_quantization/test_quantizer_core.py    |  94 -------
 .../17_compression/test_compressor_core.py    | 111 --------
 .../run_all_tests.py                          |   0
 .../17_memoization/test_kv_cache_core.py      | 212 ++++++++++++++
 .../test_progressive_integration.py           |   0
 .../test_tinygpt_integration.py               |   0
 13 files changed, 680 insertions(+), 296 deletions(-)
 delete mode 100644 tinytorch/tests/15_memoization/test_kv_cache_core.py
 rename tinytorch/tests/{16_quantization => 15_quantization}/run_all_tests.py (100%)
 rename tinytorch/tests/{16_quantization => 15_quantization}/test_quantization_integration.py (100%)
 create mode 100644 tinytorch/tests/15_quantization/test_quantizer_core.py
 rename tinytorch/tests/{17_compression => 16_compression}/run_all_tests.py (100%)
 rename tinytorch/tests/{17_compression => 16_compression}/test_compression_integration.py (100%)
 create mode 100644 tinytorch/tests/16_compression/test_compressor_core.py
 delete mode 100644 tinytorch/tests/16_quantization/test_quantizer_core.py
 delete mode 100644 tinytorch/tests/17_compression/test_compressor_core.py
 rename tinytorch/tests/{15_memoization => 17_memoization}/run_all_tests.py (100%)
 create mode 100644 tinytorch/tests/17_memoization/test_kv_cache_core.py
 rename tinytorch/tests/{15_memoization => 17_memoization}/test_progressive_integration.py (100%)
 rename tinytorch/tests/{15_memoization => 17_memoization}/test_tinygpt_integration.py (100%)

diff --git a/tinytorch/tests/15_memoization/test_kv_cache_core.py b/tinytorch/tests/15_memoization/test_kv_cache_core.py
deleted file mode 100644
index e3ea85836..000000000
--- a/tinytorch/tests/15_memoization/test_kv_cache_core.py
+++ /dev/null
@@ -1,91 +0,0 @@
-"""
-Module 15: KV Cache (Memoization) Core Tests
-=============================================
-
-These tests verify that KV caching works for efficient inference.
-
-WHY THESE TESTS MATTER:
------------------------
-KV caching is essential for efficient text generation:
-- Without cache: O(n²) per token (recompute all attention)
-- With cache: O(n) per token (reuse previous K,V)
-
-For generating 100 tokens, that's 100x speedup!
-
-WHAT WE TEST:
--------------
-1. KVCache can store key-value pairs
-2. Cache retrieval returns stored values
-3. Cache works across multiple layers
-"""
-
-import pytest
-import numpy as np
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-from tinytorch.core.tensor import Tensor
-
-
-class TestKVCacheBasics:
-    """Test basic KV cache functionality."""
-
-    def test_kv_cache_import(self):
-        """
-        WHAT: Verify KVCache can be imported.
-
-        WHY: Basic sanity check.
-        """
-        try:
-            from tinytorch.perf.memoization import KVCache
-            assert KVCache is not None
-        except ImportError as e:
-            pytest.skip(f"KVCache not yet exported: {e}")
-
-    def test_kv_cache_can_instantiate(self):
-        """
-        WHAT: Verify KVCache can be created.
-        """
-        try:
-            from tinytorch.perf.memoization import KVCache
-            # KVCache needs: batch_size, max_seq_len, num_layers, num_heads, head_dim
-            cache = KVCache(batch_size=1, max_seq_len=128, num_layers=2, num_heads=4, head_dim=16)
-            assert cache is not None
-        except ImportError:
-            pytest.skip("KVCache not yet exported")
-
-    def test_kv_cache_stores_and_retrieves(self):
-        """
-        WHAT: Verify cache can store and retrieve K,V tensors.
-
-        WHY: The whole point of the cache is to reuse computed values.
-        If storage/retrieval doesn't work, there's no speedup.
-        """
-        try:
-            from tinytorch.perf.memoization import KVCache
-        except ImportError:
-            pytest.skip("KVCache not yet exported")
-
-        # Create cache with proper dimensions
-        cache = KVCache(batch_size=1, max_seq_len=128, num_layers=2, num_heads=4, head_dim=16)
-
-        # Store some K,V pairs (cache expects one token at a time during generation)
-        layer_idx = 0
-        K = Tensor(np.random.randn(1, 4, 1, 16))  # (batch, heads, 1, dim) - one new token
-        V = Tensor(np.random.randn(1, 4, 1, 16))
-
-        cache.update(layer_idx, K, V)
-
-        # Retrieve
-        cached_K, cached_V = cache.get(layer_idx)
-
-        assert cached_K is not None, "Cache didn't store K"
-        assert cached_V is not None, "Cache didn't store V"
-        assert np.allclose(cached_K.data, K.data), "Retrieved K doesn't match stored"
-        assert np.allclose(cached_V.data, V.data), "Retrieved V doesn't match stored"
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/16_quantization/run_all_tests.py b/tinytorch/tests/15_quantization/run_all_tests.py
similarity index 100%
rename from tinytorch/tests/16_quantization/run_all_tests.py
rename to tinytorch/tests/15_quantization/run_all_tests.py
diff --git a/tinytorch/tests/16_quantization/test_quantization_integration.py b/tinytorch/tests/15_quantization/test_quantization_integration.py
similarity index 100%
rename from tinytorch/tests/16_quantization/test_quantization_integration.py
rename to tinytorch/tests/15_quantization/test_quantization_integration.py
diff --git a/tinytorch/tests/15_quantization/test_quantizer_core.py b/tinytorch/tests/15_quantization/test_quantizer_core.py
new file mode 100644
index 000000000..9a6a9061b
--- /dev/null
+++ b/tinytorch/tests/15_quantization/test_quantizer_core.py
@@ -0,0 +1,206 @@
+"""
+Module 15: Quantization Core Tests
+===================================
+
+These tests verify that quantization reduces model size correctly.
+
+WHY THESE TESTS MATTER:
+-----------------------
+Quantization converts FP32 (4 bytes) to INT8 (1 byte) = 4x smaller model.
+If quantization is broken:
+- Model stays big (defeats the purpose)
+- Accuracy drops too much (unusable)
+- Values overflow (numerical errors)
+
+WHAT WE TEST:
+-------------
+1. Quantization produces INT8 values
+2. Dequantization recovers approximate original values
+3. Model size actually decreases
+"""
+
+import pytest
+import numpy as np
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+
+
+class TestQuantizationBasics:
+    """Test basic quantization functionality."""
+
+    def test_quantizer_import(self):
+        """Verify Quantizer can be imported."""
+        try:
+            from tinytorch.perf.quantization import Quantizer
+            assert Quantizer is not None
+        except ImportError as e:
+            pytest.skip(f"Quantizer not yet exported: {e}")
+
+    def test_quantize_produces_int8(self):
+        """
+        WHAT: Verify quantization produces INT8 values in [-128, 127].
+
+        WHY: INT8 is the target representation. Values outside this
+        range would overflow and produce garbage.
+        """
+        try:
+            from tinytorch.perf.quantization import Quantizer
+        except ImportError:
+            pytest.skip("Quantizer not yet exported")
+
+        # Create FP32 tensor
+        fp32_tensor = Tensor(np.random.randn(10, 10).astype(np.float32))
+
+        # Quantize
+        q_tensor, scale, zero_point = Quantizer.quantize_tensor(fp32_tensor)
+
+        # Check INT8 range
+        assert q_tensor.data.min() >= -128, "Quantized values below INT8 min"
+        assert q_tensor.data.max() <= 127, "Quantized values above INT8 max"
+
+    def test_dequantize_recovers_approximate_values(self):
+        """
+        WHAT: Verify dequantization recovers values close to original.
+
+        WHY: Quantization is lossy, but should be approximately reversible.
+        Large errors would destroy model accuracy.
+        """
+        try:
+            from tinytorch.perf.quantization import Quantizer
+        except ImportError:
+            pytest.skip("Quantizer not yet exported")
+
+        # Create FP32 tensor with known values
+        original = Tensor(np.array([0.5, -0.5, 1.0, -1.0]).astype(np.float32))
+
+        # Round trip: quantize then dequantize
+        q_tensor, scale, zero_point = Quantizer.quantize_tensor(original)
+        recovered = Quantizer.dequantize_tensor(q_tensor, scale, zero_point)
+
+        # Should be close (within ~1% for typical values)
+        max_error = np.max(np.abs(original.data - recovered.data))
+        assert max_error < 0.1, (
+            f"Dequantization error too large: {max_error}\n"
+            f"  Original: {original.data}\n"
+            f"  Recovered: {recovered.data}"
+        )
+
+
+class TestQuantizationAdvanced:
+    """Advanced quantization tests for edge cases and accuracy."""
+
+    def test_quantize_constant_tensor(self):
+        """
+        WHAT: Verify quantization handles constant tensors (all same value).
+
+        WHY: Constant tensors are an edge case where min=max. The algorithm
+        must handle this gracefully without division by zero.
+        """
+        try:
+            from tinytorch.perf.quantization import quantize_int8
+        except ImportError:
+            pytest.skip("quantize_int8 not yet exported")
+
+        # All zeros
+        constant = Tensor(np.zeros((4, 4), dtype=np.float32))
+        q_tensor, scale, zero_point = quantize_int8(constant)
+
+        # Should produce valid output without errors
+        assert q_tensor.data.shape == constant.data.shape, "Shape changed"
+
+    def test_quantize_preserves_relative_ordering(self):
+        """
+        WHAT: Verify quantization preserves relative ordering of values.
+
+        WHY: If [0.1, 0.2, 0.3] becomes [5, 4, 6], the model's predictions
+        would be garbage. Relative ordering must be preserved.
+        """
+        try:
+            from tinytorch.perf.quantization import quantize_int8
+        except ImportError:
+            pytest.skip("quantize_int8 not yet exported")
+
+        # Strictly increasing values
+        original = Tensor(np.array([0.1, 0.2, 0.3, 0.4, 0.5], dtype=np.float32))
+        q_tensor, _, _ = quantize_int8(original)
+
+        # Quantized values should be monotonically non-decreasing
+        q_data = q_tensor.data.astype(np.float32)
+        for i in range(len(q_data) - 1):
+            assert q_data[i] <= q_data[i + 1], (
+                f"Ordering not preserved: q[{i}]={q_data[i]} > q[{i+1}]={q_data[i+1]}"
+            )
+
+    def test_quantize_negative_values(self):
+        """
+        WHAT: Verify quantization handles negative values correctly.
+
+        WHY: Neural network weights are typically centered around zero
+        with both positive and negative values.
+        """
+        try:
+            from tinytorch.perf.quantization import quantize_int8, dequantize_int8
+        except ImportError:
+            pytest.skip("Quantization functions not yet exported")
+
+        # Mixed positive and negative
+        original = Tensor(np.array([-2.0, -1.0, 0.0, 1.0, 2.0], dtype=np.float32))
+        q_tensor, scale, zero_point = quantize_int8(original)
+        recovered = dequantize_int8(q_tensor, scale, zero_point)
+
+        # Original signs should be preserved after round-trip
+        for i in range(len(original.data)):
+            orig_sign = np.sign(original.data[i])
+            rec_sign = np.sign(recovered.data[i])
+            # Zero can go either way due to quantization noise
+            if orig_sign != 0:
+                assert orig_sign == rec_sign, (
+                    f"Sign not preserved for value {original.data[i]}: "
+                    f"recovered {recovered.data[i]}"
+                )
+
+
+class TestQuantizedLinear:
+    """Test the QuantizedLinear layer implementation."""
+
+    def test_quantized_linear_forward(self):
+        """
+        WHAT: Verify QuantizedLinear produces similar output to regular Linear.
+
+        WHY: Quantized layers should approximate the original behavior.
+        Large deviations indicate incorrect implementation.
+        """
+        try:
+            from tinytorch.perf.quantization import QuantizedLinear
+            from tinytorch.core.layers import Linear
+        except ImportError:
+            pytest.skip("QuantizedLinear not yet exported")
+
+        # Create and quantize a linear layer
+        linear = Linear(4, 3)
+        q_linear = QuantizedLinear(linear)
+
+        # Forward pass
+        input_tensor = Tensor(np.random.randn(2, 4).astype(np.float32))
+        original_output = linear.forward(input_tensor)
+        quantized_output = q_linear.forward(input_tensor)
+
+        # Outputs should be similar (within quantization error)
+        # For INT8, typical error is ~1-5% of the output range
+        max_error = np.max(np.abs(original_output.data - quantized_output.data))
+        output_range = np.max(original_output.data) - np.min(original_output.data)
+
+        # Allow up to 10% relative error for educational implementation
+        assert max_error < 0.1 * output_range + 0.1, (
+            f"QuantizedLinear output differs too much from Linear:\n"
+            f"  Max error: {max_error:.4f}\n"
+            f"  Output range: {output_range:.4f}"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/17_compression/run_all_tests.py b/tinytorch/tests/16_compression/run_all_tests.py
similarity index 100%
rename from tinytorch/tests/17_compression/run_all_tests.py
rename to tinytorch/tests/16_compression/run_all_tests.py
diff --git a/tinytorch/tests/17_compression/test_compression_integration.py b/tinytorch/tests/16_compression/test_compression_integration.py
similarity index 100%
rename from tinytorch/tests/17_compression/test_compression_integration.py
rename to tinytorch/tests/16_compression/test_compression_integration.py
diff --git a/tinytorch/tests/16_compression/test_compressor_core.py b/tinytorch/tests/16_compression/test_compressor_core.py
new file mode 100644
index 000000000..f122b1c07
--- /dev/null
+++ b/tinytorch/tests/16_compression/test_compressor_core.py
@@ -0,0 +1,262 @@
+"""
+Module 16: Compression Core Tests
+===================================
+
+These tests verify that model compression (pruning) works correctly.
+
+WHY THESE TESTS MATTER:
+-----------------------
+Pruning removes unnecessary weights, making models smaller and faster.
+If compression is broken:
+- Model doesn't get smaller (no benefit)
+- Important weights get removed (accuracy crashes)
+- Sparsity calculations are wrong (can't measure compression)
+"""
+
+import pytest
+import numpy as np
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+from tinytorch.core.layers import Linear
+
+
+class TestCompressionBasics:
+    """Test basic compression/pruning functionality."""
+
+    def test_compressor_import(self):
+        """Verify Compressor can be imported."""
+        try:
+            from tinytorch.perf.compression import Compressor
+            assert Compressor is not None
+        except ImportError as e:
+            pytest.skip(f"Compressor not yet exported: {e}")
+
+    def test_measure_sparsity(self):
+        """
+        WHAT: Verify sparsity measurement works correctly.
+
+        WHY: Sparsity = fraction of zeros. This is how we measure compression.
+        50% sparsity means half the weights are zero.
+        """
+        try:
+            from tinytorch.perf.compression import Compressor
+        except ImportError:
+            pytest.skip("Compressor not yet exported")
+
+        # Create a simple model with known sparsity
+        class SimpleModel:
+            def __init__(self):
+                # Half zeros, half ones = 50% sparsity
+                self.layer = Linear(4, 4, bias=False)
+                self.layer.weight.data = np.array([
+                    [0, 0, 1, 1],
+                    [0, 0, 1, 1],
+                    [0, 0, 1, 1],
+                    [0, 0, 1, 1]
+                ], dtype=np.float32)
+
+            def parameters(self):
+                return self.layer.parameters()
+
+        model = SimpleModel()
+        sparsity = Compressor.measure_sparsity(model)
+
+        # Should be ~50%
+        assert 0.4 < sparsity < 0.6, (
+            f"Sparsity measurement wrong!\n"
+            f"  Expected: ~0.5 (50% zeros)\n"
+            f"  Got: {sparsity}"
+        )
+
+    def test_magnitude_prune_increases_sparsity(self):
+        """
+        WHAT: Verify pruning increases the number of zeros.
+
+        WHY: Pruning should set small weights to zero.
+        After pruning, sparsity should increase.
+        """
+        try:
+            from tinytorch.perf.compression import Compressor
+        except ImportError:
+            pytest.skip("Compressor not yet exported")
+
+        # Create model with random weights (low sparsity)
+        class SimpleModel:
+            def __init__(self):
+                self.layer = Linear(10, 10, bias=False)
+
+            def parameters(self):
+                return self.layer.parameters()
+
+        model = SimpleModel()
+        initial_sparsity = Compressor.measure_sparsity(model)
+
+        # Apply pruning
+        Compressor.magnitude_prune(model, sparsity=0.5)
+
+        final_sparsity = Compressor.measure_sparsity(model)
+
+        assert final_sparsity > initial_sparsity, (
+            f"Pruning didn't increase sparsity!\n"
+            f"  Before: {initial_sparsity}\n"
+            f"  After: {final_sparsity}"
+        )
+
+
+class TestCompressionAdvanced:
+    """Advanced compression tests for accuracy and edge cases."""
+
+    def test_sparsity_achieves_target(self):
+        """
+        WHAT: Verify magnitude pruning achieves approximately target sparsity.
+
+        WHY: If we request 80% sparsity, we should get close to 80% zeros.
+        Large deviations indicate the pruning algorithm is broken.
+        """
+        try:
+            from tinytorch.perf.compression import measure_sparsity, magnitude_prune
+        except ImportError:
+            pytest.skip("Compression functions not yet exported")
+
+        # Create model
+        class SimpleModel:
+            def __init__(self):
+                self.layer1 = Linear(100, 50, bias=False)
+                self.layer2 = Linear(50, 25, bias=False)
+
+            def parameters(self):
+                return self.layer1.parameters() + self.layer2.parameters()
+
+        model = SimpleModel()
+        target_sparsity = 0.8  # 80%
+
+        # Apply pruning
+        magnitude_prune(model, sparsity=target_sparsity)
+        achieved_sparsity = measure_sparsity(model)
+
+        # Should be within 5% of target (sparsity is in percentage)
+        assert abs(achieved_sparsity - target_sparsity * 100) < 5, (
+            f"Sparsity target not achieved!\n"
+            f"  Target: {target_sparsity * 100}%\n"
+            f"  Achieved: {achieved_sparsity:.1f}%"
+        )
+
+    def test_pruning_preserves_large_weights(self):
+        """
+        WHAT: Verify that large magnitude weights are preserved during pruning.
+
+        WHY: Magnitude pruning should keep the largest weights. If large
+        weights are removed, model accuracy would collapse.
+        """
+        try:
+            from tinytorch.perf.compression import magnitude_prune
+        except ImportError:
+            pytest.skip("magnitude_prune not yet exported")
+
+        # Create model with one very large weight
+        class SimpleModel:
+            def __init__(self):
+                self.layer = Linear(4, 4, bias=False)
+                # Set one weight to be much larger than others
+                self.layer.weight.data = np.array([
+                    [0.01, 0.02, 0.01, 0.02],
+                    [0.01, 100.0, 0.01, 0.02],  # 100.0 is the largest
+                    [0.01, 0.02, 0.01, 0.02],
+                    [0.01, 0.02, 0.01, 0.02]
+                ], dtype=np.float32)
+
+            def parameters(self):
+                return self.layer.parameters()
+
+        model = SimpleModel()
+
+        # Prune 90% of weights
+        magnitude_prune(model, sparsity=0.9)
+
+        # The largest weight should still be there
+        assert model.layer.weight.data[1, 1] == 100.0, (
+            "Large weight was incorrectly pruned!\n"
+            f"  Expected: 100.0\n"
+            f"  Got: {model.layer.weight.data[1, 1]}"
+        )
+
+    def test_zero_sparsity_no_change(self):
+        """
+        WHAT: Verify that 0% sparsity doesn't change the model.
+
+        WHY: This is an edge case - requesting no pruning should leave
+        all weights unchanged.
+        """
+        try:
+            from tinytorch.perf.compression import magnitude_prune
+        except ImportError:
+            pytest.skip("magnitude_prune not yet exported")
+
+        class SimpleModel:
+            def __init__(self):
+                self.layer = Linear(4, 4, bias=False)
+
+            def parameters(self):
+                return self.layer.parameters()
+
+        model = SimpleModel()
+        original_weights = model.layer.weight.data.copy()
+
+        # Prune 0% (should be no change)
+        magnitude_prune(model, sparsity=0.0)
+
+        assert np.allclose(model.layer.weight.data, original_weights), (
+            "0% sparsity changed weights when it shouldn't!"
+        )
+
+
+class TestStructuredPruning:
+    """Test structured pruning (removing entire neurons/channels)."""
+
+    def test_structured_prune_import(self):
+        """Verify structured_prune function can be imported."""
+        try:
+            from tinytorch.perf.compression import structured_prune
+            assert structured_prune is not None
+        except ImportError:
+            pytest.skip("structured_prune not yet exported")
+
+    def test_structured_prune_reduces_effective_neurons(self):
+        """
+        WHAT: Verify structured pruning removes entire rows/columns.
+
+        WHY: Unlike magnitude pruning which creates sparse matrices,
+        structured pruning removes whole neurons for actual speedups.
+        """
+        try:
+            from tinytorch.perf.compression import structured_prune
+            from tinytorch.core.layers import Sequential
+        except ImportError:
+            pytest.skip("structured_prune not yet exported")
+
+        # Create model using Sequential (required for structured_prune)
+        layer = Linear(10, 10, bias=False)
+        model = Sequential(layer)
+
+        # Apply 50% structured pruning
+        structured_prune(model, prune_ratio=0.5)
+
+        # Check that some entire columns are now all zeros
+        # structured_prune zeros out columns (output channels)
+        weights = layer.weight.data
+        zero_cols = np.sum(np.all(weights == 0, axis=0))
+
+        # At least some columns should be completely zeroed
+        assert zero_cols >= 1, (
+            f"Structured pruning didn't zero out entire columns!\n"
+            f"  Expected: At least 1 zero column\n"
+            f"  Got: {zero_cols} zero columns"
+        )
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/16_quantization/test_quantizer_core.py b/tinytorch/tests/16_quantization/test_quantizer_core.py
deleted file mode 100644
index 41d52b2c7..000000000
--- a/tinytorch/tests/16_quantization/test_quantizer_core.py
+++ /dev/null
@@ -1,94 +0,0 @@
-"""
-Module 16: Quantization Core Tests
-===================================
-
-These tests verify that quantization reduces model size correctly.
-
-WHY THESE TESTS MATTER:
------------------------
-Quantization converts FP32 (4 bytes) to INT8 (1 byte) = 4x smaller model.
-If quantization is broken:
-- Model stays big (defeats the purpose)
-- Accuracy drops too much (unusable)
-- Values overflow (numerical errors)
-
-WHAT WE TEST:
--------------
-1. Quantization produces INT8 values
-2. Dequantization recovers approximate original values
-3. Model size actually decreases
-"""
-
-import pytest
-import numpy as np
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-from tinytorch.core.tensor import Tensor
-
-
-class TestQuantizationBasics:
-    """Test basic quantization functionality."""
-
-    def test_quantizer_import(self):
-        """Verify Quantizer can be imported."""
-        try:
-            from tinytorch.perf.quantization import Quantizer
-            assert Quantizer is not None
-        except ImportError as e:
-            pytest.skip(f"Quantizer not yet exported: {e}")
-
-    def test_quantize_produces_int8(self):
-        """
-        WHAT: Verify quantization produces INT8 values in [-128, 127].
-
-        WHY: INT8 is the target representation. Values outside this
-        range would overflow and produce garbage.
-        """
-        try:
-            from tinytorch.perf.quantization import Quantizer
-        except ImportError:
-            pytest.skip("Quantizer not yet exported")
-
-        # Create FP32 tensor
-        fp32_tensor = Tensor(np.random.randn(10, 10).astype(np.float32))
-
-        # Quantize
-        q_tensor, scale, zero_point = Quantizer.quantize_tensor(fp32_tensor)
-
-        # Check INT8 range
-        assert q_tensor.data.min() >= -128, "Quantized values below INT8 min"
-        assert q_tensor.data.max() <= 127, "Quantized values above INT8 max"
-
-    def test_dequantize_recovers_approximate_values(self):
-        """
-        WHAT: Verify dequantization recovers values close to original.
-
-        WHY: Quantization is lossy, but should be approximately reversible.
-        Large errors would destroy model accuracy.
-        """
-        try:
-            from tinytorch.perf.quantization import Quantizer
-        except ImportError:
-            pytest.skip("Quantizer not yet exported")
-
-        # Create FP32 tensor with known values
-        original = Tensor(np.array([0.5, -0.5, 1.0, -1.0]).astype(np.float32))
-
-        # Round trip: quantize then dequantize
-        q_tensor, scale, zero_point = Quantizer.quantize_tensor(original)
-        recovered = Quantizer.dequantize_tensor(q_tensor, scale, zero_point)
-
-        # Should be close (within ~1% for typical values)
-        max_error = np.max(np.abs(original.data - recovered.data))
-        assert max_error < 0.1, (
-            f"Dequantization error too large: {max_error}\n"
-            f"  Original: {original.data}\n"
-            f"  Recovered: {recovered.data}"
-        )
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/17_compression/test_compressor_core.py b/tinytorch/tests/17_compression/test_compressor_core.py
deleted file mode 100644
index 9b9fc0bca..000000000
--- a/tinytorch/tests/17_compression/test_compressor_core.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-Module 17: Compression Core Tests
-===================================
-
-These tests verify that model compression (pruning) works correctly.
-
-WHY THESE TESTS MATTER:
------------------------
-Pruning removes unnecessary weights, making models smaller and faster.
-If compression is broken:
-- Model doesn't get smaller (no benefit)
-- Important weights get removed (accuracy crashes)
-- Sparsity calculations are wrong (can't measure compression)
-"""
-
-import pytest
-import numpy as np
-import sys
-from pathlib import Path
-
-sys.path.insert(0, str(Path(__file__).parent.parent.parent))
-
-from tinytorch.core.tensor import Tensor
-from tinytorch.core.layers import Linear
-
-
-class TestCompressionBasics:
-    """Test basic compression/pruning functionality."""
-
-    def test_compressor_import(self):
-        """Verify Compressor can be imported."""
-        try:
-            from tinytorch.perf.compression import Compressor
-            assert Compressor is not None
-        except ImportError as e:
-            pytest.skip(f"Compressor not yet exported: {e}")
-
-    def test_measure_sparsity(self):
-        """
-        WHAT: Verify sparsity measurement works correctly.
-
-        WHY: Sparsity = fraction of zeros. This is how we measure compression.
-        50% sparsity means half the weights are zero.
-        """
-        try:
-            from tinytorch.perf.compression import Compressor
-        except ImportError:
-            pytest.skip("Compressor not yet exported")
-
-        # Create a simple model with known sparsity
-        class SimpleModel:
-            def __init__(self):
-                # Half zeros, half ones = 50% sparsity
-                self.layer = Linear(4, 4, bias=False)
-                self.layer.weight.data = np.array([
-                    [0, 0, 1, 1],
-                    [0, 0, 1, 1],
-                    [0, 0, 1, 1],
-                    [0, 0, 1, 1]
-                ], dtype=np.float32)
-
-            def parameters(self):
-                return self.layer.parameters()
-
-        model = SimpleModel()
-        sparsity = Compressor.measure_sparsity(model)
-
-        # Should be ~50%
-        assert 0.4 < sparsity < 0.6, (
-            f"Sparsity measurement wrong!\n"
-            f"  Expected: ~0.5 (50% zeros)\n"
-            f"  Got: {sparsity}"
-        )
-
-    def test_magnitude_prune_increases_sparsity(self):
-        """
-        WHAT: Verify pruning increases the number of zeros.
-
-        WHY: Pruning should set small weights to zero.
-        After pruning, sparsity should increase.
-        """
-        try:
-            from tinytorch.perf.compression import Compressor
-        except ImportError:
-            pytest.skip("Compressor not yet exported")
-
-        # Create model with random weights (low sparsity)
-        class SimpleModel:
-            def __init__(self):
-                self.layer = Linear(10, 10, bias=False)
-
-            def parameters(self):
-                return self.layer.parameters()
-
-        model = SimpleModel()
-        initial_sparsity = Compressor.measure_sparsity(model)
-
-        # Apply pruning
-        Compressor.magnitude_prune(model, sparsity=0.5)
-
-        final_sparsity = Compressor.measure_sparsity(model)
-
-        assert final_sparsity > initial_sparsity, (
-            f"Pruning didn't increase sparsity!\n"
-            f"  Before: {initial_sparsity}\n"
-            f"  After: {final_sparsity}"
-        )
-
-
-if __name__ == "__main__":
-    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/15_memoization/run_all_tests.py b/tinytorch/tests/17_memoization/run_all_tests.py
similarity index 100%
rename from tinytorch/tests/15_memoization/run_all_tests.py
rename to tinytorch/tests/17_memoization/run_all_tests.py
diff --git a/tinytorch/tests/17_memoization/test_kv_cache_core.py b/tinytorch/tests/17_memoization/test_kv_cache_core.py
new file mode 100644
index 000000000..c767e5988
--- /dev/null
+++ b/tinytorch/tests/17_memoization/test_kv_cache_core.py
@@ -0,0 +1,212 @@
+"""
+Module 17: KV Cache (Memoization) Core Tests
+=============================================
+
+These tests verify that KV caching works for efficient inference.
+
+WHY THESE TESTS MATTER:
+-----------------------
+KV caching is essential for efficient text generation:
+- Without cache: O(n²) per token (recompute all attention)
+- With cache: O(n) per token (reuse previous K,V)
+
+For generating 100 tokens, that's 100x speedup!
+
+WHAT WE TEST:
+-------------
+1. KVCache can store key-value pairs
+2. Cache retrieval returns stored values
+3. Cache works across multiple layers
+"""
+
+import pytest
+import numpy as np
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+from tinytorch.core.tensor import Tensor
+
+
+class TestKVCacheBasics:
+    """Test basic KV cache functionality."""
+
+    def test_kv_cache_import(self):
+        """
+        WHAT: Verify KVCache can be imported.
+
+        WHY: Basic sanity check.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+            assert KVCache is not None
+        except ImportError as e:
+            pytest.skip(f"KVCache not yet exported: {e}")
+
+    def test_kv_cache_can_instantiate(self):
+        """
+        WHAT: Verify KVCache can be created.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+            # KVCache needs: batch_size, max_seq_len, num_layers, num_heads, head_dim
+            cache = KVCache(batch_size=1, max_seq_len=128, num_layers=2, num_heads=4, head_dim=16)
+            assert cache is not None
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+    def test_kv_cache_stores_and_retrieves(self):
+        """
+        WHAT: Verify cache can store and retrieve K,V tensors.
+
+        WHY: The whole point of the cache is to reuse computed values.
+        If storage/retrieval doesn't work, there's no speedup.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+        # Create cache with proper dimensions
+        cache = KVCache(batch_size=1, max_seq_len=128, num_layers=2, num_heads=4, head_dim=16)
+
+        # Store some K,V pairs (cache expects one token at a time during generation)
+        layer_idx = 0
+        K = Tensor(np.random.randn(1, 4, 1, 16))  # (batch, heads, 1, dim) - one new token
+        V = Tensor(np.random.randn(1, 4, 1, 16))
+
+        cache.update(layer_idx, K, V)
+        cache.advance()  # Must advance after update to make values retrievable
+
+        # Retrieve
+        cached_K, cached_V = cache.get(layer_idx)
+
+        assert cached_K is not None, "Cache didn't store K"
+        assert cached_V is not None, "Cache didn't store V"
+        assert cached_K.shape == K.shape, f"K shape mismatch: {cached_K.shape} vs {K.shape}"
+        assert np.allclose(cached_K.data, K.data), "Retrieved K doesn't match stored"
+        assert np.allclose(cached_V.data, V.data), "Retrieved V doesn't match stored"
+
+
+class TestKVCacheAdvanced:
+    """Advanced KV cache tests for multiple tokens and layers."""
+
+    def test_kv_cache_multiple_tokens(self):
+        """
+        WHAT: Verify cache can accumulate multiple tokens.
+
+        WHY: During generation, we add one token at a time. The cache must
+        correctly accumulate all previous K,V pairs.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+        cache = KVCache(batch_size=1, max_seq_len=10, num_layers=1, num_heads=2, head_dim=8)
+
+        # Add 3 tokens
+        for token_idx in range(3):
+            K = Tensor(np.full((1, 2, 1, 8), token_idx, dtype=np.float32))
+            V = Tensor(np.full((1, 2, 1, 8), token_idx + 10, dtype=np.float32))
+            cache.update(layer_idx=0, key=K, value=V)
+            cache.advance()
+
+        # Retrieve should give all 3 tokens
+        cached_K, cached_V = cache.get(layer_idx=0)
+
+        assert cached_K.shape == (1, 2, 3, 8), f"Expected (1,2,3,8), got {cached_K.shape}"
+        assert cached_V.shape == (1, 2, 3, 8), f"Expected (1,2,3,8), got {cached_V.shape}"
+
+        # Verify values are in order
+        assert cached_K.data[0, 0, 0, 0] == 0, "First token K wrong"
+        assert cached_K.data[0, 0, 1, 0] == 1, "Second token K wrong"
+        assert cached_K.data[0, 0, 2, 0] == 2, "Third token K wrong"
+
+    def test_kv_cache_multiple_layers(self):
+        """
+        WHAT: Verify cache works correctly across multiple transformer layers.
+
+        WHY: Real transformers have multiple layers, each with its own K,V cache.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+        num_layers = 4
+        cache = KVCache(batch_size=1, max_seq_len=10, num_layers=num_layers, num_heads=2, head_dim=8)
+
+        # Update each layer with different values
+        for layer_idx in range(num_layers):
+            K = Tensor(np.full((1, 2, 1, 8), layer_idx * 10, dtype=np.float32))
+            V = Tensor(np.full((1, 2, 1, 8), layer_idx * 10 + 1, dtype=np.float32))
+            cache.update(layer_idx, K, V)
+
+        cache.advance()
+
+        # Verify each layer has correct values
+        for layer_idx in range(num_layers):
+            cached_K, cached_V = cache.get(layer_idx)
+            expected_k_val = layer_idx * 10
+            expected_v_val = layer_idx * 10 + 1
+
+            assert cached_K.data[0, 0, 0, 0] == expected_k_val, (
+                f"Layer {layer_idx} K wrong: expected {expected_k_val}, got {cached_K.data[0,0,0,0]}"
+            )
+            assert cached_V.data[0, 0, 0, 0] == expected_v_val, (
+                f"Layer {layer_idx} V wrong: expected {expected_v_val}, got {cached_V.data[0,0,0,0]}"
+            )
+
+    def test_kv_cache_seq_pos_tracking(self):
+        """
+        WHAT: Verify seq_pos counter tracks correctly.
+
+        WHY: seq_pos determines where in the cache to write next and how
+        much valid data to return. Incorrect tracking breaks generation.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+        cache = KVCache(batch_size=1, max_seq_len=100, num_layers=1, num_heads=2, head_dim=8)
+
+        # Initially at 0
+        assert cache.seq_pos == 0, "Initial seq_pos should be 0"
+
+        # Add tokens and check seq_pos
+        for expected_pos in range(1, 6):
+            K = Tensor(np.zeros((1, 2, 1, 8)))
+            V = Tensor(np.zeros((1, 2, 1, 8)))
+            cache.update(0, K, V)
+            cache.advance()
+            assert cache.seq_pos == expected_pos, (
+                f"seq_pos should be {expected_pos}, got {cache.seq_pos}"
+            )
+
+    def test_kv_cache_raises_on_invalid_layer(self):
+        """
+        WHAT: Verify cache raises error for invalid layer index.
+
+        WHY: Trying to access a non-existent layer is a programming error
+        that should be caught early.
+        """
+        try:
+            from tinytorch.perf.memoization import KVCache
+        except ImportError:
+            pytest.skip("KVCache not yet exported")
+
+        cache = KVCache(batch_size=1, max_seq_len=10, num_layers=2, num_heads=2, head_dim=8)
+
+        K = Tensor(np.zeros((1, 2, 1, 8)))
+        V = Tensor(np.zeros((1, 2, 1, 8)))
+
+        # Valid layers are 0 and 1
+        with pytest.raises(ValueError):
+            cache.update(layer_idx=5, key=K, value=V)  # Invalid layer
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tinytorch/tests/15_memoization/test_progressive_integration.py b/tinytorch/tests/17_memoization/test_progressive_integration.py
similarity index 100%
rename from tinytorch/tests/15_memoization/test_progressive_integration.py
rename to tinytorch/tests/17_memoization/test_progressive_integration.py
diff --git a/tinytorch/tests/15_memoization/test_tinygpt_integration.py b/tinytorch/tests/17_memoization/test_tinygpt_integration.py
similarity index 100%
rename from tinytorch/tests/15_memoization/test_tinygpt_integration.py
rename to tinytorch/tests/17_memoization/test_tinygpt_integration.py