mirror of
https://github.com/MLSysBook/TinyTorch.git
synced 2026-05-08 03:07:32 -05:00
- Flattened tests/ directory structure (removed integration/ and system/ subdirectories) - Renamed all integration tests with _integration.py suffix for clarity - Created test_utils.py with setup_integration_test() function - Updated integration tests to use ONLY tinytorch package imports - Ensured all modules are exported before running tests via tito export --all - Optimized module test timing for fast execution (under 5 seconds each) - Fixed MLOps test reliability and reduced timing parameters across modules - Exported all modules (compression, kernels, benchmarking, mlops) to tinytorch package
567 lines
20 KiB
Python
567 lines
20 KiB
Python
# AUTOGENERATED! DO NOT EDIT! File to edit: ../../modules/source/11_kernels/kernels_dev.ipynb.
|
|
|
|
# %% auto 0
|
|
__all__ = ['time_kernel', 'matmul_baseline', 'vectorized_relu', 'vectorized_operations', 'cache_friendly_matmul', 'parallel_relu',
|
|
'parallel_batch_processing', 'quantized_matmul', 'quantized_relu']
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 1
|
|
import numpy as np
|
|
import sys
|
|
import os
|
|
import time
|
|
import tracemalloc
|
|
import psutil
|
|
from typing import Callable, Dict, Any, Optional, Tuple, List
|
|
from functools import wraps
|
|
from pathlib import Path
|
|
|
|
# Import our existing components
|
|
try:
|
|
from tinytorch.core.tensor import Tensor
|
|
from tinytorch.core.layers import matmul_naive as matmul
|
|
from tinytorch.core.activations import ReLU, Sigmoid, Tanh
|
|
from tinytorch.core.cnn import Conv2D
|
|
except ImportError:
|
|
# For development, import from local modules
|
|
base_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
sys.path.extend([
|
|
os.path.join(base_dir, '01_tensor'),
|
|
os.path.join(base_dir, '02_activations'),
|
|
os.path.join(base_dir, '03_layers'),
|
|
os.path.join(base_dir, '05_cnn'),
|
|
os.path.join(base_dir, 'utils')
|
|
])
|
|
|
|
try:
|
|
from tensor_dev import Tensor
|
|
from layers_dev import matmul_naive as matmul
|
|
from activations_dev import ReLU, Sigmoid, Tanh
|
|
from cnn_dev import Conv2D
|
|
except ImportError:
|
|
# Create minimal mock for development
|
|
class Tensor:
|
|
def __init__(self, data):
|
|
self.data = np.array(data)
|
|
self.shape = self.data.shape
|
|
def __str__(self):
|
|
return f"Tensor({self.data})"
|
|
|
|
# Simple timing utility for kernel performance measurement
|
|
def time_kernel(func, *args, **kwargs):
|
|
"""
|
|
Simple timing function for measuring kernel performance.
|
|
|
|
Returns:
|
|
tuple: (result, time_in_microseconds)
|
|
"""
|
|
start = time.perf_counter()
|
|
result = func(*args, **kwargs)
|
|
end = time.perf_counter()
|
|
microseconds = (end - start) * 1_000_000
|
|
return result, microseconds
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 6
|
|
def matmul_baseline(A: Tensor, B: Tensor) -> Tensor:
|
|
"""
|
|
Baseline matrix multiplication using TinyTorch's proven implementation.
|
|
|
|
This function demonstrates how to build on existing TinyTorch components
|
|
rather than reinventing the wheel. We use the standard matmul from Module 03
|
|
as our baseline for comparison with optimized kernels.
|
|
|
|
This is NOT a custom implementation - it's the standard TinyTorch matmul
|
|
wrapped for use in kernel comparisons and benchmarking.
|
|
|
|
TODO: Use TinyTorch's standard matmul implementation as a baseline.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Import the standard matmul function from tinytorch.core.layers
|
|
2. Extract numpy arrays from input Tensors
|
|
3. Use the proven implementation from TinyTorch
|
|
4. Wrap result back in Tensor format
|
|
5. Return the result
|
|
|
|
CODE REUSE PRINCIPLES:
|
|
1. Always use the packaged version for reliability
|
|
2. Don't duplicate working code - reference the source
|
|
3. Use descriptive names that indicate what the function actually does
|
|
4. Keep dependencies simple and reliable
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
A = Tensor([[1, 2], [3, 4]])
|
|
B = Tensor([[5, 6], [7, 8]])
|
|
C = matmul_baseline(A, B)
|
|
# Expected: [[19, 22], [43, 50]]
|
|
```
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This shows how to use TinyTorch as a library
|
|
- Demonstrates reliable dependency management
|
|
- Serves as baseline for kernel performance comparisons
|
|
- Shows proper software engineering practices
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy arrays from Tensors
|
|
A_data = A.data if hasattr(A, 'data') else A
|
|
B_data = B.data if hasattr(B, 'data') else B
|
|
|
|
# Use NumPy's matrix multiplication as our baseline
|
|
# This is our baseline - reliable, tested, and consistent
|
|
result_data = np.dot(A_data, B_data)
|
|
|
|
# Wrap the result back in a Tensor for consistency
|
|
result = Tensor(result_data)
|
|
|
|
return result
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 9
|
|
def vectorized_relu(x: Tensor) -> Tensor:
|
|
"""
|
|
Vectorized ReLU implementation demonstrating SIMD principles.
|
|
|
|
This function shows how to write operations that take advantage of
|
|
CPU vectorization capabilities for better performance.
|
|
|
|
TODO: Implement a vectorized ReLU that's optimized for performance.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy array from Tensor
|
|
2. Use NumPy's vectorized operations (these compile to SIMD instructions)
|
|
3. Apply ReLU: f(x) = max(0, x) for all elements simultaneously
|
|
4. Return result as Tensor
|
|
|
|
VECTORIZATION TECHNIQUES:
|
|
1. Use np.maximum instead of loops - this is vectorized
|
|
2. Ensure input is contiguous in memory for better SIMD performance
|
|
3. Consider using specific dtypes (float32 vs float64) for SIMD alignment
|
|
4. Avoid conditional operations that break vectorization
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
x = Tensor([-2, -1, 0, 1, 2])
|
|
y = vectorized_relu(x)
|
|
# Expected: [0, 0, 0, 1, 2]
|
|
```
|
|
|
|
PERFORMANCE CONSIDERATIONS:
|
|
- np.maximum is vectorized and uses SIMD instructions
|
|
- Memory layout matters: contiguous arrays are faster
|
|
- Data type matters: float32 allows more SIMD parallelism than float64
|
|
- Avoid Python loops - they can't be vectorized
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how PyTorch's ReLU is implemented under the hood
|
|
- GPU kernels use similar principles with thousands of parallel threads
|
|
- Modern CPUs can process 4-16 floats simultaneously with SIMD
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy array
|
|
x_data = x.data if hasattr(x, 'data') else x
|
|
|
|
# Ensure contiguous memory layout for better SIMD performance
|
|
if not x_data.flags.c_contiguous:
|
|
x_data = np.ascontiguousarray(x_data)
|
|
|
|
# Vectorized ReLU using NumPy's maximum function
|
|
# This compiles to SIMD instructions on modern CPUs
|
|
result = np.maximum(0, x_data)
|
|
|
|
return Tensor(result)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 10
|
|
def vectorized_operations(x: Tensor, y: Tensor) -> Dict[str, Tensor]:
|
|
"""
|
|
Demonstration of various vectorized operations.
|
|
|
|
Shows how multiple operations can be vectorized for better performance.
|
|
|
|
TODO: Implement a collection of vectorized operations.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy arrays from input Tensors
|
|
2. Implement vectorized versions of common operations
|
|
3. Use NumPy's built-in vectorized functions
|
|
4. Return dictionary of results
|
|
|
|
OPERATIONS TO IMPLEMENT:
|
|
- element_wise_multiply: x * y (element-wise)
|
|
- element_wise_add: x + y (element-wise)
|
|
- squared_difference: (x - y)^2
|
|
- euclidean_distance: sqrt(sum((x - y)^2))
|
|
- dot_product: sum(x * y)
|
|
|
|
VECTORIZATION PRINCIPLES:
|
|
- Use NumPy operations instead of Python loops
|
|
- Combine operations when possible: (x - y)**2 instead of subtract then square
|
|
- Consider memory layout and data types
|
|
- Measure performance improvements
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
x = Tensor([1, 2, 3, 4])
|
|
y = Tensor([2, 3, 4, 5])
|
|
results = vectorized_operations(x, y)
|
|
# Returns dict with all vectorized operation results
|
|
```
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy arrays
|
|
x_data = x.data if hasattr(x, 'data') else x
|
|
y_data = y.data if hasattr(y, 'data') else y
|
|
|
|
# Ensure arrays are the same shape for element-wise operations
|
|
assert x_data.shape == y_data.shape, f"Shape mismatch: {x_data.shape} vs {y_data.shape}"
|
|
|
|
# Vectorized operations
|
|
results = {
|
|
'element_wise_multiply': Tensor(x_data * y_data),
|
|
'element_wise_add': Tensor(x_data + y_data),
|
|
'squared_difference': Tensor((x_data - y_data) ** 2),
|
|
'euclidean_distance': Tensor(np.sqrt(np.sum((x_data - y_data) ** 2))),
|
|
'dot_product': Tensor(np.dot(x_data.flatten(), y_data.flatten()))
|
|
}
|
|
|
|
return results
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 13
|
|
def cache_friendly_matmul(A: Tensor, B: Tensor, block_size: int = 32) -> Tensor:
|
|
"""
|
|
Cache-friendly matrix multiplication using blocking technique.
|
|
|
|
This implementation uses cache blocking to improve memory access patterns
|
|
and achieve better performance on modern CPUs.
|
|
|
|
TODO: Implement cache-friendly matrix multiplication using blocking.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy arrays and get dimensions
|
|
2. Pre-allocate output matrix
|
|
3. Use three nested loops for blocks: block_i, block_j, block_k
|
|
4. Within each block, use three nested loops for elements: i, j, k
|
|
5. Process data in cache-sized blocks for better locality
|
|
|
|
BLOCKING ALGORITHM:
|
|
1. Divide matrices into blocks of size block_size x block_size
|
|
2. For each block of C, compute contribution from corresponding A and B blocks
|
|
3. This keeps data in cache longer, reducing memory access time
|
|
|
|
CACHE OPTIMIZATION PRINCIPLES:
|
|
- Process data in small blocks that fit in cache
|
|
- Reuse data as much as possible while it's in cache
|
|
- Access memory in predictable patterns
|
|
- Minimize cache misses
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
A = Tensor([[1, 2], [3, 4]])
|
|
B = Tensor([[5, 6], [7, 8]])
|
|
C = cache_friendly_matmul(A, B, block_size=2)
|
|
# Expected: [[19, 22], [43, 50]]
|
|
```
|
|
|
|
PERFORMANCE HINTS:
|
|
- block_size should be chosen based on cache size
|
|
- Typical L1 cache: 32KB, so block_size=32 for float32 matrices
|
|
- Experiment with different block sizes for your hardware
|
|
- This algorithm is O(n^3) but with much better constants
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how BLAS libraries achieve high performance
|
|
- GPUs use similar tiling strategies for shared memory
|
|
- Modern compilers can sometimes do this automatically
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy arrays
|
|
A_data = A.data if hasattr(A, 'data') else A
|
|
B_data = B.data if hasattr(B, 'data') else B
|
|
|
|
# Get dimensions
|
|
m, k = A_data.shape
|
|
k2, n = B_data.shape
|
|
assert k == k2, f"Cannot multiply {A_data.shape} and {B_data.shape}"
|
|
|
|
# Pre-allocate output matrix
|
|
C = np.zeros((m, n), dtype=A_data.dtype)
|
|
|
|
# Cache-friendly blocked matrix multiplication
|
|
for block_i in range(0, m, block_size):
|
|
for block_j in range(0, n, block_size):
|
|
for block_k in range(0, k, block_size):
|
|
# Define block boundaries
|
|
end_i = min(block_i + block_size, m)
|
|
end_j = min(block_j + block_size, n)
|
|
end_k = min(block_k + block_size, k)
|
|
|
|
# Process block - good cache locality
|
|
for i in range(block_i, end_i):
|
|
for j in range(block_j, end_j):
|
|
for k_idx in range(block_k, end_k):
|
|
C[i, j] += A_data[i, k_idx] * B_data[k_idx, j]
|
|
|
|
return Tensor(C)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 16
|
|
def parallel_relu(x: Tensor, num_workers: int = 4) -> Tensor:
|
|
"""
|
|
Parallel ReLU implementation using multiple CPU cores.
|
|
|
|
This function demonstrates data parallelism by splitting the input
|
|
across multiple worker processes.
|
|
|
|
TODO: Implement parallel ReLU using multiprocessing or threading.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy array from Tensor
|
|
2. Split array into chunks for parallel processing
|
|
3. Define worker function that applies ReLU to a chunk
|
|
4. Use ThreadPoolExecutor to process chunks in parallel
|
|
5. Combine results from all workers
|
|
6. Return result as Tensor
|
|
|
|
PARALLELIZATION STRATEGY:
|
|
1. Split input into num_workers chunks
|
|
2. Each worker processes its chunk independently
|
|
3. Apply ReLU: max(0, x) to each chunk
|
|
4. Combine results preserving original order
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
x = Tensor(np.random.randn(1000))
|
|
y = parallel_relu(x, num_workers=4)
|
|
# Processes data using 4 parallel workers
|
|
```
|
|
|
|
PERFORMANCE CONSIDERATIONS:
|
|
- Overhead of parallel processing may not be worth it for small arrays
|
|
- Threading vs multiprocessing trade-offs
|
|
- Chunk size should be large enough to amortize overhead
|
|
- Consider memory bandwidth limitations
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how PyTorch processes batches in parallel
|
|
- GPUs naturally do this with thousands of parallel threads
|
|
- Modern deep learning frameworks heavily use parallelism
|
|
"""
|
|
### BEGIN SOLUTION
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
# Extract numpy array
|
|
x_data = x.data if hasattr(x, 'data') else x
|
|
|
|
# For small arrays, parallel processing isn't worth the overhead
|
|
if x_data.size < 1000:
|
|
return Tensor(np.maximum(0, x_data))
|
|
|
|
# Split array into chunks
|
|
chunk_size = max(1, x_data.size // num_workers)
|
|
chunks = []
|
|
flat_data = x_data.flatten()
|
|
|
|
for i in range(0, len(flat_data), chunk_size):
|
|
chunks.append(flat_data[i:i + chunk_size])
|
|
|
|
# Worker function
|
|
def relu_chunk(chunk):
|
|
return np.maximum(0, chunk)
|
|
|
|
# Process chunks in parallel
|
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
future_to_chunk = {executor.submit(relu_chunk, chunk): i for i, chunk in enumerate(chunks)}
|
|
results = [None] * len(chunks)
|
|
|
|
for future in future_to_chunk:
|
|
chunk_idx = future_to_chunk[future]
|
|
results[chunk_idx] = future.result()
|
|
|
|
# Combine results
|
|
combined_result = np.concatenate(results)
|
|
|
|
# Reshape back to original shape
|
|
result = combined_result.reshape(x_data.shape)
|
|
|
|
return Tensor(result)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 17
|
|
def parallel_batch_processing(batch_data: List[Tensor], operation: Callable, num_workers: int = 4) -> List[Tensor]:
|
|
"""
|
|
Process a batch of tensors in parallel using multiple workers.
|
|
|
|
This function demonstrates how to parallelize operations across
|
|
multiple data samples, similar to how modern ML frameworks work.
|
|
|
|
TODO: Implement parallel batch processing.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Take a list of Tensors and an operation function
|
|
2. Use ThreadPoolExecutor to process multiple tensors simultaneously
|
|
3. Apply the operation to each tensor in parallel
|
|
4. Return list of results in original order
|
|
|
|
PARALLELIZATION STRATEGY:
|
|
1. Each worker processes one tensor at a time
|
|
2. Multiple workers can process different tensors simultaneously
|
|
3. Preserve order of results to match input order
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
batch = [Tensor(np.random.randn(100, 100)) for _ in range(8)]
|
|
relu_op = lambda x: vectorized_relu(x)
|
|
results = parallel_batch_processing(batch, relu_op, num_workers=4)
|
|
# Processes 8 tensors using 4 parallel workers
|
|
```
|
|
|
|
PERFORMANCE CONSIDERATIONS:
|
|
- Each tensor should be large enough to justify parallel overhead
|
|
- Balance number of workers with available CPU cores
|
|
- Consider memory usage with multiple workers
|
|
- Thread vs process pool trade-offs
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how PyTorch's DataLoader processes batches
|
|
- Similar to how GPUs process multiple samples simultaneously
|
|
- Foundation for distributed training across multiple nodes
|
|
"""
|
|
### BEGIN SOLUTION
|
|
from concurrent.futures import ThreadPoolExecutor
|
|
|
|
# For small batches, parallel processing might not be worth it
|
|
if len(batch_data) < num_workers:
|
|
return [operation(tensor) for tensor in batch_data]
|
|
|
|
# Process batch in parallel
|
|
with ThreadPoolExecutor(max_workers=num_workers) as executor:
|
|
# Submit all tasks
|
|
future_to_index = {executor.submit(operation, tensor): i for i, tensor in enumerate(batch_data)}
|
|
|
|
# Collect results in original order
|
|
results = [None] * len(batch_data)
|
|
for future in future_to_index:
|
|
index = future_to_index[future]
|
|
results[index] = future.result()
|
|
|
|
return results
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 22
|
|
def quantized_matmul(A: Tensor, B: Tensor, scale_A: float = 1.0, scale_B: float = 1.0) -> Tensor:
|
|
"""
|
|
Quantized matrix multiplication kernel for compressed models.
|
|
|
|
This function demonstrates how to perform matrix multiplication
|
|
with quantized (int8) weights while maintaining numerical accuracy.
|
|
|
|
TODO: Implement quantized matrix multiplication.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy arrays from Tensors
|
|
2. Quantize inputs to int8 using provided scales
|
|
3. Perform integer matrix multiplication
|
|
4. Rescale result back to appropriate range
|
|
5. Return result as Tensor
|
|
|
|
QUANTIZATION PROCESS:
|
|
1. Quantize: int8_value = round(float_value / scale)
|
|
2. Compute: int8_result = int8_A @ int8_B
|
|
3. Rescale: float_result = int8_result * scale_A * scale_B
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
A = Tensor([[1.0, 2.0], [3.0, 4.0]])
|
|
B = Tensor([[0.5, 1.5], [2.5, 3.5]])
|
|
C = quantized_matmul(A, B, scale_A=1.0/127, scale_B=1.0/127)
|
|
# Should approximate regular matrix multiplication
|
|
```
|
|
|
|
PERFORMANCE CONSIDERATIONS:
|
|
- int8 operations are often faster than float32
|
|
- Memory usage is 4x lower
|
|
- Accumulation in int32 to prevent overflow
|
|
- Careful handling of scales to maintain precision
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how TensorFlow Lite performs quantized inference
|
|
- Similar to how mobile ML accelerators work
|
|
- Foundation for edge deployment of neural networks
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy arrays
|
|
A_data = A.data if hasattr(A, 'data') else A
|
|
B_data = B.data if hasattr(B, 'data') else B
|
|
|
|
# Quantize inputs to int8
|
|
A_int8 = np.round(A_data / scale_A).astype(np.int8)
|
|
B_int8 = np.round(B_data / scale_B).astype(np.int8)
|
|
|
|
# Perform integer matrix multiplication
|
|
# Use int32 for accumulation to prevent overflow
|
|
C_int32 = np.dot(A_int8.astype(np.int32), B_int8.astype(np.int32))
|
|
|
|
# Rescale result back to float
|
|
C_float = C_int32 * scale_A * scale_B
|
|
|
|
return Tensor(C_float)
|
|
### END SOLUTION
|
|
|
|
# %% ../../modules/source/11_kernels/kernels_dev.ipynb 23
|
|
def quantized_relu(x: Tensor, scale: float = 1.0) -> Tensor:
|
|
"""
|
|
Quantized ReLU implementation for compressed models.
|
|
|
|
This function shows how to apply ReLU activation to quantized values
|
|
while maintaining the quantization format.
|
|
|
|
TODO: Implement quantized ReLU activation.
|
|
|
|
STEP-BY-STEP IMPLEMENTATION:
|
|
1. Extract numpy array from Tensor
|
|
2. Quantize input to int8 using provided scale
|
|
3. Apply ReLU in integer domain: max(0, x)
|
|
4. Keep result in int8 format (no rescaling needed for ReLU)
|
|
5. Convert back to float using scale
|
|
6. Return result as Tensor
|
|
|
|
QUANTIZED RELU PROCESS:
|
|
1. Quantize: int8_value = round(float_value / scale)
|
|
2. Apply ReLU: int8_result = max(0, int8_value)
|
|
3. Dequantize: float_result = int8_result * scale
|
|
|
|
EXAMPLE USAGE:
|
|
```python
|
|
x = Tensor([-1.0, 0.0, 1.0, 2.0])
|
|
y = quantized_relu(x, scale=1.0/127)
|
|
# Should produce [0.0, 0.0, 1.0, 2.0] (approximately)
|
|
```
|
|
|
|
OPTIMIZATION NOTES:
|
|
- ReLU in int8 is just max(0, x) - very fast
|
|
- No floating-point operations needed during activation
|
|
- Maintains quantization format throughout
|
|
- Can be vectorized efficiently
|
|
|
|
LEARNING CONNECTIONS:
|
|
- This is how quantized neural networks maintain speed
|
|
- Similar to how mobile processors optimize ML inference
|
|
- Foundation for real-time edge computing applications
|
|
"""
|
|
### BEGIN SOLUTION
|
|
# Extract numpy array
|
|
x_data = x.data if hasattr(x, 'data') else x
|
|
|
|
# Quantize input to int8
|
|
x_int8 = np.round(x_data / scale).astype(np.int8)
|
|
|
|
# Apply ReLU in integer domain
|
|
x_relu_int8 = np.maximum(0, x_int8)
|
|
|
|
# Convert back to float
|
|
x_relu_float = x_relu_int8 * scale
|
|
|
|
return Tensor(x_relu_float)
|
|
### END SOLUTION
|