"""
Example 04: The Data Wall
-------------------------
This script demonstrates the "Data Wall" concept from Volume 2, Lab 4.
It shows how faster GPUs can actually result in lower utilization if the 
storage bandwidth cannot keep up with the compute demand.
"""
import mlsysim
from mlsysim.core.constants import Q_

def main():
    print("Evaluating ResNet-50 Training Data Pipeline...\n")

    # 1. Define the Workload
    model = mlsysim.Models.Vision.ResNet50
    batch_size = 256
    
    # Calculate data demand per step
    # ResNet50 input: 224x224x3 FP16 (2 bytes)
    bytes_per_sample = 224 * 224 * 3 * 2
    batch_bytes = bytes_per_sample * batch_size * Q_("1 byte")
    
    # 2. Define the Hardware Targets
    v100 = mlsysim.Hardware.Cloud.V100
    a100 = mlsysim.Hardware.Cloud.A100
    h100 = mlsysim.Hardware.Cloud.H100
    
    # 3. Define the Storage (A standard NVMe SSD)
    storage_bw = Q_("3.0 GB/s")
    
    print(f"Storage Bandwidth: {storage_bw}")
    print(f"Batch Data Size: {batch_bytes.to('MB'):.2f}\n")
    
    print(f"{'GPU':<10} | {'Compute Time':<15} | {'I/O Time':<15} | {'GPU Utilization':<15}")
    print("-" * 65)
    
    for hw in [v100, a100, h100]:
        # Calculate Compute Time
        prof = mlsysim.Engine.solve(
            model=model,
            hardware=hw,
            batch_size=batch_size,
            precision="fp16",
            is_training=True
        )
        t_compute = prof.latency
        
        # Calculate I/O Time
        t_io = (batch_bytes / storage_bw).to("ms")
        
        # Calculate Utilization (assuming no perfect overlap for simplicity, or just taking the ratio)
        # If I/O takes longer than compute, GPU is idle waiting for data.
        # Utilization = T_compute / (T_compute + T_io)  (if purely sequential)
        # Or if pipelined: T_compute / max(T_compute, T_io)
        utilization = t_compute / max(t_compute, t_io)
        
        print(f"{hw.name:<10} | {t_compute.m_as('ms'):.1f} ms        | {t_io.m_as('ms'):.1f} ms        | {utilization.m_as('dimensionless') * 100:.1f}%")

    print("\nConclusion: As GPUs get faster (V100 -> H100), the compute time drops.")
    print("But because storage bandwidth is fixed, the GPU spends more time waiting for data,")
    print("causing utilization to plummet. This is the Data Wall.")

if __name__ == "__main__":
    main()

# Expected output (mlsysim v0.1.1):
# Evaluating ResNet-50 Training Data Pipeline...
#
# Storage Bandwidth: 3.0 GB/s
# Batch Data Size: 77.07 MB
#
# GPU        | Compute Time    | I/O Time        | GPU Utilization
# -----------------------------------------------------------------
# NVIDIA V100 | 51.9 ms        | 25.7 ms        | 100.0%
# NVIDIA A100 | 41.9 ms        | 25.7 ms        | 100.0%
# NVIDIA H100 | 7.9 ms        | 25.7 ms        | 30.7%
#
# Conclusion: As GPUs get faster (V100 -> H100), the compute time drops.
# But because storage bandwidth is fixed, the GPU spends more time waiting for data,
# causing utilization to plummet. This is the Data Wall.