TinyTorch/tests/checkpoints/checkpoint_14_deployment.py

"""
Checkpoint 14: Deployment (After Module 15 - MLOps)
Question: "Can I deploy and monitor ML systems in production?"
"""

import numpy as np
import pytest

def test_checkpoint_14_deployment():
    """
    Checkpoint 14: Deployment

    Validates that students can deploy ML models to production and implement
    monitoring systems to ensure reliable, scalable machine learning operations -
    essential for real-world ML engineering and MLOps practices.
    """
    print("\n🚀 Checkpoint 14: Deployment")
    print("=" * 50)

    try:
        from tinytorch.core.tensor import Tensor
        from tinytorch.core.mlops import ModelMonitor, DriftDetector, RetrainingTrigger, MLOpsPipeline
        from tinytorch.core.networks import Sequential
        from tinytorch.core.layers import Linear
        from tinytorch.core.activations import ReLU, Softmax
        from tinytorch.core.training import Trainer, CrossEntropyLoss, Accuracy
        from tinytorch.core.compression import quantize_layer_weights, prune_weights_by_magnitude
    except ImportError as e:
        pytest.fail(f"❌ Cannot import required classes - complete Modules 2-15 first: {e}")

    # Test 1: Model monitoring setup
    print("📡 Testing model monitoring...")

    try:
        monitor = ModelMonitor()

        # Test monitoring configuration
        if hasattr(monitor, 'configure'):
            monitor.configure({
                'metrics': ['accuracy', 'latency', 'throughput'],
                'thresholds': {'accuracy': 0.85, 'latency': 0.1},
                'alert_channels': ['log', 'console']
            })
            print(f"✅ Model monitoring: configured with metrics and thresholds")
        else:
            print(f"✅ Model monitoring: monitor instance created")

        # Test model registration
        model = Sequential([
            Linear(10, 20),
            ReLU(),
            Linear(20, 5),
            Softmax()
        ])

        if hasattr(monitor, 'register_model'):
            monitor.register_model('test_model_v1', model)
            print(f"✅ Model registration: test_model_v1 registered")

        # Test performance logging
        if hasattr(monitor, 'log_prediction'):
            test_input = Tensor(np.random.randn(1, 10))
            prediction = model(test_input)

            monitor.log_prediction(
                model_name='test_model_v1',
                input_data=test_input.data,
                prediction=prediction.data,
                timestamp=None,
                metadata={'batch_id': 'test_001'}
            )
            print(f"✅ Performance logging: prediction logged with metadata")

    except Exception as e:
        print(f"⚠️ Model monitoring: {e}")

    # Test 2: Data drift detection
    print("🌊 Testing data drift detection...")

    try:
        drift_detector = DriftDetector()

        # Simulate reference dataset (training distribution)
        reference_data = np.random.normal(0, 1, (1000, 10))

        # Configure drift detector
        if hasattr(drift_detector, 'fit_reference'):
            drift_detector.fit_reference(reference_data)
            print(f"✅ Reference data: fitted on {reference_data.shape} samples")

        # Test normal data (no drift)
        normal_data = np.random.normal(0, 1, (100, 10))

        if hasattr(drift_detector, 'detect_drift'):
            drift_score_normal = drift_detector.detect_drift(normal_data)
            print(f"✅ Normal data drift score: {drift_score_normal:.4f}" if isinstance(drift_score_normal, (int, float)) else "✅ Normal data: no significant drift")

        # Test shifted data (drift present)
        drifted_data = np.random.normal(2, 1.5, (100, 10))  # Mean shift and scale change

        if hasattr(drift_detector, 'detect_drift'):
            drift_score_shifted = drift_detector.detect_drift(drifted_data)
            print(f"✅ Drifted data drift score: {drift_score_shifted:.4f}" if isinstance(drift_score_shifted, (int, float)) else "✅ Drifted data: drift detected")

            # Verify drift detection works
            if isinstance(drift_score_normal, (int, float)) and isinstance(drift_score_shifted, (int, float)):
                assert drift_score_shifted > drift_score_normal, "Drifted data should have higher drift score"

    except Exception as e:
        print(f"⚠️ Data drift detection: {e}")

    # Test 3: Automated retraining triggers
    print("🔄 Testing retraining triggers...")

    try:
        retrain_trigger = RetrainingTrigger()

        # Configure retraining conditions
        if hasattr(retrain_trigger, 'configure'):
            retrain_trigger.configure({
                'accuracy_threshold': 0.8,
                'drift_threshold': 0.5,
                'time_threshold': 24 * 7,  # 1 week in hours
                'sample_threshold': 10000
            })
            print(f"✅ Retraining configuration: multiple trigger conditions set")

        # Test trigger conditions
        performance_metrics = {
            'accuracy': 0.75,  # Below threshold
            'drift_score': 0.6,  # Above threshold
            'hours_since_training': 200,  # Above threshold
            'new_samples': 15000  # Above threshold
        }

        if hasattr(retrain_trigger, 'should_retrain'):
            should_retrain = retrain_trigger.should_retrain(performance_metrics)
            print(f"✅ Retraining decision: {'RETRAIN' if should_retrain else 'CONTINUE'} based on metrics")
        else:
            # Manual trigger logic
            triggers_met = 0
            if performance_metrics['accuracy'] < 0.8:
                triggers_met += 1
            if performance_metrics['drift_score'] > 0.5:
                triggers_met += 1
            if performance_metrics['hours_since_training'] > 168:
                triggers_met += 1
            if performance_metrics['new_samples'] > 10000:
                triggers_met += 1

            should_retrain = triggers_met >= 2  # Require multiple conditions
            print(f"✅ Retraining decision: {'RETRAIN' if should_retrain else 'CONTINUE'} ({triggers_met}/4 conditions met)")

    except Exception as e:
        print(f"⚠️ Retraining triggers: {e}")

    # Test 4: MLOps pipeline orchestration
    print("🔧 Testing MLOps pipeline...")

    try:
        pipeline = MLOpsPipeline()

        # Test pipeline configuration
        if hasattr(pipeline, 'configure'):
            pipeline_config = {
                'stages': ['data_validation', 'training', 'evaluation', 'deployment'],
                'model_registry': 'local',
                'monitoring_enabled': True,
                'auto_rollback': True
            }
            pipeline.configure(pipeline_config)
            print(f"✅ Pipeline configuration: {len(pipeline_config['stages'])} stages configured")

        # Test pipeline execution
        if hasattr(pipeline, 'run_pipeline'):
            # Mock pipeline data
            pipeline_data = {
                'training_data': np.random.randn(500, 10),
                'validation_data': np.random.randn(100, 10),
                'model_config': {'input_dim': 10, 'output_dim': 3}
            }

            try:
                result = pipeline.run_pipeline(pipeline_data)
                if result:
                    print(f"✅ Pipeline execution: completed successfully")
                else:
                    print(f"⚠️ Pipeline execution: completed with warnings")
            except Exception as e:
                print(f"⚠️ Pipeline execution: {e}")
        else:
            # Manual pipeline simulation
            stages = ['validation', 'training', 'evaluation', 'deployment']
            for i, stage in enumerate(stages):
                print(f"   Stage {i+1}/{len(stages)}: {stage} - ✅")
            print(f"✅ Pipeline simulation: {len(stages)} stages completed")

    except Exception as e:
        print(f"⚠️ MLOps pipeline: {e}")

    # Test 5: Model versioning and rollback
    print("📦 Testing model versioning...")

    try:
        # Simulate model versions
        model_v1 = Sequential([Linear(10, 5), ReLU(), Linear(5, 3)])
        model_v2 = Sequential([Linear(10, 8), ReLU(), Linear(8, 3)])  # Improved architecture

        model_registry = {
            'v1.0': {
                'model': model_v1,
                'accuracy': 0.85,
                'deployment_date': '2024-01-01',
                'status': 'deployed'
            },
            'v2.0': {
                'model': model_v2,
                'accuracy': 0.88,
                'deployment_date': '2024-01-15',
                'status': 'candidate'
            }
        }

        # Test version comparison
        v1_acc = model_registry['v1.0']['accuracy']
        v2_acc = model_registry['v2.0']['accuracy']

        # Deploy better version
        if v2_acc > v1_acc:
            model_registry['v2.0']['status'] = 'deployed'
            model_registry['v1.0']['status'] = 'archived'
            current_version = 'v2.0'
        else:
            current_version = 'v1.0'

        print(f"✅ Model versioning: deployed version {current_version} (accuracy: {model_registry[current_version]['accuracy']:.3f})")

        # Test rollback capability
        if model_registry['v1.0']['status'] == 'archived':
            # Simulate performance degradation requiring rollback
            model_registry['v1.0']['status'] = 'deployed'
            model_registry['v2.0']['status'] = 'rolled_back'
            print(f"✅ Model rollback: reverted to v1.0 due to production issues")

    except Exception as e:
        print(f"⚠️ Model versioning: {e}")

    # Test 6: Production optimization
    print("⚡ Testing production optimization...")

    try:
        # Test model compression for deployment
        production_model = Sequential([
            Linear(50, 100),
            ReLU(),
            Linear(100, 50),
            ReLU(),
            Linear(50, 10)
        ])

        # Original model size
        original_params = sum(layer.weights.data.size + layer.bias.data.size
                            for layer in production_model.layers
                            if hasattr(layer, 'weights'))

        # Test quantization
        quantized_layers = 0
        for layer in production_model.layers:
            if hasattr(layer, 'weights'):
                try:
                    quantized_weights = quantize_layer_weights(layer.weights.data, bits=8)
                    quantized_layers += 1
                except Exception:
                    pass

        # Test pruning
        pruned_layers = 0
        for layer in production_model.layers:
            if hasattr(layer, 'weights'):
                try:
                    pruned_weights = prune_weights_by_magnitude(layer.weights.data, sparsity=0.2)
                    pruned_layers += 1
                except Exception:
                    pass

        print(f"✅ Production optimization: quantized {quantized_layers} layers, pruned {pruned_layers} layers")
        print(f"   Original parameters: {original_params}")

    except Exception as e:
        print(f"⚠️ Production optimization: {e}")

    # Test 7: Health checks and alerts
    print("🏥 Testing health checks...")

    try:
        # Simulate system health metrics
        health_metrics = {
            'cpu_usage': 75.0,      # Percentage
            'memory_usage': 80.0,   # Percentage
            'gpu_usage': 90.0,      # Percentage
            'request_latency': 0.15, # Seconds
            'error_rate': 0.02,     # Percentage (2%)
            'throughput': 150       # Requests per second
        }

        # Define thresholds
        thresholds = {
            'cpu_usage': 85.0,
            'memory_usage': 90.0,
            'gpu_usage': 95.0,
            'request_latency': 0.2,
            'error_rate': 0.05,
            'throughput': 100
        }

        # Check health status
        alerts = []
        for metric, value in health_metrics.items():
            threshold = thresholds.get(metric, float('inf'))

            if metric in ['cpu_usage', 'memory_usage', 'gpu_usage', 'request_latency', 'error_rate']:
                if value > threshold:
                    alerts.append(f"{metric}: {value} > {threshold}")
            elif metric == 'throughput':
                if value < threshold:
                    alerts.append(f"{metric}: {value} < {threshold}")

        health_status = "HEALTHY" if not alerts else "DEGRADED"
        print(f"✅ Health check: {health_status}")

        if alerts:
            print(f"   Alerts: {len(alerts)} issues detected")
            for alert in alerts[:3]:  # Show first 3 alerts
                print(f"   - {alert}")
        else:
            print(f"   All metrics within thresholds")

    except Exception as e:
        print(f"⚠️ Health checks: {e}")

    # Test 8: A/B testing capability
    print("🔬 Testing A/B testing...")

    try:
        # Simulate A/B test between two model versions
        model_a = Sequential([Linear(10, 15), ReLU(), Linear(15, 5)])  # Control
        model_b = Sequential([Linear(10, 20), ReLU(), Linear(20, 5)])  # Treatment

        # Simulate user requests
        test_requests = 100
        a_group_size = int(test_requests * 0.5)  # 50/50 split
        b_group_size = test_requests - a_group_size

        # Simulate performance metrics
        a_latencies = np.random.normal(0.1, 0.02, a_group_size)
        b_latencies = np.random.normal(0.08, 0.02, b_group_size)  # Model B is faster

        a_accuracies = np.random.normal(0.85, 0.05, a_group_size)
        b_accuracies = np.random.normal(0.87, 0.05, b_group_size)  # Model B is more accurate

        # Statistical analysis
        a_avg_latency = np.mean(a_latencies)
        b_avg_latency = np.mean(b_latencies)
        a_avg_accuracy = np.mean(a_accuracies)
        b_avg_accuracy = np.mean(b_accuracies)

        # Determine winner
        latency_improvement = (a_avg_latency - b_avg_latency) / a_avg_latency * 100
        accuracy_improvement = (b_avg_accuracy - a_avg_accuracy) / a_avg_accuracy * 100

        winner = "B" if (latency_improvement > 5 and accuracy_improvement > 1) else "A"

        print(f"✅ A/B testing: {test_requests} requests split between models")
        print(f"   Model A: latency={a_avg_latency:.3f}s, accuracy={a_avg_accuracy:.3f}")
        print(f"   Model B: latency={b_avg_latency:.3f}s, accuracy={b_avg_accuracy:.3f}")
        print(f"   Winner: Model {winner}")

    except Exception as e:
        print(f"⚠️ A/B testing: {e}")

    # Test 9: Continuous deployment
    print("🔄 Testing continuous deployment...")

    try:
        # Simulate CI/CD pipeline stages
        deployment_stages = [
            ('Unit Tests', True),
            ('Integration Tests', True),
            ('Performance Tests', True),
            ('Security Scan', True),
            ('Staging Deployment', True),
            ('Smoke Tests', True),
            ('Production Deployment', True),
            ('Health Verification', True)
        ]

        deployment_success = True
        for stage_name, stage_result in deployment_stages:
            if not stage_result:
                deployment_success = False
                print(f"   ❌ {stage_name}: FAILED")
                break
            else:
                print(f"   ✅ {stage_name}: PASSED")

        if deployment_success:
            print(f"✅ Continuous deployment: all {len(deployment_stages)} stages completed successfully")

            # Simulate canary deployment
            canary_percentage = 5  # Start with 5% traffic
            print(f"   Canary deployment: {canary_percentage}% traffic routing to new version")
        else:
            print(f"❌ Continuous deployment: pipeline failed, deployment blocked")

    except Exception as e:
        print(f"⚠️ Continuous deployment: {e}")

    # Test 10: End-to-end production workflow
    print("🌐 Testing end-to-end workflow...")

    try:
        # Simulate complete production ML workflow
        workflow_steps = {
            'data_ingestion': True,
            'data_validation': True,
            'feature_engineering': True,
            'model_training': True,
            'model_validation': True,
            'model_deployment': True,
            'monitoring_setup': True,
            'alert_configuration': True
        }

        # Execute workflow
        completed_steps = 0
        for step, success in workflow_steps.items():
            if success:
                completed_steps += 1

        workflow_completion = completed_steps / len(workflow_steps) * 100

        print(f"✅ End-to-end workflow: {completed_steps}/{len(workflow_steps)} steps completed ({workflow_completion:.0f}%)")

        # Check production readiness
        production_ready = workflow_completion >= 100
        print(f"   Production readiness: {'READY' if production_ready else 'NOT READY'}")

        if production_ready:
            print(f"   System is ready for production ML workloads!")

    except Exception as e:
        print(f"⚠️ End-to-end workflow: {e}")

    print("\n🎉 Deployment Complete!")
    print("📝 You can now deploy and monitor ML systems in production")
    print("🔧 Built capabilities: Monitoring, drift detection, MLOps pipelines, A/B testing, CI/CD")
    print("🧠 Breakthrough: You can build production-grade ML systems that scale and self-monitor!")
    print("🎯 Next: Build complete end-to-end ML system capstone")

if __name__ == "__main__":
    test_checkpoint_14_deployment()