#!/usr/bin/env python
"""
Working Training Example - Proper Solution
===========================================
This shows how to make training work with the current architecture.
The key: ensure Variables maintain connection to Parameters.
"""

import numpy as np
import sys
sys.path.insert(0, '.')

from tinytorch.core.tensor import Tensor, Parameter
from tinytorch.core.autograd import Variable


class WorkingLinear:
    """Linear layer that properly maintains gradient connections."""
    
    def __init__(self, in_features, out_features):
        # Parameters with requires_grad=True
        self.weights = Parameter(np.random.randn(in_features, out_features) * 0.1)
        self.bias = Parameter(np.random.randn(out_features) * 0.1)
        
        # Keep Variable versions that maintain connection
        self._weight_var = Variable(self.weights)
        self._bias_var = Variable(self.bias)
    
    def forward(self, x):
        """Forward pass maintaining gradient chain."""
        # Ensure input is Variable
        if not isinstance(x, Variable):
            x = Variable(x, requires_grad=False)
        
        # Use Variable versions of parameters
        # These maintain connection via _source_tensor
        output = x @ self._weight_var + self._bias_var
        return output
    
    def parameters(self):
        """Return original parameters for optimizer."""
        return [self.weights, self.bias]
    
    def __call__(self, x):
        return self.forward(x)


def sigmoid_variable(x):
    """Sigmoid that works with Variables."""
    if not isinstance(x, Variable):
        x = Variable(x)
    
    # Forward
    sig_data = 1.0 / (1.0 + np.exp(-x.data.data))
    
    # Backward
    def grad_fn(grad_output):
        grad = sig_data * (1 - sig_data) * grad_output.data.data
        x.backward(Variable(grad))
    
    return Variable(sig_data, requires_grad=x.requires_grad, grad_fn=grad_fn)


def relu_variable(x):
    """ReLU that works with Variables."""
    if not isinstance(x, Variable):
        x = Variable(x)
    
    # Forward
    relu_data = np.maximum(0, x.data.data)
    
    # Backward
    def grad_fn(grad_output):
        grad = (x.data.data > 0) * grad_output.data.data
        x.backward(Variable(grad))
    
    return Variable(relu_data, requires_grad=x.requires_grad, grad_fn=grad_fn)


class WorkingMSE:
    """MSE loss that properly computes gradients."""
    
    def __call__(self, pred, target):
        # Convert to Variables
        if not isinstance(pred, Variable):
            pred = Variable(pred)
        if not isinstance(target, Variable):
            target = Variable(target, requires_grad=False)
        
        # Forward: MSE = mean((pred - target)^2)
        diff = pred - target
        squared = diff * diff
        
        # Manual mean
        n = squared.data.data.size
        loss_val = np.mean(squared.data.data)
        
        # Backward
        def grad_fn(grad_output=Variable(1.0)):
            # Gradient: 2 * (pred - target) / n
            grad = 2.0 * (pred.data.data - target.data.data) / n
            pred.backward(Variable(grad))
        
        return Variable(loss_val, requires_grad=True, grad_fn=grad_fn)


class WorkingSGD:
    """SGD optimizer that updates parameters."""
    
    def __init__(self, params, lr=0.01):
        self.params = params
        self.lr = lr
    
    def zero_grad(self):
        for p in self.params:
            p.grad = None
    
    def step(self):
        for p in self.params:
            if p.grad is not None:
                p.data = p.data - self.lr * p.grad.data


def train_xor_working():
    """Train XOR with working implementation."""
    print("="*60)
    print("WORKING XOR TRAINING")
    print("="*60)
    
    # Data
    X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.float32)
    y = np.array([[0], [1], [1], [0]], dtype=np.float32)
    
    # Network
    layer1 = WorkingLinear(2, 8)
    layer2 = WorkingLinear(8, 1)
    
    # Training setup
    params = layer1.parameters() + layer2.parameters()
    optimizer = WorkingSGD(params, lr=0.5)
    criterion = WorkingMSE()
    
    # Training loop
    losses = []
    for epoch in range(1000):
        # Forward
        h = layer1(Tensor(X))
        h = relu_variable(h)
        output = layer2(h)
        output = sigmoid_variable(output)
        
        # Loss
        loss = criterion(output, Tensor(y))
        loss_val = float(loss.data.data)
        losses.append(loss_val)
        
        # Backward
        optimizer.zero_grad()
        loss.backward()
        
        # Check gradients (first epoch only)
        if epoch == 0:
            print("Gradient check:")
            for i, p in enumerate(params):
                if p.grad is not None:
                    grad_norm = np.linalg.norm(p.grad.data)
                    print(f"  Param {i}: gradient norm = {grad_norm:.4f}")
                else:
                    print(f"  Param {i}: NO GRADIENT!")
        
        # Update
        optimizer.step()
        
        if epoch % 200 == 0:
            print(f"Epoch {epoch:4d}: Loss = {loss_val:.4f}")
    
    # Results
    print("\nFinal predictions:")
    h = layer1(Tensor(X))
    h = relu_variable(h)
    output = layer2(h)
    output = sigmoid_variable(output)
    
    predictions = output.data.data
    for x_val, pred, target in zip(X, predictions, y):
        print(f"  {x_val} → {pred[0]:.3f} (target: {target[0]})")
    
    # Accuracy
    binary_preds = (predictions > 0.5).astype(int)
    accuracy = np.mean(binary_preds == y)
    print(f"\nAccuracy: {accuracy*100:.0f}%")
    
    if accuracy == 1.0:
        print("✅ XOR learned perfectly!")
    elif accuracy >= 0.75:
        print("✅ XOR learned well!")
    else:
        print("⚠️ XOR partially learned")


def train_linear_regression_working():
    """Train linear regression with working implementation."""
    print("\n" + "="*60)
    print("WORKING LINEAR REGRESSION")
    print("="*60)
    
    # Data: y = 2x + 1
    X = np.array([[1], [2], [3], [4]], dtype=np.float32)
    y = np.array([[3], [5], [7], [9]], dtype=np.float32)
    
    # Model
    model = WorkingLinear(1, 1)
    print(f"Initial: weight={model.weights.data[0,0]:.3f}, bias={model.bias.data[0]:.3f}")
    
    optimizer = WorkingSGD(model.parameters(), lr=0.01)
    criterion = WorkingMSE()
    
    # Training
    for epoch in range(200):
        output = model(Tensor(X))
        loss = criterion(output, Tensor(y))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if epoch % 50 == 0:
            loss_val = float(loss.data.data)
            print(f"Epoch {epoch:3d}: Loss = {loss_val:.4f}")
    
    print(f"Final: weight={model.weights.data[0,0]:.3f}, bias={model.bias.data[0]:.3f}")
    print(f"Target: weight=2.000, bias=1.000")
    
    # Check
    w_err = abs(model.weights.data[0,0] - 2.0)
    b_err = abs(model.bias.data[0] - 1.0)
    
    if w_err < 0.1 and b_err < 0.1:
        print("✅ Linear regression learned perfectly!")


if __name__ == "__main__":
    # Test simple case first
    train_linear_regression_working()
    
    # Test XOR
    print()
    train_xor_working()
    
    print("\n" + "="*60)
    print("KEY INSIGHT")
    print("="*60)
    print("""
The working solution shows that we need:

1. Variables that maintain connection to source Parameters (_source_tensor)
2. Operations between Variables that create new Variables with grad_fn
3. Backward pass that propagates gradients back to original Parameters

The current TinyTorch architecture CAN work, but layers need to:
- Keep Variable versions of parameters that maintain connections
- Use these Variables in forward passes
- Return Variables, not Tensors

This is why PyTorch unified Tensor and Variable - to avoid this complexity!
""")