"""
MLPerf EDU: Micro-DLRM (Cloud Division)

A scaled-down Deep Learning Recommendation Model for click-through rate
prediction, mapping the MLPerf Training DLRM benchmark to laptop scale.

Architecture:
    Dense features → Bottom MLP → dense embedding
    Sparse features → EmbeddingBag tables → sparse embeddings
    [dense_emb, sparse_embs] → concatenate → Top MLP → sigmoid → CTR

The model demonstrates the unique memory access pattern of recommendation:
- Sparse embeddings → memory bandwidth bound (random lookups)
- Dense MLP → compute bound (matrix multiplications)

Dataset: MovieLens-100K (Harper & Konstan, 2015)
    - 100,000 ratings from 943 users on 1,682 movies
    - Binarized at threshold 4: rating >= 4 → positive click
    - Ships locally in data/movielens/ml-100k/ (5 MB)

Quality Target: Acc > 0.70 on MovieLens binary click prediction (best val ~71%)

Provenance: Naumov et al. 2019, "Deep Learning Recommendation Model"
"""

import torch
import torch.nn as nn


class MicroDLRMWhiteBox(nn.Module):
    """
    Micro-scale DLRM for MovieLens-100K recommendation.

    Implements the core DLRM pattern: separate processing of dense
    (continuous) and sparse (categorical) features, followed by
    feature interaction and CTR prediction.

    Default embedding sizes match MovieLens-100K:
    - user_id: 943 users
    - item_id: 1682 items  
    - occupation: 21 categories
    """

    def __init__(self,
                 m_spa=8,
                 num_embeddings=[943, 1682, 21],
                 ln_bot=[16, 8, 8],
                 ln_top=[32, 16, 1]):
        super().__init__()

        # Sparse: embedding tables for categorical features
        self.emb_l = nn.ModuleList([
            nn.EmbeddingBag(n, m_spa, mode="sum", sparse=False)
            for n in num_embeddings
        ])

        # Dense: bottom MLP for continuous features
        layers = []
        for i in range(len(ln_bot) - 1):
            layers.append(nn.Linear(ln_bot[i], ln_bot[i + 1]))
            layers.append(nn.ReLU())
        self.bot_l = nn.Sequential(*layers)

        # Feature interaction: concat dense output + all sparse embeddings
        cross_dim = ln_bot[-1] + len(num_embeddings) * m_spa

        # Top MLP: CTR prediction
        top_layers = []
        in_dim = cross_dim
        for out_dim in ln_top[:-1]:
            top_layers.append(nn.Linear(in_dim, out_dim))
            top_layers.append(nn.ReLU())
            in_dim = out_dim
        top_layers.append(nn.Linear(in_dim, ln_top[-1]))
        top_layers.append(nn.Sigmoid())
        self.top_l = nn.Sequential(*top_layers)

    def forward(self, dense_x, sparse_indices, sparse_offsets):
        """
        Args:
            dense_x: (B, 16) continuous features
            sparse_indices: list of (B,) index tensors for each embedding table
            sparse_offsets: list of (B,) offset tensors for EmbeddingBag

        Returns:
            (B, 1) click-through probability
        """
        # Process dense features through bottom MLP
        x_dense = self.bot_l(dense_x)

        # Lookup sparse embeddings
        x_sparse = []
        for i, emb in enumerate(self.emb_l):
            z = emb(sparse_indices[i], sparse_offsets[i])
            x_sparse.append(z)

        # Feature interaction: concatenate dense + sparse
        # NOTE: The official DLRM uses dot-product interaction:
        #   T = stack([x_dense] + x_sparse)  # (B, n_features, embed_dim)
        #   Z = bmm(T, T.transpose(1,2))     # (B, n, n) pairwise interactions
        #   flat = Z[triu_indices]            # upper triangle features
        # We use concat for simplicity. Switching to dot-product interaction
        # is a pedagogical exercise that exposes feature crossing and the
        # compute vs. memory tradeoff in sparse-dense architectures.
        interaction = torch.cat([x_dense] + x_sparse, dim=1)

        # Predict CTR
        return self.top_l(interaction)