cs249r_book/mlperf-edu/reference/cloud/rag_infer.py

import torch
import torch.nn as nn
import numpy as np

class NativeFaissEngine:
    """
    Pedagogical representation of a Vector Database Retrieval Engine (RAG).
    Instead of abstracting LangChain, we literally map Cosine Similarity matrices
    forcing students to optimize enormous Indexing arrays natively!
    """
    def __init__(self, embedding_dim=768, capacity=100000):
        self.embedding_dim = embedding_dim
        # Native document vector store natively representing massive MS-MARCO chunks
        self.index = torch.randn(capacity, embedding_dim)
        self.index = torch.nn.functional.normalize(self.index, p=2, dim=1) # L2 Norm for fast Cosine bounds

    def search(self, query_vectors: torch.Tensor, top_k=5):
        """
        Natively simulates MIPS (Maximum Inner Product Search).
        Students can optimize this via Hierarchical Navigable Small World (HNSW) algorithms!
        """
        query_vectors = torch.nn.functional.normalize(query_vectors, p=2, dim=1)

        # O(N) exhaustive dot-product natively generating Latency bottlenecks!
        similarity_scores = torch.matmul(query_vectors, self.index.T)
        top_scores, top_indices = torch.topk(similarity_scores, k=top_k, dim=1)

        return top_indices, top_scores

class RagWhiteBox(nn.Module):
    """
    Unified RAG Architecture mapping Information Retrieval directly into Generative Output limits.
    """
    def __init__(self, n_embd=768, vocab_size=32000):
        super().__init__()
        self.retriever = NativeFaissEngine(embedding_dim=n_embd)

        # Simplified generative embedding bounds
        self.embed = nn.Embedding(vocab_size, n_embd)
        self.generator = nn.Sequential(
            nn.Linear(n_embd, n_embd * 4),
            nn.GELU(),
            nn.Linear(n_embd * 4, vocab_size)
        )

    def forward(self, input_ids):
        # 1. Synthesize Query Embeddings functionally
        query_embeds = self.embed(input_ids).mean(dim=1) # Mean pooling

        # 2. Structural Retrieval Phase (I/O Bottleneck evaluation!)
        # In a real environment, this blocks GPU execution!
        retrieved_idx, _ = self.retriever.search(query_embeds, top_k=3)

        # 3. Augment Context dynamically
        # Pedagogical simulation of appending document text embeddings organically
        augmented_embeds = query_embeds + (retrieved_idx.sum(dim=1, keepdim=True).float() * 0.001)

        # 4. Final Generation
        return self.generator(augmented_embeds)