mlxrunner: Enforce model context limit

Currently, context length is unbounded - the cache will keep growing forever independent of the model's trained context length. This caps it and enforces semantics similar to most cloud services: - Long prompts will result in an error, not truncation. - Generation that exceeds the context will be stopped
2026-04-30 07:57:51 -05:00 · 2026-02-25 15:07:09 -08:00
parent 18ab09b431
commit a16f96658b
13 changed files with 104 additions and 60 deletions
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -8,7 +8,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"math/rand"
 	"net"
 	"net/http"
@@ -30,15 +29,16 @@ import (

 // Client wraps an MLX runner subprocess to implement llm.LlamaServer for LLM models.
 type Client struct {
-	port        int
-	modelName   string
-	memory      atomic.Uint64
-	done        chan error
-	client      *http.Client
-	lastErr     string
-	lastErrLock sync.Mutex
-	mu          sync.Mutex
-	cmd         *exec.Cmd
+	port          int
+	modelName     string
+	contextLength atomic.Int64
+	memory        atomic.Uint64
+	done          chan error
+	client        *http.Client
+	lastErr       string
+	lastErrLock   sync.Mutex
+	mu            sync.Mutex
+	cmd           *exec.Cmd
 }

 // NewClient spawns a new MLX runner subprocess for LLM models and waits until it's ready.
@@ -297,7 +297,7 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 }

 func (c *Client) ContextLength() int {
-	return math.MaxInt
+	return int(c.contextLength.Load())
 }

 // Detokenize implements llm.LlamaServer.
@@ -351,9 +351,10 @@ func (c *Client) Pid() int {
 }

 type statusResponse struct {
-	Status   int
-	Progress int
-	Memory   uint64
+	Status        int
+	Progress      int
+	ContextLength int
+	Memory        uint64
 }

 // Ping implements llm.LlamaServer.
@@ -376,7 +377,10 @@ func (c *Client) Ping(ctx context.Context) error {
 	if err := json.NewDecoder(resp.Body).Decode(&status); err != nil {
 		return err
 	}
+
+	c.contextLength.Store(int64(status.ContextLength))
 	c.memory.Store(status.Memory)
+
 	return nil
 }

--- a/x/mlxrunner/model/base/base.go
+++ b/x/mlxrunner/model/base/base.go
@@ -20,6 +20,7 @@ type Model interface {
 	Unembed(x *mlx.Array) *mlx.Array
 	NumLayers() int
 	Tokenizer() *tokenizer.Tokenizer
+	MaxContextLength() int

 	// LoadWeights receives all tensors loaded from the manifest and assigns
 	// them to model fields. Model-specific logic (MLA absorption, expert
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -6,9 +6,12 @@ import (
 	"bytes"
 	"context"
 	"errors"
+	"fmt"
 	"log/slog"
+	"net/http"
 	"time"

+	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/x/mlxrunner/mlx"
 )
@@ -51,9 +54,23 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		return errors.New("empty prompt")
 	}

+	if len(inputs) >= r.contextLength {
+		return api.StatusError{
+			StatusCode:   http.StatusBadRequest,
+			ErrorMessage: fmt.Sprintf("input length (%d tokens) exceeds the model's maximum context length (%d tokens)", len(inputs), r.contextLength),
+		}
+	}
+
+	// Cap generation to stay within the model's context length
+	maxGenerate := r.contextLength - len(inputs)
+	if request.Options.MaxTokens <= 0 {
+		request.Options.MaxTokens = maxGenerate
+	} else {
+		request.Options.MaxTokens = min(request.Options.MaxTokens, maxGenerate)
+	}
+
 	session := r.cache.begin(r.Model, inputs)
 	defer session.close()
-
 	caches := session.caches
 	tokens := session.remaining

--- a/x/mlxrunner/runner.go
+++ b/x/mlxrunner/runner.go
@@ -45,10 +45,11 @@ type TextCompletionsRequest struct {
 }

 type Runner struct {
-	Model     base.Model
-	Tokenizer *tokenizer.Tokenizer
-	Requests  chan Request
-	cache     kvCache
+	Model         base.Model
+	Tokenizer     *tokenizer.Tokenizer
+	Requests      chan Request
+	cache         kvCache
+	contextLength int
 }

 func (r *Runner) Load(modelName string) error {
@@ -77,6 +78,7 @@ func (r *Runner) Load(modelName string) error {

 	r.Model = m
 	r.Tokenizer = m.Tokenizer()
+	r.contextLength = m.MaxContextLength()
 	return nil
 }

--- a/x/mlxrunner/server.go
+++ b/x/mlxrunner/server.go
@@ -51,9 +51,10 @@ func Execute(args []string) error {
 	mux := http.NewServeMux()
 	mux.HandleFunc("GET /v1/status", func(w http.ResponseWriter, r *http.Request) {
 		if err := json.NewEncoder(w).Encode(statusResponse{
-			Status:   0,
-			Progress: 100,
-			Memory:   uint64(mlx.ActiveMemory() + mlx.CacheMemory()),
+			Status:        0,
+			Progress:      100,
+			ContextLength: runner.contextLength,
+			Memory:        uint64(mlx.ActiveMemory() + mlx.CacheMemory()),
 		}); err != nil {
 			slog.Error("Failed to encode response", "error", err)
 			http.Error(w, "Internal Server Error", http.StatusInternalServerError)
@@ -88,9 +89,6 @@ func Execute(args []string) error {
 		}

 		request.Options.MaxTokens = cmp.Or(request.Options.MaxTokens, request.Options.NumPredict)
-		if request.Options.MaxTokens < 1 {
-			request.Options.MaxTokens = 16 << 10
-		}

 		request.Pipeline = runner.TextGenerationPipeline
 		request.Sampler = sample.New(
--- a/x/models/gemma3/gemma3.go
+++ b/x/models/gemma3/gemma3.go
@@ -430,6 +430,10 @@ func (m *Model) NumLayers() int {
 	return len(m.Layers)
 }

+func (m *Model) MaxContextLength() int {
+	return int(m.MaxPositionEmbeddings)
+}
+
 func (m *Model) Tokenizer() *tokenizer.Tokenizer {
 	return m.tok
 }
--- a/x/models/glm4_moe_lite/glm4_moe_lite.go
+++ b/x/models/glm4_moe_lite/glm4_moe_lite.go
@@ -733,7 +733,7 @@ func (m *Model) Unembed(x *mlx.Array) *mlx.Array {
 func (m *Model) NumLayers() int { return len(m.Layers) }

 // MaxContextLength returns the maximum context length
-func (m *Model) MaxContextLength() int32 { return m.MaxPositionEmbeddings }
+func (m *Model) MaxContextLength() int { return int(m.MaxPositionEmbeddings) }

 // VocabSize returns the vocabulary size
 func (m *Model) VocabSize() int32 { return m.Config.VocabSize }
--- a/x/models/llama/llama.go
+++ b/x/models/llama/llama.go
@@ -262,6 +262,10 @@ func (m *Model) NumLayers() int {
 	return len(m.Layers)
 }

+func (m *Model) MaxContextLength() int {
+	return int(m.MaxPositionEmbeddings)
+}
+
 func (m *Model) Tokenizer() *tokenizer.Tokenizer {
 	return m.tok
 }
--- a/x/models/qwen3/qwen3.go
+++ b/x/models/qwen3/qwen3.go
@@ -279,6 +279,10 @@ func (m *Model) NumLayers() int {
 	return len(m.Layers)
 }

+func (m *Model) MaxContextLength() int {
+	return int(m.MaxPositionEmbeddings)
+}
+
 func (m *Model) Tokenizer() *tokenizer.Tokenizer {
 	return m.tok
 }