From dd5eb6337dab84d76d7edec2c102064504d75378 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 25 Feb 2026 17:03:31 -0800 Subject: [PATCH] mlxrunner: Fix panic on full KV cache hit When the entire prompt was already cached (e.g. repeated prompt), findRemaining returned an empty slice, causing FromValues to panic on an index-out-of-range accessing a zero-length byte slice. Fix by always keeping at least one token to re-evaluate so the pipeline can seed token generation. Also reject empty prompts early rather than panicking. --- x/mlxrunner/cache.go | 6 ++++++ x/mlxrunner/pipeline.go | 6 ++++-- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/x/mlxrunner/cache.go b/x/mlxrunner/cache.go index 0d858d91b..48f953b2c 100644 --- a/x/mlxrunner/cache.go +++ b/x/mlxrunner/cache.go @@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 { prefix++ } + // Always keep at least one token to re-evaluate so the + // pipeline can seed token generation from it. + if prefix == len(tokens) && prefix > 0 { + prefix-- + } + if prefix < len(c.tokens) { trim := len(c.tokens) - prefix for _, kv := range c.caches { diff --git a/x/mlxrunner/pipeline.go b/x/mlxrunner/pipeline.go index 945f94755..ed068ed6b 100644 --- a/x/mlxrunner/pipeline.go +++ b/x/mlxrunner/pipeline.go @@ -47,6 +47,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error { mlx.ResetPeakMemory() inputs := r.Tokenizer.Encode(request.Prompt, true) + if len(inputs) == 0 { + return errors.New("empty prompt") + } + session := r.cache.begin(r.Model, inputs) defer session.close() @@ -54,7 +58,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error { tokens := session.remaining total, processed := len(tokens), 0 - slog.Info("Prompt processing progress", "processed", processed, "total", total) for total-processed > 1 { if err := request.Ctx.Err(); err != nil { return err @@ -104,7 +107,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error { nextSample, nextLogprobs = step(sample) if i == 0 { - slog.Info("Prompt processing progress", "processed", total, "total", total) mlx.Eval(sample) final.PromptTokensDuration = time.Since(now) now = time.Now()