diff --git a/x/mlxrunner/cache.go b/x/mlxrunner/cache.go index 0d858d91b..eb8b4b32e 100644 --- a/x/mlxrunner/cache.go +++ b/x/mlxrunner/cache.go @@ -78,6 +78,11 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 { prefix++ } + if prefix == len(tokens) && prefix > 0 { + // Leave one token to run through the model so we can sample a response. + prefix-- + } + if prefix < len(c.tokens) { trim := len(c.tokens) - prefix for _, kv := range c.caches {