mirror of
https://github.com/ollama/ollama.git
synced 2026-03-09 03:12:11 -05:00
mlxrunner: Fix panic on full KV cache hit
When the entire prompt was already cached (e.g. repeated prompt), findRemaining returned an empty slice, causing FromValues to panic on an index-out-of-range accessing a zero-length byte slice. Fix by always keeping at least one token to re-evaluate so the pipeline can seed token generation. Also reject empty prompts early rather than panicking.
This commit is contained in:
@@ -78,6 +78,12 @@ func (c *kvCache) findRemaining(tokens []int32) []int32 {
|
||||
prefix++
|
||||
}
|
||||
|
||||
// Always keep at least one token to re-evaluate so the
|
||||
// pipeline can seed token generation from it.
|
||||
if prefix == len(tokens) && prefix > 0 {
|
||||
prefix--
|
||||
}
|
||||
|
||||
if prefix < len(c.tokens) {
|
||||
trim := len(c.tokens) - prefix
|
||||
for _, kv := range c.caches {
|
||||
|
||||
@@ -47,6 +47,10 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
mlx.ResetPeakMemory()
|
||||
|
||||
inputs := r.Tokenizer.Encode(request.Prompt, true)
|
||||
if len(inputs) == 0 {
|
||||
return errors.New("empty prompt")
|
||||
}
|
||||
|
||||
session := r.cache.begin(r.Model, inputs)
|
||||
defer session.close()
|
||||
|
||||
@@ -54,7 +58,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
tokens := session.remaining
|
||||
|
||||
total, processed := len(tokens), 0
|
||||
slog.Info("Prompt processing progress", "processed", processed, "total", total)
|
||||
for total-processed > 1 {
|
||||
if err := request.Ctx.Err(); err != nil {
|
||||
return err
|
||||
@@ -104,7 +107,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
nextSample, nextLogprobs = step(sample)
|
||||
|
||||
if i == 0 {
|
||||
slog.Info("Prompt processing progress", "processed", total, "total", total)
|
||||
mlx.Eval(sample)
|
||||
final.PromptTokensDuration = time.Since(now)
|
||||
now = time.Now()
|
||||
|
||||
Reference in New Issue
Block a user