From e36f389e8281da2532074ae3af53d3a7b1ca6ce5 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Thu, 5 Feb 2026 12:48:25 -0800 Subject: [PATCH] scheduler: default parallel=1 for qwen3next/lfm (#14103) --- server/sched.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/sched.go b/server/sched.go index 3aa9969a0..67af5e8ce 100644 --- a/server/sched.go +++ b/server/sched.go @@ -417,9 +417,9 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo numParallel = 1 } - // `mllama`, `qwen3vl`, and `qwen3vlmoe` are snowflakes and uses an encoder cache which cannot be used with num_parallel > 1 + // Some architectures are not safe with num_parallel > 1. // ref: https://github.com/ollama/ollama/issues/4165 - if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe"}, req.model.Config.ModelFamily) && numParallel != 1 { + if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen3next", "lfm2", "lfm2moe"}, req.model.Config.ModelFamily) && numParallel != 1 { numParallel = 1 slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily) }