model: support for qwen3.5 architecture (#14378)

This commit is contained in:
Jeffrey Morgan
2026-02-24 20:08:05 -08:00
committed by GitHub
parent 9d902d63ce
commit da70c3222e
31 changed files with 1902 additions and 1628 deletions

View File

@@ -86,6 +86,11 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
ID: len(images),
Data: i,
}
images = append(images, imgData)
if m.Config.Renderer != "" {
continue
}
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
if !strings.Contains(prompt, "[img]") {
@@ -93,8 +98,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
} else {
prompt = strings.Replace(prompt, "[img]", imgTag, 1)
}
images = append(images, imgData)
}
msgs[currMsgIdx+cnt].Content = prefix + prompt
}

View File

@@ -9,6 +9,7 @@ import (
"github.com/ollama/ollama/api"
"github.com/ollama/ollama/template"
"github.com/ollama/ollama/types/model"
)
func TestChatPrompt(t *testing.T) {
@@ -330,3 +331,38 @@ func TestChatPromptTokenizeCalls(t *testing.T) {
})
}
}
func TestChatPromptRendererDoesNotRewriteMessageContent(t *testing.T) {
msgs := []api.Message{
{
Role: "user",
Content: "what do these photos have in common?",
Images: []api.ImageData{[]byte("img-1"), []byte("img-2"), []byte("img-3")},
},
}
originalContent := msgs[0].Content
m := Model{
Config: model.ConfigV2{Renderer: "qwen3-vl-instruct"},
ProjectorPaths: []string{"vision"},
}
opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
think := false
prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
if err != nil {
t.Fatal(err)
}
if msgs[0].Content != originalContent {
t.Fatalf("renderer path should not mutate message content: got %q, want %q", msgs[0].Content, originalContent)
}
if got, want := len(images), 3; got != want {
t.Fatalf("len(images) = %d, want %d", got, want)
}
if prompt == "" {
t.Fatal("prompt is empty")
}
}

View File

@@ -6,6 +6,7 @@ import (
"log/slog"
"maps"
"os"
"slices"
"strings"
"unsafe"
@@ -61,7 +62,7 @@ func useMoreBits(iLayer, nLayers int) bool {
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
}
func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) {
switch {
// Full attention
case strings.HasSuffix(name, ".attn_q.weight"):
@@ -82,6 +83,10 @@ func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
// SSM
case strings.HasSuffix(name, ".ssm_ba.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ssm_beta.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ssm_alpha.weight"):
return fsggml.TensorTypeQ4_K, true
case strings.HasSuffix(name, ".ssm_out.weight"):
return fsggml.TensorTypeQ4_K, true
@@ -290,8 +295,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
newType := fsggml.TensorType(t.Kind)
if quantize {
if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
if qt, ok := qwen3nextQuantType(name); ok {
if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
if qt, ok := qwen3LinearAttnQuantType(name); ok {
return qt
}
}

View File

@@ -166,6 +166,60 @@ func TestGetTensorNewType(t *testing.T) {
}
}
func TestQwen3LinearAttentionQuantOverride(t *testing.T) {
cases := []struct {
name string
arch string
tensor string
fileType fsggml.FileType
expected fsggml.TensorType
}{
{
name: "qwen35_beta",
arch: "qwen35",
tensor: "blk.0.ssm_beta.weight",
fileType: fsggml.FileTypeQ4_K_M,
expected: fsggml.TensorTypeQ4_K,
},
{
name: "qwen35_alpha",
arch: "qwen35",
tensor: "blk.0.ssm_alpha.weight",
fileType: fsggml.FileTypeQ4_K_M,
expected: fsggml.TensorTypeQ4_K,
},
{
name: "qwen35moe_attn_qkv",
arch: "qwen35moe",
tensor: "blk.0.attn_qkv.weight",
fileType: fsggml.FileTypeQ4_K_M,
expected: fsggml.TensorTypeQ4_K,
},
{
name: "non_qwen35_falls_back",
arch: "foo",
tensor: "blk.0.attn_qkv.weight",
fileType: fsggml.FileTypeQ4_K_M,
expected: fsggml.TensorTypeQ5_K,
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
kv := fsggml.KV{"general.architecture": tt.arch}
got := newType(&fsggml.Tensor{
Name: tt.tensor,
Shape: []uint64{256, 256},
Kind: uint32(fsggml.TensorTypeF16),
}, kv, &quantizeState{}, tt.fileType)
if got != tt.expected {
t.Fatalf("unexpected tensor type for %s (%s): got %s want %s", tt.tensor, tt.arch, got, tt.expected)
}
})
}
}
func TestQuantizeModel(t *testing.T) {
cases := []struct {
name string

View File

@@ -447,7 +447,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
// Some architectures are not safe with num_parallel > 1.
// ref: https://github.com/ollama/ollama/issues/4165
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen3next", "lfm2", "lfm2moe", "nemotron_h", "nemotron_h_moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen35", "qwen35moe", "qwen3next", "lfm2", "lfm2moe", "nemotron_h", "nemotron_h_moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
numParallel = 1
slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
}