mirror of
https://github.com/ollama/ollama.git
synced 2026-04-29 15:38:27 -05:00
model: support for qwen3.5 architecture (#14378)
This commit is contained in:
@@ -86,6 +86,11 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
ID: len(images),
|
||||
Data: i,
|
||||
}
|
||||
images = append(images, imgData)
|
||||
|
||||
if m.Config.Renderer != "" {
|
||||
continue
|
||||
}
|
||||
|
||||
imgTag := fmt.Sprintf("[img-%d]", imgData.ID)
|
||||
if !strings.Contains(prompt, "[img]") {
|
||||
@@ -93,8 +98,6 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
} else {
|
||||
prompt = strings.Replace(prompt, "[img]", imgTag, 1)
|
||||
}
|
||||
|
||||
images = append(images, imgData)
|
||||
}
|
||||
msgs[currMsgIdx+cnt].Content = prefix + prompt
|
||||
}
|
||||
|
||||
@@ -9,6 +9,7 @@ import (
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/template"
|
||||
"github.com/ollama/ollama/types/model"
|
||||
)
|
||||
|
||||
func TestChatPrompt(t *testing.T) {
|
||||
@@ -330,3 +331,38 @@ func TestChatPromptTokenizeCalls(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestChatPromptRendererDoesNotRewriteMessageContent(t *testing.T) {
|
||||
msgs := []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "what do these photos have in common?",
|
||||
Images: []api.ImageData{[]byte("img-1"), []byte("img-2"), []byte("img-3")},
|
||||
},
|
||||
}
|
||||
originalContent := msgs[0].Content
|
||||
|
||||
m := Model{
|
||||
Config: model.ConfigV2{Renderer: "qwen3-vl-instruct"},
|
||||
ProjectorPaths: []string{"vision"},
|
||||
}
|
||||
opts := api.Options{Runner: api.Runner{NumCtx: 8192}}
|
||||
think := false
|
||||
|
||||
prompt, images, err := chatPrompt(t.Context(), &m, mockRunner{}.Tokenize, &opts, msgs, nil, &api.ThinkValue{Value: think}, true)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
if msgs[0].Content != originalContent {
|
||||
t.Fatalf("renderer path should not mutate message content: got %q, want %q", msgs[0].Content, originalContent)
|
||||
}
|
||||
|
||||
if got, want := len(images), 3; got != want {
|
||||
t.Fatalf("len(images) = %d, want %d", got, want)
|
||||
}
|
||||
|
||||
if prompt == "" {
|
||||
t.Fatal("prompt is empty")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"log/slog"
|
||||
"maps"
|
||||
"os"
|
||||
"slices"
|
||||
"strings"
|
||||
"unsafe"
|
||||
|
||||
@@ -61,7 +62,7 @@ func useMoreBits(iLayer, nLayers int) bool {
|
||||
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
|
||||
}
|
||||
|
||||
func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
|
||||
func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) {
|
||||
switch {
|
||||
// Full attention
|
||||
case strings.HasSuffix(name, ".attn_q.weight"):
|
||||
@@ -82,6 +83,10 @@ func qwen3nextQuantType(name string) (fsggml.TensorType, bool) {
|
||||
// SSM
|
||||
case strings.HasSuffix(name, ".ssm_ba.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ssm_beta.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ssm_alpha.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
case strings.HasSuffix(name, ".ssm_out.weight"):
|
||||
return fsggml.TensorTypeQ4_K, true
|
||||
|
||||
@@ -290,8 +295,8 @@ func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.Fil
|
||||
|
||||
newType := fsggml.TensorType(t.Kind)
|
||||
if quantize {
|
||||
if kv.Architecture() == "qwen3next" && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
|
||||
if qt, ok := qwen3nextQuantType(name); ok {
|
||||
if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
|
||||
if qt, ok := qwen3LinearAttnQuantType(name); ok {
|
||||
return qt
|
||||
}
|
||||
}
|
||||
|
||||
@@ -166,6 +166,60 @@ func TestGetTensorNewType(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestQwen3LinearAttentionQuantOverride(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
arch string
|
||||
tensor string
|
||||
fileType fsggml.FileType
|
||||
expected fsggml.TensorType
|
||||
}{
|
||||
{
|
||||
name: "qwen35_beta",
|
||||
arch: "qwen35",
|
||||
tensor: "blk.0.ssm_beta.weight",
|
||||
fileType: fsggml.FileTypeQ4_K_M,
|
||||
expected: fsggml.TensorTypeQ4_K,
|
||||
},
|
||||
{
|
||||
name: "qwen35_alpha",
|
||||
arch: "qwen35",
|
||||
tensor: "blk.0.ssm_alpha.weight",
|
||||
fileType: fsggml.FileTypeQ4_K_M,
|
||||
expected: fsggml.TensorTypeQ4_K,
|
||||
},
|
||||
{
|
||||
name: "qwen35moe_attn_qkv",
|
||||
arch: "qwen35moe",
|
||||
tensor: "blk.0.attn_qkv.weight",
|
||||
fileType: fsggml.FileTypeQ4_K_M,
|
||||
expected: fsggml.TensorTypeQ4_K,
|
||||
},
|
||||
{
|
||||
name: "non_qwen35_falls_back",
|
||||
arch: "foo",
|
||||
tensor: "blk.0.attn_qkv.weight",
|
||||
fileType: fsggml.FileTypeQ4_K_M,
|
||||
expected: fsggml.TensorTypeQ5_K,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range cases {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
kv := fsggml.KV{"general.architecture": tt.arch}
|
||||
got := newType(&fsggml.Tensor{
|
||||
Name: tt.tensor,
|
||||
Shape: []uint64{256, 256},
|
||||
Kind: uint32(fsggml.TensorTypeF16),
|
||||
}, kv, &quantizeState{}, tt.fileType)
|
||||
|
||||
if got != tt.expected {
|
||||
t.Fatalf("unexpected tensor type for %s (%s): got %s want %s", tt.tensor, tt.arch, got, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestQuantizeModel(t *testing.T) {
|
||||
cases := []struct {
|
||||
name string
|
||||
|
||||
@@ -447,7 +447,7 @@ func (s *Scheduler) load(req *LlmRequest, f *ggml.GGML, systemInfo ml.SystemInfo
|
||||
|
||||
// Some architectures are not safe with num_parallel > 1.
|
||||
// ref: https://github.com/ollama/ollama/issues/4165
|
||||
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen3next", "lfm2", "lfm2moe", "nemotron_h", "nemotron_h_moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
|
||||
if slices.Contains([]string{"mllama", "qwen3vl", "qwen3vlmoe", "qwen35", "qwen35moe", "qwen3next", "lfm2", "lfm2moe", "nemotron_h", "nemotron_h_moe"}, req.model.Config.ModelFamily) && numParallel != 1 {
|
||||
numParallel = 1
|
||||
slog.Warn("model architecture does not currently support parallel requests", "architecture", req.model.Config.ModelFamily)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user