Files
ollama/server/laguna_quantization_test.go
Daniel Hiltgen 87288ced4f New models (#15861)
* mlx: add laguna model support

* convert: support fp8 safetensors import

Decode HF F8_E4M3 safetensors with block scale companions into GGUF-supported tensor types, and record which output tensors came from FP8 source weights.

Use that source-precision metadata during create quantization: default FP8-sourced GGUFs to Q8_0, keep non-FP8 tensors at their original precision for Q8_0, and promote non-FP8 quantizable tensors to Q8_0 for Q4_K requests.

* ggml: add laguna model support

* server: preserve generate logprobs with builtin parsers

Generate requests were dropping logprob-only chunks whenever a builtin parser buffered visible content. Chat already handled this case, but generate only forwarded chunks with visible response, thinking, or tool-call output.

Keep generate chunks that carry logprobs even when the builtin parser has not flushed visible content yet, and add a regression test that exercises the behavior with a generic thinking parser.

* review comments - perf improvements

* ggml: implement nemotron 3 nano omni

* add poolside integration

* update poolside doc

* adapt to new cache setup

* fix test

* fix test

---------

Co-authored-by: Eva Ho <hoyyeva@gmail.com>
2026-04-28 11:50:12 -07:00

91 lines
2.6 KiB
Go

package server
import (
"testing"
fsggml "github.com/ollama/ollama/fs/ggml"
)
func TestLagunaGGUFQuantization(t *testing.T) {
cases := []struct {
name string
tensor string
originalType fsggml.TensorType
requestedType fsggml.TensorType
fileType fsggml.FileType
blockCount int
wantType fsggml.TensorType
wantQuantize bool
}{
{
name: "non_routed_weights_preserved",
tensor: "blk.1.attn_q.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ8_0,
fileType: fsggml.FileTypeQ8_0,
blockCount: 2,
wantType: fsggml.TensorTypeBF16,
wantQuantize: false,
},
{
name: "shared_expert_weights_preserved",
tensor: "blk.1.ffn_gate_shexp.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ4_K,
fileType: fsggml.FileTypeQ4_K_M,
blockCount: 2,
wantType: fsggml.TensorTypeBF16,
wantQuantize: false,
},
{
name: "routed_gate_q8",
tensor: "blk.1.ffn_gate_exps.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ8_0,
fileType: fsggml.FileTypeQ8_0,
blockCount: 2,
wantType: fsggml.TensorTypeQ8_0,
wantQuantize: true,
},
{
name: "routed_down_q4_promoted",
tensor: "blk.1.ffn_down_exps.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ4_K,
fileType: fsggml.FileTypeQ4_K_M,
blockCount: 2,
wantType: fsggml.TensorTypeQ6_K,
wantQuantize: true,
},
{
name: "routed_down_q4_not_promoted_when_q8_requested",
tensor: "blk.1.ffn_down_exps.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ8_0,
fileType: fsggml.FileTypeQ4_K_M,
blockCount: 2,
wantType: fsggml.TensorTypeQ8_0,
wantQuantize: true,
},
{
name: "routed_down_q4_k_s_promoted",
tensor: "blk.0.ffn_down_exps.weight",
originalType: fsggml.TensorTypeBF16,
requestedType: fsggml.TensorTypeQ4_K,
fileType: fsggml.FileTypeQ4_K_S,
blockCount: 8,
wantType: fsggml.TensorTypeQ5_K,
wantQuantize: true,
},
}
for _, tt := range cases {
t.Run(tt.name, func(t *testing.T) {
gotType, gotQuantize := lagunaGGUFQuantization(tt.tensor, tt.originalType, tt.requestedType, tt.fileType, tt.blockCount)
if gotType != tt.wantType || gotQuantize != tt.wantQuantize {
t.Fatalf("lagunaGGUFQuantization(%q) = (%s, %v), want (%s, %v)", tt.tensor, gotType, gotQuantize, tt.wantType, tt.wantQuantize)
}
})
}
}