mirror of
https://github.com/ollama/ollama.git
synced 2026-05-05 23:53:43 -05:00
* mlx: add laguna model support * convert: support fp8 safetensors import Decode HF F8_E4M3 safetensors with block scale companions into GGUF-supported tensor types, and record which output tensors came from FP8 source weights. Use that source-precision metadata during create quantization: default FP8-sourced GGUFs to Q8_0, keep non-FP8 tensors at their original precision for Q8_0, and promote non-FP8 quantizable tensors to Q8_0 for Q4_K requests. * ggml: add laguna model support * server: preserve generate logprobs with builtin parsers Generate requests were dropping logprob-only chunks whenever a builtin parser buffered visible content. Chat already handled this case, but generate only forwarded chunks with visible response, thinking, or tool-call output. Keep generate chunks that carry logprobs even when the builtin parser has not flushed visible content yet, and add a regression test that exercises the behavior with a generic thinking parser. * review comments - perf improvements * ggml: implement nemotron 3 nano omni * add poolside integration * update poolside doc * adapt to new cache setup * fix test * fix test --------- Co-authored-by: Eva Ho <hoyyeva@gmail.com>
415 lines
14 KiB
Go
415 lines
14 KiB
Go
package server
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"log/slog"
|
|
"maps"
|
|
"os"
|
|
"slices"
|
|
"strconv"
|
|
"strings"
|
|
"unsafe"
|
|
|
|
fsggml "github.com/ollama/ollama/fs/ggml"
|
|
"github.com/ollama/ollama/ml/backend/ggml"
|
|
)
|
|
|
|
type quantizer struct {
|
|
*os.File
|
|
offset uint64
|
|
from, to *fsggml.Tensor
|
|
progressFn func(n uint64)
|
|
}
|
|
|
|
func (q quantizer) WriteTo(w io.Writer) (int64, error) {
|
|
quantize := q.from.Kind != q.to.Kind
|
|
sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size()))
|
|
if !quantize {
|
|
n, err := io.Copy(w, sr)
|
|
q.progressFn(q.from.Size())
|
|
return n, err
|
|
}
|
|
data, err := io.ReadAll(sr)
|
|
if err != nil {
|
|
slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err)
|
|
return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err)
|
|
}
|
|
if uint64(len(data)) < q.from.Size() {
|
|
return 0, fmt.Errorf("tensor %s data size %d is less than expected %d from shape %v", q.from.Name, len(data), q.from.Size(), q.from.Shape)
|
|
}
|
|
var f32s []float32
|
|
newType := fsggml.TensorType(q.to.Kind)
|
|
if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 {
|
|
f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements())
|
|
} else {
|
|
f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements())
|
|
}
|
|
data = ggml.Quantize(newType, f32s, q.from.Shape)
|
|
n, err := w.Write(data)
|
|
q.progressFn(q.from.Size())
|
|
return int64(n), err
|
|
}
|
|
|
|
type quantizeState struct {
|
|
nAttnV int // Number of attn_*v* weight tensors
|
|
nFfnDown int // Number of ffn_down tensors
|
|
iAttnV int // Running counter of number of attn_v tensors that have been processed
|
|
iFfnDown int // Running counter of number of ffn_down tensors that have been processed
|
|
hasOutput bool // used to figure out if a model shares tok_embd with the output weight
|
|
preserveSourceFP8ToQ8 bool
|
|
preserveSourceQ4 bool
|
|
sourceFP8Tensors map[string]struct{}
|
|
}
|
|
|
|
func useMoreBits(iLayer, nLayers int) bool {
|
|
return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2
|
|
}
|
|
|
|
func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) {
|
|
switch {
|
|
// Full attention
|
|
case strings.HasSuffix(name, ".attn_q.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".attn_k.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".attn_v.weight"):
|
|
return fsggml.TensorTypeQ6_K, true
|
|
case strings.HasSuffix(name, ".attn_output.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
|
|
// Linear attention (Gated Delta Net) after split
|
|
case strings.HasSuffix(name, ".attn_qkv.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".attn_gate.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
|
|
// SSM
|
|
case strings.HasSuffix(name, ".ssm_ba.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ssm_beta.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ssm_alpha.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ssm_out.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
|
|
// MoE experts + shared experts
|
|
case strings.HasSuffix(name, ".ffn_down_exps.weight"):
|
|
return fsggml.TensorTypeQ6_K, true
|
|
case strings.HasSuffix(name, ".ffn_down_shexp.weight"):
|
|
return fsggml.TensorTypeQ6_K, true
|
|
case strings.HasSuffix(name, ".ffn_gate_exps.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ffn_gate_shexp.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ffn_up_exps.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
case strings.HasSuffix(name, ".ffn_up_shexp.weight"):
|
|
return fsggml.TensorTypeQ4_K, true
|
|
}
|
|
|
|
return 0, false
|
|
}
|
|
|
|
func isLagunaGGUFRoutedExpertWeight(name string) bool {
|
|
return strings.HasSuffix(name, ".weight") && (strings.Contains(name, "ffn_gate_exps") ||
|
|
strings.Contains(name, "ffn_up_exps") ||
|
|
strings.Contains(name, "ffn_down_exps"))
|
|
}
|
|
|
|
func lagunaGGUFBlockIndex(name string) (int, bool) {
|
|
if !strings.HasPrefix(name, "blk.") {
|
|
return 0, false
|
|
}
|
|
|
|
parts := strings.SplitN(strings.TrimPrefix(name, "blk."), ".", 2)
|
|
if len(parts) != 2 {
|
|
return 0, false
|
|
}
|
|
|
|
i, err := strconv.Atoi(parts[0])
|
|
if err != nil {
|
|
return 0, false
|
|
}
|
|
|
|
return i, true
|
|
}
|
|
|
|
func lagunaGGUFQuantization(name string, originalType, requestedType fsggml.TensorType, ftype fsggml.FileType, blockCount int) (fsggml.TensorType, bool) {
|
|
if !isLagunaGGUFRoutedExpertWeight(name) {
|
|
return originalType, false
|
|
}
|
|
|
|
if strings.HasSuffix(name, ".ffn_down_exps.weight") {
|
|
if i, ok := lagunaGGUFBlockIndex(name); ok && blockCount > 0 {
|
|
switch ftype {
|
|
case fsggml.FileTypeQ4_K_M:
|
|
if requestedType != fsggml.TensorTypeQ8_0 && useMoreBits(i, blockCount) {
|
|
return fsggml.TensorTypeQ6_K, true
|
|
}
|
|
case fsggml.FileTypeQ4_K_S:
|
|
if requestedType != fsggml.TensorTypeQ8_0 && i < blockCount/8 {
|
|
return fsggml.TensorTypeQ5_K, true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return requestedType, true
|
|
}
|
|
|
|
func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType {
|
|
// Ported from llama_tensor_get_type, removed unsupported quantization types
|
|
nExperts := max(1, kv.Uint("expert_count", 0))
|
|
if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") {
|
|
nx := shape[0]
|
|
qk_k := newType.BlockSize()
|
|
if nx%qk_k != 0 {
|
|
newType = fsggml.TensorTypeQ8_0
|
|
} else if newType != fsggml.TensorTypeQ8_0 {
|
|
newType = fsggml.TensorTypeQ6_K
|
|
}
|
|
} else if strings.Contains(name, "attn_v.weight") {
|
|
if newType != fsggml.TensorTypeQ8_0 && (ftype == fsggml.FileTypeQ4_K_M) &&
|
|
useMoreBits(qs.iAttnV, qs.nAttnV) {
|
|
newType = fsggml.TensorTypeQ6_K
|
|
} else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 {
|
|
newType = fsggml.TensorTypeQ5_K
|
|
}
|
|
|
|
// TODO
|
|
// if (qs.model.type == LLM_TYPE_70B) {
|
|
// // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
|
|
// // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
|
|
// // nearly negligible increase in model size by quantizing this tensor with more bits:
|
|
// if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K;
|
|
// }
|
|
|
|
if nExperts == 8 {
|
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|
newType = fsggml.TensorTypeQ8_0
|
|
}
|
|
qs.iAttnV++
|
|
} else if strings.Contains(name, "attn_k.weight") {
|
|
if nExperts == 8 {
|
|
// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
|
|
newType = fsggml.TensorTypeQ8_0
|
|
}
|
|
} else if strings.Contains(name, "attn_k_b.weight") ||
|
|
strings.Contains(name, "attn_v_b.weight") ||
|
|
strings.Contains(name, "attn_kv_a_mqa.weight") ||
|
|
strings.Contains(name, "attn_q_a.weight") ||
|
|
strings.Contains(name, "attn_q_b.weight") {
|
|
// MLA tensors need higher precision to avoid quality degradation
|
|
newType = fsggml.TensorTypeQ8_0
|
|
} else if strings.Contains(name, "ffn_down") {
|
|
// For MoE models, ffn_down.weight (dense) and ffn_down_exps.weight (expert) both
|
|
// exist per layer and should get the same useMoreBits treatment. Dense sorts before
|
|
// expert alphabetically, so dense increments the counter and expert uses counter-1.
|
|
var iLayer int
|
|
if strings.Contains(name, "_exps") {
|
|
if kv.Architecture() == "laguna" {
|
|
goto finalize
|
|
}
|
|
iLayer = max(0, qs.iFfnDown-1)
|
|
} else {
|
|
iLayer = qs.iFfnDown
|
|
qs.iFfnDown++
|
|
}
|
|
n_layer := qs.nFfnDown
|
|
if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M {
|
|
if useMoreBits(iLayer, n_layer) {
|
|
newType = fsggml.TensorTypeQ6_K
|
|
}
|
|
} else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 {
|
|
newType = fsggml.TensorTypeQ5_K
|
|
}
|
|
} else if strings.Contains(name, "attn_output.weight") {
|
|
if newType != fsggml.TensorTypeQ8_0 && nExperts == 8 {
|
|
if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M {
|
|
newType = fsggml.TensorTypeQ5_K
|
|
}
|
|
}
|
|
} else if strings.Contains(name, "attn_qkv.weight") {
|
|
if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M {
|
|
newType = fsggml.TensorTypeQ5_K
|
|
}
|
|
}
|
|
|
|
finalize:
|
|
if newType.IsQuantized() {
|
|
nx := shape[0]
|
|
qk_k := newType.BlockSize()
|
|
|
|
// Check if first dimension is divisible by block size
|
|
if nx%qk_k != 0 {
|
|
// Store the original type for logging
|
|
originalType := newType
|
|
|
|
// Select appropriate fallback based on original type
|
|
switch newType {
|
|
case fsggml.TensorTypeQ4_K:
|
|
newType = fsggml.TensorTypeQ5_0
|
|
case fsggml.TensorTypeQ5_K:
|
|
newType = fsggml.TensorTypeQ5_1
|
|
case fsggml.TensorTypeQ6_K:
|
|
newType = fsggml.TensorTypeQ8_0
|
|
}
|
|
|
|
// Final check - if still incompatible, fall back to F16
|
|
if nx%newType.BlockSize() != 0 {
|
|
newType = fsggml.TensorTypeF16
|
|
}
|
|
|
|
slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
|
|
nx, qk_k, originalType.String(), newType.String()))
|
|
}
|
|
}
|
|
return newType
|
|
}
|
|
|
|
func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error {
|
|
kv := maps.Clone(orig.KV())
|
|
kv["general.file_type"] = newFileType
|
|
// kv["general.quantization_version"] = ggml.QuantizationVersion()
|
|
qs := &quantizeState{
|
|
sourceFP8Tensors: sourceFP8TensorSet(kv),
|
|
}
|
|
hasSourceFP8 := hasSourceFP8Tensors(kv)
|
|
qs.preserveSourceFP8ToQ8 = hasSourceFP8 && newFileType == fsggml.FileTypeQ8_0
|
|
qs.preserveSourceQ4 = hasSourceFP8 && slices.Contains([]fsggml.FileType{fsggml.FileTypeQ4_K_M, fsggml.FileTypeQ4_K_S}, newFileType)
|
|
// Build up the quantize state so newType can adjust types
|
|
layerCount := 0
|
|
for k, l := range orig.Tensors().GroupLayers() {
|
|
if strings.HasPrefix(k, "blk.") {
|
|
layerCount++
|
|
}
|
|
for _, tensor := range l {
|
|
if strings.Contains(tensor.Name, "attn_v.weight") ||
|
|
strings.Contains(tensor.Name, "attn_qkv.weight") ||
|
|
strings.Contains(tensor.Name, "attn_kv_b.weight") {
|
|
qs.nAttnV++
|
|
} else if tensor.Name == "output.weight" {
|
|
qs.hasOutput = true
|
|
}
|
|
}
|
|
}
|
|
qs.nFfnDown = layerCount
|
|
|
|
origTensors := orig.Tensors().Items()
|
|
outputTensors := make([]*fsggml.Tensor, len(origTensors))
|
|
for i, tensor := range origTensors {
|
|
newType := newType(tensor, kv, qs, newFileType)
|
|
newTensor := &fsggml.Tensor{
|
|
Name: tensor.Name,
|
|
Shape: tensor.Shape,
|
|
Kind: uint32(newType),
|
|
}
|
|
outputTensors[i] = newTensor
|
|
outputTensors[i].WriterTo = quantizer{
|
|
File: in,
|
|
offset: orig.Tensors().Offset + tensor.Offset,
|
|
from: tensor,
|
|
to: newTensor,
|
|
progressFn: progressFn,
|
|
}
|
|
}
|
|
return fsggml.WriteGGUF(out, kv, outputTensors)
|
|
}
|
|
|
|
func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType {
|
|
defaultType := ftype.ToTensorType()
|
|
name := t.Name
|
|
quantize := strings.HasSuffix(name, "weight")
|
|
|
|
// don't quantize vision or audio encoder tensors
|
|
quantize = quantize && !strings.HasPrefix(name, "v.")
|
|
quantize = quantize && !strings.HasPrefix(name, "a.")
|
|
quantize = quantize && !strings.Contains(name, "mm.")
|
|
|
|
// quantize only 2D and 3D tensors (experts)
|
|
quantize = quantize && (len(t.Shape) >= 2)
|
|
|
|
// do not quantize norm tensors
|
|
quantize = quantize && !strings.Contains(name, "_norm.weight")
|
|
|
|
// do not quantize expert gating tensors
|
|
quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight")
|
|
quantize = quantize && !strings.Contains(name, "ffn_gate_inp_shexp.weight")
|
|
|
|
// do not quantize positional embeddings and token types (BERT)
|
|
quantize = quantize && (name != "position_embd.weight")
|
|
quantize = quantize && (name != "token_types.weight")
|
|
|
|
// do not quantize Mamba's small yet 2D weights
|
|
// NOTE: can't use LLM_TN here because the layer number is not known
|
|
quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight")
|
|
|
|
// do not quantize LFM2's shortconv kernel weights
|
|
quantize = quantize && !strings.Contains(name, "shortconv.conv.weight")
|
|
|
|
// do not quantize RWKV's time_mix_first tensors
|
|
quantize = quantize && !strings.Contains(name, "time_mix_first.weight")
|
|
quantize = quantize && !strings.Contains(name, "time_mix_w1.weight")
|
|
quantize = quantize && !strings.Contains(name, "time_mix_w2.weight")
|
|
quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight")
|
|
quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight")
|
|
quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight")
|
|
|
|
// do not quantize relative position bias (T5)
|
|
quantize = quantize && !strings.Contains(name, "attn_rel_b.weight")
|
|
|
|
quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight")
|
|
|
|
newType := fsggml.TensorType(t.Kind)
|
|
if quantize {
|
|
if qs.preserveSourceFP8ToQ8 {
|
|
if _, ok := qs.sourceFP8Tensors[name]; !ok {
|
|
return newType
|
|
}
|
|
}
|
|
|
|
if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) {
|
|
if qt, ok := qwen3LinearAttnQuantType(name); ok {
|
|
return qt
|
|
}
|
|
}
|
|
|
|
// TODO: Consider extracting architecture-specific GGUF quantization policy
|
|
// from server so different quantization backends can share one source of
|
|
// truth for model-family specializations.
|
|
// get more optimal quantization type based on the tensor shape, layer, etc.
|
|
if qs.preserveSourceQ4 {
|
|
if _, ok := qs.sourceFP8Tensors[name]; !ok {
|
|
defaultType = fsggml.TensorTypeQ8_0
|
|
}
|
|
}
|
|
if kv.Architecture() == "laguna" {
|
|
var ok bool
|
|
defaultType, ok = lagunaGGUFQuantization(name, newType, defaultType, ftype, int(kv.Uint("block_count", 0)))
|
|
if !ok {
|
|
return newType
|
|
}
|
|
}
|
|
newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype)
|
|
if newType != defaultType {
|
|
slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType)
|
|
}
|
|
}
|
|
return newType
|
|
}
|
|
|
|
func sourceFP8TensorSet(kv fsggml.KV) map[string]struct{} {
|
|
names := kv.Strings("source_fp8_tensors")
|
|
if len(names) == 0 {
|
|
return nil
|
|
}
|
|
|
|
out := make(map[string]struct{}, len(names))
|
|
for _, name := range names {
|
|
out[name] = struct{}{}
|
|
}
|
|
return out
|
|
}
|