package server import ( "fmt" "io" "log/slog" "maps" "os" "slices" "strconv" "strings" "unsafe" fsggml "github.com/ollama/ollama/fs/ggml" "github.com/ollama/ollama/ml/backend/ggml" ) type quantizer struct { *os.File offset uint64 from, to *fsggml.Tensor progressFn func(n uint64) } func (q quantizer) WriteTo(w io.Writer) (int64, error) { quantize := q.from.Kind != q.to.Kind sr := io.NewSectionReader(q, int64(q.offset), int64(q.from.Size())) if !quantize { n, err := io.Copy(w, sr) q.progressFn(q.from.Size()) return n, err } data, err := io.ReadAll(sr) if err != nil { slog.Warn("file read error", "tensor", q.from.Name, "file", q.Name(), "error", err) return 0, fmt.Errorf("unable to read tensor %s from %s: %s", q.from.Name, q.Name(), err) } if uint64(len(data)) < q.from.Size() { return 0, fmt.Errorf("tensor %s data size %d is less than expected %d from shape %v", q.from.Name, len(data), q.from.Size(), q.from.Shape) } var f32s []float32 newType := fsggml.TensorType(q.to.Kind) if fsggml.TensorType(q.from.Kind) == fsggml.TensorTypeF32 { f32s = unsafe.Slice((*float32)(unsafe.Pointer(&data[0])), q.from.Elements()) } else { f32s = ggml.ConvertToF32(data, q.from.Kind, q.from.Elements()) } data = ggml.Quantize(newType, f32s, q.from.Shape) n, err := w.Write(data) q.progressFn(q.from.Size()) return int64(n), err } type quantizeState struct { nAttnV int // Number of attn_*v* weight tensors nFfnDown int // Number of ffn_down tensors iAttnV int // Running counter of number of attn_v tensors that have been processed iFfnDown int // Running counter of number of ffn_down tensors that have been processed hasOutput bool // used to figure out if a model shares tok_embd with the output weight preserveSourceFP8ToQ8 bool preserveSourceQ4 bool sourceFP8Tensors map[string]struct{} } func useMoreBits(iLayer, nLayers int) bool { return iLayer < (nLayers/8) || iLayer >= 7*nLayers/8 || (iLayer-nLayers/8)%3 == 2 } func qwen3LinearAttnQuantType(name string) (fsggml.TensorType, bool) { switch { // Full attention case strings.HasSuffix(name, ".attn_q.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".attn_k.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".attn_v.weight"): return fsggml.TensorTypeQ6_K, true case strings.HasSuffix(name, ".attn_output.weight"): return fsggml.TensorTypeQ4_K, true // Linear attention (Gated Delta Net) after split case strings.HasSuffix(name, ".attn_qkv.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".attn_gate.weight"): return fsggml.TensorTypeQ4_K, true // SSM case strings.HasSuffix(name, ".ssm_ba.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ssm_beta.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ssm_alpha.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ssm_out.weight"): return fsggml.TensorTypeQ4_K, true // MoE experts + shared experts case strings.HasSuffix(name, ".ffn_down_exps.weight"): return fsggml.TensorTypeQ6_K, true case strings.HasSuffix(name, ".ffn_down_shexp.weight"): return fsggml.TensorTypeQ6_K, true case strings.HasSuffix(name, ".ffn_gate_exps.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ffn_gate_shexp.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ffn_up_exps.weight"): return fsggml.TensorTypeQ4_K, true case strings.HasSuffix(name, ".ffn_up_shexp.weight"): return fsggml.TensorTypeQ4_K, true } return 0, false } func isLagunaGGUFRoutedExpertWeight(name string) bool { return strings.HasSuffix(name, ".weight") && (strings.Contains(name, "ffn_gate_exps") || strings.Contains(name, "ffn_up_exps") || strings.Contains(name, "ffn_down_exps")) } func lagunaGGUFBlockIndex(name string) (int, bool) { if !strings.HasPrefix(name, "blk.") { return 0, false } parts := strings.SplitN(strings.TrimPrefix(name, "blk."), ".", 2) if len(parts) != 2 { return 0, false } i, err := strconv.Atoi(parts[0]) if err != nil { return 0, false } return i, true } func lagunaGGUFQuantization(name string, originalType, requestedType fsggml.TensorType, ftype fsggml.FileType, blockCount int) (fsggml.TensorType, bool) { if !isLagunaGGUFRoutedExpertWeight(name) { return originalType, false } if strings.HasSuffix(name, ".ffn_down_exps.weight") { if i, ok := lagunaGGUFBlockIndex(name); ok && blockCount > 0 { switch ftype { case fsggml.FileTypeQ4_K_M: if requestedType != fsggml.TensorTypeQ8_0 && useMoreBits(i, blockCount) { return fsggml.TensorTypeQ6_K, true } case fsggml.FileTypeQ4_K_S: if requestedType != fsggml.TensorTypeQ8_0 && i < blockCount/8 { return fsggml.TensorTypeQ5_K, true } } } } return requestedType, true } func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType, name string, shape []uint64, ftype fsggml.FileType) fsggml.TensorType { // Ported from llama_tensor_get_type, removed unsupported quantization types nExperts := max(1, kv.Uint("expert_count", 0)) if name == "output.weight" || name == "output_norm.weight" || (!qs.hasOutput && name == "token_embd.weight") { nx := shape[0] qk_k := newType.BlockSize() if nx%qk_k != 0 { newType = fsggml.TensorTypeQ8_0 } else if newType != fsggml.TensorTypeQ8_0 { newType = fsggml.TensorTypeQ6_K } } else if strings.Contains(name, "attn_v.weight") { if newType != fsggml.TensorTypeQ8_0 && (ftype == fsggml.FileTypeQ4_K_M) && useMoreBits(qs.iAttnV, qs.nAttnV) { newType = fsggml.TensorTypeQ6_K } else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && qs.iAttnV < 4 { newType = fsggml.TensorTypeQ5_K } // TODO // if (qs.model.type == LLM_TYPE_70B) { // // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is // // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with // // nearly negligible increase in model size by quantizing this tensor with more bits: // if (newType == GGML_TYPE_Q3_K || newType == GGML_TYPE_Q4_K) newType = GGML_TYPE_Q5_K; // } if nExperts == 8 { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB newType = fsggml.TensorTypeQ8_0 } qs.iAttnV++ } else if strings.Contains(name, "attn_k.weight") { if nExperts == 8 { // for the 8-expert model, bumping this to Q8_0 trades just ~128MB newType = fsggml.TensorTypeQ8_0 } } else if strings.Contains(name, "attn_k_b.weight") || strings.Contains(name, "attn_v_b.weight") || strings.Contains(name, "attn_kv_a_mqa.weight") || strings.Contains(name, "attn_q_a.weight") || strings.Contains(name, "attn_q_b.weight") { // MLA tensors need higher precision to avoid quality degradation newType = fsggml.TensorTypeQ8_0 } else if strings.Contains(name, "ffn_down") { // For MoE models, ffn_down.weight (dense) and ffn_down_exps.weight (expert) both // exist per layer and should get the same useMoreBits treatment. Dense sorts before // expert alphabetically, so dense increments the counter and expert uses counter-1. var iLayer int if strings.Contains(name, "_exps") { if kv.Architecture() == "laguna" { goto finalize } iLayer = max(0, qs.iFfnDown-1) } else { iLayer = qs.iFfnDown qs.iFfnDown++ } n_layer := qs.nFfnDown if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M { if useMoreBits(iLayer, n_layer) { newType = fsggml.TensorTypeQ6_K } } else if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_S && iLayer < n_layer/8 { newType = fsggml.TensorTypeQ5_K } } else if strings.Contains(name, "attn_output.weight") { if newType != fsggml.TensorTypeQ8_0 && nExperts == 8 { if ftype == fsggml.FileTypeQ4_K_S || ftype == fsggml.FileTypeQ4_K_M { newType = fsggml.TensorTypeQ5_K } } } else if strings.Contains(name, "attn_qkv.weight") { if newType != fsggml.TensorTypeQ8_0 && ftype == fsggml.FileTypeQ4_K_M { newType = fsggml.TensorTypeQ5_K } } finalize: if newType.IsQuantized() { nx := shape[0] qk_k := newType.BlockSize() // Check if first dimension is divisible by block size if nx%qk_k != 0 { // Store the original type for logging originalType := newType // Select appropriate fallback based on original type switch newType { case fsggml.TensorTypeQ4_K: newType = fsggml.TensorTypeQ5_0 case fsggml.TensorTypeQ5_K: newType = fsggml.TensorTypeQ5_1 case fsggml.TensorTypeQ6_K: newType = fsggml.TensorTypeQ8_0 } // Final check - if still incompatible, fall back to F16 if nx%newType.BlockSize() != 0 { newType = fsggml.TensorTypeF16 } slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s", nx, qk_k, originalType.String(), newType.String())) } } return newType } func quantize(in, out *os.File, orig *fsggml.GGML, newFileType fsggml.FileType, progressFn func(n uint64)) error { kv := maps.Clone(orig.KV()) kv["general.file_type"] = newFileType // kv["general.quantization_version"] = ggml.QuantizationVersion() qs := &quantizeState{ sourceFP8Tensors: sourceFP8TensorSet(kv), } hasSourceFP8 := hasSourceFP8Tensors(kv) qs.preserveSourceFP8ToQ8 = hasSourceFP8 && newFileType == fsggml.FileTypeQ8_0 qs.preserveSourceQ4 = hasSourceFP8 && slices.Contains([]fsggml.FileType{fsggml.FileTypeQ4_K_M, fsggml.FileTypeQ4_K_S}, newFileType) // Build up the quantize state so newType can adjust types layerCount := 0 for k, l := range orig.Tensors().GroupLayers() { if strings.HasPrefix(k, "blk.") { layerCount++ } for _, tensor := range l { if strings.Contains(tensor.Name, "attn_v.weight") || strings.Contains(tensor.Name, "attn_qkv.weight") || strings.Contains(tensor.Name, "attn_kv_b.weight") { qs.nAttnV++ } else if tensor.Name == "output.weight" { qs.hasOutput = true } } } qs.nFfnDown = layerCount origTensors := orig.Tensors().Items() outputTensors := make([]*fsggml.Tensor, len(origTensors)) for i, tensor := range origTensors { newType := newType(tensor, kv, qs, newFileType) newTensor := &fsggml.Tensor{ Name: tensor.Name, Shape: tensor.Shape, Kind: uint32(newType), } outputTensors[i] = newTensor outputTensors[i].WriterTo = quantizer{ File: in, offset: orig.Tensors().Offset + tensor.Offset, from: tensor, to: newTensor, progressFn: progressFn, } } return fsggml.WriteGGUF(out, kv, outputTensors) } func newType(t *fsggml.Tensor, kv fsggml.KV, qs *quantizeState, ftype fsggml.FileType) fsggml.TensorType { defaultType := ftype.ToTensorType() name := t.Name quantize := strings.HasSuffix(name, "weight") // don't quantize vision or audio encoder tensors quantize = quantize && !strings.HasPrefix(name, "v.") quantize = quantize && !strings.HasPrefix(name, "a.") quantize = quantize && !strings.Contains(name, "mm.") // quantize only 2D and 3D tensors (experts) quantize = quantize && (len(t.Shape) >= 2) // do not quantize norm tensors quantize = quantize && !strings.Contains(name, "_norm.weight") // do not quantize expert gating tensors quantize = quantize && !strings.Contains(name, "ffn_gate_inp.weight") quantize = quantize && !strings.Contains(name, "ffn_gate_inp_shexp.weight") // do not quantize positional embeddings and token types (BERT) quantize = quantize && (name != "position_embd.weight") quantize = quantize && (name != "token_types.weight") // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize = quantize && !strings.Contains(name, "ssm_conv1d.weight") // do not quantize LFM2's shortconv kernel weights quantize = quantize && !strings.Contains(name, "shortconv.conv.weight") // do not quantize RWKV's time_mix_first tensors quantize = quantize && !strings.Contains(name, "time_mix_first.weight") quantize = quantize && !strings.Contains(name, "time_mix_w1.weight") quantize = quantize && !strings.Contains(name, "time_mix_w2.weight") quantize = quantize && !strings.Contains(name, "time_mix_decay_w1.weight") quantize = quantize && !strings.Contains(name, "time_mix_decay_w2.weight") quantize = quantize && !strings.Contains(name, "time_mix_lerp_fused.weight") // do not quantize relative position bias (T5) quantize = quantize && !strings.Contains(name, "attn_rel_b.weight") quantize = quantize && !strings.Contains(name, "per_layer_token_embd.weight") newType := fsggml.TensorType(t.Kind) if quantize { if qs.preserveSourceFP8ToQ8 { if _, ok := qs.sourceFP8Tensors[name]; !ok { return newType } } if slices.Contains([]string{"qwen3next", "qwen35", "qwen35moe"}, kv.Architecture()) && (ftype == fsggml.FileTypeQ4_K_M || ftype == fsggml.FileTypeQ4_K_S) { if qt, ok := qwen3LinearAttnQuantType(name); ok { return qt } } // TODO: Consider extracting architecture-specific GGUF quantization policy // from server so different quantization backends can share one source of // truth for model-family specializations. // get more optimal quantization type based on the tensor shape, layer, etc. if qs.preserveSourceQ4 { if _, ok := qs.sourceFP8Tensors[name]; !ok { defaultType = fsggml.TensorTypeQ8_0 } } if kv.Architecture() == "laguna" { var ok bool defaultType, ok = lagunaGGUFQuantization(name, newType, defaultType, ftype, int(kv.Uint("block_count", 0))) if !ok { return newType } } newType = getTensorNewType(kv, qs, defaultType, t.Name, t.Shape, ftype) if newType != defaultType { slog.Debug("tensor quantization adjusted for better quality", "name", t.Name, "requested", defaultType, "quantization", newType) } } return newType } func sourceFP8TensorSet(kv fsggml.KV) map[string]struct{} { names := kv.Strings("source_fp8_tensors") if len(names) == 0 { return nil } out := make(map[string]struct{}, len(names)) for _, name := range names { out[name] = struct{}{} } return out }