glm4moelite: quantize more tensors to q8_0 and avoid double BOS token (#13891)

2026-03-09 07:16:38 -05:00 · 2026-01-24 16:33:54 -08:00
parent f3b476c592
commit 16750865d1
2 changed files with 8 additions and 1 deletions
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -95,6 +95,13 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType
 			// for the 8-expert model, bumping this to Q8_0 trades just ~128MB
 			newType = fsggml.TensorTypeQ8_0
 		}
+	} else if strings.Contains(name, "attn_k_b.weight") ||
+		strings.Contains(name, "attn_v_b.weight") ||
+		strings.Contains(name, "attn_kv_a_mqa.weight") ||
+		strings.Contains(name, "attn_q_a.weight") ||
+		strings.Contains(name, "attn_q_b.weight") {
+		// MLA tensors need higher precision to avoid quality degradation
+		newType = fsggml.TensorTypeQ8_0
 	} else if strings.Contains(name, "ffn_down") {
 		iLayer := qs.iFfnDown
 		n_layer := qs.nFfnDown