mlx: int4 groupsize 64 (#14682)

Change affine 4bit integers to use groupsize 64
This commit is contained in:
Patrick Devine
2026-03-06 16:39:47 -08:00
committed by GitHub
parent 288077c3a3
commit e790dc435b
2 changed files with 3 additions and 3 deletions

View File

@@ -21,7 +21,7 @@ var quantizeParams = map[string]struct {
bits int
mode string
}{
"int4": {32, 4, "affine"},
"int4": {64, 4, "affine"},
"nvfp4": {16, 4, "nvfp4"},
"int8": {64, 8, "affine"},
"mxfp8": {32, 8, "mxfp8"},

View File

@@ -334,12 +334,12 @@ func GetTensorQuantization(name string, shape []int32, quantize string) string {
quantNorm := normalizeQuantType(quantize)
// MLX quantization requires last dimension to be divisible by group size
// nvfp4: 16, int4/mxfp8: 32, int8: 64
// nvfp4: 16, mxfp8: 32, int4/int8: 64
groupSize := int32(32)
switch quantNorm {
case "nvfp4":
groupSize = 16
case "int8":
case "int4", "int8":
groupSize = 64
}
if shape[len(shape)-1]%groupSize != 0 {