safetensors quantization for mlx (#14184)

This change includes: - changes to the safetensors metadata format - changes to the create command to properly create the blobs with the new format - changes to load the new format - fixes ollama show to properly show each tensor
2026-03-11 17:34:04 -05:00 · 2026-02-10 11:29:17 -08:00
parent 9ec733e527
commit a0407d07fa
14 changed files with 1640 additions and 461 deletions
--- a/x/server/show.go
+++ b/x/server/show.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"sort"
 	"strings"

 	"github.com/ollama/ollama/api"
@@ -105,9 +106,9 @@ func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map
 		bytesPerParam = 1
 	}

-	// Subtract safetensors header overhead (88 bytes per tensor file)
-	// Each tensor is stored as a minimal safetensors file
-	totalBytes := totalTensorBytes - tensorCount*88
+	// Subtract safetensors header overhead per tensor blob.
+	// Headers include __metadata__ with the tensor name, so overhead is ~150 bytes on average.
+	totalBytes := totalTensorBytes - tensorCount*150

 	paramCount := totalBytes / bytesPerParam

@@ -163,24 +164,103 @@ func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {

 // getTensorInfoFromManifest extracts tensor info from a manifest.
 // This is separated for testability.
-// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
+// For quantized tensors, reads quant_type from blob __metadata__.
+// For packed blobs (multiple tensors per blob), enumerates all tensors in the blob.
 func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
 	var tensors []api.Tensor

-	// First pass: collect all tensor info and identify scale tensors
-	type tensorData struct {
-		info   *safetensorsTensorInfo
-		digest string
-	}
-	tensorMap := make(map[string]*tensorData)
-	scaleMap := make(map[string]*tensorData) // base name -> scale tensor info
-
 	for _, layer := range mf.Layers {
 		if layer.MediaType != manifest.MediaTypeImageTensor {
 			continue
 		}

-		// Read the safetensors header from the blob
+		// Read all tensor entries from the safetensors header
+		blobPath, err := manifest.BlobsPath(layer.Digest)
+		if err != nil {
+			continue
+		}
+
+		f, err := os.Open(blobPath)
+		if err != nil {
+			continue
+		}
+
+		allInfos, err := parseSafetensorsAllHeaders(f)
+		f.Close()
+		if err != nil {
+			continue
+		}
+
+		// Determine if this is a packed blob (multiple main tensors)
+		isPacked := len(allInfos) > 1
+
+		for _, info := range allInfos {
+			tensorName := layer.Name
+			if isPacked {
+				// For packed blobs, use the tensor name from the header
+				tensorName = info.Name
+			}
+
+			if info.QuantType != "" {
+				quantType := strings.ToUpper(info.QuantType)
+
+				shape := make([]uint64, len(info.Shape))
+				for i, s := range info.Shape {
+					shape[i] = uint64(s)
+				}
+
+				var packFactor int64
+				switch strings.ToLower(info.QuantType) {
+				case "int4", "nvfp4":
+					packFactor = 8
+				case "int8", "mxfp8":
+					packFactor = 4
+				}
+				if packFactor > 0 && len(shape) >= 2 {
+					shape[len(shape)-1] = uint64(info.Shape[len(info.Shape)-1] * packFactor)
+				}
+
+				tensors = append(tensors, api.Tensor{
+					Name:  tensorName,
+					Type:  quantType,
+					Shape: shape,
+				})
+			} else {
+				shape := make([]uint64, len(info.Shape))
+				for i, s := range info.Shape {
+					shape[i] = uint64(s)
+				}
+
+				tensors = append(tensors, api.Tensor{
+					Name:  tensorName,
+					Type:  info.Dtype,
+					Shape: shape,
+				})
+			}
+		}
+	}
+
+	sort.Slice(tensors, func(i, j int) bool {
+		return tensors[i].Name < tensors[j].Name
+	})
+
+	return tensors, nil
+}
+
+// GetSafetensorsDtype returns the quantization type for a safetensors model.
+// Reads quant_type from the first tensor blob's __metadata__.
+// Falls back to torch_dtype from config.json if no quant metadata.
+func GetSafetensorsDtype(name model.Name) (string, error) {
+	mf, err := manifest.ParseNamedManifest(name)
+	if err != nil {
+		return "", fmt.Errorf("failed to load manifest: %w", err)
+	}
+
+	// Check first tensor blob for quant_type metadata
+	for _, layer := range mf.Layers {
+		if layer.MediaType != manifest.MediaTypeImageTensor {
+			continue
+		}
 		blobPath, err := manifest.BlobsPath(layer.Digest)
 		if err != nil {
 			continue
@@ -189,131 +269,11 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
 		if err != nil {
 			continue
 		}
-
-		td := &tensorData{info: info, digest: layer.Digest}
-
-		if strings.HasSuffix(layer.Name, "_scale") {
-			baseName := strings.TrimSuffix(layer.Name, "_scale")
-			scaleMap[baseName] = td
-		} else if strings.HasSuffix(layer.Name, "_qbias") {
-			// Skip qbias tensors - they're included with the quantized weight
-			continue
-		} else {
-			tensorMap[layer.Name] = td
+		if info.QuantType != "" {
+			return strings.ToUpper(info.QuantType), nil
 		}
-	}
-
-	// Second pass: build tensor list with quantization info
-	for _, layer := range mf.Layers {
-		if layer.MediaType != manifest.MediaTypeImageTensor {
-			continue
-		}
-
-		// Skip scale and qbias tensors
-		if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
-			continue
-		}
-
-		td := tensorMap[layer.Name]
-		if td == nil {
-			continue
-		}
-
-		// Check if this tensor has a corresponding scale tensor (quantized)
-		scaleTd := scaleMap[layer.Name]
-		if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
-			// Quantized tensor - detect bits from shapes
-			weightCols := td.info.Shape[len(td.info.Shape)-1]
-			scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]
-
-			// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
-			// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
-			// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
-			var bits int
-			var quantType string
-			if weightCols*8/scaleCols == 32 {
-				bits = 4
-				quantType = "Q4"
-			} else if weightCols*4/scaleCols == 64 {
-				bits = 8
-				quantType = "Q8"
-			} else {
-				// Unknown quantization, show raw
-				quantType = td.info.Dtype
-			}
-
-			// Calculate unpacked shape
-			shape := make([]uint64, len(td.info.Shape))
-			for i, s := range td.info.Shape {
-				shape[i] = uint64(s)
-			}
-			if bits > 0 {
-				packFactor := int64(32 / bits)
-				shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
-			}
-
-			tensors = append(tensors, api.Tensor{
-				Name:  layer.Name,
-				Type:  quantType,
-				Shape: shape,
-			})
-		} else {
-			// Non-quantized tensor
-			shape := make([]uint64, len(td.info.Shape))
-			for i, s := range td.info.Shape {
-				shape[i] = uint64(s)
-			}
-
-			tensors = append(tensors, api.Tensor{
-				Name:  layer.Name,
-				Type:  td.info.Dtype,
-				Shape: shape,
-			})
-		}
-	}
-
-	return tensors, nil
-}
-
-// GetSafetensorsDtype returns the quantization type for a safetensors model.
-// Reads from model_index.json first, falls back to detection from tensor names.
-// Otherwise returns the torch_dtype from config.json.
-func GetSafetensorsDtype(name model.Name) (string, error) {
-	mf, err := manifest.ParseNamedManifest(name)
-	if err != nil {
-		return "", fmt.Errorf("failed to load manifest: %w", err)
-	}
-
-	// First try to read quantization from model_index.json
-	var modelIndex struct {
-		Quantization string `json:"quantization"`
-	}
-	if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
-		return modelIndex.Quantization, nil
-	}
-
-	// Fallback: detect from tensor names
-	hasScales := false
-	hasQBias := false
-	for _, layer := range mf.Layers {
-		if layer.MediaType == manifest.MediaTypeImageTensor {
-			if strings.HasSuffix(layer.Name, "_scale") {
-				hasScales = true
-			}
-			if strings.HasSuffix(layer.Name, "_qbias") {
-				hasQBias = true
-			}
-		}
-	}
-
-	if hasScales {
-		if hasQBias {
-			// Affine mode (has scale + qbias) - could be Q4 or Q8
-			// Default to Q4 as it's more common
-			return "Q4", nil
-		}
-		// No qbias = NVFP4
-		return "NVFP4", nil
+		// Only check the first tensor blob
+		break
 	}

 	// Not quantized - return torch_dtype from config.json
@@ -329,8 +289,11 @@ func GetSafetensorsDtype(name model.Name) (string, error) {

 // safetensorsTensorInfo holds metadata about a tensor from a safetensors header
 type safetensorsTensorInfo struct {
-	Dtype string  `json:"dtype"`
-	Shape []int64 `json:"shape"`
+	Name      string  // tensor name from the header key
+	Dtype     string  `json:"dtype"`
+	Shape     []int64 `json:"shape"`
+	QuantType string  // from __metadata__.quant_type (e.g., "int4", "int8", "nvfp4", "mxfp8")
+	GroupSize string  // from __metadata__.group_size (e.g., "32", "64")
 }

 // readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
@@ -347,6 +310,7 @@ func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {

 // parseSafetensorsHeader parses a safetensors header from a reader.
 // This is separated for testability.
+// Parses __metadata__ for quant_type and group_size if present.
 func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
 	// Read header size (8 bytes, little endian)
 	var headerSize uint64
@@ -371,7 +335,31 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
 		return nil, fmt.Errorf("failed to parse header: %w", err)
 	}

-	// Find the first (and should be only) tensor entry
+	// Parse metadata if present
+	var quantType, groupSize string
+	if metaRaw, ok := header["__metadata__"]; ok {
+		var meta map[string]string
+		if json.Unmarshal(metaRaw, &meta) == nil {
+			quantType = meta["quant_type"]
+			groupSize = meta["group_size"]
+		}
+	}
+
+	// Find the main tensor entry (not __metadata__, .scale, or .bias)
+	for name, raw := range header {
+		if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
+			continue
+		}
+		var info safetensorsTensorInfo
+		if err := json.Unmarshal(raw, &info); err != nil {
+			return nil, fmt.Errorf("failed to parse tensor info: %w", err)
+		}
+		info.QuantType = quantType
+		info.GroupSize = groupSize
+		return &info, nil
+	}
+
+	// Fall back to first non-metadata tensor entry
 	for name, raw := range header {
 		if name == "__metadata__" {
 			continue
@@ -380,8 +368,134 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
 		if err := json.Unmarshal(raw, &info); err != nil {
 			return nil, fmt.Errorf("failed to parse tensor info: %w", err)
 		}
+		info.QuantType = quantType
+		info.GroupSize = groupSize
 		return &info, nil
 	}

 	return nil, fmt.Errorf("no tensor found in header")
 }
+
+// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
+// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
+// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
+// Each tensor's quant type is inferred from its shape and the presence of .scale/.bias entries
+// when no global __metadata__ quant_type is present.
+func parseSafetensorsAllHeaders(r io.Reader) ([]safetensorsTensorInfo, error) {
+	var headerSize uint64
+	if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
+		return nil, fmt.Errorf("failed to read header size: %w", err)
+	}
+
+	if headerSize > 100*1024*1024 { // 100MB limit for packed blob headers
+		return nil, fmt.Errorf("header size too large: %d", headerSize)
+	}
+
+	headerBytes := make([]byte, headerSize)
+	if _, err := io.ReadFull(r, headerBytes); err != nil {
+		return nil, fmt.Errorf("failed to read header: %w", err)
+	}
+
+	var header map[string]json.RawMessage
+	if err := json.Unmarshal(headerBytes, &header); err != nil {
+		return nil, fmt.Errorf("failed to parse header: %w", err)
+	}
+
+	// Parse global metadata if present
+	var globalQuantType, globalGroupSize string
+	if metaRaw, ok := header["__metadata__"]; ok {
+		var meta map[string]string
+		if json.Unmarshal(metaRaw, &meta) == nil {
+			globalQuantType = meta["quant_type"]
+			globalGroupSize = meta["group_size"]
+		}
+	}
+
+	// Build a set of all keys for checking .scale/.bias presence
+	headerKeys := make(map[string]bool, len(header))
+	for k := range header {
+		headerKeys[k] = true
+	}
+
+	// Collect all main tensor entries (sorted for deterministic output)
+	var mainNames []string
+	for name := range header {
+		if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
+			continue
+		}
+		mainNames = append(mainNames, name)
+	}
+	sort.Strings(mainNames)
+
+	var results []safetensorsTensorInfo
+	for _, name := range mainNames {
+		var info safetensorsTensorInfo
+		if err := json.Unmarshal(header[name], &info); err != nil {
+			return nil, fmt.Errorf("failed to parse tensor info for %s: %w", name, err)
+		}
+		info.Name = name
+
+		if globalQuantType != "" {
+			// Use global metadata
+			info.QuantType = globalQuantType
+			info.GroupSize = globalGroupSize
+		} else if headerKeys[name+".scale"] {
+			// No global metadata, but has .scale - infer quant type from shape
+			info.QuantType = inferQuantType(header, name)
+		}
+
+		results = append(results, info)
+	}
+
+	if len(results) == 0 {
+		return nil, fmt.Errorf("no tensor found in header")
+	}
+
+	return results, nil
+}
+
+// inferQuantType infers the quantization type for a tensor from its shape and scale shape.
+// Returns "int4", "int8", etc. or "" if not quantized.
+func inferQuantType(header map[string]json.RawMessage, name string) string {
+	// Parse the main tensor shape
+	var mainInfo struct {
+		Shape []int64 `json:"shape"`
+	}
+	if json.Unmarshal(header[name], &mainInfo) != nil || len(mainInfo.Shape) < 2 {
+		return ""
+	}
+
+	// Parse scale shape to determine group size
+	scaleRaw, ok := header[name+".scale"]
+	if !ok {
+		return ""
+	}
+	var scaleInfo struct {
+		Shape []int64 `json:"shape"`
+	}
+	if json.Unmarshal(scaleRaw, &scaleInfo) != nil || len(scaleInfo.Shape) < 2 {
+		return ""
+	}
+
+	// Calculate group size: main_cols * pack_factor / scale_cols
+	// Main dtype is U32, so we need to figure out the pack factor
+	// For int4: pack=8, group=32. scale_cols = original_cols / 32 = main_cols * 8 / 32 = main_cols / 4
+	// For int8: pack=4, group=64. scale_cols = original_cols / 64 = main_cols * 4 / 64 = main_cols / 16
+	mainCols := mainInfo.Shape[len(mainInfo.Shape)-1]
+	scaleCols := scaleInfo.Shape[len(scaleInfo.Shape)-1]
+	if scaleCols == 0 {
+		return ""
+	}
+
+	ratio := mainCols / scaleCols // main_packed_cols / scale_cols
+	// int4: ratio = (orig/8) / (orig/32) = 32/8 = 4
+	// int8: ratio = (orig/4) / (orig/64) = 64/4 = 16
+	switch ratio {
+	case 4:
+		return "int4"
+	case 16:
+		return "int8"
+	default:
+		return ""
+	}
+}
--- a/x/server/show_test.go
+++ b/x/server/show_test.go
@@ -36,7 +36,7 @@ func TestBuildModelInfo(t *testing.T) {
 				VocabSize:             262144,
 				TorchDtype:            "bfloat16",
 			},
-			totalTensorBytes: 8_600_000_088, // ~4.3B params * 2 bytes + 88 bytes header
+			totalTensorBytes: 8_600_000_150, // ~4.3B params * 2 bytes + 150 bytes header
 			tensorCount:      1,
 			wantArch:         "gemma3",
 			wantContextLen:   131072,
@@ -57,7 +57,7 @@ func TestBuildModelInfo(t *testing.T) {
 				VocabSize:             32000,
 				TorchDtype:            "float16",
 			},
-			totalTensorBytes: 14_000_000_088, // ~7B params * 2 bytes + 88 bytes header
+			totalTensorBytes: 14_000_000_150, // ~7B params * 2 bytes + 150 bytes header
 			tensorCount:      1,
 			wantArch:         "llama",
 			wantContextLen:   4096,
@@ -84,7 +84,7 @@ func TestBuildModelInfo(t *testing.T) {
 				VocabSize:         262144,
 				TorchDtype:        "bfloat16",
 			},
-			totalTensorBytes: 8_600_000_088,
+			totalTensorBytes: 8_600_000_150,
 			tensorCount:      1,
 			wantArch:         "gemma3",
 			wantContextLen:   131072,
@@ -101,7 +101,7 @@ func TestBuildModelInfo(t *testing.T) {
 				MaxPositionEmbeddings: 2048,
 				TorchDtype:            "float32",
 			},
-			totalTensorBytes: 400_000_088, // 100M params * 4 bytes + 88 bytes header
+			totalTensorBytes: 400_000_150, // 100M params * 4 bytes + 150 bytes header
 			tensorCount:      1,
 			wantArch:         "test",
 			wantContextLen:   2048,
@@ -118,7 +118,7 @@ func TestBuildModelInfo(t *testing.T) {
 				MaxPositionEmbeddings: 1024,
 				TorchDtype:            "bfloat16",
 			},
-			totalTensorBytes: 2_000_880, // 1M params * 2 bytes + 10 tensors * 88 bytes
+			totalTensorBytes: 2_001_500, // 1M params * 2 bytes + 10 tensors * 150 bytes
 			tensorCount:      10,
 			wantArch:         "test",
 			wantContextLen:   1024,
@@ -230,42 +230,42 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
 		{
 			name:           "bfloat16",
 			dtype:          "bfloat16",
-			totalBytes:     2_000_088, // 1M * 2 + 88
+			totalBytes:     2_000_150, // 1M * 2 + 150
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
 		{
 			name:           "float16",
 			dtype:          "float16",
-			totalBytes:     2_000_088,
+			totalBytes:     2_000_150,
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
 		{
 			name:           "float32",
 			dtype:          "float32",
-			totalBytes:     4_000_088, // 1M * 4 + 88
+			totalBytes:     4_000_150, // 1M * 4 + 150
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
 		{
 			name:           "int8",
 			dtype:          "int8",
-			totalBytes:     1_000_088, // 1M * 1 + 88
+			totalBytes:     1_000_150, // 1M * 1 + 150
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
 		{
 			name:           "unknown dtype defaults to 2 bytes",
 			dtype:          "unknown",
-			totalBytes:     2_000_088,
+			totalBytes:     2_000_150,
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
 		{
 			name:           "empty dtype defaults to 2 bytes",
 			dtype:          "",
-			totalBytes:     2_000_088,
+			totalBytes:     2_000_150,
 			tensorCount:    1,
 			wantParamCount: 1_000_000,
 		},
@@ -288,11 +288,13 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {

 func TestParseSafetensorsHeader(t *testing.T) {
 	tests := []struct {
-		name      string
-		header    map[string]any
-		wantDtype string
-		wantShape []int64
-		wantErr   bool
+		name          string
+		header        map[string]any
+		wantDtype     string
+		wantShape     []int64
+		wantQuantType string
+		wantGroupSize string
+		wantErr       bool
 	}{
 		{
 			name: "simple tensor",
@@ -307,7 +309,70 @@ func TestParseSafetensorsHeader(t *testing.T) {
 			wantShape: []int64{2560, 262144},
 		},
 		{
-			name: "with metadata",
+			name: "tensor keyed by name",
+			header: map[string]any{
+				"model.layers.0.weight": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 2560},
+					"data_offsets": []int64{0, 13107200},
+				},
+			},
+			wantDtype: "BF16",
+			wantShape: []int64{2560, 2560},
+		},
+		{
+			name: "with int4 quant metadata",
+			header: map[string]any{
+				"__metadata__": map[string]any{
+					"quant_type": "int4",
+					"group_size": "32",
+				},
+				"model.layers.0.mlp.up_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{2560, 320},
+					"data_offsets": []int64{0, 3276800},
+				},
+				"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 80},
+					"data_offsets": []int64{3276800, 3686400},
+				},
+				"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 80},
+					"data_offsets": []int64{3686400, 4096000},
+				},
+			},
+			wantDtype:     "U32",
+			wantShape:     []int64{2560, 320},
+			wantQuantType: "int4",
+			wantGroupSize: "32",
+		},
+		{
+			name: "int8 quant metadata",
+			header: map[string]any{
+				"__metadata__": map[string]any{
+					"quant_type": "int8",
+					"group_size": "64",
+				},
+				"model.layers.0.mlp.down_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{2560, 640},
+					"data_offsets": []int64{0, 6553600},
+				},
+				"model.layers.0.mlp.down_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 40},
+					"data_offsets": []int64{6553600, 6963200},
+				},
+			},
+			wantDtype:     "U32",
+			wantShape:     []int64{2560, 640},
+			wantQuantType: "int8",
+			wantGroupSize: "64",
+		},
+		{
+			name: "with old-style format metadata",
 			header: map[string]any{
 				"__metadata__": map[string]any{
 					"format": "pt",
@@ -371,6 +436,13 @@ func TestParseSafetensorsHeader(t *testing.T) {
 					}
 				}
 			}
+
+			if info.QuantType != tt.wantQuantType {
+				t.Errorf("QuantType = %v, want %v", info.QuantType, tt.wantQuantType)
+			}
+			if info.GroupSize != tt.wantGroupSize {
+				t.Errorf("GroupSize = %v, want %v", info.GroupSize, tt.wantGroupSize)
+			}
 		})
 	}
 }
@@ -460,7 +532,7 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
 		t.Fatalf("failed to create blobs dir: %v", err)
 	}

-	// Create test tensor blobs
+	// Create test tensor blobs with __metadata__
 	tensors := []struct {
 		name   string
 		digest string
@@ -487,10 +559,9 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
 		},
 	}

-	// Create blob files
+	// Create blob files with tensor keyed by name
 	var layers []manifest.Layer
 	for _, tensor := range tensors {
-		// Create safetensors blob
 		header := map[string]any{
 			tensor.name: map[string]any{
 				"dtype":        tensor.dtype,
@@ -561,6 +632,391 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
 	}
 }

+func TestGetTensorInfoFromManifest_Quantized(t *testing.T) {
+	// Create a temp directory for blobs and set OLLAMA_MODELS
+	tempDir := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempDir)
+
+	blobDir := filepath.Join(tempDir, "blobs")
+	if err := os.MkdirAll(blobDir, 0o755); err != nil {
+		t.Fatalf("failed to create blobs dir: %v", err)
+	}
+
+	// Create a combined quantized blob with __metadata__
+	header := map[string]any{
+		"__metadata__": map[string]string{
+			"quant_type": "int4",
+			"group_size": "32",
+		},
+		"model.layers.0.mlp.up_proj.weight": map[string]any{
+			"dtype":        "U32",
+			"shape":        []int64{2560, 320}, // packed: 2560 / 8 = 320
+			"data_offsets": []int64{0, 3276800},
+		},
+		"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{2560, 80}, // 2560 / 32 = 80
+			"data_offsets": []int64{3276800, 3686400},
+		},
+		"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{2560, 80},
+			"data_offsets": []int64{3686400, 4096000},
+		},
+	}
+	headerJSON, _ := json.Marshal(header)
+
+	var buf bytes.Buffer
+	binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
+	buf.Write(headerJSON)
+
+	digest := "sha256:aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb"
+	blobPath, err := manifest.BlobsPath(digest)
+	if err != nil {
+		t.Fatalf("failed to get blob path: %v", err)
+	}
+	if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
+		t.Fatalf("failed to write blob: %v", err)
+	}
+
+	mf := &manifest.Manifest{
+		SchemaVersion: 2,
+		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+		Layers: []manifest.Layer{
+			{
+				MediaType: manifest.MediaTypeImageTensor,
+				Digest:    digest,
+				Size:      int64(buf.Len() + 4096000),
+				Name:      "model.layers.0.mlp.up_proj.weight",
+			},
+		},
+	}
+
+	result, err := getTensorInfoFromManifest(mf)
+	if err != nil {
+		t.Fatalf("getTensorInfoFromManifest() error = %v", err)
+	}
+
+	if len(result) != 1 {
+		t.Fatalf("got %d tensors, want 1", len(result))
+	}
+
+	tensor := result[0]
+	if tensor.Name != "model.layers.0.mlp.up_proj.weight" {
+		t.Errorf("Name = %v, want model.layers.0.mlp.up_proj.weight", tensor.Name)
+	}
+	if tensor.Type != "INT4" {
+		t.Errorf("Type = %v, want INT4", tensor.Type)
+	}
+	// Shape should be unpacked: 320 * 8 = 2560
+	if len(tensor.Shape) != 2 || tensor.Shape[0] != 2560 || tensor.Shape[1] != 2560 {
+		t.Errorf("Shape = %v, want [2560, 2560]", tensor.Shape)
+	}
+}
+
+func TestParseSafetensorsAllHeaders(t *testing.T) {
+	tests := []struct {
+		name       string
+		header     map[string]any
+		wantCount  int
+		wantNames  []string
+		wantDtypes []string
+		wantQuants []string
+		wantErr    bool
+	}{
+		{
+			name: "single tensor blob",
+			header: map[string]any{
+				"model.layers.0.weight": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 2560},
+					"data_offsets": []int64{0, 13107200},
+				},
+			},
+			wantCount:  1,
+			wantNames:  []string{"model.layers.0.weight"},
+			wantDtypes: []string{"BF16"},
+			wantQuants: []string{""},
+		},
+		{
+			name: "packed unquantized blob",
+			header: map[string]any{
+				"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 10240},
+					"data_offsets": []int64{0, 52428800},
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 2560},
+					"data_offsets": []int64{52428800, 104857600},
+				},
+				"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 2560},
+					"data_offsets": []int64{104857600, 157286400},
+				},
+			},
+			wantCount: 3,
+			wantNames: []string{
+				"model.layers.0.mlp.experts.0.down_proj.weight",
+				"model.layers.0.mlp.experts.0.gate_proj.weight",
+				"model.layers.0.mlp.experts.0.up_proj.weight",
+			},
+			wantDtypes: []string{"BF16", "BF16", "BF16"},
+			wantQuants: []string{"", "", ""},
+		},
+		{
+			name: "packed quantized blob with global metadata",
+			header: map[string]any{
+				"__metadata__": map[string]any{
+					"quant_type": "int4",
+					"group_size": "32",
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{10240, 320},
+					"data_offsets": []int64{0, 13107200},
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{13107200, 14745600},
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{14745600, 16384000},
+				},
+				"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{10240, 320},
+					"data_offsets": []int64{16384000, 29491200},
+				},
+				"model.layers.0.mlp.experts.0.up_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{29491200, 31129600},
+				},
+				"model.layers.0.mlp.experts.0.up_proj.weight.bias": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{31129600, 32768000},
+				},
+			},
+			wantCount: 2,
+			wantNames: []string{
+				"model.layers.0.mlp.experts.0.gate_proj.weight",
+				"model.layers.0.mlp.experts.0.up_proj.weight",
+			},
+			wantDtypes: []string{"U32", "U32"},
+			wantQuants: []string{"int4", "int4"},
+		},
+		{
+			name: "packed mixed-precision blob (no global metadata)",
+			header: map[string]any{
+				"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{10240, 320},
+					"data_offsets": []int64{0, 13107200},
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{13107200, 14745600},
+				},
+				"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{10240, 80},
+					"data_offsets": []int64{14745600, 16384000},
+				},
+				"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
+					"dtype":        "U32",
+					"shape":        []int64{2560, 2560},
+					"data_offsets": []int64{16384000, 42598400},
+				},
+				"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
+					"dtype":        "BF16",
+					"shape":        []int64{2560, 160},
+					"data_offsets": []int64{42598400, 43417600},
+				},
+			},
+			wantCount: 2,
+			wantNames: []string{
+				"model.layers.0.mlp.experts.0.down_proj.weight",
+				"model.layers.0.mlp.experts.0.gate_proj.weight",
+			},
+			wantDtypes: []string{"U32", "U32"},
+			wantQuants: []string{"int8", "int4"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			headerJSON, err := json.Marshal(tt.header)
+			if err != nil {
+				t.Fatalf("failed to marshal header: %v", err)
+			}
+
+			var buf bytes.Buffer
+			if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
+				t.Fatalf("failed to write header size: %v", err)
+			}
+			buf.Write(headerJSON)
+
+			results, err := parseSafetensorsAllHeaders(&buf)
+			if (err != nil) != tt.wantErr {
+				t.Errorf("parseSafetensorsAllHeaders() error = %v, wantErr %v", err, tt.wantErr)
+				return
+			}
+			if tt.wantErr {
+				return
+			}
+
+			if len(results) != tt.wantCount {
+				t.Fatalf("got %d tensors, want %d", len(results), tt.wantCount)
+			}
+
+			for i, info := range results {
+				if info.Name != tt.wantNames[i] {
+					t.Errorf("tensor[%d].Name = %v, want %v", i, info.Name, tt.wantNames[i])
+				}
+				if info.Dtype != tt.wantDtypes[i] {
+					t.Errorf("tensor[%d].Dtype = %v, want %v", i, info.Dtype, tt.wantDtypes[i])
+				}
+				if info.QuantType != tt.wantQuants[i] {
+					t.Errorf("tensor[%d].QuantType = %v, want %v", i, info.QuantType, tt.wantQuants[i])
+				}
+			}
+		})
+	}
+}
+
+func TestGetTensorInfoFromManifest_Packed(t *testing.T) {
+	// Create a temp directory for blobs and set OLLAMA_MODELS
+	tempDir := t.TempDir()
+	t.Setenv("OLLAMA_MODELS", tempDir)
+
+	blobDir := filepath.Join(tempDir, "blobs")
+	if err := os.MkdirAll(blobDir, 0o755); err != nil {
+		t.Fatalf("failed to create blobs dir: %v", err)
+	}
+
+	// Create a packed blob with multiple expert tensors (mixed quantization)
+	header := map[string]any{
+		"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
+			"dtype":        "U32",
+			"shape":        []int64{10240, 320},
+			"data_offsets": []int64{0, 13107200},
+		},
+		"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{10240, 80},
+			"data_offsets": []int64{13107200, 14745600},
+		},
+		"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{10240, 80},
+			"data_offsets": []int64{14745600, 16384000},
+		},
+		"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
+			"dtype":        "U32",
+			"shape":        []int64{2560, 2560},
+			"data_offsets": []int64{16384000, 42598400},
+		},
+		"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{2560, 160},
+			"data_offsets": []int64{42598400, 43417600},
+		},
+	}
+	headerJSON, _ := json.Marshal(header)
+
+	var buf bytes.Buffer
+	binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
+	buf.Write(headerJSON)
+
+	packedDigest := "sha256:aaaa000000000000000000000000000000000000000000000000000000000001"
+	blobPath, err := manifest.BlobsPath(packedDigest)
+	if err != nil {
+		t.Fatalf("failed to get blob path: %v", err)
+	}
+	if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
+		t.Fatalf("failed to write packed blob: %v", err)
+	}
+
+	// Also create a regular (single-tensor) blob
+	singleHeader := map[string]any{
+		"model.embed_tokens.weight": map[string]any{
+			"dtype":        "BF16",
+			"shape":        []int64{262144, 2560},
+			"data_offsets": []int64{0, 1342177280},
+		},
+	}
+	singleHeaderJSON, _ := json.Marshal(singleHeader)
+	var singleBuf bytes.Buffer
+	binary.Write(&singleBuf, binary.LittleEndian, uint64(len(singleHeaderJSON)))
+	singleBuf.Write(singleHeaderJSON)
+
+	singleDigest := "sha256:bbbb000000000000000000000000000000000000000000000000000000000002"
+	singleBlobPath, err := manifest.BlobsPath(singleDigest)
+	if err != nil {
+		t.Fatalf("failed to get blob path: %v", err)
+	}
+	if err := os.WriteFile(singleBlobPath, singleBuf.Bytes(), 0o644); err != nil {
+		t.Fatalf("failed to write single blob: %v", err)
+	}
+
+	mf := &manifest.Manifest{
+		SchemaVersion: 2,
+		MediaType:     "application/vnd.docker.distribution.manifest.v2+json",
+		Layers: []manifest.Layer{
+			{
+				MediaType: manifest.MediaTypeImageTensor,
+				Digest:    singleDigest,
+				Size:      int64(singleBuf.Len()),
+				Name:      "model.embed_tokens.weight",
+			},
+			{
+				MediaType: manifest.MediaTypeImageTensor,
+				Digest:    packedDigest,
+				Size:      int64(buf.Len()),
+				Name:      "model.layers.0.mlp.experts", // group prefix
+			},
+		},
+	}
+
+	result, err := getTensorInfoFromManifest(mf)
+	if err != nil {
+		t.Fatalf("getTensorInfoFromManifest() error = %v", err)
+	}
+
+	// Should have 3 tensors: 1 single + 2 packed main tensors
+	if len(result) != 3 {
+		t.Fatalf("got %d tensors, want 3. Tensors: %v", len(result), result)
+	}
+
+	// First tensor should be the single blob
+	if result[0].Name != "model.embed_tokens.weight" {
+		t.Errorf("tensor[0].Name = %v, want model.embed_tokens.weight", result[0].Name)
+	}
+	if result[0].Type != "BF16" {
+		t.Errorf("tensor[0].Type = %v, want BF16", result[0].Type)
+	}
+
+	// Packed tensors should have their actual names (sorted)
+	packedNames := make(map[string]bool)
+	for _, r := range result[1:] {
+		packedNames[r.Name] = true
+	}
+	if !packedNames["model.layers.0.mlp.experts.0.down_proj.weight"] {
+		t.Error("missing packed tensor: model.layers.0.mlp.experts.0.down_proj.weight")
+	}
+	if !packedNames["model.layers.0.mlp.experts.0.gate_proj.weight"] {
+		t.Error("missing packed tensor: model.layers.0.mlp.experts.0.gate_proj.weight")
+	}
+}
+
 func TestReadSafetensorsHeader(t *testing.T) {
 	// Create a temp file with a valid safetensors header
 	tempDir := t.TempDir()