mirror of
https://github.com/ollama/ollama.git
synced 2026-03-11 17:34:04 -05:00
safetensors quantization for mlx (#14184)
This change includes: - changes to the safetensors metadata format - changes to the create command to properly create the blobs with the new format - changes to load the new format - fixes ollama show to properly show each tensor
This commit is contained in:
394
x/server/show.go
394
x/server/show.go
@@ -6,6 +6,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
@@ -105,9 +106,9 @@ func buildModelInfo(config modelConfig, totalTensorBytes, tensorCount int64) map
|
||||
bytesPerParam = 1
|
||||
}
|
||||
|
||||
// Subtract safetensors header overhead (88 bytes per tensor file)
|
||||
// Each tensor is stored as a minimal safetensors file
|
||||
totalBytes := totalTensorBytes - tensorCount*88
|
||||
// Subtract safetensors header overhead per tensor blob.
|
||||
// Headers include __metadata__ with the tensor name, so overhead is ~150 bytes on average.
|
||||
totalBytes := totalTensorBytes - tensorCount*150
|
||||
|
||||
paramCount := totalBytes / bytesPerParam
|
||||
|
||||
@@ -163,24 +164,103 @@ func GetSafetensorsTensorInfo(name model.Name) ([]api.Tensor, error) {
|
||||
|
||||
// getTensorInfoFromManifest extracts tensor info from a manifest.
|
||||
// This is separated for testability.
|
||||
// For quantized models, groups weight/scale/qbias into single entries with detected quantization type.
|
||||
// For quantized tensors, reads quant_type from blob __metadata__.
|
||||
// For packed blobs (multiple tensors per blob), enumerates all tensors in the blob.
|
||||
func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
||||
var tensors []api.Tensor
|
||||
|
||||
// First pass: collect all tensor info and identify scale tensors
|
||||
type tensorData struct {
|
||||
info *safetensorsTensorInfo
|
||||
digest string
|
||||
}
|
||||
tensorMap := make(map[string]*tensorData)
|
||||
scaleMap := make(map[string]*tensorData) // base name -> scale tensor info
|
||||
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
}
|
||||
|
||||
// Read the safetensors header from the blob
|
||||
// Read all tensor entries from the safetensors header
|
||||
blobPath, err := manifest.BlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
f, err := os.Open(blobPath)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
allInfos, err := parseSafetensorsAllHeaders(f)
|
||||
f.Close()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Determine if this is a packed blob (multiple main tensors)
|
||||
isPacked := len(allInfos) > 1
|
||||
|
||||
for _, info := range allInfos {
|
||||
tensorName := layer.Name
|
||||
if isPacked {
|
||||
// For packed blobs, use the tensor name from the header
|
||||
tensorName = info.Name
|
||||
}
|
||||
|
||||
if info.QuantType != "" {
|
||||
quantType := strings.ToUpper(info.QuantType)
|
||||
|
||||
shape := make([]uint64, len(info.Shape))
|
||||
for i, s := range info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
|
||||
var packFactor int64
|
||||
switch strings.ToLower(info.QuantType) {
|
||||
case "int4", "nvfp4":
|
||||
packFactor = 8
|
||||
case "int8", "mxfp8":
|
||||
packFactor = 4
|
||||
}
|
||||
if packFactor > 0 && len(shape) >= 2 {
|
||||
shape[len(shape)-1] = uint64(info.Shape[len(info.Shape)-1] * packFactor)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: tensorName,
|
||||
Type: quantType,
|
||||
Shape: shape,
|
||||
})
|
||||
} else {
|
||||
shape := make([]uint64, len(info.Shape))
|
||||
for i, s := range info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: tensorName,
|
||||
Type: info.Dtype,
|
||||
Shape: shape,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort.Slice(tensors, func(i, j int) bool {
|
||||
return tensors[i].Name < tensors[j].Name
|
||||
})
|
||||
|
||||
return tensors, nil
|
||||
}
|
||||
|
||||
// GetSafetensorsDtype returns the quantization type for a safetensors model.
|
||||
// Reads quant_type from the first tensor blob's __metadata__.
|
||||
// Falls back to torch_dtype from config.json if no quant metadata.
|
||||
func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
mf, err := manifest.ParseNamedManifest(name)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
// Check first tensor blob for quant_type metadata
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
}
|
||||
blobPath, err := manifest.BlobsPath(layer.Digest)
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -189,131 +269,11 @@ func getTensorInfoFromManifest(mf *manifest.Manifest) ([]api.Tensor, error) {
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
td := &tensorData{info: info, digest: layer.Digest}
|
||||
|
||||
if strings.HasSuffix(layer.Name, "_scale") {
|
||||
baseName := strings.TrimSuffix(layer.Name, "_scale")
|
||||
scaleMap[baseName] = td
|
||||
} else if strings.HasSuffix(layer.Name, "_qbias") {
|
||||
// Skip qbias tensors - they're included with the quantized weight
|
||||
continue
|
||||
} else {
|
||||
tensorMap[layer.Name] = td
|
||||
if info.QuantType != "" {
|
||||
return strings.ToUpper(info.QuantType), nil
|
||||
}
|
||||
}
|
||||
|
||||
// Second pass: build tensor list with quantization info
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType != manifest.MediaTypeImageTensor {
|
||||
continue
|
||||
}
|
||||
|
||||
// Skip scale and qbias tensors
|
||||
if strings.HasSuffix(layer.Name, "_scale") || strings.HasSuffix(layer.Name, "_qbias") {
|
||||
continue
|
||||
}
|
||||
|
||||
td := tensorMap[layer.Name]
|
||||
if td == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
// Check if this tensor has a corresponding scale tensor (quantized)
|
||||
scaleTd := scaleMap[layer.Name]
|
||||
if scaleTd != nil && len(td.info.Shape) >= 2 && len(scaleTd.info.Shape) >= 2 {
|
||||
// Quantized tensor - detect bits from shapes
|
||||
weightCols := td.info.Shape[len(td.info.Shape)-1]
|
||||
scaleCols := scaleTd.info.Shape[len(scaleTd.info.Shape)-1]
|
||||
|
||||
// Detect quantization: Q4 has pack_factor=8, Q8 has pack_factor=4
|
||||
// Q4 uses group_size=32: weightCols * 8 / scaleCols = 32
|
||||
// Q8 uses group_size=64: weightCols * 4 / scaleCols = 64
|
||||
var bits int
|
||||
var quantType string
|
||||
if weightCols*8/scaleCols == 32 {
|
||||
bits = 4
|
||||
quantType = "Q4"
|
||||
} else if weightCols*4/scaleCols == 64 {
|
||||
bits = 8
|
||||
quantType = "Q8"
|
||||
} else {
|
||||
// Unknown quantization, show raw
|
||||
quantType = td.info.Dtype
|
||||
}
|
||||
|
||||
// Calculate unpacked shape
|
||||
shape := make([]uint64, len(td.info.Shape))
|
||||
for i, s := range td.info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
if bits > 0 {
|
||||
packFactor := int64(32 / bits)
|
||||
shape[len(shape)-1] = uint64(td.info.Shape[len(td.info.Shape)-1] * packFactor)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: layer.Name,
|
||||
Type: quantType,
|
||||
Shape: shape,
|
||||
})
|
||||
} else {
|
||||
// Non-quantized tensor
|
||||
shape := make([]uint64, len(td.info.Shape))
|
||||
for i, s := range td.info.Shape {
|
||||
shape[i] = uint64(s)
|
||||
}
|
||||
|
||||
tensors = append(tensors, api.Tensor{
|
||||
Name: layer.Name,
|
||||
Type: td.info.Dtype,
|
||||
Shape: shape,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return tensors, nil
|
||||
}
|
||||
|
||||
// GetSafetensorsDtype returns the quantization type for a safetensors model.
|
||||
// Reads from model_index.json first, falls back to detection from tensor names.
|
||||
// Otherwise returns the torch_dtype from config.json.
|
||||
func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
mf, err := manifest.ParseNamedManifest(name)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to load manifest: %w", err)
|
||||
}
|
||||
|
||||
// First try to read quantization from model_index.json
|
||||
var modelIndex struct {
|
||||
Quantization string `json:"quantization"`
|
||||
}
|
||||
if err := mf.ReadConfigJSON("model_index.json", &modelIndex); err == nil && modelIndex.Quantization != "" {
|
||||
return modelIndex.Quantization, nil
|
||||
}
|
||||
|
||||
// Fallback: detect from tensor names
|
||||
hasScales := false
|
||||
hasQBias := false
|
||||
for _, layer := range mf.Layers {
|
||||
if layer.MediaType == manifest.MediaTypeImageTensor {
|
||||
if strings.HasSuffix(layer.Name, "_scale") {
|
||||
hasScales = true
|
||||
}
|
||||
if strings.HasSuffix(layer.Name, "_qbias") {
|
||||
hasQBias = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if hasScales {
|
||||
if hasQBias {
|
||||
// Affine mode (has scale + qbias) - could be Q4 or Q8
|
||||
// Default to Q4 as it's more common
|
||||
return "Q4", nil
|
||||
}
|
||||
// No qbias = NVFP4
|
||||
return "NVFP4", nil
|
||||
// Only check the first tensor blob
|
||||
break
|
||||
}
|
||||
|
||||
// Not quantized - return torch_dtype from config.json
|
||||
@@ -329,8 +289,11 @@ func GetSafetensorsDtype(name model.Name) (string, error) {
|
||||
|
||||
// safetensorsTensorInfo holds metadata about a tensor from a safetensors header
|
||||
type safetensorsTensorInfo struct {
|
||||
Dtype string `json:"dtype"`
|
||||
Shape []int64 `json:"shape"`
|
||||
Name string // tensor name from the header key
|
||||
Dtype string `json:"dtype"`
|
||||
Shape []int64 `json:"shape"`
|
||||
QuantType string // from __metadata__.quant_type (e.g., "int4", "int8", "nvfp4", "mxfp8")
|
||||
GroupSize string // from __metadata__.group_size (e.g., "32", "64")
|
||||
}
|
||||
|
||||
// readSafetensorsHeader reads the JSON header from a safetensors file to get tensor metadata.
|
||||
@@ -347,6 +310,7 @@ func readSafetensorsHeader(path string) (*safetensorsTensorInfo, error) {
|
||||
|
||||
// parseSafetensorsHeader parses a safetensors header from a reader.
|
||||
// This is separated for testability.
|
||||
// Parses __metadata__ for quant_type and group_size if present.
|
||||
func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
|
||||
// Read header size (8 bytes, little endian)
|
||||
var headerSize uint64
|
||||
@@ -371,7 +335,31 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
|
||||
return nil, fmt.Errorf("failed to parse header: %w", err)
|
||||
}
|
||||
|
||||
// Find the first (and should be only) tensor entry
|
||||
// Parse metadata if present
|
||||
var quantType, groupSize string
|
||||
if metaRaw, ok := header["__metadata__"]; ok {
|
||||
var meta map[string]string
|
||||
if json.Unmarshal(metaRaw, &meta) == nil {
|
||||
quantType = meta["quant_type"]
|
||||
groupSize = meta["group_size"]
|
||||
}
|
||||
}
|
||||
|
||||
// Find the main tensor entry (not __metadata__, .scale, or .bias)
|
||||
for name, raw := range header {
|
||||
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
||||
continue
|
||||
}
|
||||
var info safetensorsTensorInfo
|
||||
if err := json.Unmarshal(raw, &info); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
||||
}
|
||||
info.QuantType = quantType
|
||||
info.GroupSize = groupSize
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
// Fall back to first non-metadata tensor entry
|
||||
for name, raw := range header {
|
||||
if name == "__metadata__" {
|
||||
continue
|
||||
@@ -380,8 +368,134 @@ func parseSafetensorsHeader(r io.Reader) (*safetensorsTensorInfo, error) {
|
||||
if err := json.Unmarshal(raw, &info); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse tensor info: %w", err)
|
||||
}
|
||||
info.QuantType = quantType
|
||||
info.GroupSize = groupSize
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("no tensor found in header")
|
||||
}
|
||||
|
||||
// parseSafetensorsAllHeaders parses all tensor entries from a safetensors header.
|
||||
// Returns one safetensorsTensorInfo per main tensor (skipping __metadata__, .scale, .bias).
|
||||
// For packed blobs this returns multiple entries; for single-tensor blobs, one entry.
|
||||
// Each tensor's quant type is inferred from its shape and the presence of .scale/.bias entries
|
||||
// when no global __metadata__ quant_type is present.
|
||||
func parseSafetensorsAllHeaders(r io.Reader) ([]safetensorsTensorInfo, error) {
|
||||
var headerSize uint64
|
||||
if err := binary.Read(r, binary.LittleEndian, &headerSize); err != nil {
|
||||
return nil, fmt.Errorf("failed to read header size: %w", err)
|
||||
}
|
||||
|
||||
if headerSize > 100*1024*1024 { // 100MB limit for packed blob headers
|
||||
return nil, fmt.Errorf("header size too large: %d", headerSize)
|
||||
}
|
||||
|
||||
headerBytes := make([]byte, headerSize)
|
||||
if _, err := io.ReadFull(r, headerBytes); err != nil {
|
||||
return nil, fmt.Errorf("failed to read header: %w", err)
|
||||
}
|
||||
|
||||
var header map[string]json.RawMessage
|
||||
if err := json.Unmarshal(headerBytes, &header); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse header: %w", err)
|
||||
}
|
||||
|
||||
// Parse global metadata if present
|
||||
var globalQuantType, globalGroupSize string
|
||||
if metaRaw, ok := header["__metadata__"]; ok {
|
||||
var meta map[string]string
|
||||
if json.Unmarshal(metaRaw, &meta) == nil {
|
||||
globalQuantType = meta["quant_type"]
|
||||
globalGroupSize = meta["group_size"]
|
||||
}
|
||||
}
|
||||
|
||||
// Build a set of all keys for checking .scale/.bias presence
|
||||
headerKeys := make(map[string]bool, len(header))
|
||||
for k := range header {
|
||||
headerKeys[k] = true
|
||||
}
|
||||
|
||||
// Collect all main tensor entries (sorted for deterministic output)
|
||||
var mainNames []string
|
||||
for name := range header {
|
||||
if name == "__metadata__" || strings.HasSuffix(name, ".scale") || strings.HasSuffix(name, ".bias") {
|
||||
continue
|
||||
}
|
||||
mainNames = append(mainNames, name)
|
||||
}
|
||||
sort.Strings(mainNames)
|
||||
|
||||
var results []safetensorsTensorInfo
|
||||
for _, name := range mainNames {
|
||||
var info safetensorsTensorInfo
|
||||
if err := json.Unmarshal(header[name], &info); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse tensor info for %s: %w", name, err)
|
||||
}
|
||||
info.Name = name
|
||||
|
||||
if globalQuantType != "" {
|
||||
// Use global metadata
|
||||
info.QuantType = globalQuantType
|
||||
info.GroupSize = globalGroupSize
|
||||
} else if headerKeys[name+".scale"] {
|
||||
// No global metadata, but has .scale - infer quant type from shape
|
||||
info.QuantType = inferQuantType(header, name)
|
||||
}
|
||||
|
||||
results = append(results, info)
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return nil, fmt.Errorf("no tensor found in header")
|
||||
}
|
||||
|
||||
return results, nil
|
||||
}
|
||||
|
||||
// inferQuantType infers the quantization type for a tensor from its shape and scale shape.
|
||||
// Returns "int4", "int8", etc. or "" if not quantized.
|
||||
func inferQuantType(header map[string]json.RawMessage, name string) string {
|
||||
// Parse the main tensor shape
|
||||
var mainInfo struct {
|
||||
Shape []int64 `json:"shape"`
|
||||
}
|
||||
if json.Unmarshal(header[name], &mainInfo) != nil || len(mainInfo.Shape) < 2 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Parse scale shape to determine group size
|
||||
scaleRaw, ok := header[name+".scale"]
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
var scaleInfo struct {
|
||||
Shape []int64 `json:"shape"`
|
||||
}
|
||||
if json.Unmarshal(scaleRaw, &scaleInfo) != nil || len(scaleInfo.Shape) < 2 {
|
||||
return ""
|
||||
}
|
||||
|
||||
// Calculate group size: main_cols * pack_factor / scale_cols
|
||||
// Main dtype is U32, so we need to figure out the pack factor
|
||||
// For int4: pack=8, group=32. scale_cols = original_cols / 32 = main_cols * 8 / 32 = main_cols / 4
|
||||
// For int8: pack=4, group=64. scale_cols = original_cols / 64 = main_cols * 4 / 64 = main_cols / 16
|
||||
mainCols := mainInfo.Shape[len(mainInfo.Shape)-1]
|
||||
scaleCols := scaleInfo.Shape[len(scaleInfo.Shape)-1]
|
||||
if scaleCols == 0 {
|
||||
return ""
|
||||
}
|
||||
|
||||
ratio := mainCols / scaleCols // main_packed_cols / scale_cols
|
||||
// int4: ratio = (orig/8) / (orig/32) = 32/8 = 4
|
||||
// int8: ratio = (orig/4) / (orig/64) = 64/4 = 16
|
||||
switch ratio {
|
||||
case 4:
|
||||
return "int4"
|
||||
case 16:
|
||||
return "int8"
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
@@ -36,7 +36,7 @@ func TestBuildModelInfo(t *testing.T) {
|
||||
VocabSize: 262144,
|
||||
TorchDtype: "bfloat16",
|
||||
},
|
||||
totalTensorBytes: 8_600_000_088, // ~4.3B params * 2 bytes + 88 bytes header
|
||||
totalTensorBytes: 8_600_000_150, // ~4.3B params * 2 bytes + 150 bytes header
|
||||
tensorCount: 1,
|
||||
wantArch: "gemma3",
|
||||
wantContextLen: 131072,
|
||||
@@ -57,7 +57,7 @@ func TestBuildModelInfo(t *testing.T) {
|
||||
VocabSize: 32000,
|
||||
TorchDtype: "float16",
|
||||
},
|
||||
totalTensorBytes: 14_000_000_088, // ~7B params * 2 bytes + 88 bytes header
|
||||
totalTensorBytes: 14_000_000_150, // ~7B params * 2 bytes + 150 bytes header
|
||||
tensorCount: 1,
|
||||
wantArch: "llama",
|
||||
wantContextLen: 4096,
|
||||
@@ -84,7 +84,7 @@ func TestBuildModelInfo(t *testing.T) {
|
||||
VocabSize: 262144,
|
||||
TorchDtype: "bfloat16",
|
||||
},
|
||||
totalTensorBytes: 8_600_000_088,
|
||||
totalTensorBytes: 8_600_000_150,
|
||||
tensorCount: 1,
|
||||
wantArch: "gemma3",
|
||||
wantContextLen: 131072,
|
||||
@@ -101,7 +101,7 @@ func TestBuildModelInfo(t *testing.T) {
|
||||
MaxPositionEmbeddings: 2048,
|
||||
TorchDtype: "float32",
|
||||
},
|
||||
totalTensorBytes: 400_000_088, // 100M params * 4 bytes + 88 bytes header
|
||||
totalTensorBytes: 400_000_150, // 100M params * 4 bytes + 150 bytes header
|
||||
tensorCount: 1,
|
||||
wantArch: "test",
|
||||
wantContextLen: 2048,
|
||||
@@ -118,7 +118,7 @@ func TestBuildModelInfo(t *testing.T) {
|
||||
MaxPositionEmbeddings: 1024,
|
||||
TorchDtype: "bfloat16",
|
||||
},
|
||||
totalTensorBytes: 2_000_880, // 1M params * 2 bytes + 10 tensors * 88 bytes
|
||||
totalTensorBytes: 2_001_500, // 1M params * 2 bytes + 10 tensors * 150 bytes
|
||||
tensorCount: 10,
|
||||
wantArch: "test",
|
||||
wantContextLen: 1024,
|
||||
@@ -230,42 +230,42 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
|
||||
{
|
||||
name: "bfloat16",
|
||||
dtype: "bfloat16",
|
||||
totalBytes: 2_000_088, // 1M * 2 + 88
|
||||
totalBytes: 2_000_150, // 1M * 2 + 150
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
{
|
||||
name: "float16",
|
||||
dtype: "float16",
|
||||
totalBytes: 2_000_088,
|
||||
totalBytes: 2_000_150,
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
{
|
||||
name: "float32",
|
||||
dtype: "float32",
|
||||
totalBytes: 4_000_088, // 1M * 4 + 88
|
||||
totalBytes: 4_000_150, // 1M * 4 + 150
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
{
|
||||
name: "int8",
|
||||
dtype: "int8",
|
||||
totalBytes: 1_000_088, // 1M * 1 + 88
|
||||
totalBytes: 1_000_150, // 1M * 1 + 150
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
{
|
||||
name: "unknown dtype defaults to 2 bytes",
|
||||
dtype: "unknown",
|
||||
totalBytes: 2_000_088,
|
||||
totalBytes: 2_000_150,
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
{
|
||||
name: "empty dtype defaults to 2 bytes",
|
||||
dtype: "",
|
||||
totalBytes: 2_000_088,
|
||||
totalBytes: 2_000_150,
|
||||
tensorCount: 1,
|
||||
wantParamCount: 1_000_000,
|
||||
},
|
||||
@@ -288,11 +288,13 @@ func TestBuildModelInfo_BytesPerParam(t *testing.T) {
|
||||
|
||||
func TestParseSafetensorsHeader(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
header map[string]any
|
||||
wantDtype string
|
||||
wantShape []int64
|
||||
wantErr bool
|
||||
name string
|
||||
header map[string]any
|
||||
wantDtype string
|
||||
wantShape []int64
|
||||
wantQuantType string
|
||||
wantGroupSize string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "simple tensor",
|
||||
@@ -307,7 +309,70 @@ func TestParseSafetensorsHeader(t *testing.T) {
|
||||
wantShape: []int64{2560, 262144},
|
||||
},
|
||||
{
|
||||
name: "with metadata",
|
||||
name: "tensor keyed by name",
|
||||
header: map[string]any{
|
||||
"model.layers.0.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 2560},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
},
|
||||
wantDtype: "BF16",
|
||||
wantShape: []int64{2560, 2560},
|
||||
},
|
||||
{
|
||||
name: "with int4 quant metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"quant_type": "int4",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 320},
|
||||
"data_offsets": []int64{0, 3276800},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80},
|
||||
"data_offsets": []int64{3276800, 3686400},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80},
|
||||
"data_offsets": []int64{3686400, 4096000},
|
||||
},
|
||||
},
|
||||
wantDtype: "U32",
|
||||
wantShape: []int64{2560, 320},
|
||||
wantQuantType: "int4",
|
||||
wantGroupSize: "32",
|
||||
},
|
||||
{
|
||||
name: "int8 quant metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"quant_type": "int8",
|
||||
"group_size": "64",
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 640},
|
||||
"data_offsets": []int64{0, 6553600},
|
||||
},
|
||||
"model.layers.0.mlp.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 40},
|
||||
"data_offsets": []int64{6553600, 6963200},
|
||||
},
|
||||
},
|
||||
wantDtype: "U32",
|
||||
wantShape: []int64{2560, 640},
|
||||
wantQuantType: "int8",
|
||||
wantGroupSize: "64",
|
||||
},
|
||||
{
|
||||
name: "with old-style format metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"format": "pt",
|
||||
@@ -371,6 +436,13 @@ func TestParseSafetensorsHeader(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if info.QuantType != tt.wantQuantType {
|
||||
t.Errorf("QuantType = %v, want %v", info.QuantType, tt.wantQuantType)
|
||||
}
|
||||
if info.GroupSize != tt.wantGroupSize {
|
||||
t.Errorf("GroupSize = %v, want %v", info.GroupSize, tt.wantGroupSize)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -460,7 +532,7 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
|
||||
t.Fatalf("failed to create blobs dir: %v", err)
|
||||
}
|
||||
|
||||
// Create test tensor blobs
|
||||
// Create test tensor blobs with __metadata__
|
||||
tensors := []struct {
|
||||
name string
|
||||
digest string
|
||||
@@ -487,10 +559,9 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
|
||||
},
|
||||
}
|
||||
|
||||
// Create blob files
|
||||
// Create blob files with tensor keyed by name
|
||||
var layers []manifest.Layer
|
||||
for _, tensor := range tensors {
|
||||
// Create safetensors blob
|
||||
header := map[string]any{
|
||||
tensor.name: map[string]any{
|
||||
"dtype": tensor.dtype,
|
||||
@@ -561,6 +632,391 @@ func TestGetTensorInfoFromManifest(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTensorInfoFromManifest_Quantized(t *testing.T) {
|
||||
// Create a temp directory for blobs and set OLLAMA_MODELS
|
||||
tempDir := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", tempDir)
|
||||
|
||||
blobDir := filepath.Join(tempDir, "blobs")
|
||||
if err := os.MkdirAll(blobDir, 0o755); err != nil {
|
||||
t.Fatalf("failed to create blobs dir: %v", err)
|
||||
}
|
||||
|
||||
// Create a combined quantized blob with __metadata__
|
||||
header := map[string]any{
|
||||
"__metadata__": map[string]string{
|
||||
"quant_type": "int4",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 320}, // packed: 2560 / 8 = 320
|
||||
"data_offsets": []int64{0, 3276800},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80}, // 2560 / 32 = 80
|
||||
"data_offsets": []int64{3276800, 3686400},
|
||||
},
|
||||
"model.layers.0.mlp.up_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 80},
|
||||
"data_offsets": []int64{3686400, 4096000},
|
||||
},
|
||||
}
|
||||
headerJSON, _ := json.Marshal(header)
|
||||
|
||||
var buf bytes.Buffer
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
|
||||
buf.Write(headerJSON)
|
||||
|
||||
digest := "sha256:aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb11aabb"
|
||||
blobPath, err := manifest.BlobsPath(digest)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write blob: %v", err)
|
||||
}
|
||||
|
||||
mf := &manifest.Manifest{
|
||||
SchemaVersion: 2,
|
||||
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
|
||||
Layers: []manifest.Layer{
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: digest,
|
||||
Size: int64(buf.Len() + 4096000),
|
||||
Name: "model.layers.0.mlp.up_proj.weight",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := getTensorInfoFromManifest(mf)
|
||||
if err != nil {
|
||||
t.Fatalf("getTensorInfoFromManifest() error = %v", err)
|
||||
}
|
||||
|
||||
if len(result) != 1 {
|
||||
t.Fatalf("got %d tensors, want 1", len(result))
|
||||
}
|
||||
|
||||
tensor := result[0]
|
||||
if tensor.Name != "model.layers.0.mlp.up_proj.weight" {
|
||||
t.Errorf("Name = %v, want model.layers.0.mlp.up_proj.weight", tensor.Name)
|
||||
}
|
||||
if tensor.Type != "INT4" {
|
||||
t.Errorf("Type = %v, want INT4", tensor.Type)
|
||||
}
|
||||
// Shape should be unpacked: 320 * 8 = 2560
|
||||
if len(tensor.Shape) != 2 || tensor.Shape[0] != 2560 || tensor.Shape[1] != 2560 {
|
||||
t.Errorf("Shape = %v, want [2560, 2560]", tensor.Shape)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseSafetensorsAllHeaders(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
header map[string]any
|
||||
wantCount int
|
||||
wantNames []string
|
||||
wantDtypes []string
|
||||
wantQuants []string
|
||||
wantErr bool
|
||||
}{
|
||||
{
|
||||
name: "single tensor blob",
|
||||
header: map[string]any{
|
||||
"model.layers.0.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 2560},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
},
|
||||
wantCount: 1,
|
||||
wantNames: []string{"model.layers.0.weight"},
|
||||
wantDtypes: []string{"BF16"},
|
||||
wantQuants: []string{""},
|
||||
},
|
||||
{
|
||||
name: "packed unquantized blob",
|
||||
header: map[string]any{
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 10240},
|
||||
"data_offsets": []int64{0, 52428800},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 2560},
|
||||
"data_offsets": []int64{52428800, 104857600},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 2560},
|
||||
"data_offsets": []int64{104857600, 157286400},
|
||||
},
|
||||
},
|
||||
wantCount: 3,
|
||||
wantNames: []string{
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight",
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight",
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight",
|
||||
},
|
||||
wantDtypes: []string{"BF16", "BF16", "BF16"},
|
||||
wantQuants: []string{"", "", ""},
|
||||
},
|
||||
{
|
||||
name: "packed quantized blob with global metadata",
|
||||
header: map[string]any{
|
||||
"__metadata__": map[string]any{
|
||||
"quant_type": "int4",
|
||||
"group_size": "32",
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{10240, 320},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{13107200, 14745600},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{14745600, 16384000},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{10240, 320},
|
||||
"data_offsets": []int64{16384000, 29491200},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{29491200, 31129600},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{31129600, 32768000},
|
||||
},
|
||||
},
|
||||
wantCount: 2,
|
||||
wantNames: []string{
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight",
|
||||
"model.layers.0.mlp.experts.0.up_proj.weight",
|
||||
},
|
||||
wantDtypes: []string{"U32", "U32"},
|
||||
wantQuants: []string{"int4", "int4"},
|
||||
},
|
||||
{
|
||||
name: "packed mixed-precision blob (no global metadata)",
|
||||
header: map[string]any{
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{10240, 320},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{13107200, 14745600},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{14745600, 16384000},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 2560},
|
||||
"data_offsets": []int64{16384000, 42598400},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 160},
|
||||
"data_offsets": []int64{42598400, 43417600},
|
||||
},
|
||||
},
|
||||
wantCount: 2,
|
||||
wantNames: []string{
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight",
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight",
|
||||
},
|
||||
wantDtypes: []string{"U32", "U32"},
|
||||
wantQuants: []string{"int8", "int4"},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
headerJSON, err := json.Marshal(tt.header)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to marshal header: %v", err)
|
||||
}
|
||||
|
||||
var buf bytes.Buffer
|
||||
if err := binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON))); err != nil {
|
||||
t.Fatalf("failed to write header size: %v", err)
|
||||
}
|
||||
buf.Write(headerJSON)
|
||||
|
||||
results, err := parseSafetensorsAllHeaders(&buf)
|
||||
if (err != nil) != tt.wantErr {
|
||||
t.Errorf("parseSafetensorsAllHeaders() error = %v, wantErr %v", err, tt.wantErr)
|
||||
return
|
||||
}
|
||||
if tt.wantErr {
|
||||
return
|
||||
}
|
||||
|
||||
if len(results) != tt.wantCount {
|
||||
t.Fatalf("got %d tensors, want %d", len(results), tt.wantCount)
|
||||
}
|
||||
|
||||
for i, info := range results {
|
||||
if info.Name != tt.wantNames[i] {
|
||||
t.Errorf("tensor[%d].Name = %v, want %v", i, info.Name, tt.wantNames[i])
|
||||
}
|
||||
if info.Dtype != tt.wantDtypes[i] {
|
||||
t.Errorf("tensor[%d].Dtype = %v, want %v", i, info.Dtype, tt.wantDtypes[i])
|
||||
}
|
||||
if info.QuantType != tt.wantQuants[i] {
|
||||
t.Errorf("tensor[%d].QuantType = %v, want %v", i, info.QuantType, tt.wantQuants[i])
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetTensorInfoFromManifest_Packed(t *testing.T) {
|
||||
// Create a temp directory for blobs and set OLLAMA_MODELS
|
||||
tempDir := t.TempDir()
|
||||
t.Setenv("OLLAMA_MODELS", tempDir)
|
||||
|
||||
blobDir := filepath.Join(tempDir, "blobs")
|
||||
if err := os.MkdirAll(blobDir, 0o755); err != nil {
|
||||
t.Fatalf("failed to create blobs dir: %v", err)
|
||||
}
|
||||
|
||||
// Create a packed blob with multiple expert tensors (mixed quantization)
|
||||
header := map[string]any{
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{10240, 320},
|
||||
"data_offsets": []int64{0, 13107200},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{13107200, 14745600},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.gate_proj.weight.bias": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{10240, 80},
|
||||
"data_offsets": []int64{14745600, 16384000},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight": map[string]any{
|
||||
"dtype": "U32",
|
||||
"shape": []int64{2560, 2560},
|
||||
"data_offsets": []int64{16384000, 42598400},
|
||||
},
|
||||
"model.layers.0.mlp.experts.0.down_proj.weight.scale": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{2560, 160},
|
||||
"data_offsets": []int64{42598400, 43417600},
|
||||
},
|
||||
}
|
||||
headerJSON, _ := json.Marshal(header)
|
||||
|
||||
var buf bytes.Buffer
|
||||
binary.Write(&buf, binary.LittleEndian, uint64(len(headerJSON)))
|
||||
buf.Write(headerJSON)
|
||||
|
||||
packedDigest := "sha256:aaaa000000000000000000000000000000000000000000000000000000000001"
|
||||
blobPath, err := manifest.BlobsPath(packedDigest)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(blobPath, buf.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write packed blob: %v", err)
|
||||
}
|
||||
|
||||
// Also create a regular (single-tensor) blob
|
||||
singleHeader := map[string]any{
|
||||
"model.embed_tokens.weight": map[string]any{
|
||||
"dtype": "BF16",
|
||||
"shape": []int64{262144, 2560},
|
||||
"data_offsets": []int64{0, 1342177280},
|
||||
},
|
||||
}
|
||||
singleHeaderJSON, _ := json.Marshal(singleHeader)
|
||||
var singleBuf bytes.Buffer
|
||||
binary.Write(&singleBuf, binary.LittleEndian, uint64(len(singleHeaderJSON)))
|
||||
singleBuf.Write(singleHeaderJSON)
|
||||
|
||||
singleDigest := "sha256:bbbb000000000000000000000000000000000000000000000000000000000002"
|
||||
singleBlobPath, err := manifest.BlobsPath(singleDigest)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to get blob path: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(singleBlobPath, singleBuf.Bytes(), 0o644); err != nil {
|
||||
t.Fatalf("failed to write single blob: %v", err)
|
||||
}
|
||||
|
||||
mf := &manifest.Manifest{
|
||||
SchemaVersion: 2,
|
||||
MediaType: "application/vnd.docker.distribution.manifest.v2+json",
|
||||
Layers: []manifest.Layer{
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: singleDigest,
|
||||
Size: int64(singleBuf.Len()),
|
||||
Name: "model.embed_tokens.weight",
|
||||
},
|
||||
{
|
||||
MediaType: manifest.MediaTypeImageTensor,
|
||||
Digest: packedDigest,
|
||||
Size: int64(buf.Len()),
|
||||
Name: "model.layers.0.mlp.experts", // group prefix
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
result, err := getTensorInfoFromManifest(mf)
|
||||
if err != nil {
|
||||
t.Fatalf("getTensorInfoFromManifest() error = %v", err)
|
||||
}
|
||||
|
||||
// Should have 3 tensors: 1 single + 2 packed main tensors
|
||||
if len(result) != 3 {
|
||||
t.Fatalf("got %d tensors, want 3. Tensors: %v", len(result), result)
|
||||
}
|
||||
|
||||
// First tensor should be the single blob
|
||||
if result[0].Name != "model.embed_tokens.weight" {
|
||||
t.Errorf("tensor[0].Name = %v, want model.embed_tokens.weight", result[0].Name)
|
||||
}
|
||||
if result[0].Type != "BF16" {
|
||||
t.Errorf("tensor[0].Type = %v, want BF16", result[0].Type)
|
||||
}
|
||||
|
||||
// Packed tensors should have their actual names (sorted)
|
||||
packedNames := make(map[string]bool)
|
||||
for _, r := range result[1:] {
|
||||
packedNames[r.Name] = true
|
||||
}
|
||||
if !packedNames["model.layers.0.mlp.experts.0.down_proj.weight"] {
|
||||
t.Error("missing packed tensor: model.layers.0.mlp.experts.0.down_proj.weight")
|
||||
}
|
||||
if !packedNames["model.layers.0.mlp.experts.0.gate_proj.weight"] {
|
||||
t.Error("missing packed tensor: model.layers.0.mlp.experts.0.gate_proj.weight")
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadSafetensorsHeader(t *testing.T) {
|
||||
// Create a temp file with a valid safetensors header
|
||||
tempDir := t.TempDir()
|
||||
|
||||
Reference in New Issue
Block a user