Add GLM-OCR vision model support (#14024)

2026-04-30 07:57:51 -05:00 · 2026-02-02 15:39:18 -08:00
parent d8cc798c2b
commit 8f4a008139
15 changed files with 1553 additions and 0 deletions
--- a/model/models/glmocr/imageprocessor.go
+++ b/model/models/glmocr/imageprocessor.go
@@ -0,0 +1,174 @@
+package glmocr
+
+import (
+	"image"
+	"log/slog"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/model/imageproc"
+)
+
+type ImageProcessor struct {
+	imageSize         int
+	patchSize         int
+	temporalPatchSize int
+	spatialMergeSize  int
+	minPixels         int
+	maxPixels         int
+	factor            int
+	imageMean         [3]float32
+	imageStd          [3]float32
+}
+
+func newImageProcessor(c fs.Config) ImageProcessor {
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
+
+	// Read normalization values from config if available, otherwise use CLIP defaults
+	imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
+	imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
+
+	// Default max_pixels: 2048 * patchSize^2 * mergeSize^2 * temporal = ~3.2M pixels
+	// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
+	defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
+
+	return ImageProcessor{
+		imageSize:         int(c.Uint("vision.image_size", 336)),
+		patchSize:         patchSize,
+		temporalPatchSize: temporalPatchSize,
+		spatialMergeSize:  spatialMergeSize,
+		minPixels:         int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
+		maxPixels:         int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
+		factor:            patchSize * spatialMergeSize,
+		imageMean:         [3]float32{imageMean[0], imageMean[1], imageMean[2]},
+		imageStd:          [3]float32{imageStd[0], imageStd[1], imageStd[2]},
+	}
+}
+
+func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
+	factor := p.factor
+	temporalFactor := p.temporalPatchSize
+	numFrames := temporalFactor // single image
+
+	if height < factor || width < factor {
+		// Scale up small images
+		scale := float64(factor) / float64(min(height, width))
+		height = int(math.Ceil(float64(height) * scale))
+		width = int(math.Ceil(float64(width) * scale))
+	}
+
+	if temporalFactor <= 0 {
+		slog.Warn("temporal_patch_size must be > 0, defaulting to 1")
+		temporalFactor = 1
+	}
+	if numFrames < temporalFactor {
+		slog.Warn("num_frames must be >= temporal_patch_size, adjusting num_frames", "num_frames", numFrames, "temporal_patch_size", temporalFactor)
+		numFrames = temporalFactor
+	}
+	if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
+		slog.Warn("aspect ratio exceeds 200, image quality may be affected", "aspect_ratio", aspectRatio)
+	}
+
+	round := func(x float64) int { return int(math.RoundToEven(x)) }
+
+	hBar := round(float64(height)/float64(factor)) * factor
+	wBar := round(float64(width)/float64(factor)) * factor
+	tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
+
+	if tBar*hBar*wBar > p.maxPixels {
+		beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
+		hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
+		wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
+	} else if tBar*hBar*wBar < p.minPixels {
+		beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
+		hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
+		wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
+	}
+
+	return hBar, wBar
+}
+
+func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
+	img = imageproc.Composite(img)
+
+	origWidth := img.Bounds().Dx()
+	origHeight := img.Bounds().Dy()
+
+	// Calculate smart resize dimensions
+	resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
+
+	// Resize image
+	resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeCatmullrom)
+
+	// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
+	// We keep [C, H, W] for patch extraction
+	normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
+
+	// Calculate grid dimensions (after Conv2D patching)
+	grid := &Grid{
+		Height:      resizedHeight / p.patchSize,
+		Width:       resizedWidth / p.patchSize,
+		Temporal:    1, // Single image
+		ImageHeight: resizedHeight,
+		ImageWidth:  resizedWidth,
+	}
+
+	patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return patches, grid, nil
+}
+
+func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
+	channels := 3
+	patchSize := p.patchSize
+	mergeSize := p.spatialMergeSize
+	temporalPatchSize := p.temporalPatchSize
+
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	patchDim := channels * temporalPatchSize * patchSize * patchSize
+	result := make([]float32, numPatches*patchDim)
+	patchIndex := 0
+
+	// Single temporal frame handling (copies to all frames)
+	for range grid.Temporal {
+		for h := 0; h < grid.Height; h += mergeSize {
+			for w := 0; w < grid.Width; w += mergeSize {
+				for mh := range mergeSize {
+					for mw := range mergeSize {
+						baseOffset := patchIndex * patchDim
+						for c := range channels {
+							channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
+							for py := range patchSize {
+								for px := range patchSize {
+									y := (h+mh)*patchSize + py
+									x := (w+mw)*patchSize + px
+									srcIdx := c*height*width + y*width + x
+									dstIdx := channelOffset + (py * patchSize) + px
+									result[dstIdx] = pixels[srcIdx]
+								}
+							}
+
+							if temporalPatchSize > 1 {
+								frameSize := patchSize * patchSize
+								for tp := 1; tp < temporalPatchSize; tp++ {
+									currentFrameOffset := channelOffset + (tp * frameSize)
+									copy(result[currentFrameOffset:currentFrameOffset+frameSize],
+										result[channelOffset:channelOffset+frameSize])
+								}
+							}
+						}
+
+						patchIndex++
+					}
+				}
+			}
+		}
+	}
+
+	return result, nil
+}
--- a/model/models/glmocr/model.go
+++ b/model/models/glmocr/model.go
@@ -0,0 +1,235 @@
+package glmocr
+
+import (
+	"bytes"
+	"errors"
+	"image"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	*TextModel
+	*VisionModel     `gguf:"v"`
+	VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
+	PatchMerger      *PatchMerger      `gguf:"mm"`
+
+	ImageProcessor
+
+	imageTokenID      int32
+	imageStartTokenID int32
+	imageEndTokenID   int32
+}
+
+var _ model.MultimodalProcessor = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
+	eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
+	allEOS := append([]int32{eosTokenID}, eosTokenIDs...)
+
+	m := &Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS:    allEOS,
+			},
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+		),
+		TextModel:         newTextModel(c),
+		VisionModel:       newVisionModel(c),
+		ImageProcessor:    newImageProcessor(c),
+		imageTokenID:      int32(c.Uint("image_token_id", 59280)),
+		imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
+		imageEndTokenID:   int32(c.Uint("image_end_token_id", 59257)),
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
+
+	return m, nil
+}
+
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+	if len(m.VisionModel.Blocks) == 0 {
+		return nil, model.ErrNoVisionModel
+	}
+
+	img, _, err := image.Decode(bytes.NewReader(multimodalData))
+	if err != nil {
+		return nil, err
+	}
+
+	f32s, grid, err := m.ImageProcessor.ProcessImage(img)
+	if err != nil {
+		return nil, err
+	}
+
+	// Create pixel values tensor from flattened patches
+	// Shape: [patchDim, numPatches]
+	patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
+	numPatches := grid.Temporal * grid.Height * grid.Width
+	pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
+
+	// Forward through vision encoder
+	visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)
+
+	// Forward through downsample (patch merger)
+	if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
+		return nil, errors.New("glmocr: missing vision downsample weights")
+	}
+	visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)
+
+	// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
+	if m.PatchMerger == nil {
+		return nil, errors.New("glmocr: missing patch merger weights")
+	}
+	visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)
+
+	return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
+}
+
+func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
+	var result []*input.Input
+
+	// Reset position cache
+	m.TextModel.positionCache = m.TextModel.positionCache[:0]
+	m.TextModel.ropeDelta = 0
+
+	pos := int32(0)
+	for _, inp := range inputs {
+		if inp.Multimodal == nil {
+			result = append(result, inp)
+			m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
+			pos++
+			continue
+		}
+
+		// Get grid info for position calculation
+		grid := inp.Multimodal[0].Data.(*Grid)
+		mergedH := grid.Height / m.VisionModel.spatialMergeSize
+		mergedW := grid.Width / m.VisionModel.spatialMergeSize
+
+		// Add image start token
+		result = append(result, &input.Input{Token: m.imageStartTokenID})
+		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
+		pos++
+
+		// Add image tokens with multimodal data
+		// All image tokens share the same base position for temporal dimension
+		tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
+		basePos := pos
+		sameBatch := tokensPerGrid - 1
+		if sameBatch < 0 {
+			sameBatch = 0
+		}
+		result = append(result, &input.Input{
+			Token:          m.imageTokenID,
+			Multimodal:     inp.Multimodal,
+			MultimodalHash: inp.MultimodalHash,
+			SameBatch:      sameBatch,
+		})
+		m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
+
+		// Add placeholder tokens for remaining positions
+		// All image tokens use the same base position (temporal stays constant)
+		for range tokensPerGrid - 1 {
+			result = append(result, &input.Input{Token: m.imageTokenID})
+			m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
+		}
+
+		// Advance position by max(mergedH, mergedW) after image tokens
+		pos = basePos + int32(max(mergedH, mergedW))
+
+		// Add image end token
+		result = append(result, &input.Input{Token: m.imageEndTokenID})
+		m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
+		pos++
+	}
+
+	// Compute rope delta for continuation after the prefill segment:
+	// delta = (max_position_id + 1) - sequence_length
+	if len(m.TextModel.positionCache) > 0 {
+		last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
+		m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
+	}
+
+	return result, nil
+}
+
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	// Initial token embedding
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
+	ctx.Forward(hiddenStates)
+
+	// Build position slices for M-RoPE
+	positionSlice := func() [][]int32 {
+		s := [][]int32{
+			make([]int32, len(batch.Positions)), // temporal
+			make([]int32, len(batch.Positions)), // height
+			make([]int32, len(batch.Positions)), // width
+			make([]int32, len(batch.Positions)), // unused (zeros)
+		}
+		for i, position := range batch.Positions {
+			// Translate through position cache or continue sequence
+			if position < int32(len(m.TextModel.positionCache)) {
+				position = m.TextModel.positionCache[position]
+			} else if len(m.TextModel.positionCache) > 0 {
+				// Continue sequence after cached positions using ropeDelta
+				position = position + m.TextModel.ropeDelta
+			}
+
+			s[0][i] = position
+			s[1][i] = position
+			s[2][i] = position
+		}
+		return s
+	}()
+
+	// Inject vision embeddings and adjust positions for image tokens
+	for _, mi := range batch.Multimodal {
+		img := mi.Multimodal[0].Tensor
+		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
+
+		if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
+			w := grid.Width / m.VisionModel.spatialMergeSize
+			for i := range img.Dim(1) {
+				positionSlice[1][mi.Index+i] += int32(i / w)
+				positionSlice[2][mi.Index+i] += int32(i % w)
+			}
+		}
+	}
+
+	positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
+
+	// Process through transformer layers
+	for i, layer := range m.TextModel.Layers {
+		m.Cache.SetLayer(i)
+
+		var lastLayerOutputs ml.Tensor
+		if i == len(m.TextModel.Layers)-1 {
+			lastLayerOutputs = batch.Outputs
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func init() {
+	model.Register("glmocr", New)
+}
--- a/model/models/glmocr/model_text.go
+++ b/model/models/glmocr/model_text.go
@@ -0,0 +1,190 @@
+package glmocr
+
+import (
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+type TextModelOptions struct {
+	hiddenSize       int
+	numHeads         int
+	numKVHeads       int
+	headDim          int
+	rotaryDim        int
+	intermediateSize int
+	eps              float32
+	ropeBase         float32
+	mropeSections    []int
+}
+
+func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
+	// With 4 sections for [temporal, height, width, unused]
+	return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
+}
+
+type TextSelfAttention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_out"`
+}
+
+func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	// Separate Q, K, V projections
+	q := sa.Query.Forward(ctx, hiddenStates)
+	k := sa.Key.Forward(ctx, hiddenStates)
+	v := sa.Value.Forward(ctx, hiddenStates)
+
+	// Reshape for GQA
+	q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
+	k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
+	v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
+
+	// Apply M-RoPE (multi-resolution rotary position embeddings)
+	q = opts.applyMRoPE(ctx, q, positions)
+	k = opts.applyMRoPE(ctx, k, positions)
+
+	// Scaled dot-product attention with KV cache
+	scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
+	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
+	// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
+	// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
+	kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
+
+	return sa.Output.Forward(ctx, kqv)
+}
+
+type TextMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
+	// SwiGLU: down(silu(gate(x)) * up(x))
+	gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, gate)
+}
+
+type TextDecoderLayer struct {
+	// Input layernorm (before attention)
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	SelfAttention *TextSelfAttention
+	// Post self-attention layernorm (after attention, before residual add)
+	PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
+
+	// FFN input layernorm (after first residual, before MLP)
+	FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP     *TextMLP
+	// Post MLP layernorm (after MLP, before residual add)
+	PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
+}
+
+func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
+	// Attention block
+	residual := hiddenStates
+	hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
+	hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
+
+	// Prune to output positions in final layer
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	// MLP block
+	residual = hiddenStates
+	hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
+	hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	return hiddenStates
+}
+
+type TextModel struct {
+	TokenEmbedding *nn.Embedding      `gguf:"token_embd"`
+	Layers         []TextDecoderLayer `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm        `gguf:"output_norm"`
+	Output         *nn.Linear         `gguf:"output,alt:token_embd"`
+
+	*TextModelOptions
+
+	// positionCache stores the M-RoPE position for each token in the sequence.
+	// This is needed because image tokens share the same base position but have
+	// different height/width offsets, and the end token position depends on the
+	// image grid dimensions.
+	positionCache []int32
+	ropeDelta     int32
+}
+
+func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	// Clear position cache when KV cache shifts
+	m.positionCache = nil
+	m.ropeDelta = 0
+	return m.applyMRoPE(ctx, key, shift), nil
+}
+
+func newTextModel(c fs.Config) *TextModel {
+	hiddenSize := int(c.Uint("embedding_length", 1536))
+	numHeads := int(c.Uint("attention.head_count", 16))
+	numKVHeads := int(c.Uint("attention.head_count_kv", 8))
+	intermediateSize := int(c.Uint("feed_forward_length", 4608))
+	eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
+	ropeBase := c.Float("rope.freq_base", 10000)
+
+	headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
+	ropeDim := int(c.Uint("rope.dimension_count", uint32(headDim)))
+	if ropeDim <= 0 {
+		ropeDim = headDim
+	}
+
+	mropeSections := c.Ints("rope.mrope_section")
+	var sectionInts []int
+
+	if len(mropeSections) > 0 {
+		sectionInts = make([]int, len(mropeSections))
+		for i, section := range mropeSections {
+			sectionInts[i] = int(section)
+		}
+	} else {
+		// Default to GLM-OCR's HF ratio (2:3:3) scaled to rotaryDim/2.
+		// For rotaryDim=64 this yields [8, 12, 12].
+		total := ropeDim / 2
+		if total <= 0 {
+			total = 32
+		}
+		s0 := total * 2 / 8
+		s1 := total * 3 / 8
+		s2 := total - s0 - s1
+		sectionInts = []int{s0, s1, s2}
+	}
+
+	// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
+	rotaryDim := ropeDim
+
+	return &TextModel{
+		Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
+		TextModelOptions: &TextModelOptions{
+			hiddenSize:       hiddenSize,
+			numHeads:         numHeads,
+			numKVHeads:       numKVHeads,
+			headDim:          headDim,
+			rotaryDim:        rotaryDim,
+			intermediateSize: intermediateSize,
+			eps:              eps,
+			ropeBase:         ropeBase,
+			mropeSections:    sectionInts,
+		},
+	}
+}
--- a/model/models/glmocr/model_vision.go
+++ b/model/models/glmocr/model_vision.go
@@ -0,0 +1,355 @@
+package glmocr
+
+import (
+	"log/slog"
+	"math"
+	"slices"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+type Grid struct {
+	Height      int // Number of patches in height direction
+	Width       int // Number of patches in width direction
+	Temporal    int
+	ImageHeight int // Full image height in pixels
+	ImageWidth  int // Full image width in pixels
+}
+
+type VisionModelOptions struct {
+	hiddenSize        int
+	numHeads          int
+	headDim           int
+	numChannels       int
+	patchSize         int
+	temporalPatchSize int
+	imageSize         int
+	spatialMergeSize  int
+	outHiddenSize     int
+	intermediateSize  int
+	eps               float32
+}
+
+type VisionPatchEmbed struct {
+	Proj  *nn.Conv2D `gguf:"patch_embd_0"`
+	Proj1 *nn.Conv2D `gguf:"patch_embd_1"`
+	Bias  ml.Tensor  `gguf:"patch_embd.bias"`
+}
+
+func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
+	_ = grid // patches are already in merge-block order
+
+	// pixelValues shape: [patchDim, numPatches]
+	numPatches := pixelValues.Shape()[1]
+
+	// Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches]
+	pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
+	// Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches]
+	pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
+
+	// Slice temporal frames for Conv2D (simulate Conv3D)
+	in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+	in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+
+	s0, s1 := opts.patchSize, opts.patchSize
+	p0, p1 := 0, 0
+	d0, d1 := 1, 1
+	hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
+
+	if pe.Proj1 != nil && opts.temporalPatchSize > 1 {
+		in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
+		in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
+		out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
+		hiddenStates = hiddenStates.Add(ctx, out1)
+	}
+
+	// Flatten to [hidden_size, num_patches]
+	hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches)
+
+	// Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting
+	if pe.Bias != nil {
+		hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1))
+	}
+
+	return hiddenStates
+}
+
+type VisionSelfAttention struct {
+	QKV    *nn.Linear  `gguf:"attn_qkv"`
+	QNorm  *nn.RMSNorm `gguf:"attn_q_norm"`
+	KNorm  *nn.RMSNorm `gguf:"attn_k_norm"`
+	Output *nn.Linear  `gguf:"attn_out"`
+}
+
+func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	// Combined QKV projection: [3*hidden_size, batch_size]
+	qkv := sa.QKV.Forward(ctx, hiddenStates)
+
+	// Split using ChunkSections along dim 0 (handles byte offsets correctly)
+	// ChunkSections returns views - must make contiguous before further operations
+	chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize)
+	q := chunks[0].Contiguous(ctx)
+	k := chunks[1].Contiguous(ctx)
+	v := chunks[2].Contiguous(ctx)
+
+	// Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N]
+	q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
+	k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
+	v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
+
+	// Apply Q-norm and K-norm after head reshape
+	// Weights are [headDim]=64, tensor is [headDim, numHeads, N]
+	q = sa.QNorm.Forward(ctx, q, opts.eps)
+	k = sa.KNorm.Forward(ctx, k, opts.eps)
+
+	// Apply rotary position embeddings with vision-style 2D positions.
+	// ggml's vision RoPE uses two position dimensions (H/W) with half-rotation pairs.
+	// We provide H/W sections and leave the remaining sections empty.
+	ropeFreqBase := float32(10000.0)
+	section := opts.headDim / 4
+	if section <= 0 {
+		section = 1
+	}
+	sections := []int{section, section, 0, 0}
+	q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
+	k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
+
+	// Scale factor for scaled dot-product attention
+	scale := 1.0 / math.Sqrt(float64(opts.headDim))
+
+	// Try flash attention first (ScaledDotProductAttention), fall back to manual
+	if sdpa, ok := q.(ml.ScaledDotProductAttention); ok {
+		attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false)
+		attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+		return sa.Output.Forward(ctx, attention)
+	}
+
+	slog.Warn("glmocr: vision attention falling back to manual attention",
+		"batchSize", batchSize, "numHeads", opts.numHeads,
+		"hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention")
+
+	// Manual attention fallback
+	// q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1
+	q = q.Permute(ctx, 0, 2, 1, 3)
+	k = k.Permute(ctx, 0, 2, 1, 3)
+	v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
+
+	// Attention scores
+	kq := k.MulmatFullPrec(ctx, q)
+	kq = kq.Scale(ctx, scale)
+	kq = kq.Softmax(ctx)
+
+	// Attention output: v @ kq (note: v first)
+	kqv := v.Mulmat(ctx, kq)
+	attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
+
+	return sa.Output.Forward(ctx, attention)
+}
+
+type VisionMLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	// SwiGLU: down(silu(gate(x)) * up(x))
+	gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, gate)
+}
+
+type VisionBlock struct {
+	Norm1         *nn.RMSNorm `gguf:"ln1"`
+	SelfAttention *VisionSelfAttention
+	Norm2         *nn.RMSNorm `gguf:"ln2"`
+	MLP           *VisionMLP
+}
+
+func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Pre-norm architecture
+	residual := hiddenStates
+	hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = b.MLP.Forward(ctx, hiddenStates)
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	return hiddenStates
+}
+
+type VisionDownsample struct {
+	*nn.Conv2D
+}
+
+func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
+	// Apply spatial downsampling via Conv2D
+	// Input: [hidden_size, num_patches] where patches are in merge-block order
+
+	if d.Conv2D == nil || d.Weight == nil {
+		slog.Error("VisionDownsample weights not loaded - model may be corrupted or incompatible")
+		return hiddenStates // Return input unchanged as fallback
+	}
+
+	merge := opts.spatialMergeSize
+	numOutputTokens := (grid.Height / merge) * (grid.Width / merge)
+
+	// Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens]
+	hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens)
+
+	// Step 2: Permute to [merge, merge, hidden_size, num_output_tokens]
+	// ggml semantics: result.ne[perm[i]] = input.ne[i]
+	// So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N]
+	hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
+
+	// Step 3: Apply Conv2D without bias (bias added after reshape)
+	// Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama
+	s0, s1 := merge, merge
+	p0, p1 := 0, 0
+	d0, d1 := 1, 1
+	hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1)
+
+	// Step 4: Reshape to [out_hidden_size, num_output_tokens]
+	hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens)
+
+	// Step 5: Add bias after reshape
+	// Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting
+	if d.Bias != nil {
+		hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1))
+	}
+
+	return hiddenStates
+}
+
+type PatchMerger struct {
+	// GGUF tags align with mm.* keys used by the model
+	Proj     *nn.Linear    `gguf:"model.fc"`  // mm.model.fc.weight
+	PostLN   *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias
+	GateProj *nn.Linear    `gguf:"gate"`      // mm.gate.weight
+	UpProj   *nn.Linear    `gguf:"up"`        // mm.up.weight
+	DownProj *nn.Linear    `gguf:"down"`      // mm.down.weight
+}
+
+func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
+	// Linear projection
+	hiddenStates = m.Proj.Forward(ctx, hiddenStates)
+
+	// Post-projection layer norm + GELU ERF
+	hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = hiddenStates.GELU_ERF(ctx)
+	// Force a copy to avoid in-place mutation issues with GELU_ERF
+	hiddenStates = hiddenStates.Contiguous(ctx)
+
+	// SwiGLU MLP: down(silu(gate(x)) * up(x))
+	gateOut := m.GateProj.Forward(ctx, hiddenStates)
+	upOut := m.UpProj.Forward(ctx, hiddenStates)
+	gate := gateOut.SILU(ctx, upOut)
+	return m.DownProj.Forward(ctx, gate)
+}
+
+type VisionModel struct {
+	PatchEmbed *VisionPatchEmbed
+	Blocks     []VisionBlock `gguf:"blk"`
+	PostLN     *nn.RMSNorm   `gguf:"post_ln"`
+	// Note: Downsample is applied at the model level so mm.patch_merger stays separate
+
+	*VisionModelOptions
+}
+
+func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
+	// Extract patch embeddings from flattened patches
+	hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions)
+
+	// Create position IDs for RoPE (spatial grid)
+	// Patches are already in merge-block order from preprocessing
+	positions := m.createPositions(ctx, grid)
+
+	// Process through vision blocks
+	for _, block := range m.Blocks {
+		hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions)
+	}
+
+	// Post-layernorm
+	hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps)
+
+	// Note: Downsample is now applied separately in Model.EncodeMultimodal
+	// so mm.patch_merger remains a distinct module
+
+	return hiddenStates
+}
+
+func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor {
+	// Create spatial position IDs for vision RoPE
+	// Position layout: [height, width, height, width] - 4 sections for mrope
+	// Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving
+	// This follows the GLM-OCR rot_pos_emb layout
+	numPatches := grid.Height * grid.Width
+	mergeRatio := m.spatialMergeSize
+
+	// Build position arrays in merge-block order
+	// Each merge_ratio x merge_ratio block of patches is grouped together
+	hpos := make([]int32, numPatches)
+	wpos := make([]int32, numPatches)
+	ptr := 0
+	for y := 0; y < grid.Height; y += mergeRatio {
+		for x := 0; x < grid.Width; x += mergeRatio {
+			for dy := range mergeRatio {
+				for dx := range mergeRatio {
+					hpos[ptr] = int32(y + dy)
+					wpos[ptr] = int32(x + dx)
+					ptr++
+				}
+			}
+		}
+	}
+
+	// Build position arrays for 4 sections (mrope). ggml vision RoPE uses only H/W;
+	// keep remaining sections zeroed to match its conventions.
+	zeros := make([]int32, numPatches)
+	s := [][]int32{
+		hpos,  // Section 0: height
+		wpos,  // Section 1: width
+		zeros, // Section 2: unused
+		zeros, // Section 3: unused
+	}
+
+	return ctx.Input().FromInts(slices.Concat(s...), numPatches*4)
+}
+
+func newVisionModel(c fs.Config) *VisionModel {
+	hiddenSize := int(c.Uint("vision.embedding_length", 1024))
+	numHeads := int(c.Uint("vision.attention.head_count", 16))
+	numChannels := int(c.Uint("vision.num_channels", 3))
+	patchSize := int(c.Uint("vision.patch_size", 14))
+	temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
+	imageSize := int(c.Uint("vision.image_size", 336))
+	spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
+	outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536))
+	intermediateSize := int(c.Uint("vision.intermediate_size", 4096))
+	eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5)
+
+	return &VisionModel{
+		Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)),
+		VisionModelOptions: &VisionModelOptions{
+			hiddenSize:        hiddenSize,
+			numHeads:          numHeads,
+			headDim:           hiddenSize / numHeads,
+			numChannels:       numChannels,
+			patchSize:         patchSize,
+			temporalPatchSize: temporalPatchSize,
+			imageSize:         imageSize,
+			spatialMergeSize:  spatialMergeSize,
+			outHiddenSize:     outHiddenSize,
+			intermediateSize:  intermediateSize,
+			eps:               eps,
+		},
+	}
+}
--- a/model/models/models.go
+++ b/model/models/models.go
@@ -8,6 +8,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/gemma3"
 	_ "github.com/ollama/ollama/model/models/gemma3n"
 	_ "github.com/ollama/ollama/model/models/glm4moelite"
+	_ "github.com/ollama/ollama/model/models/glmocr"
 	_ "github.com/ollama/ollama/model/models/gptoss"
 	_ "github.com/ollama/ollama/model/models/lfm2"
 	_ "github.com/ollama/ollama/model/models/llama"