mirror of
https://github.com/ollama/ollama.git
synced 2026-04-30 07:57:51 -05:00
Add GLM-OCR vision model support (#14024)
This commit is contained in:
174
model/models/glmocr/imageprocessor.go
Normal file
174
model/models/glmocr/imageprocessor.go
Normal file
@@ -0,0 +1,174 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"image"
|
||||
"log/slog"
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/model/imageproc"
|
||||
)
|
||||
|
||||
type ImageProcessor struct {
|
||||
imageSize int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
spatialMergeSize int
|
||||
minPixels int
|
||||
maxPixels int
|
||||
factor int
|
||||
imageMean [3]float32
|
||||
imageStd [3]float32
|
||||
}
|
||||
|
||||
func newImageProcessor(c fs.Config) ImageProcessor {
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
|
||||
// Read normalization values from config if available, otherwise use CLIP defaults
|
||||
imageMean := c.Floats("vision.image_mean", imageproc.ClipDefaultMean[:])
|
||||
imageStd := c.Floats("vision.image_std", imageproc.ClipDefaultSTD[:])
|
||||
|
||||
// Default max_pixels: 2048 * patchSize^2 * mergeSize^2 * temporal = ~3.2M pixels
|
||||
// This limits to ~16k patches (4k output tokens) to keep memory stable without flash attention
|
||||
defaultMaxPixels := 2048 * patchSize * patchSize * spatialMergeSize * spatialMergeSize * temporalPatchSize
|
||||
|
||||
return ImageProcessor{
|
||||
imageSize: int(c.Uint("vision.image_size", 336)),
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
minPixels: int(c.Uint("vision.min_pixels", uint32(8*patchSize*patchSize*spatialMergeSize*spatialMergeSize*temporalPatchSize))),
|
||||
maxPixels: int(c.Uint("vision.max_pixels", uint32(defaultMaxPixels))),
|
||||
factor: patchSize * spatialMergeSize,
|
||||
imageMean: [3]float32{imageMean[0], imageMean[1], imageMean[2]},
|
||||
imageStd: [3]float32{imageStd[0], imageStd[1], imageStd[2]},
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) SmartResize(height, width int) (int, int) {
|
||||
factor := p.factor
|
||||
temporalFactor := p.temporalPatchSize
|
||||
numFrames := temporalFactor // single image
|
||||
|
||||
if height < factor || width < factor {
|
||||
// Scale up small images
|
||||
scale := float64(factor) / float64(min(height, width))
|
||||
height = int(math.Ceil(float64(height) * scale))
|
||||
width = int(math.Ceil(float64(width) * scale))
|
||||
}
|
||||
|
||||
if temporalFactor <= 0 {
|
||||
slog.Warn("temporal_patch_size must be > 0, defaulting to 1")
|
||||
temporalFactor = 1
|
||||
}
|
||||
if numFrames < temporalFactor {
|
||||
slog.Warn("num_frames must be >= temporal_patch_size, adjusting num_frames", "num_frames", numFrames, "temporal_patch_size", temporalFactor)
|
||||
numFrames = temporalFactor
|
||||
}
|
||||
if aspectRatio := float64(max(height, width)) / float64(min(height, width)); aspectRatio > 200 {
|
||||
slog.Warn("aspect ratio exceeds 200, image quality may be affected", "aspect_ratio", aspectRatio)
|
||||
}
|
||||
|
||||
round := func(x float64) int { return int(math.RoundToEven(x)) }
|
||||
|
||||
hBar := round(float64(height)/float64(factor)) * factor
|
||||
wBar := round(float64(width)/float64(factor)) * factor
|
||||
tBar := round(float64(numFrames)/float64(temporalFactor)) * temporalFactor
|
||||
|
||||
if tBar*hBar*wBar > p.maxPixels {
|
||||
beta := math.Sqrt(float64(numFrames*height*width) / float64(p.maxPixels))
|
||||
hBar = int(math.Floor(float64(height)/beta/float64(factor))) * factor
|
||||
wBar = int(math.Floor(float64(width)/beta/float64(factor))) * factor
|
||||
} else if tBar*hBar*wBar < p.minPixels {
|
||||
beta := math.Sqrt(float64(p.minPixels) / float64(numFrames*height*width))
|
||||
hBar = int(math.Ceil(float64(height)*beta/float64(factor))) * factor
|
||||
wBar = int(math.Ceil(float64(width)*beta/float64(factor))) * factor
|
||||
}
|
||||
|
||||
return hBar, wBar
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) ProcessImage(img image.Image) ([]float32, *Grid, error) {
|
||||
img = imageproc.Composite(img)
|
||||
|
||||
origWidth := img.Bounds().Dx()
|
||||
origHeight := img.Bounds().Dy()
|
||||
|
||||
// Calculate smart resize dimensions
|
||||
resizedHeight, resizedWidth := p.SmartResize(origHeight, origWidth)
|
||||
|
||||
// Resize image
|
||||
resizedImg := imageproc.Resize(img, image.Point{X: resizedWidth, Y: resizedHeight}, imageproc.ResizeCatmullrom)
|
||||
|
||||
// Normalize pixels - output format is [C, H, W] with rescale and channelFirst
|
||||
// We keep [C, H, W] for patch extraction
|
||||
normalizedPixels := imageproc.Normalize(resizedImg, p.imageMean, p.imageStd, true, true)
|
||||
|
||||
// Calculate grid dimensions (after Conv2D patching)
|
||||
grid := &Grid{
|
||||
Height: resizedHeight / p.patchSize,
|
||||
Width: resizedWidth / p.patchSize,
|
||||
Temporal: 1, // Single image
|
||||
ImageHeight: resizedHeight,
|
||||
ImageWidth: resizedWidth,
|
||||
}
|
||||
|
||||
patches, err := p.createPatches(normalizedPixels, resizedHeight, resizedWidth, grid)
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return patches, grid, nil
|
||||
}
|
||||
|
||||
func (p *ImageProcessor) createPatches(pixels []float32, height, width int, grid *Grid) ([]float32, error) {
|
||||
channels := 3
|
||||
patchSize := p.patchSize
|
||||
mergeSize := p.spatialMergeSize
|
||||
temporalPatchSize := p.temporalPatchSize
|
||||
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
patchDim := channels * temporalPatchSize * patchSize * patchSize
|
||||
result := make([]float32, numPatches*patchDim)
|
||||
patchIndex := 0
|
||||
|
||||
// Single temporal frame handling (copies to all frames)
|
||||
for range grid.Temporal {
|
||||
for h := 0; h < grid.Height; h += mergeSize {
|
||||
for w := 0; w < grid.Width; w += mergeSize {
|
||||
for mh := range mergeSize {
|
||||
for mw := range mergeSize {
|
||||
baseOffset := patchIndex * patchDim
|
||||
for c := range channels {
|
||||
channelOffset := baseOffset + (c * temporalPatchSize * patchSize * patchSize)
|
||||
for py := range patchSize {
|
||||
for px := range patchSize {
|
||||
y := (h+mh)*patchSize + py
|
||||
x := (w+mw)*patchSize + px
|
||||
srcIdx := c*height*width + y*width + x
|
||||
dstIdx := channelOffset + (py * patchSize) + px
|
||||
result[dstIdx] = pixels[srcIdx]
|
||||
}
|
||||
}
|
||||
|
||||
if temporalPatchSize > 1 {
|
||||
frameSize := patchSize * patchSize
|
||||
for tp := 1; tp < temporalPatchSize; tp++ {
|
||||
currentFrameOffset := channelOffset + (tp * frameSize)
|
||||
copy(result[currentFrameOffset:currentFrameOffset+frameSize],
|
||||
result[channelOffset:channelOffset+frameSize])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
patchIndex++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
235
model/models/glmocr/model.go
Normal file
235
model/models/glmocr/model.go
Normal file
@@ -0,0 +1,235 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"image"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/model"
|
||||
"github.com/ollama/ollama/model/input"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
model.Base
|
||||
model.BytePairEncoding
|
||||
|
||||
*TextModel
|
||||
*VisionModel `gguf:"v"`
|
||||
VisionDownsample *VisionDownsample `gguf:"mm.patch_merger"`
|
||||
PatchMerger *PatchMerger `gguf:"mm"`
|
||||
|
||||
ImageProcessor
|
||||
|
||||
imageTokenID int32
|
||||
imageStartTokenID int32
|
||||
imageEndTokenID int32
|
||||
}
|
||||
|
||||
var _ model.MultimodalProcessor = (*Model)(nil)
|
||||
|
||||
func New(c fs.Config) (model.Model, error) {
|
||||
eosTokenID := int32(c.Uint("tokenizer.ggml.eos_token_id"))
|
||||
eosTokenIDs := c.Ints("tokenizer.ggml.eos_token_ids")
|
||||
allEOS := append([]int32{eosTokenID}, eosTokenIDs...)
|
||||
|
||||
m := &Model{
|
||||
BytePairEncoding: model.NewBytePairEncoding(
|
||||
&model.Vocabulary{
|
||||
Values: c.Strings("tokenizer.ggml.tokens"),
|
||||
Types: c.Ints("tokenizer.ggml.token_type"),
|
||||
Merges: c.Strings("tokenizer.ggml.merges"),
|
||||
AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
|
||||
BOS: []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
|
||||
AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
|
||||
EOS: allEOS,
|
||||
},
|
||||
`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
|
||||
),
|
||||
TextModel: newTextModel(c),
|
||||
VisionModel: newVisionModel(c),
|
||||
ImageProcessor: newImageProcessor(c),
|
||||
imageTokenID: int32(c.Uint("image_token_id", 59280)),
|
||||
imageStartTokenID: int32(c.Uint("image_start_token_id", 59256)),
|
||||
imageEndTokenID: int32(c.Uint("image_end_token_id", 59257)),
|
||||
}
|
||||
|
||||
m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
|
||||
|
||||
return m, nil
|
||||
}
|
||||
|
||||
func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
|
||||
if len(m.VisionModel.Blocks) == 0 {
|
||||
return nil, model.ErrNoVisionModel
|
||||
}
|
||||
|
||||
img, _, err := image.Decode(bytes.NewReader(multimodalData))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
f32s, grid, err := m.ImageProcessor.ProcessImage(img)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Create pixel values tensor from flattened patches
|
||||
// Shape: [patchDim, numPatches]
|
||||
patchDim := m.VisionModel.numChannels * m.temporalPatchSize * m.patchSize * m.patchSize
|
||||
numPatches := grid.Temporal * grid.Height * grid.Width
|
||||
pixelValues := ctx.Input().FromFloats(f32s, patchDim, numPatches)
|
||||
|
||||
// Forward through vision encoder
|
||||
visionOutputs := m.VisionModel.Forward(ctx, pixelValues, grid)
|
||||
|
||||
// Forward through downsample (patch merger)
|
||||
if m.VisionDownsample == nil || m.VisionDownsample.Weight == nil {
|
||||
return nil, errors.New("glmocr: missing vision downsample weights")
|
||||
}
|
||||
visionOutputs = m.VisionDownsample.Forward(ctx, visionOutputs, grid, m.VisionModel.VisionModelOptions)
|
||||
|
||||
// Forward through patch merger (FC + LayerNorm + GELU + SwiGLU FFN)
|
||||
if m.PatchMerger == nil {
|
||||
return nil, errors.New("glmocr: missing patch merger weights")
|
||||
}
|
||||
visionOutputs = m.PatchMerger.Forward(ctx, visionOutputs, m.VisionModel.VisionModelOptions)
|
||||
|
||||
return []input.Multimodal{{Tensor: visionOutputs, Data: grid}}, nil
|
||||
}
|
||||
|
||||
func (m *Model) PostTokenize(inputs []*input.Input) ([]*input.Input, error) {
|
||||
var result []*input.Input
|
||||
|
||||
// Reset position cache
|
||||
m.TextModel.positionCache = m.TextModel.positionCache[:0]
|
||||
m.TextModel.ropeDelta = 0
|
||||
|
||||
pos := int32(0)
|
||||
for _, inp := range inputs {
|
||||
if inp.Multimodal == nil {
|
||||
result = append(result, inp)
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
continue
|
||||
}
|
||||
|
||||
// Get grid info for position calculation
|
||||
grid := inp.Multimodal[0].Data.(*Grid)
|
||||
mergedH := grid.Height / m.VisionModel.spatialMergeSize
|
||||
mergedW := grid.Width / m.VisionModel.spatialMergeSize
|
||||
|
||||
// Add image start token
|
||||
result = append(result, &input.Input{Token: m.imageStartTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
|
||||
// Add image tokens with multimodal data
|
||||
// All image tokens share the same base position for temporal dimension
|
||||
tokensPerGrid := inp.Multimodal[0].Tensor.Dim(1)
|
||||
basePos := pos
|
||||
sameBatch := tokensPerGrid - 1
|
||||
if sameBatch < 0 {
|
||||
sameBatch = 0
|
||||
}
|
||||
result = append(result, &input.Input{
|
||||
Token: m.imageTokenID,
|
||||
Multimodal: inp.Multimodal,
|
||||
MultimodalHash: inp.MultimodalHash,
|
||||
SameBatch: sameBatch,
|
||||
})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
|
||||
|
||||
// Add placeholder tokens for remaining positions
|
||||
// All image tokens use the same base position (temporal stays constant)
|
||||
for range tokensPerGrid - 1 {
|
||||
result = append(result, &input.Input{Token: m.imageTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, basePos)
|
||||
}
|
||||
|
||||
// Advance position by max(mergedH, mergedW) after image tokens
|
||||
pos = basePos + int32(max(mergedH, mergedW))
|
||||
|
||||
// Add image end token
|
||||
result = append(result, &input.Input{Token: m.imageEndTokenID})
|
||||
m.TextModel.positionCache = append(m.TextModel.positionCache, pos)
|
||||
pos++
|
||||
}
|
||||
|
||||
// Compute rope delta for continuation after the prefill segment:
|
||||
// delta = (max_position_id + 1) - sequence_length
|
||||
if len(m.TextModel.positionCache) > 0 {
|
||||
last := m.TextModel.positionCache[len(m.TextModel.positionCache)-1]
|
||||
m.TextModel.ropeDelta = last + 1 - int32(len(m.TextModel.positionCache))
|
||||
}
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
|
||||
// Initial token embedding
|
||||
hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs).Duplicate(ctx)
|
||||
ctx.Forward(hiddenStates)
|
||||
|
||||
// Build position slices for M-RoPE
|
||||
positionSlice := func() [][]int32 {
|
||||
s := [][]int32{
|
||||
make([]int32, len(batch.Positions)), // temporal
|
||||
make([]int32, len(batch.Positions)), // height
|
||||
make([]int32, len(batch.Positions)), // width
|
||||
make([]int32, len(batch.Positions)), // unused (zeros)
|
||||
}
|
||||
for i, position := range batch.Positions {
|
||||
// Translate through position cache or continue sequence
|
||||
if position < int32(len(m.TextModel.positionCache)) {
|
||||
position = m.TextModel.positionCache[position]
|
||||
} else if len(m.TextModel.positionCache) > 0 {
|
||||
// Continue sequence after cached positions using ropeDelta
|
||||
position = position + m.TextModel.ropeDelta
|
||||
}
|
||||
|
||||
s[0][i] = position
|
||||
s[1][i] = position
|
||||
s[2][i] = position
|
||||
}
|
||||
return s
|
||||
}()
|
||||
|
||||
// Inject vision embeddings and adjust positions for image tokens
|
||||
for _, mi := range batch.Multimodal {
|
||||
img := mi.Multimodal[0].Tensor
|
||||
ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
|
||||
|
||||
if grid, ok := mi.Multimodal[0].Data.(*Grid); ok {
|
||||
w := grid.Width / m.VisionModel.spatialMergeSize
|
||||
for i := range img.Dim(1) {
|
||||
positionSlice[1][mi.Index+i] += int32(i / w)
|
||||
positionSlice[2][mi.Index+i] += int32(i % w)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
positions := ctx.Input().FromInts(slices.Concat(positionSlice...), len(positionSlice[0])*len(positionSlice))
|
||||
|
||||
// Process through transformer layers
|
||||
for i, layer := range m.TextModel.Layers {
|
||||
m.Cache.SetLayer(i)
|
||||
|
||||
var lastLayerOutputs ml.Tensor
|
||||
if i == len(m.TextModel.Layers)-1 {
|
||||
lastLayerOutputs = batch.Outputs
|
||||
}
|
||||
|
||||
hiddenStates = layer.Forward(ctx, hiddenStates, positions, lastLayerOutputs, m.Cache, m.TextModel.TextModelOptions)
|
||||
}
|
||||
|
||||
hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.TextModel.eps)
|
||||
return m.Output.Forward(ctx, hiddenStates), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
model.Register("glmocr", New)
|
||||
}
|
||||
190
model/models/glmocr/model_text.go
Normal file
190
model/models/glmocr/model_text.go
Normal file
@@ -0,0 +1,190 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/kvcache"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type TextModelOptions struct {
|
||||
hiddenSize int
|
||||
numHeads int
|
||||
numKVHeads int
|
||||
headDim int
|
||||
rotaryDim int
|
||||
intermediateSize int
|
||||
eps float32
|
||||
ropeBase float32
|
||||
mropeSections []int
|
||||
}
|
||||
|
||||
func (o *TextModelOptions) applyMRoPE(ctx ml.Context, states, positions ml.Tensor) ml.Tensor {
|
||||
// With 4 sections for [temporal, height, width, unused]
|
||||
return nn.RoPE(ctx, states, positions, o.rotaryDim, o.ropeBase, 1.0, rope.WithMRoPE(o.mropeSections))
|
||||
}
|
||||
|
||||
type TextSelfAttention struct {
|
||||
Query *nn.Linear `gguf:"attn_q"`
|
||||
Key *nn.Linear `gguf:"attn_k"`
|
||||
Value *nn.Linear `gguf:"attn_v"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
// Separate Q, K, V projections
|
||||
q := sa.Query.Forward(ctx, hiddenStates)
|
||||
k := sa.Key.Forward(ctx, hiddenStates)
|
||||
v := sa.Value.Forward(ctx, hiddenStates)
|
||||
|
||||
// Reshape for GQA
|
||||
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
k = k.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
||||
v = v.Reshape(ctx, opts.headDim, opts.numKVHeads, batchSize)
|
||||
|
||||
// Apply M-RoPE (multi-resolution rotary position embeddings)
|
||||
q = opts.applyMRoPE(ctx, q, positions)
|
||||
k = opts.applyMRoPE(ctx, k, positions)
|
||||
|
||||
// Scaled dot-product attention with KV cache
|
||||
scaleFactor := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||
kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
|
||||
// Reshape attention output: [headDim, numHeads, batchSize] -> [numHeads*headDim, batchSize]
|
||||
// Note: numHeads * headDim = 16 * 128 = 2048, which is the attention hidden size
|
||||
kqv = kqv.Reshape(ctx, opts.numHeads*opts.headDim, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, kqv)
|
||||
}
|
||||
|
||||
type TextMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *TextMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *TextModelOptions) ml.Tensor {
|
||||
// SwiGLU: down(silu(gate(x)) * up(x))
|
||||
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type TextDecoderLayer struct {
|
||||
// Input layernorm (before attention)
|
||||
AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
|
||||
SelfAttention *TextSelfAttention
|
||||
// Post self-attention layernorm (after attention, before residual add)
|
||||
PostAttnNorm *nn.RMSNorm `gguf:"post_attn_norm"`
|
||||
|
||||
// FFN input layernorm (after first residual, before MLP)
|
||||
FFNNorm *nn.RMSNorm `gguf:"ffn_norm"`
|
||||
MLP *TextMLP
|
||||
// Post MLP layernorm (after MLP, before residual add)
|
||||
PostFFNNorm *nn.RMSNorm `gguf:"post_ffn_norm"`
|
||||
}
|
||||
|
||||
func (l *TextDecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *TextModelOptions) ml.Tensor {
|
||||
// Attention block
|
||||
residual := hiddenStates
|
||||
hiddenStates = l.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = l.SelfAttention.Forward(ctx, hiddenStates, positions, cache, opts)
|
||||
hiddenStates = l.PostAttnNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
|
||||
// Prune to output positions in final layer
|
||||
if outputs != nil {
|
||||
hiddenStates = hiddenStates.Rows(ctx, outputs)
|
||||
residual = residual.Rows(ctx, outputs)
|
||||
}
|
||||
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
// MLP block
|
||||
residual = hiddenStates
|
||||
hiddenStates = l.FFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = l.MLP.Forward(ctx, hiddenStates, opts)
|
||||
hiddenStates = l.PostFFNNorm.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type TextModel struct {
|
||||
TokenEmbedding *nn.Embedding `gguf:"token_embd"`
|
||||
Layers []TextDecoderLayer `gguf:"blk"`
|
||||
OutputNorm *nn.RMSNorm `gguf:"output_norm"`
|
||||
Output *nn.Linear `gguf:"output,alt:token_embd"`
|
||||
|
||||
*TextModelOptions
|
||||
|
||||
// positionCache stores the M-RoPE position for each token in the sequence.
|
||||
// This is needed because image tokens share the same base position but have
|
||||
// different height/width offsets, and the end token position depends on the
|
||||
// image grid dimensions.
|
||||
positionCache []int32
|
||||
ropeDelta int32
|
||||
}
|
||||
|
||||
func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
|
||||
// Clear position cache when KV cache shifts
|
||||
m.positionCache = nil
|
||||
m.ropeDelta = 0
|
||||
return m.applyMRoPE(ctx, key, shift), nil
|
||||
}
|
||||
|
||||
func newTextModel(c fs.Config) *TextModel {
|
||||
hiddenSize := int(c.Uint("embedding_length", 1536))
|
||||
numHeads := int(c.Uint("attention.head_count", 16))
|
||||
numKVHeads := int(c.Uint("attention.head_count_kv", 8))
|
||||
intermediateSize := int(c.Uint("feed_forward_length", 4608))
|
||||
eps := c.Float("attention.layer_norm_rms_epsilon", 1e-5)
|
||||
ropeBase := c.Float("rope.freq_base", 10000)
|
||||
|
||||
headDim := int(c.Uint("attention.key_length", uint32(hiddenSize/numHeads)))
|
||||
ropeDim := int(c.Uint("rope.dimension_count", uint32(headDim)))
|
||||
if ropeDim <= 0 {
|
||||
ropeDim = headDim
|
||||
}
|
||||
|
||||
mropeSections := c.Ints("rope.mrope_section")
|
||||
var sectionInts []int
|
||||
|
||||
if len(mropeSections) > 0 {
|
||||
sectionInts = make([]int, len(mropeSections))
|
||||
for i, section := range mropeSections {
|
||||
sectionInts[i] = int(section)
|
||||
}
|
||||
} else {
|
||||
// Default to GLM-OCR's HF ratio (2:3:3) scaled to rotaryDim/2.
|
||||
// For rotaryDim=64 this yields [8, 12, 12].
|
||||
total := ropeDim / 2
|
||||
if total <= 0 {
|
||||
total = 32
|
||||
}
|
||||
s0 := total * 2 / 8
|
||||
s1 := total * 3 / 8
|
||||
s2 := total - s0 - s1
|
||||
sectionInts = []int{s0, s1, s2}
|
||||
}
|
||||
|
||||
// GGML rope_multi: sector = (dim_pair) % sum(sections), mapping each pair to its position dim
|
||||
rotaryDim := ropeDim
|
||||
|
||||
return &TextModel{
|
||||
Layers: make([]TextDecoderLayer, c.Uint("block_count", 16)),
|
||||
TextModelOptions: &TextModelOptions{
|
||||
hiddenSize: hiddenSize,
|
||||
numHeads: numHeads,
|
||||
numKVHeads: numKVHeads,
|
||||
headDim: headDim,
|
||||
rotaryDim: rotaryDim,
|
||||
intermediateSize: intermediateSize,
|
||||
eps: eps,
|
||||
ropeBase: ropeBase,
|
||||
mropeSections: sectionInts,
|
||||
},
|
||||
}
|
||||
}
|
||||
355
model/models/glmocr/model_vision.go
Normal file
355
model/models/glmocr/model_vision.go
Normal file
@@ -0,0 +1,355 @@
|
||||
package glmocr
|
||||
|
||||
import (
|
||||
"log/slog"
|
||||
"math"
|
||||
"slices"
|
||||
|
||||
"github.com/ollama/ollama/fs"
|
||||
"github.com/ollama/ollama/ml"
|
||||
"github.com/ollama/ollama/ml/nn"
|
||||
"github.com/ollama/ollama/ml/nn/rope"
|
||||
)
|
||||
|
||||
type Grid struct {
|
||||
Height int // Number of patches in height direction
|
||||
Width int // Number of patches in width direction
|
||||
Temporal int
|
||||
ImageHeight int // Full image height in pixels
|
||||
ImageWidth int // Full image width in pixels
|
||||
}
|
||||
|
||||
type VisionModelOptions struct {
|
||||
hiddenSize int
|
||||
numHeads int
|
||||
headDim int
|
||||
numChannels int
|
||||
patchSize int
|
||||
temporalPatchSize int
|
||||
imageSize int
|
||||
spatialMergeSize int
|
||||
outHiddenSize int
|
||||
intermediateSize int
|
||||
eps float32
|
||||
}
|
||||
|
||||
type VisionPatchEmbed struct {
|
||||
Proj *nn.Conv2D `gguf:"patch_embd_0"`
|
||||
Proj1 *nn.Conv2D `gguf:"patch_embd_1"`
|
||||
Bias ml.Tensor `gguf:"patch_embd.bias"`
|
||||
}
|
||||
|
||||
func (pe *VisionPatchEmbed) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
|
||||
_ = grid // patches are already in merge-block order
|
||||
|
||||
// pixelValues shape: [patchDim, numPatches]
|
||||
numPatches := pixelValues.Shape()[1]
|
||||
|
||||
// Reshape to [patchSize*patchSize, temporalPatchSize, numChannels, numPatches]
|
||||
pixelValues = pixelValues.Reshape(ctx, opts.patchSize*opts.patchSize, opts.temporalPatchSize, opts.numChannels, numPatches)
|
||||
// Permute to [temporalPatchSize, patchSize*patchSize, numChannels, numPatches]
|
||||
pixelValues = pixelValues.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
|
||||
|
||||
// Slice temporal frames for Conv2D (simulate Conv3D)
|
||||
in0 := pixelValues.View(ctx, 0, 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in0 = in0.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
|
||||
s0, s1 := opts.patchSize, opts.patchSize
|
||||
p0, p1 := 0, 0
|
||||
d0, d1 := 1, 1
|
||||
hiddenStates := pe.Proj.Forward(ctx, in0, s0, s1, p0, p1, d0, d1)
|
||||
|
||||
if pe.Proj1 != nil && opts.temporalPatchSize > 1 {
|
||||
in1 := pixelValues.View(ctx, pixelValues.Stride(0), 1, pixelValues.Stride(1), pixelValues.Dim(1), pixelValues.Stride(2), pixelValues.Dim(2), pixelValues.Stride(3), pixelValues.Dim(3)).Contiguous(ctx)
|
||||
in1 = in1.Reshape(ctx, opts.patchSize, opts.patchSize, opts.numChannels, numPatches)
|
||||
out1 := pe.Proj1.Forward(ctx, in1, s0, s1, p0, p1, d0, d1)
|
||||
hiddenStates = hiddenStates.Add(ctx, out1)
|
||||
}
|
||||
|
||||
// Flatten to [hidden_size, num_patches]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, numPatches)
|
||||
|
||||
// Add patch bias - reshape from [hidden_size] to [hidden_size, 1] for broadcasting
|
||||
if pe.Bias != nil {
|
||||
hiddenStates = hiddenStates.Add(ctx, pe.Bias.Reshape(ctx, opts.hiddenSize, 1))
|
||||
}
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type VisionSelfAttention struct {
|
||||
QKV *nn.Linear `gguf:"attn_qkv"`
|
||||
QNorm *nn.RMSNorm `gguf:"attn_q_norm"`
|
||||
KNorm *nn.RMSNorm `gguf:"attn_k_norm"`
|
||||
Output *nn.Linear `gguf:"attn_out"`
|
||||
}
|
||||
|
||||
func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
batchSize := hiddenStates.Dim(1)
|
||||
|
||||
// Combined QKV projection: [3*hidden_size, batch_size]
|
||||
qkv := sa.QKV.Forward(ctx, hiddenStates)
|
||||
|
||||
// Split using ChunkSections along dim 0 (handles byte offsets correctly)
|
||||
// ChunkSections returns views - must make contiguous before further operations
|
||||
chunks := qkv.ChunkSections(ctx, 0, opts.hiddenSize, opts.hiddenSize, opts.hiddenSize)
|
||||
q := chunks[0].Contiguous(ctx)
|
||||
k := chunks[1].Contiguous(ctx)
|
||||
v := chunks[2].Contiguous(ctx)
|
||||
|
||||
// Reshape for multi-head attention: [hiddenSize, N] -> [headDim, numHeads, N]
|
||||
q = q.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
k = k.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
v = v.Reshape(ctx, opts.headDim, opts.numHeads, batchSize)
|
||||
|
||||
// Apply Q-norm and K-norm after head reshape
|
||||
// Weights are [headDim]=64, tensor is [headDim, numHeads, N]
|
||||
q = sa.QNorm.Forward(ctx, q, opts.eps)
|
||||
k = sa.KNorm.Forward(ctx, k, opts.eps)
|
||||
|
||||
// Apply rotary position embeddings with vision-style 2D positions.
|
||||
// ggml's vision RoPE uses two position dimensions (H/W) with half-rotation pairs.
|
||||
// We provide H/W sections and leave the remaining sections empty.
|
||||
ropeFreqBase := float32(10000.0)
|
||||
section := opts.headDim / 4
|
||||
if section <= 0 {
|
||||
section = 1
|
||||
}
|
||||
sections := []int{section, section, 0, 0}
|
||||
q = nn.RoPE(ctx, q, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
|
||||
k = nn.RoPE(ctx, k, positions, opts.headDim/2, ropeFreqBase, 1.0, rope.WithVision(sections))
|
||||
|
||||
// Scale factor for scaled dot-product attention
|
||||
scale := 1.0 / math.Sqrt(float64(opts.headDim))
|
||||
|
||||
// Try flash attention first (ScaledDotProductAttention), fall back to manual
|
||||
if sdpa, ok := q.(ml.ScaledDotProductAttention); ok {
|
||||
attention := sdpa.ScaledDotProductAttention(ctx, k, v, nil, nil, nil, scale, false)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
slog.Warn("glmocr: vision attention falling back to manual attention",
|
||||
"batchSize", batchSize, "numHeads", opts.numHeads,
|
||||
"hint", "set OLLAMA_FLASH_ATTENTION=1 to enable flash attention")
|
||||
|
||||
// Manual attention fallback
|
||||
// q, k, v are [headDim, numHeads, batchSize] - GGML treats as 4D with implicit dim 3 = 1
|
||||
q = q.Permute(ctx, 0, 2, 1, 3)
|
||||
k = k.Permute(ctx, 0, 2, 1, 3)
|
||||
v = v.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)
|
||||
|
||||
// Attention scores
|
||||
kq := k.MulmatFullPrec(ctx, q)
|
||||
kq = kq.Scale(ctx, scale)
|
||||
kq = kq.Softmax(ctx)
|
||||
|
||||
// Attention output: v @ kq (note: v first)
|
||||
kqv := v.Mulmat(ctx, kq)
|
||||
attention := kqv.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
|
||||
attention = attention.Reshape(ctx, opts.hiddenSize, batchSize)
|
||||
|
||||
return sa.Output.Forward(ctx, attention)
|
||||
}
|
||||
|
||||
type VisionMLP struct {
|
||||
Gate *nn.Linear `gguf:"ffn_gate"`
|
||||
Up *nn.Linear `gguf:"ffn_up"`
|
||||
Down *nn.Linear `gguf:"ffn_down"`
|
||||
}
|
||||
|
||||
func (mlp *VisionMLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
|
||||
// SwiGLU: down(silu(gate(x)) * up(x))
|
||||
gate := mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx, mlp.Up.Forward(ctx, hiddenStates))
|
||||
return mlp.Down.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type VisionBlock struct {
|
||||
Norm1 *nn.RMSNorm `gguf:"ln1"`
|
||||
SelfAttention *VisionSelfAttention
|
||||
Norm2 *nn.RMSNorm `gguf:"ln2"`
|
||||
MLP *VisionMLP
|
||||
}
|
||||
|
||||
func (b *VisionBlock) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
// Pre-norm architecture
|
||||
residual := hiddenStates
|
||||
hiddenStates = b.Norm1.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = b.SelfAttention.Forward(ctx, hiddenStates, positions, opts)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
residual = hiddenStates
|
||||
hiddenStates = b.Norm2.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = b.MLP.Forward(ctx, hiddenStates)
|
||||
hiddenStates = hiddenStates.Add(ctx, residual)
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type VisionDownsample struct {
|
||||
*nn.Conv2D
|
||||
}
|
||||
|
||||
func (d *VisionDownsample) Forward(ctx ml.Context, hiddenStates ml.Tensor, grid *Grid, opts *VisionModelOptions) ml.Tensor {
|
||||
// Apply spatial downsampling via Conv2D
|
||||
// Input: [hidden_size, num_patches] where patches are in merge-block order
|
||||
|
||||
if d.Conv2D == nil || d.Weight == nil {
|
||||
slog.Error("VisionDownsample weights not loaded - model may be corrupted or incompatible")
|
||||
return hiddenStates // Return input unchanged as fallback
|
||||
}
|
||||
|
||||
merge := opts.spatialMergeSize
|
||||
numOutputTokens := (grid.Height / merge) * (grid.Width / merge)
|
||||
|
||||
// Step 1: Reshape to [hidden_size, merge, merge, num_output_tokens]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.hiddenSize, merge, merge, numOutputTokens)
|
||||
|
||||
// Step 2: Permute to [merge, merge, hidden_size, num_output_tokens]
|
||||
// ggml semantics: result.ne[perm[i]] = input.ne[i]
|
||||
// So permute(2,0,1,3) on [1024,2,2,N] gives: ne[2]=1024, ne[0]=2, ne[1]=2, ne[3]=N -> [2,2,1024,N]
|
||||
hiddenStates = hiddenStates.Permute(ctx, 2, 0, 1, 3).Contiguous(ctx)
|
||||
|
||||
// Step 3: Apply Conv2D without bias (bias added after reshape)
|
||||
// Note: ggml_conv_2d takes (kernel, input) - kernel must be receiver in ollama
|
||||
s0, s1 := merge, merge
|
||||
p0, p1 := 0, 0
|
||||
d0, d1 := 1, 1
|
||||
hiddenStates = d.Weight.Conv2D(ctx, hiddenStates, s0, s1, p0, p1, d0, d1)
|
||||
|
||||
// Step 4: Reshape to [out_hidden_size, num_output_tokens]
|
||||
hiddenStates = hiddenStates.Reshape(ctx, opts.outHiddenSize, numOutputTokens)
|
||||
|
||||
// Step 5: Add bias after reshape
|
||||
// Reshape bias from [out_hidden_size] to [out_hidden_size, 1] for proper broadcasting
|
||||
if d.Bias != nil {
|
||||
hiddenStates = hiddenStates.Add(ctx, d.Bias.Reshape(ctx, opts.outHiddenSize, 1))
|
||||
}
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
type PatchMerger struct {
|
||||
// GGUF tags align with mm.* keys used by the model
|
||||
Proj *nn.Linear `gguf:"model.fc"` // mm.model.fc.weight
|
||||
PostLN *nn.LayerNorm `gguf:"post_norm"` // mm.post_norm.weight/bias
|
||||
GateProj *nn.Linear `gguf:"gate"` // mm.gate.weight
|
||||
UpProj *nn.Linear `gguf:"up"` // mm.up.weight
|
||||
DownProj *nn.Linear `gguf:"down"` // mm.down.weight
|
||||
}
|
||||
|
||||
func (m *PatchMerger) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *VisionModelOptions) ml.Tensor {
|
||||
// Linear projection
|
||||
hiddenStates = m.Proj.Forward(ctx, hiddenStates)
|
||||
|
||||
// Post-projection layer norm + GELU ERF
|
||||
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, opts.eps)
|
||||
hiddenStates = hiddenStates.GELU_ERF(ctx)
|
||||
// Force a copy to avoid in-place mutation issues with GELU_ERF
|
||||
hiddenStates = hiddenStates.Contiguous(ctx)
|
||||
|
||||
// SwiGLU MLP: down(silu(gate(x)) * up(x))
|
||||
gateOut := m.GateProj.Forward(ctx, hiddenStates)
|
||||
upOut := m.UpProj.Forward(ctx, hiddenStates)
|
||||
gate := gateOut.SILU(ctx, upOut)
|
||||
return m.DownProj.Forward(ctx, gate)
|
||||
}
|
||||
|
||||
type VisionModel struct {
|
||||
PatchEmbed *VisionPatchEmbed
|
||||
Blocks []VisionBlock `gguf:"blk"`
|
||||
PostLN *nn.RMSNorm `gguf:"post_ln"`
|
||||
// Note: Downsample is applied at the model level so mm.patch_merger stays separate
|
||||
|
||||
*VisionModelOptions
|
||||
}
|
||||
|
||||
func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor, grid *Grid) ml.Tensor {
|
||||
// Extract patch embeddings from flattened patches
|
||||
hiddenStates := m.PatchEmbed.Forward(ctx, pixelValues, grid, m.VisionModelOptions)
|
||||
|
||||
// Create position IDs for RoPE (spatial grid)
|
||||
// Patches are already in merge-block order from preprocessing
|
||||
positions := m.createPositions(ctx, grid)
|
||||
|
||||
// Process through vision blocks
|
||||
for _, block := range m.Blocks {
|
||||
hiddenStates = block.Forward(ctx, hiddenStates, positions, m.VisionModelOptions)
|
||||
}
|
||||
|
||||
// Post-layernorm
|
||||
hiddenStates = m.PostLN.Forward(ctx, hiddenStates, m.eps)
|
||||
|
||||
// Note: Downsample is now applied separately in Model.EncodeMultimodal
|
||||
// so mm.patch_merger remains a distinct module
|
||||
|
||||
return hiddenStates
|
||||
}
|
||||
|
||||
func (m *VisionModel) createPositions(ctx ml.Context, grid *Grid) ml.Tensor {
|
||||
// Create spatial position IDs for vision RoPE
|
||||
// Position layout: [height, width, height, width] - 4 sections for mrope
|
||||
// Patches are in MERGE-BLOCK order after VisionPatchEmbed interleaving
|
||||
// This follows the GLM-OCR rot_pos_emb layout
|
||||
numPatches := grid.Height * grid.Width
|
||||
mergeRatio := m.spatialMergeSize
|
||||
|
||||
// Build position arrays in merge-block order
|
||||
// Each merge_ratio x merge_ratio block of patches is grouped together
|
||||
hpos := make([]int32, numPatches)
|
||||
wpos := make([]int32, numPatches)
|
||||
ptr := 0
|
||||
for y := 0; y < grid.Height; y += mergeRatio {
|
||||
for x := 0; x < grid.Width; x += mergeRatio {
|
||||
for dy := range mergeRatio {
|
||||
for dx := range mergeRatio {
|
||||
hpos[ptr] = int32(y + dy)
|
||||
wpos[ptr] = int32(x + dx)
|
||||
ptr++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build position arrays for 4 sections (mrope). ggml vision RoPE uses only H/W;
|
||||
// keep remaining sections zeroed to match its conventions.
|
||||
zeros := make([]int32, numPatches)
|
||||
s := [][]int32{
|
||||
hpos, // Section 0: height
|
||||
wpos, // Section 1: width
|
||||
zeros, // Section 2: unused
|
||||
zeros, // Section 3: unused
|
||||
}
|
||||
|
||||
return ctx.Input().FromInts(slices.Concat(s...), numPatches*4)
|
||||
}
|
||||
|
||||
func newVisionModel(c fs.Config) *VisionModel {
|
||||
hiddenSize := int(c.Uint("vision.embedding_length", 1024))
|
||||
numHeads := int(c.Uint("vision.attention.head_count", 16))
|
||||
numChannels := int(c.Uint("vision.num_channels", 3))
|
||||
patchSize := int(c.Uint("vision.patch_size", 14))
|
||||
temporalPatchSize := int(c.Uint("vision.temporal_patch_size", 2))
|
||||
imageSize := int(c.Uint("vision.image_size", 336))
|
||||
spatialMergeSize := int(c.Uint("vision.spatial_merge_size", 2))
|
||||
outHiddenSize := int(c.Uint("vision.out_hidden_size", 1536))
|
||||
intermediateSize := int(c.Uint("vision.intermediate_size", 4096))
|
||||
eps := c.Float("vision.attention.layer_norm_rms_epsilon", 1e-5)
|
||||
|
||||
return &VisionModel{
|
||||
Blocks: make([]VisionBlock, c.Uint("vision.block_count", 24)),
|
||||
VisionModelOptions: &VisionModelOptions{
|
||||
hiddenSize: hiddenSize,
|
||||
numHeads: numHeads,
|
||||
headDim: hiddenSize / numHeads,
|
||||
numChannels: numChannels,
|
||||
patchSize: patchSize,
|
||||
temporalPatchSize: temporalPatchSize,
|
||||
imageSize: imageSize,
|
||||
spatialMergeSize: spatialMergeSize,
|
||||
outHiddenSize: outHiddenSize,
|
||||
intermediateSize: intermediateSize,
|
||||
eps: eps,
|
||||
},
|
||||
}
|
||||
}
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
_ "github.com/ollama/ollama/model/models/gemma3"
|
||||
_ "github.com/ollama/ollama/model/models/gemma3n"
|
||||
_ "github.com/ollama/ollama/model/models/glm4moelite"
|
||||
_ "github.com/ollama/ollama/model/models/glmocr"
|
||||
_ "github.com/ollama/ollama/model/models/gptoss"
|
||||
_ "github.com/ollama/ollama/model/models/lfm2"
|
||||
_ "github.com/ollama/ollama/model/models/llama"
|
||||
|
||||
Reference in New Issue
Block a user