mirror of
https://github.com/ollama/ollama.git
synced 2026-03-09 07:16:38 -05:00
model: improvements to LFM architectures (#14368)
This commit is contained in:
@@ -316,8 +316,10 @@ func LoadModelMetadata(fsys fs.FS) (ModelKV, *Tokenizer, error) {
|
||||
conv = &glm4MoeLiteModel{}
|
||||
case "GlmOcrForConditionalGeneration":
|
||||
conv = &glmOcrModel{}
|
||||
case "Lfm2ForCausalLM":
|
||||
case "Lfm2ForCausalLM", "Lfm2MoeForCausalLM":
|
||||
conv = &lfm2Model{}
|
||||
case "Lfm2VlForConditionalGeneration":
|
||||
conv = &lfm2VLTextModel{}
|
||||
case "Qwen3NextForCausalLM":
|
||||
conv = &qwen3NextModel{}
|
||||
case "NemotronHForCausalLM":
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"fmt"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
@@ -13,42 +15,149 @@ type lfm2Model struct {
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
MaxPositionEmbeddings uint32 `json:"max_position_embeddings"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
BlockFFDim uint32 `json:"block_ff_dim"`
|
||||
BlockMultipleOf uint32 `json:"block_multiple_of"`
|
||||
BlockAutoAdjustFFDim bool `json:"block_auto_adjust_ff_dim"`
|
||||
BlockFFNDimMultiplier float32 `json:"block_ffn_dim_multiplier"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumKeyValueHeads uint32 `json:"num_key_value_heads"`
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
NormEps float32 `json:"norm_eps"`
|
||||
ConvLCache uint32 `json:"conv_L_cache"`
|
||||
MoEIntermediateSize uint32 `json:"moe_intermediate_size"`
|
||||
NumExperts uint32 `json:"num_experts"`
|
||||
NumLocalExperts uint32 `json:"num_local_experts"`
|
||||
NumExpertsPerToken uint32 `json:"num_experts_per_tok"`
|
||||
NumDenseLayers uint32 `json:"num_dense_layers"`
|
||||
RoutedScalingFactor float32 `json:"routed_scaling_factor"`
|
||||
LayerTypes []string `json:"layer_types"`
|
||||
TieEmbedding bool `json:"tie_embedding"`
|
||||
RopeParameters struct {
|
||||
RopeTheta float32 `json:"rope_theta"`
|
||||
} `json:"rope_parameters"`
|
||||
}
|
||||
|
||||
var _ ModelConverter = (*lfm2Model)(nil)
|
||||
|
||||
const (
|
||||
defaultMaxPositionEmbeddings = uint32(128_000)
|
||||
fallbackContextLength = uint32(32_768)
|
||||
)
|
||||
|
||||
func (p *lfm2Model) isMoE() bool {
|
||||
return p.ModelType == "lfm2_moe" || p.expertCount() > 0
|
||||
}
|
||||
|
||||
func (p *lfm2Model) ropeFreqBase() float32 {
|
||||
if p.RopeTheta != 0 {
|
||||
return p.RopeTheta
|
||||
}
|
||||
|
||||
return p.RopeParameters.RopeTheta
|
||||
}
|
||||
|
||||
func (p *lfm2Model) expertCount() uint32 {
|
||||
if p.NumLocalExperts > 0 {
|
||||
return p.NumLocalExperts
|
||||
}
|
||||
return p.NumExperts
|
||||
}
|
||||
|
||||
func (p *lfm2Model) feedForwardLength() uint32 {
|
||||
ff := p.IntermediateSize
|
||||
if p.BlockFFDim != 0 {
|
||||
ff = p.BlockFFDim
|
||||
}
|
||||
|
||||
if !p.BlockAutoAdjustFFDim || p.BlockMultipleOf == 0 {
|
||||
return ff
|
||||
}
|
||||
|
||||
ff = (2 * ff) / 3
|
||||
|
||||
// Keep default multiplier behavior consistent with llama.cpp conversion.
|
||||
if p.BlockFFNDimMultiplier != 0 {
|
||||
ff = uint32(float32(ff) * p.BlockFFNDimMultiplier)
|
||||
}
|
||||
|
||||
m := p.BlockMultipleOf
|
||||
return m * ((ff + m - 1) / m)
|
||||
}
|
||||
|
||||
func (p *lfm2Model) hasKnownContextLengthFallbackSignature() bool {
|
||||
return p.isMoE() &&
|
||||
p.VocabSize == 65536 &&
|
||||
p.HiddenSize == 2048 &&
|
||||
p.NumHiddenLayers == 40 &&
|
||||
p.IntermediateSize == 11776 &&
|
||||
p.NumAttentionHeads == 32 &&
|
||||
p.NumKeyValueHeads == 8 &&
|
||||
p.NumDenseLayers == 2 &&
|
||||
p.expertCount() == 64 &&
|
||||
p.NumExpertsPerToken == 4 &&
|
||||
p.MoEIntermediateSize == 1536
|
||||
}
|
||||
|
||||
func (p *lfm2Model) contextLength() uint32 {
|
||||
if p.MaxPositionEmbeddings == defaultMaxPositionEmbeddings && p.hasKnownContextLengthFallbackSignature() {
|
||||
return fallbackContextLength
|
||||
}
|
||||
|
||||
return p.MaxPositionEmbeddings
|
||||
}
|
||||
|
||||
func (p *lfm2Model) KV(t *Tokenizer) KV {
|
||||
architecture := "lfm2"
|
||||
if p.isMoE() {
|
||||
architecture = "lfm2moe"
|
||||
}
|
||||
|
||||
kv := p.ModelParameters.KV(t)
|
||||
kv["general.architecture"] = "lfm2"
|
||||
kv["lfm2.vocab_size"] = p.VocabSize
|
||||
kv["lfm2.block_count"] = p.NumHiddenLayers
|
||||
kv["lfm2.embedding_length"] = p.HiddenSize
|
||||
kv["lfm2.feed_forward_length"] = p.IntermediateSize
|
||||
kv["lfm2.context_length"] = p.MaxPositionEmbeddings
|
||||
kv["general.architecture"] = architecture
|
||||
kv["tokenizer.ggml.pre"] = "lfm2"
|
||||
kv["vocab_size"] = p.VocabSize
|
||||
kv["block_count"] = p.NumHiddenLayers
|
||||
kv["embedding_length"] = p.HiddenSize
|
||||
kv["feed_forward_length"] = p.feedForwardLength()
|
||||
kv["context_length"] = p.contextLength()
|
||||
|
||||
// Build per-layer KV head count array based on layer_types
|
||||
// (0 = shortconv layer, non-zero = attention layer with that many KV heads)
|
||||
// (0 = shortconv layer, non-zero = attention layer with that many KV heads).
|
||||
//
|
||||
// Dense LFM2 in HF defaults to all attention layers when layer_types is absent.
|
||||
// Preserve that behavior to avoid accidentally emitting all-conv metadata.
|
||||
kvHeadCounts := make([]uint32, p.NumHiddenLayers)
|
||||
for i := range p.NumHiddenLayers {
|
||||
if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
|
||||
if len(p.LayerTypes) == 0 {
|
||||
for i := range p.NumHiddenLayers {
|
||||
kvHeadCounts[i] = p.NumKeyValueHeads
|
||||
}
|
||||
} else {
|
||||
for i := range p.NumHiddenLayers {
|
||||
if int(i) < len(p.LayerTypes) && p.LayerTypes[i] == "full_attention" {
|
||||
kvHeadCounts[i] = p.NumKeyValueHeads
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kv["lfm2.attention.head_count"] = p.NumAttentionHeads
|
||||
kv["lfm2.attention.head_count_kv"] = kvHeadCounts
|
||||
kv["lfm2.attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
|
||||
kv["lfm2.attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
|
||||
kv["lfm2.attention.layer_norm_rms_epsilon"] = p.NormEps
|
||||
kv["lfm2.rope.freq_base"] = p.RopeTheta
|
||||
kv["lfm2.shortconv.l_cache"] = p.ConvLCache
|
||||
kv["attention.head_count"] = p.NumAttentionHeads
|
||||
kv["attention.head_count_kv"] = kvHeadCounts
|
||||
kv["attention.key_length"] = p.HiddenSize / p.NumAttentionHeads
|
||||
kv["attention.value_length"] = p.HiddenSize / p.NumAttentionHeads
|
||||
kv["attention.layer_norm_rms_epsilon"] = p.NormEps
|
||||
kv["shortconv.l_cache"] = p.ConvLCache
|
||||
|
||||
if ropeFreqBase := p.ropeFreqBase(); ropeFreqBase != 0 {
|
||||
kv["rope.freq_base"] = ropeFreqBase
|
||||
}
|
||||
|
||||
if p.isMoE() {
|
||||
kv["expert_count"] = p.expertCount()
|
||||
kv["expert_used_count"] = p.NumExpertsPerToken
|
||||
kv["expert_feed_forward_length"] = p.MoEIntermediateSize
|
||||
kv["leading_dense_block_count"] = p.NumDenseLayers
|
||||
kv["expert_gating_func"] = uint32(2) // sigmoid
|
||||
kv["expert_weights_scale"] = cmp.Or(p.RoutedScalingFactor, float32(1.0))
|
||||
}
|
||||
|
||||
return kv
|
||||
}
|
||||
@@ -56,6 +165,30 @@ func (p *lfm2Model) KV(t *Tokenizer) KV {
|
||||
func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
if p.isMoE() {
|
||||
merges := make([]merge, 0, p.NumHiddenLayers*3)
|
||||
for i := range p.NumHiddenLayers {
|
||||
if i < p.NumDenseLayers {
|
||||
continue
|
||||
}
|
||||
|
||||
merges = append(merges, merge{
|
||||
fmt.Sprintf("blk.%d.feed_forward.experts.*.w1.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_gate_exps.weight", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.feed_forward.experts.*.w2.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_down_exps.weight", i),
|
||||
}, merge{
|
||||
fmt.Sprintf("blk.%d.feed_forward.experts.*.w3.weight", i),
|
||||
fmt.Sprintf("blk.%d.ffn_up_exps.weight", i),
|
||||
})
|
||||
}
|
||||
|
||||
merged, remaining := mergeTensors(ts, merges...)
|
||||
out = append(out, merged...)
|
||||
ts = remaining
|
||||
}
|
||||
|
||||
for _, t := range ts {
|
||||
shape := t.Shape()
|
||||
|
||||
@@ -80,7 +213,7 @@ func (p *lfm2Model) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
func (p *lfm2Model) Replacements() []string {
|
||||
return []string{
|
||||
"model.embed_tokens", "token_embd",
|
||||
"model.embedding_norm", "output_norm",
|
||||
"model.embedding_norm", "token_embd_norm",
|
||||
"model.layers", "blk",
|
||||
"operator_norm", "attn_norm",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
@@ -92,6 +225,8 @@ func (p *lfm2Model) Replacements() []string {
|
||||
"conv.conv", "shortconv.conv",
|
||||
"conv.in_proj", "shortconv.in_proj",
|
||||
"conv.out_proj", "shortconv.out_proj",
|
||||
"feed_forward.gate", "ffn_gate_inp",
|
||||
"feed_forward.expert_bias", "exp_probs_b.bias",
|
||||
"feed_forward.w1", "ffn_gate",
|
||||
"feed_forward.w2", "ffn_down",
|
||||
"feed_forward.w3", "ffn_up",
|
||||
|
||||
271
convert/convert_lfm2_test.go
Normal file
271
convert/convert_lfm2_test.go
Normal file
@@ -0,0 +1,271 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"io"
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
type lfm2StubTensor struct {
|
||||
tensorBase
|
||||
}
|
||||
|
||||
func newLFM2StubTensor(name string, shape []uint64) *lfm2StubTensor {
|
||||
return &lfm2StubTensor{
|
||||
tensorBase: tensorBase{
|
||||
name: name,
|
||||
shape: shape,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func (t *lfm2StubTensor) WriteTo(io.Writer) (int64, error) {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
func (t *lfm2StubTensor) Clone() Tensor {
|
||||
return &lfm2StubTensor{
|
||||
tensorBase: tensorBase{
|
||||
name: t.name,
|
||||
shape: slices.Clone(t.shape),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2MoEKV(t *testing.T) {
|
||||
var p lfm2Model
|
||||
p.ModelParameters.ModelType = "lfm2_moe"
|
||||
p.VocabSize = 65536
|
||||
p.HiddenSize = 2048
|
||||
p.NumHiddenLayers = 4
|
||||
p.MaxPositionEmbeddings = 128000
|
||||
p.IntermediateSize = 11776
|
||||
p.NumAttentionHeads = 32
|
||||
p.NumKeyValueHeads = 8
|
||||
p.LayerTypes = []string{"conv", "full_attention", "conv", "full_attention"}
|
||||
p.NormEps = 1e-5
|
||||
p.ConvLCache = 3
|
||||
p.MoEIntermediateSize = 1536
|
||||
p.NumExperts = 64
|
||||
p.NumExpertsPerToken = 4
|
||||
p.NumDenseLayers = 2
|
||||
p.RopeParameters.RopeTheta = 1_000_000
|
||||
|
||||
kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
|
||||
|
||||
if got, want := kv["general.architecture"], "lfm2moe"; got != want {
|
||||
t.Fatalf("general.architecture = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
|
||||
t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["expert_count"], uint32(64); got != want {
|
||||
t.Fatalf("expert_count = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["expert_used_count"], uint32(4); got != want {
|
||||
t.Fatalf("expert_used_count = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["expert_feed_forward_length"], uint32(1536); got != want {
|
||||
t.Fatalf("expert_feed_forward_length = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["leading_dense_block_count"], uint32(2); got != want {
|
||||
t.Fatalf("leading_dense_block_count = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["expert_gating_func"], uint32(2); got != want {
|
||||
t.Fatalf("expert_gating_func = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
gotHeadCounts, ok := kv["attention.head_count_kv"].([]uint32)
|
||||
if !ok {
|
||||
t.Fatalf("attention.head_count_kv has unexpected type %T", kv["attention.head_count_kv"])
|
||||
}
|
||||
|
||||
wantHeadCounts := []uint32{0, 8, 0, 8}
|
||||
if !slices.Equal(gotHeadCounts, wantHeadCounts) {
|
||||
t.Fatalf("attention.head_count_kv = %v, want %v", gotHeadCounts, wantHeadCounts)
|
||||
}
|
||||
|
||||
if got, want := kv["rope.freq_base"], float32(1_000_000); got != want {
|
||||
t.Fatalf("rope.freq_base = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2DenseKV(t *testing.T) {
|
||||
p := lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2", VocabSize: 32000},
|
||||
HiddenSize: 1024,
|
||||
NumHiddenLayers: 2,
|
||||
MaxPositionEmbeddings: 32768,
|
||||
IntermediateSize: 4096,
|
||||
NumAttentionHeads: 16,
|
||||
NumKeyValueHeads: 4,
|
||||
LayerTypes: []string{"conv", "full_attention"},
|
||||
NormEps: 1e-5,
|
||||
ConvLCache: 3,
|
||||
RopeTheta: 10000,
|
||||
}
|
||||
|
||||
kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
|
||||
|
||||
if got, want := kv["general.architecture"], "lfm2"; got != want {
|
||||
t.Fatalf("general.architecture = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["tokenizer.ggml.pre"], "lfm2"; got != want {
|
||||
t.Fatalf("tokenizer.ggml.pre = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if _, ok := kv["expert_count"]; ok {
|
||||
t.Fatalf("expert_count should not be set for dense lfm2")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2MoETensors(t *testing.T) {
|
||||
p := lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2_moe"},
|
||||
NumHiddenLayers: 4,
|
||||
NumDenseLayers: 2,
|
||||
}
|
||||
|
||||
in := []Tensor{
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.0.w1.weight", []uint64{1536, 2048}),
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.1.w1.weight", []uint64{1536, 2048}),
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.0.w2.weight", []uint64{2048, 1536}),
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.1.w2.weight", []uint64{2048, 1536}),
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.0.w3.weight", []uint64{1536, 2048}),
|
||||
newLFM2StubTensor("blk.2.feed_forward.experts.1.w3.weight", []uint64{1536, 2048}),
|
||||
newLFM2StubTensor("blk.0.shortconv.conv.weight", []uint64{2048, 1, 3}),
|
||||
}
|
||||
|
||||
out := p.Tensors(in)
|
||||
|
||||
byName := make(map[string][]uint64, len(out))
|
||||
for _, tns := range out {
|
||||
byName[tns.Name] = tns.Shape
|
||||
}
|
||||
|
||||
if got, ok := byName["blk.2.ffn_gate_exps.weight"]; !ok {
|
||||
t.Fatalf("missing merged tensor blk.2.ffn_gate_exps.weight")
|
||||
} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
|
||||
t.Fatalf("blk.2.ffn_gate_exps.weight shape = %v, want [2 1536 2048]", got)
|
||||
}
|
||||
|
||||
if got, ok := byName["blk.2.ffn_down_exps.weight"]; !ok {
|
||||
t.Fatalf("missing merged tensor blk.2.ffn_down_exps.weight")
|
||||
} else if !slices.Equal(got, []uint64{2, 2048, 1536}) {
|
||||
t.Fatalf("blk.2.ffn_down_exps.weight shape = %v, want [2 2048 1536]", got)
|
||||
}
|
||||
|
||||
if got, ok := byName["blk.2.ffn_up_exps.weight"]; !ok {
|
||||
t.Fatalf("missing merged tensor blk.2.ffn_up_exps.weight")
|
||||
} else if !slices.Equal(got, []uint64{2, 1536, 2048}) {
|
||||
t.Fatalf("blk.2.ffn_up_exps.weight shape = %v, want [2 1536 2048]", got)
|
||||
}
|
||||
|
||||
if got, ok := byName["blk.0.shortconv.conv.weight"]; !ok {
|
||||
t.Fatalf("missing shortconv tensor")
|
||||
} else if !slices.Equal(got, []uint64{2048, 3}) {
|
||||
t.Fatalf("blk.0.shortconv.conv.weight shape = %v, want [2048 3]", got)
|
||||
}
|
||||
|
||||
if _, ok := byName["blk.2.feed_forward.experts.0.w1.weight"]; ok {
|
||||
t.Fatalf("unmerged expert tensor should not be present")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2MoEReplacements(t *testing.T) {
|
||||
p := lfm2Model{}
|
||||
replacer := strings.NewReplacer(p.Replacements()...)
|
||||
|
||||
if got, want := replacer.Replace("model.layers.2.feed_forward.expert_bias"), "blk.2.exp_probs_b.bias"; got != want {
|
||||
t.Fatalf("expert bias replacement = %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if got, want := replacer.Replace("model.layers.2.feed_forward.gate.weight"), "blk.2.ffn_gate_inp.weight"; got != want {
|
||||
t.Fatalf("gate replacement = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2KVContextLengthEdgeCaseFallbackOverride(t *testing.T) {
|
||||
p := lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
|
||||
HiddenSize: 2048,
|
||||
NumHiddenLayers: 40,
|
||||
MaxPositionEmbeddings: 128000,
|
||||
IntermediateSize: 11776,
|
||||
NumAttentionHeads: 32,
|
||||
NumKeyValueHeads: 8,
|
||||
LayerTypes: make([]string, 40),
|
||||
NormEps: 1e-5,
|
||||
ConvLCache: 3,
|
||||
MoEIntermediateSize: 1536,
|
||||
NumExperts: 64,
|
||||
NumExpertsPerToken: 4,
|
||||
NumDenseLayers: 2,
|
||||
}
|
||||
for i := 0; i < len(p.LayerTypes); i++ {
|
||||
p.LayerTypes[i] = "conv"
|
||||
}
|
||||
p.LayerTypes[2] = "full_attention"
|
||||
|
||||
kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
|
||||
|
||||
if got, want := kv["context_length"], uint32(32768); got != want {
|
||||
t.Fatalf("context_length = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2KVContextLengthNoOverride(t *testing.T) {
|
||||
p := lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2_moe", VocabSize: 65536},
|
||||
HiddenSize: 2048,
|
||||
NumHiddenLayers: 39, // mismatch: should not trigger edge case
|
||||
MaxPositionEmbeddings: 128000,
|
||||
IntermediateSize: 11776,
|
||||
NumAttentionHeads: 32,
|
||||
NumKeyValueHeads: 8,
|
||||
LayerTypes: []string{"conv", "full_attention"},
|
||||
NormEps: 1e-5,
|
||||
ConvLCache: 3,
|
||||
MoEIntermediateSize: 1536,
|
||||
NumExperts: 64,
|
||||
NumExpertsPerToken: 4,
|
||||
NumDenseLayers: 2,
|
||||
}
|
||||
|
||||
kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
|
||||
|
||||
if got, want := kv["context_length"], uint32(128000); got != want {
|
||||
t.Fatalf("context_length = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2KVFeedForwardLengthAutoAdjust(t *testing.T) {
|
||||
p := lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2", VocabSize: 65536},
|
||||
HiddenSize: 2048,
|
||||
NumHiddenLayers: 16,
|
||||
MaxPositionEmbeddings: 128000,
|
||||
IntermediateSize: 12288, // should be ignored when block_ff_dim is set
|
||||
BlockFFDim: 12288,
|
||||
BlockAutoAdjustFFDim: true,
|
||||
BlockMultipleOf: 256,
|
||||
BlockFFNDimMultiplier: 1.0,
|
||||
NumAttentionHeads: 32,
|
||||
NumKeyValueHeads: 8,
|
||||
LayerTypes: []string{"conv", "full_attention"},
|
||||
NormEps: 1e-5,
|
||||
ConvLCache: 3,
|
||||
}
|
||||
|
||||
kv := p.KV(&Tokenizer{Vocabulary: &Vocabulary{Model: "gpt2"}})
|
||||
|
||||
if got, want := kv["feed_forward_length"], uint32(8192); got != want {
|
||||
t.Fatalf("feed_forward_length = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
417
convert/convert_lfm2_vl.go
Normal file
417
convert/convert_lfm2_vl.go
Normal file
@@ -0,0 +1,417 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"cmp"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io/fs"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
)
|
||||
|
||||
// lfm2VLTextModel converts the language model component of LFM2 VL checkpoints.
|
||||
type lfm2VLTextModel struct {
|
||||
TextConfig lfm2Model `json:"text_config"`
|
||||
DoImageSplitting *bool `json:"do_image_splitting"`
|
||||
DownsampleFactor uint32 `json:"downsample_factor"`
|
||||
EncoderPatchSize uint32 `json:"encoder_patch_size"`
|
||||
ImageTokenID uint32 `json:"image_token_id"`
|
||||
MaxImageTokens uint32 `json:"max_image_tokens"`
|
||||
MinImageTokens uint32 `json:"min_image_tokens"`
|
||||
MaxTiles uint32 `json:"max_tiles"`
|
||||
MinTiles uint32 `json:"min_tiles"`
|
||||
TileSize uint32 `json:"tile_size"`
|
||||
MaxPixelsTolerance float32 `json:"max_pixels_tolerance"`
|
||||
ProjectorUseLayernorm bool `json:"projector_use_layernorm"`
|
||||
ProjectorHiddenSize uint32 `json:"projector_hidden_size"`
|
||||
ProjectorHiddenAct string `json:"projector_hidden_act"`
|
||||
UseImageSpecialTokens *bool `json:"use_image_special_tokens"`
|
||||
UseThumbnail *bool `json:"use_thumbnail"`
|
||||
VisionConfig struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NumChannels uint32 `json:"num_channels"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_eps"`
|
||||
} `json:"vision_config"`
|
||||
Processor struct {
|
||||
ImageProcessor struct {
|
||||
DoImageSplitting *bool `json:"do_image_splitting"`
|
||||
DownsampleFactor uint32 `json:"downsample_factor"`
|
||||
MaxImageTokens uint32 `json:"max_image_tokens"`
|
||||
MinImageTokens uint32 `json:"min_image_tokens"`
|
||||
MaxTiles uint32 `json:"max_tiles"`
|
||||
MinTiles uint32 `json:"min_tiles"`
|
||||
MaxPixelsTol float32 `json:"max_pixels_tolerance"`
|
||||
TileSize uint32 `json:"tile_size"`
|
||||
UseThumbnail *bool `json:"use_thumbnail"`
|
||||
ImageMean []float32 `json:"image_mean"`
|
||||
ImageStd []float32 `json:"image_std"`
|
||||
Size struct {
|
||||
Height uint32 `json:"height"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"size"`
|
||||
} `json:"image_processor"`
|
||||
}
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) textModel() *lfm2Model {
|
||||
return &p.TextConfig
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) specialTokenTypes() []string {
|
||||
return p.textModel().specialTokenTypes()
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "processor_config.json")
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrNotExist) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
return json.Unmarshal(bts, &p.Processor)
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) visionImageSize() uint32 {
|
||||
// LFM2-VL image processor operates on 512 tiles and downsamples by factor 2
|
||||
// before projection. Keep a fixed square image size compatible with position
|
||||
// embeddings and the simplified runtime image pipeline.
|
||||
tile := cmp.Or(
|
||||
p.Processor.ImageProcessor.TileSize,
|
||||
p.Processor.ImageProcessor.Size.Height,
|
||||
p.Processor.ImageProcessor.Size.Width,
|
||||
uint32(512),
|
||||
)
|
||||
downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
|
||||
if downsample == 0 {
|
||||
return tile
|
||||
}
|
||||
|
||||
return max(uint32(1), tile/downsample)
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) KV(t *Tokenizer) KV {
|
||||
kv := p.textModel().KV(t)
|
||||
|
||||
boolOr := func(defaultValue bool, values ...*bool) bool {
|
||||
for _, v := range values {
|
||||
if v != nil {
|
||||
return *v
|
||||
}
|
||||
}
|
||||
return defaultValue
|
||||
}
|
||||
|
||||
kv["vision.block_count"] = cmp.Or(p.VisionConfig.NumHiddenLayers, uint32(27))
|
||||
kv["vision.embedding_length"] = cmp.Or(p.VisionConfig.HiddenSize, uint32(1152))
|
||||
kv["vision.feed_forward_length"] = cmp.Or(p.VisionConfig.IntermediateSize, uint32(4304))
|
||||
kv["vision.attention.head_count"] = cmp.Or(p.VisionConfig.NumAttentionHeads, uint32(16))
|
||||
kv["vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionConfig.LayerNormEpsilon, float32(1e-6))
|
||||
kv["vision.patch_size"] = cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16))
|
||||
kv["vision.num_channels"] = cmp.Or(p.VisionConfig.NumChannels, uint32(3))
|
||||
kv["vision.image_size"] = p.visionImageSize()
|
||||
kv["vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
|
||||
kv["vision.projector.use_layernorm"] = p.ProjectorUseLayernorm
|
||||
kv["vision.do_image_splitting"] = boolOr(true, p.DoImageSplitting, p.Processor.ImageProcessor.DoImageSplitting)
|
||||
kv["vision.min_tiles"] = cmp.Or(p.MinTiles, p.Processor.ImageProcessor.MinTiles, uint32(2))
|
||||
kv["vision.max_tiles"] = cmp.Or(p.MaxTiles, p.Processor.ImageProcessor.MaxTiles, uint32(10))
|
||||
kv["vision.tile_size"] = cmp.Or(p.TileSize, p.Processor.ImageProcessor.TileSize, uint32(512))
|
||||
kv["vision.min_image_tokens"] = cmp.Or(p.MinImageTokens, p.Processor.ImageProcessor.MinImageTokens, uint32(64))
|
||||
kv["vision.max_image_tokens"] = cmp.Or(p.MaxImageTokens, p.Processor.ImageProcessor.MaxImageTokens, uint32(256))
|
||||
kv["vision.max_pixels_tolerance"] = cmp.Or(p.MaxPixelsTolerance, p.Processor.ImageProcessor.MaxPixelsTol, float32(2.0))
|
||||
kv["vision.use_thumbnail"] = boolOr(true, p.UseThumbnail, p.Processor.ImageProcessor.UseThumbnail)
|
||||
kv["vision.use_image_special_tokens"] = boolOr(true, p.UseImageSpecialTokens)
|
||||
kv["vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
|
||||
kv["vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
|
||||
kv["vision.image_token_id"] = cmp.Or(p.ImageTokenID, uint32(396))
|
||||
|
||||
setVisionTokenID := func(k, token string) {
|
||||
if t == nil || t.Vocabulary == nil {
|
||||
return
|
||||
}
|
||||
for i, v := range t.Vocabulary.Tokens {
|
||||
if v == token {
|
||||
kv[k] = uint32(i)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
setVisionTokenID("vision.image_start_token_id", "<|image_start|>")
|
||||
setVisionTokenID("vision.image_end_token_id", "<|image_end|>")
|
||||
setVisionTokenID("vision.image_thumbnail_token_id", "<|img_thumbnail|>")
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
patchSize := int(cmp.Or(p.VisionConfig.PatchSize, p.EncoderPatchSize, uint32(16)))
|
||||
numChannels := int(cmp.Or(p.VisionConfig.NumChannels, uint32(3)))
|
||||
|
||||
for _, t := range ts {
|
||||
if t.Name() == "v.patch_embd.weight" {
|
||||
shape := t.Shape()
|
||||
if len(shape) == 2 {
|
||||
inputDim := uint64(numChannels * patchSize * patchSize)
|
||||
if shape[1] == inputDim {
|
||||
channels := numChannels
|
||||
patch := patchSize
|
||||
t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
|
||||
return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
out := p.textModel().Tensors(ts)
|
||||
for _, t := range out {
|
||||
if t.Name == "v.patch_embd.weight" && len(t.Shape) == 2 {
|
||||
t.Shape = []uint64{t.Shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *lfm2VLTextModel) Replacements() []string {
|
||||
out := make([]string, 0, 96)
|
||||
|
||||
addText := func(from, to string) {
|
||||
out = append(out, from, to)
|
||||
if strings.HasPrefix(from, "model.") {
|
||||
suffix := strings.TrimPrefix(from, "model.")
|
||||
out = append(out,
|
||||
"model.language_model."+suffix, to,
|
||||
"model.language_model.model."+suffix, to,
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
base := p.textModel().Replacements()
|
||||
for i := 0; i+1 < len(base); i += 2 {
|
||||
addText(base[i], base[i+1])
|
||||
}
|
||||
|
||||
// Vision tower + multimodal projector tensors (single-file conversion).
|
||||
out = append(out,
|
||||
"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
|
||||
"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
|
||||
"model.vision_tower.vision_model.encoder.layers", "v.blk",
|
||||
"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
|
||||
"model.multi_modal_projector.layer_norm", "mm.layer_norm",
|
||||
"model.multi_modal_projector.linear_1", "mm.1",
|
||||
"model.multi_modal_projector.linear_2", "mm.2",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.out_proj", "attn_out",
|
||||
"layer_norm1", "ln1",
|
||||
"layer_norm2", "ln2",
|
||||
"mlp.fc1", "ffn_up",
|
||||
"mlp.fc2", "ffn_down",
|
||||
)
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// lfm2VLProjectorModel converts the vision encoder + projector component of LFM2 VL checkpoints.
|
||||
type lfm2VLProjectorModel struct {
|
||||
ModelParameters
|
||||
DownsampleFactor uint32 `json:"downsample_factor"`
|
||||
ProjectorHiddenDim uint32 `json:"projector_hidden_size"`
|
||||
VisionModel struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NumChannels uint32 `json:"num_channels"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_eps"`
|
||||
ImageSize uint32 `json:"image_size"`
|
||||
} `json:"vision_config"`
|
||||
Processor struct {
|
||||
ImageProcessor struct {
|
||||
DownsampleFactor uint32 `json:"downsample_factor"`
|
||||
TileSize uint32 `json:"tile_size"`
|
||||
ImageMean []float32 `json:"image_mean"`
|
||||
ImageStd []float32 `json:"image_std"`
|
||||
Size struct {
|
||||
Height uint32 `json:"height"`
|
||||
Width uint32 `json:"width"`
|
||||
} `json:"size"`
|
||||
} `json:"image_processor"`
|
||||
}
|
||||
}
|
||||
|
||||
var (
|
||||
_ ModelConverter = (*lfm2VLTextModel)(nil)
|
||||
_ ModelConverter = (*lfm2VLProjectorModel)(nil)
|
||||
_ moreParser = (*lfm2VLTextModel)(nil)
|
||||
_ moreParser = (*lfm2VLProjectorModel)(nil)
|
||||
)
|
||||
|
||||
func (p *lfm2VLProjectorModel) parseMore(fsys fs.FS) error {
|
||||
bts, err := fs.ReadFile(fsys, "processor_config.json")
|
||||
if err != nil {
|
||||
if errors.Is(err, fs.ErrNotExist) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
return json.Unmarshal(bts, &p.Processor)
|
||||
}
|
||||
|
||||
func (p *lfm2VLProjectorModel) imageSize() uint32 {
|
||||
if p.VisionModel.ImageSize > 0 {
|
||||
return p.VisionModel.ImageSize
|
||||
}
|
||||
|
||||
downsample := cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
|
||||
baseSize := cmp.Or(
|
||||
p.Processor.ImageProcessor.TileSize,
|
||||
p.Processor.ImageProcessor.Size.Height,
|
||||
p.Processor.ImageProcessor.Size.Width,
|
||||
uint32(256),
|
||||
)
|
||||
if downsample == 0 {
|
||||
return baseSize
|
||||
}
|
||||
|
||||
return max(uint32(1), baseSize/downsample)
|
||||
}
|
||||
|
||||
func (p *lfm2VLProjectorModel) KV(_ *Tokenizer) KV {
|
||||
kv := KV{
|
||||
"general.architecture": "clip",
|
||||
"general.type": "mmproj",
|
||||
"general.file_type": uint32(1),
|
||||
"general.quantization_version": uint32(2),
|
||||
"clip.has_vision_encoder": true,
|
||||
"clip.projector_type": "lfm2",
|
||||
"clip.use_gelu": true,
|
||||
}
|
||||
|
||||
kv["clip.vision.block_count"] = cmp.Or(p.VisionModel.NumHiddenLayers, uint32(27))
|
||||
kv["clip.vision.embedding_length"] = cmp.Or(p.VisionModel.HiddenSize, uint32(1152))
|
||||
kv["clip.vision.feed_forward_length"] = cmp.Or(p.VisionModel.IntermediateSize, uint32(4304))
|
||||
kv["clip.vision.attention.head_count"] = cmp.Or(p.VisionModel.NumAttentionHeads, uint32(16))
|
||||
kv["clip.vision.attention.layer_norm_epsilon"] = cmp.Or(p.VisionModel.LayerNormEpsilon, float32(1e-6))
|
||||
kv["clip.vision.patch_size"] = cmp.Or(p.VisionModel.PatchSize, uint32(16))
|
||||
kv["clip.vision.image_size"] = p.imageSize()
|
||||
kv["clip.vision.projection_dim"] = cmp.Or(p.ProjectorHiddenDim, uint32(2048))
|
||||
kv["clip.vision.projector.scale_factor"] = cmp.Or(p.DownsampleFactor, p.Processor.ImageProcessor.DownsampleFactor, uint32(2))
|
||||
kv["clip.vision.image_mean"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageMean, []float32{0.5, 0.5, 0.5}))
|
||||
kv["clip.vision.image_std"] = slices.Clone(defaultFloat32Slice(p.Processor.ImageProcessor.ImageStd, []float32{0.5, 0.5, 0.5}))
|
||||
|
||||
return kv
|
||||
}
|
||||
|
||||
func defaultFloat32Slice(v, fallback []float32) []float32 {
|
||||
if len(v) > 0 {
|
||||
return v
|
||||
}
|
||||
|
||||
return fallback
|
||||
}
|
||||
|
||||
func (p *lfm2VLProjectorModel) Tensors(ts []Tensor) []*ggml.Tensor {
|
||||
var out []*ggml.Tensor
|
||||
|
||||
numChannels := cmp.Or(p.VisionModel.NumChannels, uint32(3))
|
||||
patchSize := cmp.Or(p.VisionModel.PatchSize, uint32(16))
|
||||
|
||||
for _, t := range ts {
|
||||
name := t.Name()
|
||||
if !(strings.HasPrefix(name, "v.") || strings.HasPrefix(name, "mm.")) {
|
||||
continue
|
||||
}
|
||||
|
||||
shape := t.Shape()
|
||||
if name == "v.patch_embd.weight" && len(shape) == 2 {
|
||||
inputDim := uint64(numChannels * patchSize * patchSize)
|
||||
if shape[1] == inputDim {
|
||||
shape = []uint64{shape[0], uint64(numChannels), uint64(patchSize), uint64(patchSize)}
|
||||
channels := int(numChannels)
|
||||
patch := int(patchSize)
|
||||
t.SetRepacker(func(_ string, data []float32, srcShape []uint64) ([]float32, error) {
|
||||
return repackPatchEmbeddingWeight(data, srcShape, channels, patch)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
out = append(out, &ggml.Tensor{
|
||||
Name: name,
|
||||
Kind: t.Kind(),
|
||||
Shape: slices.Clone(shape),
|
||||
WriterTo: t,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func (p *lfm2VLProjectorModel) Replacements() []string {
|
||||
return []string{
|
||||
"model.multi_modal_projector.linear_1", "mm.1",
|
||||
"model.multi_modal_projector.linear_2", "mm.2",
|
||||
"model.vision_tower.vision_model.embeddings.patch_embedding", "v.patch_embd",
|
||||
"model.vision_tower.vision_model.embeddings.position_embedding", "v.position_embd",
|
||||
"model.vision_tower.vision_model.encoder.layers", "v.blk",
|
||||
"self_attn.q_proj", "attn_q",
|
||||
"self_attn.k_proj", "attn_k",
|
||||
"self_attn.v_proj", "attn_v",
|
||||
"self_attn.out_proj", "attn_out",
|
||||
"layer_norm1", "ln1",
|
||||
"layer_norm2", "ln2",
|
||||
"mlp.fc1", "ffn_up",
|
||||
"mlp.fc2", "ffn_down",
|
||||
"model.vision_tower.vision_model.post_layernorm", "v.post_ln",
|
||||
}
|
||||
}
|
||||
|
||||
func repackPatchEmbeddingWeight(data []float32, srcShape []uint64, channels, patch int) ([]float32, error) {
|
||||
if len(srcShape) != 2 {
|
||||
return nil, fmt.Errorf("invalid patch embedding shape rank: %d", len(srcShape))
|
||||
}
|
||||
|
||||
outDim := int(srcShape[0])
|
||||
flatInputDim := int(srcShape[1])
|
||||
expectedInputDim := channels * patch * patch
|
||||
if flatInputDim != expectedInputDim {
|
||||
return nil, fmt.Errorf("invalid patch embedding input dim: got %d, want %d", flatInputDim, expectedInputDim)
|
||||
}
|
||||
|
||||
expectedSize := outDim * flatInputDim
|
||||
if len(data) != expectedSize {
|
||||
return nil, fmt.Errorf("invalid patch embedding data size: got %d, want %d", len(data), expectedSize)
|
||||
}
|
||||
|
||||
repacked := make([]float32, len(data))
|
||||
perChannel := patch * patch
|
||||
|
||||
for o := range outDim {
|
||||
inBase := o * flatInputDim
|
||||
outBase := o * flatInputDim
|
||||
|
||||
for y := range patch {
|
||||
for x := range patch {
|
||||
inPixelBase := inBase + (y*patch+x)*channels
|
||||
for c := range channels {
|
||||
src := inPixelBase + c
|
||||
dst := outBase + c*perChannel + y*patch + x
|
||||
repacked[dst] = data[src]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return repacked, nil
|
||||
}
|
||||
249
convert/convert_lfm2_vl_test.go
Normal file
249
convert/convert_lfm2_vl_test.go
Normal file
@@ -0,0 +1,249 @@
|
||||
package convert
|
||||
|
||||
import (
|
||||
"slices"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestLFM2VLTextModelKVUsesTextConfig(t *testing.T) {
|
||||
p := lfm2VLTextModel{
|
||||
TextConfig: lfm2Model{
|
||||
ModelParameters: ModelParameters{ModelType: "lfm2", VocabSize: 65536},
|
||||
HiddenSize: 2048,
|
||||
NumHiddenLayers: 16,
|
||||
MaxPositionEmbeddings: 128000,
|
||||
IntermediateSize: 12288,
|
||||
BlockFFDim: 12288,
|
||||
BlockAutoAdjustFFDim: true,
|
||||
BlockMultipleOf: 256,
|
||||
BlockFFNDimMultiplier: 1.0,
|
||||
NumAttentionHeads: 32,
|
||||
NumKeyValueHeads: 8,
|
||||
LayerTypes: []string{"conv", "full_attention"},
|
||||
NormEps: 1e-5,
|
||||
ConvLCache: 3,
|
||||
},
|
||||
DownsampleFactor: 2,
|
||||
VisionConfig: struct {
|
||||
HiddenSize uint32 `json:"hidden_size"`
|
||||
IntermediateSize uint32 `json:"intermediate_size"`
|
||||
NumAttentionHeads uint32 `json:"num_attention_heads"`
|
||||
NumHiddenLayers uint32 `json:"num_hidden_layers"`
|
||||
NumChannels uint32 `json:"num_channels"`
|
||||
PatchSize uint32 `json:"patch_size"`
|
||||
LayerNormEpsilon float32 `json:"layer_norm_eps"`
|
||||
}{
|
||||
HiddenSize: 1152,
|
||||
IntermediateSize: 4304,
|
||||
NumAttentionHeads: 16,
|
||||
NumHiddenLayers: 27,
|
||||
NumChannels: 3,
|
||||
PatchSize: 16,
|
||||
LayerNormEpsilon: 1e-6,
|
||||
},
|
||||
}
|
||||
p.Processor.ImageProcessor.TileSize = 512
|
||||
p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
|
||||
p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
|
||||
|
||||
kv := p.KV(&Tokenizer{
|
||||
Vocabulary: &Vocabulary{
|
||||
Model: "gpt2",
|
||||
Tokens: []string{"<|pad|>", "<image>", "<|image_start|>", "<|image_end|>", "<|img_thumbnail|>"},
|
||||
},
|
||||
})
|
||||
|
||||
if got, want := kv["general.architecture"], "lfm2"; got != want {
|
||||
t.Fatalf("general.architecture = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["feed_forward_length"], uint32(8192); got != want {
|
||||
t.Fatalf("feed_forward_length = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["vision.block_count"], uint32(27); got != want {
|
||||
t.Fatalf("vision.block_count = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["vision.image_size"], uint32(256); got != want {
|
||||
t.Fatalf("vision.image_size = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["vision.image_token_id"], uint32(396); got != want {
|
||||
t.Fatalf("vision.image_token_id = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["vision.image_start_token_id"], uint32(2); got != want {
|
||||
t.Fatalf("vision.image_start_token_id = %v, want %v", got, want)
|
||||
}
|
||||
|
||||
if got, want := kv["vision.do_image_splitting"], true; got != want {
|
||||
t.Fatalf("vision.do_image_splitting = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["vision.min_tiles"], uint32(2); got != want {
|
||||
t.Fatalf("vision.min_tiles = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["vision.max_tiles"], uint32(10); got != want {
|
||||
t.Fatalf("vision.max_tiles = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["vision.tile_size"], uint32(512); got != want {
|
||||
t.Fatalf("vision.tile_size = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["vision.use_thumbnail"], true; got != want {
|
||||
t.Fatalf("vision.use_thumbnail = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["vision.use_image_special_tokens"], true; got != want {
|
||||
t.Fatalf("vision.use_image_special_tokens = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2VLTextModelTensorsIncludeVision(t *testing.T) {
|
||||
p := lfm2VLTextModel{}
|
||||
p.VisionConfig.PatchSize = 16
|
||||
p.VisionConfig.NumChannels = 3
|
||||
input := []Tensor{
|
||||
newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
|
||||
newLFM2StubTensor("model.layers.0.ffn_norm.weight", []uint64{2048}),
|
||||
newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
|
||||
newLFM2StubTensor("v.blk.0.attn_q.weight", []uint64{1152, 1152}),
|
||||
newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
|
||||
}
|
||||
|
||||
out := p.Tensors(input)
|
||||
if len(out) == 0 {
|
||||
t.Fatal("expected non-empty tensor list")
|
||||
}
|
||||
|
||||
foundPatch := false
|
||||
foundVision := false
|
||||
for _, tns := range out {
|
||||
if tns.Name == "v.patch_embd.weight" {
|
||||
foundPatch = true
|
||||
if !slices.Equal(tns.Shape, []uint64{1152, 3, 16, 16}) {
|
||||
t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", tns.Shape)
|
||||
}
|
||||
}
|
||||
if strings.HasPrefix(tns.Name, "v.") || strings.HasPrefix(tns.Name, "mm.") {
|
||||
foundVision = true
|
||||
}
|
||||
}
|
||||
|
||||
if !foundPatch {
|
||||
t.Fatal("expected v.patch_embd.weight in output tensors")
|
||||
}
|
||||
if !foundVision {
|
||||
t.Fatal("expected at least one vision/projector tensor in output")
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2VLTextModelReplacements(t *testing.T) {
|
||||
p := lfm2VLTextModel{}
|
||||
r := strings.NewReplacer(p.Replacements()...)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
in string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "language_model_embed_tokens",
|
||||
in: "model.language_model.embed_tokens.weight",
|
||||
want: "token_embd.weight",
|
||||
},
|
||||
{
|
||||
name: "language_model_layers",
|
||||
in: "model.language_model.layers.2.self_attn.q_proj.weight",
|
||||
want: "blk.2.attn_q.weight",
|
||||
},
|
||||
{
|
||||
name: "nested_language_model_prefix",
|
||||
in: "model.language_model.model.embedding_norm.weight",
|
||||
want: "token_embd_norm.weight",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if got := r.Replace(tt.in); got != tt.want {
|
||||
t.Fatalf("replacement(%q) = %q, want %q", tt.in, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2VLProjectorKV(t *testing.T) {
|
||||
p := lfm2VLProjectorModel{
|
||||
DownsampleFactor: 2,
|
||||
ProjectorHiddenDim: 2048,
|
||||
}
|
||||
p.VisionModel.NumHiddenLayers = 27
|
||||
p.VisionModel.HiddenSize = 1152
|
||||
p.VisionModel.IntermediateSize = 4304
|
||||
p.VisionModel.NumAttentionHeads = 16
|
||||
p.VisionModel.PatchSize = 16
|
||||
p.VisionModel.LayerNormEpsilon = 1e-6
|
||||
p.Processor.ImageProcessor.TileSize = 512
|
||||
p.Processor.ImageProcessor.ImageMean = []float32{0.5, 0.5, 0.5}
|
||||
p.Processor.ImageProcessor.ImageStd = []float32{0.5, 0.5, 0.5}
|
||||
|
||||
kv := p.KV(nil)
|
||||
|
||||
if got, want := kv["general.architecture"], "clip"; got != want {
|
||||
t.Fatalf("general.architecture = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["clip.projector_type"], "lfm2"; got != want {
|
||||
t.Fatalf("clip.projector_type = %v, want %v", got, want)
|
||||
}
|
||||
if got, want := kv["clip.vision.image_size"], uint32(256); got != want {
|
||||
t.Fatalf("clip.vision.image_size = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestLFM2VLProjectorTensorsPatchReshape(t *testing.T) {
|
||||
p := lfm2VLProjectorModel{}
|
||||
p.VisionModel.NumChannels = 3
|
||||
p.VisionModel.PatchSize = 16
|
||||
|
||||
input := []Tensor{
|
||||
newLFM2StubTensor("v.patch_embd.weight", []uint64{1152, 768}),
|
||||
newLFM2StubTensor("mm.1.weight", []uint64{2048, 4608}),
|
||||
newLFM2StubTensor("model.embed_tokens.weight", []uint64{65536, 2048}),
|
||||
}
|
||||
|
||||
out := p.Tensors(input)
|
||||
if len(out) != 2 {
|
||||
t.Fatalf("expected 2 tensors, got %d", len(out))
|
||||
}
|
||||
|
||||
var patchShape []uint64
|
||||
for _, tns := range out {
|
||||
if tns.Name == "v.patch_embd.weight" {
|
||||
patchShape = tns.Shape
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if !slices.Equal(patchShape, []uint64{1152, 3, 16, 16}) {
|
||||
t.Fatalf("v.patch_embd.weight shape = %v, want [1152 3 16 16]", patchShape)
|
||||
}
|
||||
}
|
||||
|
||||
func TestRepackPatchEmbeddingWeight(t *testing.T) {
|
||||
data := []float32{
|
||||
0, 1, // y=0,x=0
|
||||
2, 3, // y=0,x=1
|
||||
4, 5, // y=1,x=0
|
||||
6, 7, // y=1,x=1
|
||||
}
|
||||
|
||||
got, err := repackPatchEmbeddingWeight(data, []uint64{1, 8}, 2, 2)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
|
||||
want := []float32{0, 2, 4, 6, 1, 3, 5, 7}
|
||||
if !slices.Equal(got, want) {
|
||||
t.Fatalf("repacked data = %v, want %v", got, want)
|
||||
}
|
||||
}
|
||||
@@ -212,8 +212,13 @@ type tokenizer struct {
|
||||
|
||||
PreTokenizer struct {
|
||||
PreTokenizers []struct {
|
||||
Type string `json:"type"`
|
||||
Pattern struct {
|
||||
Type string `json:"type"`
|
||||
Behavior string `json:"behavior"`
|
||||
Invert bool `json:"invert"`
|
||||
AddPrefixSpace bool `json:"add_prefix_space"`
|
||||
TrimOffsets bool `json:"trim_offsets"`
|
||||
UseRegex bool `json:"use_regex"`
|
||||
Pattern struct {
|
||||
Regex string `json:"Regex"`
|
||||
} `json:"pattern"`
|
||||
} `json:"pretokenizers"`
|
||||
|
||||
@@ -191,6 +191,84 @@ func TestParseTokenizer(t *testing.T) {
|
||||
Pre: "default",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "llama-bpe pretokenizer and control tokens",
|
||||
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||
"tokenizer.json": strings.NewReader(`{
|
||||
"added_tokens": [
|
||||
{"id": 1, "content": "<|startoftext|>", "special": true},
|
||||
{"id": 6, "content": "<|im_start|>", "special": true},
|
||||
{"id": 7, "content": "<|im_end|>", "special": true},
|
||||
{"id": 8, "content": "<|tool_list_start|>", "special": true},
|
||||
{"id": 9, "content": "<|tool_list_end|>", "special": true},
|
||||
{"id": 10, "content": "<|tool_call_start|>", "special": true},
|
||||
{"id": 11, "content": "<|tool_call_end|>", "special": true},
|
||||
{"id": 12, "content": "<|tool_response_start|>", "special": true},
|
||||
{"id": 13, "content": "<|tool_response_end|>", "special": true},
|
||||
{"id": 396, "content": "<image>", "special": true},
|
||||
{"id": 64400, "content": "<think>", "special": true},
|
||||
{"id": 64401, "content": "</think>", "special": true}
|
||||
],
|
||||
"model": {
|
||||
"vocab": {
|
||||
"<|startoftext|>": 1,
|
||||
"<|im_start|>": 6,
|
||||
"<|im_end|>": 7,
|
||||
"<|tool_list_start|>": 8,
|
||||
"<|tool_list_end|>": 9,
|
||||
"<|tool_call_start|>": 10,
|
||||
"<|tool_call_end|>": 11,
|
||||
"<|tool_response_start|>": 12,
|
||||
"<|tool_response_end|>": 13,
|
||||
"<image>": 396,
|
||||
"<think>": 64400,
|
||||
"</think>": 64401
|
||||
}
|
||||
},
|
||||
"pre_tokenizer": {
|
||||
"type": "Sequence",
|
||||
"pretokenizers": [
|
||||
{
|
||||
"type": "Split",
|
||||
"pattern": {
|
||||
"Regex": "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||
},
|
||||
"behavior": "Isolated",
|
||||
"invert": false
|
||||
},
|
||||
{
|
||||
"type": "ByteLevel",
|
||||
"add_prefix_space": false,
|
||||
"trim_offsets": true,
|
||||
"use_regex": false
|
||||
}
|
||||
]
|
||||
}
|
||||
}`),
|
||||
}),
|
||||
want: &Tokenizer{
|
||||
Vocabulary: &Vocabulary{
|
||||
Model: "gpt2",
|
||||
Tokens: []string{
|
||||
"<|startoftext|>",
|
||||
"<|im_start|>",
|
||||
"<|im_end|>",
|
||||
"<|tool_list_start|>",
|
||||
"<|tool_list_end|>",
|
||||
"<|tool_call_start|>",
|
||||
"<|tool_call_end|>",
|
||||
"<|tool_response_start|>",
|
||||
"<|tool_response_end|>",
|
||||
"<image>",
|
||||
"<think>",
|
||||
"</think>",
|
||||
},
|
||||
Scores: []float32{1, 6, 7, 8, 9, 10, 11, 12, 13, 396, 64400, 64401},
|
||||
Types: []int32{3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
|
||||
},
|
||||
Pre: "llama-bpe",
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "list string merges",
|
||||
fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
|
||||
|
||||
Reference in New Issue
Block a user