package convert import ( "bytes" "encoding/binary" "encoding/json" "io" "os" "path/filepath" "slices" "strings" "testing" ) func TestHybridPatternUnmarshal(t *testing.T) { t.Run("string", func(t *testing.T) { var p hybridPattern if err := json.Unmarshal([]byte(`"MEM*"`), &p); err != nil { t.Fatal(err) } if got, want := string(p), "MEM*"; got != want { t.Fatalf("unexpected pattern: got %q want %q", got, want) } }) t.Run("array", func(t *testing.T) { var p hybridPattern if err := json.Unmarshal([]byte(`["M","E","M","*"]`), &p); err != nil { t.Fatal(err) } if got, want := string(p), "MEM*"; got != want { t.Fatalf("unexpected pattern: got %q want %q", got, want) } }) } func TestNemotronHLayerArrays(t *testing.T) { m := &nemotronHModel{ NumHiddenLayers: 5, NumAttentionHeads: 32, NumKeyValueHeads: 8, HybridOverridePattern: "MEM*E", NRoutedExperts: 128, NumExpertsPerTok: 6, MoEIntermediateSize: 1856, } headsKV, ffn, err := m.layerArrays() if err != nil { t.Fatal(err) } if got, want := headsKV, []uint32{0, 0, 0, 8, 0}; !slices.Equal(got, want) { t.Fatalf("unexpected head_count_kv: got %v want %v", got, want) } if got, want := ffn, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) { t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want) } } func TestNemotronHKV(t *testing.T) { m := &nemotronHModel{ MaxPositionEmbeddings: 1048576, HiddenSize: 2688, NumHiddenLayers: 5, NumAttentionHeads: 32, NumKeyValueHeads: 2, HeadDim: 128, LayerNormEpsilon: 1e-5, RopeTheta: 10000, PartialRotaryFactor: 0.5, ConvKernel: 4, SSMStateSize: 128, MambaNumHeads: 64, MambaHeadDim: 64, NGroups: 8, HybridOverridePattern: "MEM*E", NRoutedExperts: 128, NSharedExperts: 1, NumExpertsPerTok: 6, MoEIntermediateSize: 1856, MoESharedExpertIntermediate: 3712, NormTopKProb: true, RoutedScalingFactor: 2.5, } if err := m.parseMore(nil); err != nil { t.Fatal(err) } kv := m.KV(&Tokenizer{Vocabulary: &Vocabulary{}}) if got, want := kv["general.architecture"], "nemotron_h_moe"; got != want { t.Fatalf("unexpected architecture: got %v want %v", got, want) } headCountKV, ok := kv["attention.head_count_kv"].([]uint32) if !ok { t.Fatalf("attention.head_count_kv has unexpected type: %T", kv["attention.head_count_kv"]) } if got, want := headCountKV, []uint32{0, 0, 0, 2, 0}; !slices.Equal(got, want) { t.Fatalf("unexpected attention.head_count_kv: got %v want %v", got, want) } ffnLength, ok := kv["feed_forward_length"].([]uint32) if !ok { t.Fatalf("feed_forward_length has unexpected type: %T", kv["feed_forward_length"]) } if got, want := ffnLength, []uint32{0, 1856, 0, 0, 1856}; !slices.Equal(got, want) { t.Fatalf("unexpected feed_forward_length: got %v want %v", got, want) } } func TestNemotronHTensorsTransforms(t *testing.T) { m := &nemotronHModel{NGroups: 8} in := []Tensor{ &fakeTensor{ name: "blk.0.ssm_a", shape: []uint64{4}, data: []float32{0, 1, 2, 3}, }, &fakeTensor{ name: "blk.0.ssm_d", shape: []uint64{4}, data: []float32{0, 1, 2, 3}, }, &fakeTensor{ name: "blk.0.ssm_norm.weight", shape: []uint64{16}, data: make([]float32, 16), }, &fakeTensor{ name: "blk.0.ssm_conv1d.weight", shape: []uint64{10, 1, 4}, data: make([]float32, 40), }, } out := m.Tensors(in) if len(out) != len(in) { t.Fatalf("unexpected output tensor count: got %d want %d", len(out), len(in)) } got := map[string]struct { shape []uint64 writer io.WriterTo }{} for _, t := range out { got[t.Name] = struct { shape []uint64 writer io.WriterTo }{shape: t.Shape, writer: t.WriterTo} } if shape := got["blk.0.ssm_a"].shape; !slices.Equal(shape, []uint64{4, 1}) { t.Fatalf("unexpected ssm_a shape: %v", shape) } if shape := got["blk.0.ssm_d"].shape; !slices.Equal(shape, []uint64{4, 1}) { t.Fatalf("unexpected ssm_d shape: %v", shape) } if shape := got["blk.0.ssm_norm.weight"].shape; !slices.Equal(shape, []uint64{8, 2}) { t.Fatalf("unexpected ssm_norm shape: %v", shape) } if shape := got["blk.0.ssm_conv1d.weight"].shape; !slices.Equal(shape, []uint64{10, 4}) { t.Fatalf("unexpected ssm_conv1d shape: %v", shape) } var b bytes.Buffer if _, err := got["blk.0.ssm_a"].writer.WriteTo(&b); err != nil { t.Fatal(err) } values := make([]float32, 4) if err := binary.Read(&b, binary.LittleEndian, &values); err != nil { t.Fatal(err) } // 0 -> -exp(0) == -1 if values[0] != -1 { t.Fatalf("unexpected transformed ssm_a[0]: got %v want -1", values[0]) } } func TestNemotronHLoadModelMetadata(t *testing.T) { tempDir := t.TempDir() config := `{ "architectures": ["NemotronHForCausalLM"], "model_type": "nemotron_h", "num_hidden_layers": 4, "hidden_size": 512, "max_position_embeddings": 32768, "num_attention_heads": 8, "num_key_value_heads": 2, "head_dim": 64, "layer_norm_epsilon": 1e-5, "conv_kernel": 4, "ssm_state_size": 128, "mamba_num_heads": 16, "mamba_head_dim": 32, "n_groups": 8, "hybrid_override_pattern": "ME*M", "n_routed_experts": 16, "num_experts_per_tok": 4, "moe_intermediate_size": 256 }` if err := os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(config), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(tempDir, "tokenizer.json"), []byte(`{}`), 0o644); err != nil { t.Fatal(err) } kv, _, err := LoadModelMetadata(os.DirFS(tempDir)) if err != nil { t.Fatal(err) } if _, ok := kv.(*nemotronHModel); !ok { t.Fatalf("unexpected converter type: %T", kv) } } func TestNemotronHNanoVLLoadModelMetadata(t *testing.T) { tempDir := t.TempDir() config := `{ "architectures": ["NemotronH_Nano_VL_V2"], "model_type": "NemotronH_Nano_VL_V2", "max_sequence_length": 131072, "force_image_size": 512, "downsample_ratio": 0.5, "patch_size": 16, "use_thumbnail": true, "img_context_token_id": 18, "img_context_token": "", "img_start_token": "", "img_end_token": "", "sound_context_token_id": 27, "sound_context_token": "", "vit_hidden_size": 1280, "projector_hidden_size": 20480, "norm_mean": [0.48145466, 0.4578275, 0.40821073], "norm_std": [0.26862954, 0.26130258, 0.27577711], "vision_config": { "version": "radio_v2.5-h", "patch_size": 16, "max_resolution": 2048, "separate_video_embedder": true }, "sound_config": { "model_type": "parakeet", "hidden_size": 1024, "num_attention_heads": 8, "num_hidden_layers": 24, "intermediate_size": 4096, "conv_kernel_size": 9, "subsampling_conv_channels": 256, "subsampling_conv_kernel_size": 3, "subsampling_conv_stride": 2, "subsampling_factor": 8, "num_mel_bins": 128, "projection_hidden_size": 4096, "sampling_rate": 16000 }, "llm_config": { "architectures": ["NemotronHForCausalLM"], "model_type": "nemotron_h", "num_hidden_layers": 4, "hidden_size": 512, "max_position_embeddings": 262144, "num_attention_heads": 8, "num_key_value_heads": 2, "head_dim": 64, "layer_norm_epsilon": 1e-5, "conv_kernel": 4, "ssm_state_size": 128, "mamba_num_heads": 16, "mamba_head_dim": 32, "n_groups": 8, "hybrid_override_pattern": "ME*M", "n_routed_experts": 16, "num_experts_per_tok": 4, "moe_intermediate_size": 256 } }` if err := os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(config), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(tempDir, "preprocessor_config.json"), []byte(`{ "image_size": 512, "patch_size": 16, "downsample_ratio": 0.5, "max_num_tiles": 12, "use_thumbnail": true, "norm_mean": [0.48145466, 0.4578275, 0.40821073], "norm_std": [0.26862954, 0.26130258, 0.27577711] }`), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(tempDir, "tokenizer.json"), []byte(`{}`), 0o644); err != nil { t.Fatal(err) } conv, tokenizer, err := LoadModelMetadata(os.DirFS(tempDir)) if err != nil { t.Fatal(err) } if _, ok := conv.(*nemotronHNanoVLModel); !ok { t.Fatalf("unexpected converter type: %T", conv) } kv := conv.KV(tokenizer) if got, want := kv["general.architecture"], "nemotron_h_omni"; got != want { t.Fatalf("unexpected architecture: got %v want %v", got, want) } if got, want := kv["context_length"], uint32(131072); got != want { t.Fatalf("unexpected context length: got %v want %v", got, want) } if got, want := kv["vision.block_count"], uint32(32); got != want { t.Fatalf("unexpected vision block count: got %v want %v", got, want) } if got, want := kv["vision.image_size"], uint32(512); got != want { t.Fatalf("unexpected vision image size: got %v want %v", got, want) } if got, want := kv["vision.projector.scale_factor"], uint32(2); got != want { t.Fatalf("unexpected projector scale factor: got %v want %v", got, want) } if got, want := kv["audio.block_count"], uint32(24); got != want { t.Fatalf("unexpected audio block count: got %v want %v", got, want) } if got, want := kv["audio.sound_token_id"], uint32(27); got != want { t.Fatalf("unexpected audio token id: got %v want %v", got, want) } if got, want := kv["audio.subsampling_factor"], uint32(8); got != want { t.Fatalf("unexpected audio subsampling factor: got %v want %v", got, want) } } func TestNemotronHNanoOmniReasoningV3LoadModelMetadata(t *testing.T) { tempDir := t.TempDir() config := `{ "architectures": ["NemotronH_Nano_Omni_Reasoning_V3"], "model_type": "NemotronH_Nano_Omni_Reasoning_V3", "max_sequence_length": 131072, "force_image_size": 512, "downsample_ratio": 0.5, "patch_size": 16, "img_context_token_id": 18, "img_context_token": "", "img_start_token": "", "img_end_token": "", "sound_context_token_id": 27, "sound_context_token": "", "vit_hidden_size": 1280, "projector_hidden_size": 4096, "vision_config": { "version": "radio_v2.5-h", "patch_size": 16, "min_num_patches": 1024, "max_num_patches": 13312, "args": { "min_num_patches": 1024, "max_num_patches": 13312 } }, "sound_config": { "model_type": "parakeet", "hidden_size": 1024, "num_attention_heads": 8, "num_hidden_layers": 24, "intermediate_size": 4096, "conv_kernel_size": 9, "subsampling_conv_channels": 256, "subsampling_conv_kernel_size": 3, "subsampling_conv_stride": 2, "subsampling_factor": 8, "num_mel_bins": 128, "projection_hidden_size": 4096, "sampling_rate": 16000 }, "llm_config": { "architectures": ["NemotronHForCausalLM"], "model_type": "nemotron_h", "num_hidden_layers": 4, "hidden_size": 512, "max_position_embeddings": 262144, "num_attention_heads": 8, "num_key_value_heads": 2, "head_dim": 64, "layer_norm_epsilon": 1e-5, "conv_kernel": 4, "ssm_state_size": 128, "mamba_num_heads": 16, "mamba_head_dim": 32, "n_groups": 8, "hybrid_override_pattern": "ME*M", "n_routed_experts": 16, "num_experts_per_tok": 4, "moe_intermediate_size": 256 } }` if err := os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(config), 0o644); err != nil { t.Fatal(err) } if err := os.WriteFile(filepath.Join(tempDir, "tokenizer.json"), []byte(`{}`), 0o644); err != nil { t.Fatal(err) } conv, tokenizer, err := LoadModelMetadata(os.DirFS(tempDir)) if err != nil { t.Fatal(err) } if _, ok := conv.(*nemotronHNanoVLModel); !ok { t.Fatalf("unexpected converter type: %T", conv) } kv := conv.KV(tokenizer) if got, want := kv["general.architecture"], "nemotron_h_omni"; got != want { t.Fatalf("unexpected architecture: got %v want %v", got, want) } if got, want := kv["vision.block_count"], uint32(32); got != want { t.Fatalf("unexpected vision block count: got %v want %v", got, want) } if got, want := kv["vision.min_num_patches"], uint32(1024); got != want { t.Fatalf("unexpected vision min patches: got %v want %v", got, want) } if got, want := kv["vision.max_num_patches"], uint32(13312); got != want { t.Fatalf("unexpected vision max patches: got %v want %v", got, want) } if got, want := kv["audio.block_count"], uint32(24); got != want { t.Fatalf("unexpected audio block count: got %v want %v", got, want) } if got, want := kv["audio.sound_token_id"], uint32(27); got != want { t.Fatalf("unexpected audio token id: got %v want %v", got, want) } } func TestNemotronHNanoVLTensorsRetainVisionAndAudio(t *testing.T) { m := &nemotronHNanoVLModel{ LLMConfig: nemotronHModel{NGroups: 8}, } in := []Tensor{ &fakeTensor{ name: "blk.0.ssm_a", shape: []uint64{4}, data: []float32{0, 1, 2, 3}, }, &fakeTensor{name: "v.blk.0.attn_qkv.weight", shape: []uint64{3840, 1280}}, &fakeTensor{name: "v.position_embd", shape: []uint64{1, 16384, 1280}}, &fakeTensor{name: "v.cls_embd", shape: []uint64{10, 1280}}, &fakeTensor{name: "mm.norm.weight", shape: []uint64{5120}}, &fakeTensor{name: "a.feature_extractor.fb", shape: []uint64{1, 128, 257}}, &fakeTensor{name: "a.subsampling.dw1.weight", shape: []uint64{256, 1, 3, 3}}, &fakeTensor{name: "a.blk.0.conv_dw.weight", shape: []uint64{1024, 1, 9}}, &fakeTensor{name: "a.blk.0.conv_pw1.weight", shape: []uint64{2048, 1024, 1}}, &fakeTensor{name: "a.blk.0.conv_bn.num_batches_tracked", shape: []uint64{1}}, &fakeTensor{name: "mm.a.1.weight", shape: []uint64{4096, 1024}}, } out := m.Tensors(in) got := map[string][]uint64{} for _, tns := range out { got[tns.Name] = tns.Shape } for _, name := range []string{ "blk.0.ssm_a", "v.blk.0.attn_q.weight", "v.blk.0.attn_k.weight", "v.blk.0.attn_v.weight", "v.position_embd", "v.cls_embd", "mm.norm.weight", "a.feature_extractor.fb", "a.subsampling.dw1.weight", "a.blk.0.conv_dw.weight", "a.blk.0.conv_pw1.weight", "mm.a.1.weight", } { if _, ok := got[name]; !ok { t.Fatalf("expected tensor %q in output", name) } } if gotShape, want := got["blk.0.ssm_a"], []uint64{4, 1}; !slices.Equal(gotShape, want) { t.Fatalf("unexpected ssm_a shape: got %v want %v", gotShape, want) } if gotShape, want := got["v.position_embd"], []uint64{16384, 1280}; !slices.Equal(gotShape, want) { t.Fatalf("unexpected position embedding shape: got %v want %v", gotShape, want) } if gotShape, want := got["a.blk.0.conv_dw.weight"], []uint64{1024, 9}; !slices.Equal(gotShape, want) { t.Fatalf("unexpected audio conv_dw shape: got %v want %v", gotShape, want) } if gotShape, want := got["a.blk.0.conv_pw1.weight"], []uint64{2048, 1024}; !slices.Equal(gotShape, want) { t.Fatalf("unexpected audio conv_pw1 shape: got %v want %v", gotShape, want) } if _, ok := got["a.blk.0.conv_bn.num_batches_tracked"]; ok { t.Fatal("audio batchnorm num_batches_tracked should be omitted") } } func TestNemotronHNanoVLReplacements(t *testing.T) { m := &nemotronHNanoVLModel{} r := strings.NewReplacer(m.Replacements()...) if got, want := r.Replace("language_model.backbone.layers.1.mixer.fc1_latent_proj.weight"), "blk.1.ffn_latent_in.weight"; got != want { t.Fatalf("unexpected fc1 replacement: got %q want %q", got, want) } if got, want := r.Replace("language_model.lm_head.weight"), "output.weight"; got != want { t.Fatalf("unexpected lm_head replacement: got %q want %q", got, want) } if got, want := r.Replace("vision_model.radio_model.model.blocks.0.attn.qkv.weight"), "v.blk.0.attn_qkv.weight"; got != want { t.Fatalf("unexpected vision replacement: got %q want %q", got, want) } if got, want := r.Replace("mlp1.1.weight"), "mm.1.weight"; got != want { t.Fatalf("unexpected projector replacement: got %q want %q", got, want) } if got, want := r.Replace("sound_encoder.encoder.layers.0.self_attn.q_proj.weight"), "a.blk.0.attn_q.weight"; got != want { t.Fatalf("unexpected audio q_proj replacement: got %q want %q", got, want) } if got, want := r.Replace("sound_encoder.encoder.layers.0.conv.pointwise_conv1.weight"), "a.blk.0.conv_pw1.weight"; got != want { t.Fatalf("unexpected audio conv replacement: got %q want %q", got, want) } if got, want := r.Replace("sound_projection.linear2.weight"), "mm.a.2.weight"; got != want { t.Fatalf("unexpected audio projector replacement: got %q want %q", got, want) } } func TestNemotronHReplacementsLatentProjections(t *testing.T) { m := &nemotronHModel{} r := strings.NewReplacer(m.Replacements()...) if got, want := r.Replace("backbone.layers.1.mixer.fc1_latent_proj.weight"), "blk.1.ffn_latent_in.weight"; got != want { t.Fatalf("unexpected fc1 replacement: got %q want %q", got, want) } if got, want := r.Replace("backbone.layers.1.mixer.fc2_latent_proj.weight"), "blk.1.ffn_latent_out.weight"; got != want { t.Fatalf("unexpected fc2 replacement: got %q want %q", got, want) } }