glm4moelite: quantize more tensors to q8_0 and avoid double BOS token (#13891)

2026-03-09 07:16:38 -05:00 · 2026-01-24 16:33:54 -08:00
parent f3b476c592
commit 16750865d1
2 changed files with 8 additions and 1 deletions
--- a/model/models/glm4moelite/model.go
+++ b/model/models/glm4moelite/model.go
@@ -246,7 +246,7 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
 				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
 				EOS: append(