mirror of
https://github.com/ollama/ollama.git
synced 2026-03-11 17:34:04 -05:00
This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.
57 lines
1.2 KiB
Go
57 lines
1.2 KiB
Go
//go:build mlx
|
|
|
|
package tokenizer
|
|
|
|
import (
|
|
"strconv"
|
|
"strings"
|
|
)
|
|
|
|
// Decode converts token IDs back to text
|
|
func (t *Tokenizer) Decode(ids []int32) string {
|
|
var sb strings.Builder
|
|
|
|
for _, id := range ids {
|
|
if int(id) >= len(t.vocab.Values) {
|
|
continue
|
|
}
|
|
|
|
token := t.vocab.Values[id]
|
|
|
|
switch t.typ {
|
|
case TokenizerSentencePiece:
|
|
// SentencePiece style: replace ▁ with space, decode byte tokens
|
|
token = strings.ReplaceAll(token, "▁", " ")
|
|
// Handle byte fallback tokens like <0x0D>
|
|
if len(token) == 6 && token[0] == '<' && token[1] == '0' && token[2] == 'x' && token[5] == '>' {
|
|
if v, err := strconv.ParseUint(token[3:5], 16, 8); err == nil {
|
|
sb.WriteByte(byte(v))
|
|
continue
|
|
}
|
|
}
|
|
sb.WriteString(token)
|
|
default:
|
|
// GPT-2 BPE style: decode byte-level encoding
|
|
for _, r := range token {
|
|
switch {
|
|
case r == 0x0100:
|
|
// Mirror GGML tokenizer behavior for NULL byte.
|
|
// 0x00 is omitted during decode.
|
|
continue
|
|
case r == 0x0143:
|
|
r = 0x00ad
|
|
case r > 0x0100 && r <= 0x0120:
|
|
r = r - 0x0100
|
|
case r > 0x0120 && r <= 0x0142:
|
|
r = r - 0x00a2
|
|
}
|
|
|
|
// Write as byte, not UTF-8 encoded rune
|
|
sb.WriteByte(byte(r))
|
|
}
|
|
}
|
|
}
|
|
|
|
return sb.String()
|
|
}
|