mirror of
https://github.com/ollama/ollama.git
synced 2026-03-09 07:16:38 -05:00
This change adds a new x/tokenizer package which includes: * New BPE and SentencePiece tokenizers * Removing the dependency on the imagegen tokenizers * Fixes to multibyte decoding in the pipeline * Various correctness and benchmark tests Not included in this PR is the WordPiece tokenizer for BERT models which will be added when we add embedding models. The imagegen tokenizers will also be removed in a follow-up PR.
109 lines
2.7 KiB
Go
109 lines
2.7 KiB
Go
//go:build mlx
|
|
|
|
// tokenizer.go - BPE and SentencePiece tokenizer for HuggingFace models
|
|
//
|
|
// Based on standard BPE algorithm (Sennrich et al. 2015) with:
|
|
// - GPT-2 byte-level encoding (OpenAI tiktoken)
|
|
// - HuggingFace tokenizer.json pretokenizer patterns
|
|
// - SentencePiece ▁-style space handling
|
|
|
|
package tokenizer
|
|
|
|
import "regexp"
|
|
|
|
// TokenizerType identifies the tokenization algorithm
|
|
type TokenizerType int
|
|
|
|
const (
|
|
TokenizerBPE TokenizerType = iota // GPT-2 style byte-level BPE
|
|
TokenizerSentencePiece // SentencePiece with ▁ for spaces
|
|
)
|
|
|
|
// Vocabulary holds the tokenizer vocabulary and merges
|
|
type Vocabulary struct {
|
|
Values []string
|
|
Reverse map[string]int32
|
|
Merges map[string]int
|
|
|
|
BOS int32
|
|
EOS []int32 // Multiple EOS tokens supported (e.g., Gemma has <eos> and <end_of_turn>)
|
|
PAD int32 // Padding token (often <|endoftext|> or <pad>)
|
|
AddBOS bool
|
|
AddEOS bool
|
|
|
|
// Precomputed byte token IDs for <0xNN> fallback (256 entries, -1 if not found)
|
|
byteTokens [256]int32
|
|
}
|
|
|
|
// Tokenizer handles BPE and SentencePiece tokenization
|
|
type Tokenizer struct {
|
|
vocab *Vocabulary
|
|
pretokenizer *regexp.Regexp
|
|
specialTokens map[string]int32 // Special tokens for direct lookup
|
|
sortedSpecialTokens []string // Special tokens sorted by length, longest first
|
|
typ TokenizerType // Algorithm type
|
|
}
|
|
|
|
// Precomputed GPT-2 byte-level encoding table
|
|
// Maps byte values to their encoded rune equivalents
|
|
var byteToRune [256]rune
|
|
|
|
func init() {
|
|
for b := 0; b < 256; b++ {
|
|
r := rune(b)
|
|
switch {
|
|
case r == 0x00ad:
|
|
r = 0x0143
|
|
case r <= 0x0020:
|
|
r = r + 0x0100
|
|
case r >= 0x007f && r <= 0x00a0:
|
|
r = r + 0x00a2
|
|
}
|
|
byteToRune[b] = r
|
|
}
|
|
}
|
|
|
|
// VocabSize returns the vocabulary size
|
|
func (t *Tokenizer) VocabSize() int {
|
|
return len(t.vocab.Values)
|
|
}
|
|
|
|
// BOS returns the beginning of sequence token ID
|
|
func (t *Tokenizer) BOS() int32 {
|
|
return t.vocab.BOS
|
|
}
|
|
|
|
// EOS returns the first end of sequence token ID (for backwards compatibility)
|
|
func (t *Tokenizer) EOS() int32 {
|
|
if len(t.vocab.EOS) > 0 {
|
|
return t.vocab.EOS[0]
|
|
}
|
|
return -1
|
|
}
|
|
|
|
// EOSTokens returns all end of sequence token IDs
|
|
func (t *Tokenizer) EOSTokens() []int32 {
|
|
return t.vocab.EOS
|
|
}
|
|
|
|
// PAD returns the padding token ID, or -1 if not set
|
|
func (t *Tokenizer) PAD() int32 {
|
|
return t.vocab.PAD
|
|
}
|
|
|
|
// IsEOS returns true if the token ID is an end of sequence token
|
|
func (t *Tokenizer) IsEOS(id int32) bool {
|
|
for _, eos := range t.vocab.EOS {
|
|
if id == eos {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// GetSpecialToken returns the token ID for a special token string
|
|
func (t *Tokenizer) GetSpecialToken(name string) (int32, bool) {
|
|
id, ok := t.specialTokens[name]
|
|
return id, ok
|
|
}
|