mirror of
https://github.com/ollama/ollama.git
synced 2026-05-10 15:44:38 -05:00
* mlx: add laguna model support * convert: support fp8 safetensors import Decode HF F8_E4M3 safetensors with block scale companions into GGUF-supported tensor types, and record which output tensors came from FP8 source weights. Use that source-precision metadata during create quantization: default FP8-sourced GGUFs to Q8_0, keep non-FP8 tensors at their original precision for Q8_0, and promote non-FP8 quantizable tensors to Q8_0 for Q4_K requests. * ggml: add laguna model support * server: preserve generate logprobs with builtin parsers Generate requests were dropping logprob-only chunks whenever a builtin parser buffered visible content. Chat already handled this case, but generate only forwarded chunks with visible response, thinking, or tool-call output. Keep generate chunks that carry logprobs even when the builtin parser has not flushed visible content yet, and add a regression test that exercises the behavior with a generic thinking parser. * review comments - perf improvements * ggml: implement nemotron 3 nano omni * add poolside integration * update poolside doc * adapt to new cache setup * fix test * fix test --------- Co-authored-by: Eva Ho <hoyyeva@gmail.com>
100 lines
2.7 KiB
Go
100 lines
2.7 KiB
Go
package mlx
|
|
|
|
import "math"
|
|
|
|
var geluCoeff = float32(math.Sqrt(2 / math.Pi))
|
|
|
|
// GELUApprox returns 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
|
|
// as a fused kernel.
|
|
var GELUApprox = Compile1(
|
|
"GELUApprox",
|
|
func(x *Array) *Array {
|
|
// Dtype-matched scalars avoid implicit upcasts on bf16 inputs.
|
|
dt := x.DType()
|
|
half := FromValue[float32](0.5).AsType(dt)
|
|
coeff := FromValue(geluCoeff).AsType(dt)
|
|
c := FromValue[float32](0.044715).AsType(dt)
|
|
one := FromValue[float32](1.0).AsType(dt)
|
|
|
|
// x^3 via x*x*x (avoids general Power which is slower).
|
|
x3 := x.Multiply(x).Multiply(x)
|
|
inner := x.Add(c.Multiply(x3))
|
|
tanh := coeff.Multiply(inner).Tanh()
|
|
return half.Multiply(x).Multiply(one.Add(tanh))
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// SiLU returns a * sigmoid(a) as a fused kernel.
|
|
var SiLU = Compile1(
|
|
"SiLU",
|
|
func(a *Array) *Array {
|
|
return a.Multiply(a.Sigmoid())
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// SoftplusF32 returns softplus(x) computed in float32 precision and cast back
|
|
// to x's original dtype, as a fused kernel. Matches the laguna attention
|
|
// output-gate formula: softplus(cast_f32(x)).cast(orig_dtype).
|
|
var SoftplusF32 = Compile1(
|
|
"SoftplusF32",
|
|
func(x *Array) *Array {
|
|
dt := x.DType()
|
|
zero := FromValue[float32](0)
|
|
return Logaddexp(x.AsType(DTypeFloat32), zero).AsType(dt)
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// SwiGLU returns silu(gate) * up as a fused kernel.
|
|
var SwiGLU = Compile2(
|
|
"SwiGLU",
|
|
func(gate, up *Array) *Array {
|
|
return SiLU(gate).Multiply(up)
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// GeGLU returns gelu_approx(gate) * up as a fused kernel. Matches mlx_lm's
|
|
// geglu, used by Gemma-family MLP and MoE paths.
|
|
var GeGLU = Compile2(
|
|
"GeGLU",
|
|
func(gate, up *Array) *Array {
|
|
return GELUApprox(gate).Multiply(up)
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// LogitSoftcap returns tanh(x / cap) * cap as a fused kernel. Matches
|
|
// mlx_lm's logit_softcap. cap must have the same dtype as x.
|
|
var LogitSoftcap = Compile2(
|
|
"LogitSoftcap",
|
|
func(x, cap *Array) *Array {
|
|
return x.Divide(cap).Tanh().Multiply(cap)
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// sigmoidRouterFused traces the DeepSeek-V2 / GLM-MoE aux-loss-free router
|
|
// head. Two outputs are returned so the pre-bias sigmoid (used to gather
|
|
// per-expert scores after top-k) and the post-bias negation (used as the
|
|
// argpartition key for top-k) share a single kernel.
|
|
var sigmoidRouterFused = Compile(
|
|
"SigmoidRouter",
|
|
func(in ...*Array) []*Array {
|
|
gates, bias := in[0], in[1]
|
|
orig := gates.Sigmoid()
|
|
neg := orig.Add(bias).Negative()
|
|
return []*Array{orig, neg}
|
|
},
|
|
Shapeless(),
|
|
)
|
|
|
|
// SigmoidRouter returns (sigmoid(gates), -(sigmoid(gates)+bias)) as a fused
|
|
// kernel — the DeepSeek-V2 / GLM-MoE aux-loss-free router head.
|
|
func SigmoidRouter(gates, bias *Array) (origScores, negScores *Array) {
|
|
out := sigmoidRouterFused(gates, bias)
|
|
return out[0], out[1]
|
|
}
|