From a60b9adcce8fb40b1ad3678d992de5969692a9ea Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Thu, 26 Feb 2026 14:45:14 -0800 Subject: [PATCH] mlxrunner: Fix prompt eval timing and count metrics Only the last token's processing time is included in prompt processing, giving an artificially high rate. In addition, the number of tokens only included the tokens that miss the cache, instead of our historic total tokens. --- x/mlxrunner/pipeline.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/x/mlxrunner/pipeline.go b/x/mlxrunner/pipeline.go index 405225c45..73b485358 100644 --- a/x/mlxrunner/pipeline.go +++ b/x/mlxrunner/pipeline.go @@ -74,6 +74,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error { caches := session.caches tokens := session.remaining + now := time.Now() total, processed := len(tokens), 0 for total-processed > 1 { if err := request.Ctx.Err(); err != nil { @@ -114,8 +115,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error { var b bytes.Buffer - now := time.Now() - final := CompletionResponse{Done: true, PromptEvalCount: total, EvalCount: request.Options.MaxTokens, DoneReason: 1} + final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.MaxTokens, DoneReason: 1} for i := range request.Options.MaxTokens { if err := request.Ctx.Err(); err != nil { return err