mlxrunner: Fix prompt eval timing and count metrics

Only the last token's processing time is included in prompt processing, giving an artificially high rate. In addition, the number of tokens only included the tokens that miss the cache, instead of our historic total tokens.
2026-03-08 23:04:13 -05:00 · 2026-02-26 14:45:14 -08:00
parent a16f96658b
commit a60b9adcce
1 changed files with 2 additions and 2 deletions
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -74,6 +74,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	caches := session.caches
 	tokens := session.remaining

+	now := time.Now()
 	total, processed := len(tokens), 0
 	for total-processed > 1 {
 		if err := request.Ctx.Err(); err != nil {
@@ -114,8 +115,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {

 	var b bytes.Buffer

-	now := time.Now()
-	final := CompletionResponse{Done: true, PromptEvalCount: total, EvalCount: request.Options.MaxTokens, DoneReason: 1}
+	final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.MaxTokens, DoneReason: 1}
 	for i := range request.Options.MaxTokens {
 		if err := request.Ctx.Err(); err != nil {
 			return err