mlxrunner: Fix prompt eval timing and count metrics

Only the last token's processing time is included in prompt processing,
giving an artificially high rate. In addition, the number of tokens
only included the tokens that miss the cache, instead of our historic
total tokens.
This commit is contained in:
Jesse Gross
2026-02-26 14:45:14 -08:00
parent a16f96658b
commit a60b9adcce

View File

@@ -74,6 +74,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
caches := session.caches
tokens := session.remaining
now := time.Now()
total, processed := len(tokens), 0
for total-processed > 1 {
if err := request.Ctx.Err(); err != nil {
@@ -114,8 +115,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
var b bytes.Buffer
now := time.Now()
final := CompletionResponse{Done: true, PromptEvalCount: total, EvalCount: request.Options.MaxTokens, DoneReason: 1}
final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.MaxTokens, DoneReason: 1}
for i := range request.Options.MaxTokens {
if err := request.Ctx.Err(); err != nil {
return err