mirror of
https://github.com/ollama/ollama.git
synced 2026-03-08 23:04:13 -05:00
mlxrunner: Fix prompt eval timing and count metrics
Only the last token's processing time is included in prompt processing, giving an artificially high rate. In addition, the number of tokens only included the tokens that miss the cache, instead of our historic total tokens.
This commit is contained in:
@@ -74,6 +74,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
caches := session.caches
|
||||
tokens := session.remaining
|
||||
|
||||
now := time.Now()
|
||||
total, processed := len(tokens), 0
|
||||
for total-processed > 1 {
|
||||
if err := request.Ctx.Err(); err != nil {
|
||||
@@ -114,8 +115,7 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
|
||||
|
||||
var b bytes.Buffer
|
||||
|
||||
now := time.Now()
|
||||
final := CompletionResponse{Done: true, PromptEvalCount: total, EvalCount: request.Options.MaxTokens, DoneReason: 1}
|
||||
final := CompletionResponse{Done: true, PromptEvalCount: len(inputs), EvalCount: request.Options.MaxTokens, DoneReason: 1}
|
||||
for i := range request.Options.MaxTokens {
|
||||
if err := request.Ctx.Err(); err != nil {
|
||||
return err
|
||||
|
||||
Reference in New Issue
Block a user