mlx: Remove peak memory from the API

This is still in flux so it is better to just log it for now.
2026-03-08 23:04:13 -05:00 · 2026-03-02 11:37:52 -08:00
parent c1e3ef4bcc
commit ad16bffc7d
5 changed files with 12 additions and 31 deletions
--- a/api/types.go
+++ b/api/types.go
@@ -15,7 +15,6 @@ import (
 	"github.com/google/uuid"

 	"github.com/ollama/ollama/envconfig"
-	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/internal/orderedmap"
 	"github.com/ollama/ollama/types/model"
 )
@@ -570,7 +569,6 @@ type DebugInfo struct {

 type Metrics struct {
 	TotalDuration      time.Duration `json:"total_duration,omitempty"`
-	PeakMemory         uint64        `json:"peak_memory,omitempty"`
 	LoadDuration       time.Duration `json:"load_duration,omitempty"`
 	PromptEvalCount    int           `json:"prompt_eval_count,omitempty"`
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration,omitempty"`
@@ -936,10 +934,6 @@ func (m *Metrics) Summary() {
 		fmt.Fprintf(os.Stderr, "total duration:       %v\n", m.TotalDuration)
 	}

-	if m.PeakMemory > 0 {
-		fmt.Fprintf(os.Stderr, "peak memory:          %s\n", formatPeakMemory(m.PeakMemory))
-	}
-
 	if m.LoadDuration > 0 {
 		fmt.Fprintf(os.Stderr, "load duration:        %v\n", m.LoadDuration)
 	}
@@ -963,14 +957,6 @@ func (m *Metrics) Summary() {
 	}
 }

-func formatPeakMemory(b uint64) string {
-	if b >= format.GibiByte {
-		return fmt.Sprintf("%.3f GiB", float64(b)/float64(format.GibiByte))
-	}
-
-	return format.HumanBytes2(b)
-}
-
 func (opts *Options) FromMap(m map[string]any) error {
 	valueOpts := reflect.ValueOf(opts).Elem() // names of the fields in the options struct
 	typeOpts := reflect.TypeOf(opts).Elem()   // types of the fields in the options struct
--- a/llm/server.go
+++ b/llm/server.go
@@ -1518,7 +1518,6 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration `json:"prompt_eval_duration"`
 	EvalCount          int           `json:"eval_count"`
 	EvalDuration       time.Duration `json:"eval_duration"`
-	PeakMemory         uint64        `json:"peak_memory,omitempty"`

 	// Logprobs contains log probability information if requested
 	Logprobs []Logprob `json:"logprobs,omitempty"`
--- a/server/routes.go
+++ b/server/routes.go
@@ -558,7 +558,6 @@ func (s *Server) GenerateHandler(c *gin.Context) {
 					PromptEvalDuration: cr.PromptEvalDuration,
 					EvalCount:          cr.EvalCount,
 					EvalDuration:       cr.EvalDuration,
-					PeakMemory:         cr.PeakMemory,
 				},
 				Logprobs: toAPILogprobs(cr.Logprobs),
 			}
@@ -2317,7 +2316,6 @@ func (s *Server) ChatHandler(c *gin.Context) {
 						PromptEvalDuration: r.PromptEvalDuration,
 						EvalCount:          r.EvalCount,
 						EvalDuration:       r.EvalDuration,
-						PeakMemory:         r.PeakMemory,
 					},
 					Logprobs: toAPILogprobs(r.Logprobs),
 				}
--- a/x/mlxrunner/client.go
+++ b/x/mlxrunner/client.go
@@ -202,7 +202,6 @@ type CompletionResponse struct {
 	PromptEvalDuration time.Duration
 	EvalCount          int
 	EvalDuration       time.Duration
-	PeakMemory         uint64

 	Error *api.StatusError
 }
@@ -284,7 +283,6 @@ func (c *Client) Completion(ctx context.Context, req llm.CompletionRequest, fn f
 			PromptEvalDuration: raw.PromptEvalDuration,
 			EvalCount:          raw.EvalCount,
 			EvalDuration:       raw.EvalDuration,
-			PeakMemory:         raw.PeakMemory,
 		}

 		fn(cresp)
--- a/x/mlxrunner/pipeline.go
+++ b/x/mlxrunner/pipeline.go
@@ -21,6 +21,17 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 		return errors.New("model not loaded")
 	}

+	enableCompile := true
+	if modelCompile, ok := r.Model.(interface{ EnableCompile() bool }); ok {
+		enableCompile = modelCompile.EnableCompile()
+	}
+	if enableCompile {
+		mlx.EnableCompile()
+	} else {
+		mlx.DisableCompile()
+	}
+	mlx.ResetPeakMemory()
+
 	var (
 		sample, logprobs         *mlx.Array
 		nextSample, nextLogprobs *mlx.Array
@@ -36,19 +47,9 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 			mlx.LogArrays()
 			r.cache.log()
 		}
+		slog.Info("peak memory", "size", mlx.PrettyBytes(mlx.PeakMemory()))
 	}()

-	enableCompile := true
-	if modelCompile, ok := r.Model.(interface{ EnableCompile() bool }); ok {
-		enableCompile = modelCompile.EnableCompile()
-	}
-	if enableCompile {
-		mlx.EnableCompile()
-	} else {
-		mlx.DisableCompile()
-	}
-	mlx.ResetPeakMemory()
-
 	inputs := r.Tokenizer.Encode(request.Prompt, true)
 	if len(inputs) == 0 {
 		return errors.New("empty prompt")
@@ -156,7 +157,6 @@ func (r *Runner) TextGenerationPipeline(request Request) error {
 	}

 	final.EvalDuration = time.Since(now)
-	final.PeakMemory = uint64(mlx.PeakMemory())
 	select {
 	case <-request.Ctx.Done():
 		return request.Ctx.Err()