From d319227df01254ba375dbabd1d42e851465e4476 Mon Sep 17 00:00:00 2001 From: Parth Sareen Date: Tue, 5 May 2026 14:40:18 -0700 Subject: [PATCH] server: cache show responses (#15967) --- server/model_caches.go | 27 + server/model_recommendations_test.go | 4 +- server/model_show_cache.go | 705 +++++++++++++++++++++++++++ server/model_show_cache_test.go | 501 +++++++++++++++++++ server/routes.go | 45 +- 5 files changed, 1266 insertions(+), 16 deletions(-) create mode 100644 server/model_caches.go create mode 100644 server/model_show_cache.go create mode 100644 server/model_show_cache_test.go diff --git a/server/model_caches.go b/server/model_caches.go new file mode 100644 index 000000000..720fa703d --- /dev/null +++ b/server/model_caches.go @@ -0,0 +1,27 @@ +package server + +import "context" + +type modelCaches struct { + recommendations *modelRecommendationsCache + show *modelShowCache +} + +func newModelCaches() *modelCaches { + return &modelCaches{ + recommendations: newModelRecommendationsCache(), + show: newModelShowCache(), + } +} + +func (c *modelCaches) Start(ctx context.Context) { + if c == nil { + return + } + if c.recommendations != nil { + c.recommendations.Start(ctx) + } + if c.show != nil { + c.show.Start(ctx) + } +} diff --git a/server/model_recommendations_test.go b/server/model_recommendations_test.go index 346462fc4..1c7b389ef 100644 --- a/server/model_recommendations_test.go +++ b/server/model_recommendations_test.go @@ -306,7 +306,7 @@ func TestModelRecommendationsHandlerUsesCache(t *testing.T) { ctx, _ := gin.CreateTestContext(w) ctx.Request = httptest.NewRequest(http.MethodGet, "/api/experimental/model-recommendations", nil) - s := &Server{modelRecommendations: cache} + s := &Server{modelCaches: &modelCaches{recommendations: cache}} s.ModelRecommendationsExperimentalHandler(ctx) if w.Code != http.StatusOK { @@ -326,7 +326,7 @@ func TestModelRecommendationsRouteRegistration(t *testing.T) { cache := newModelRecommendationsCache() cache.set([]api.ModelRecommendation{{Model: "route-model", Description: "route description"}}) - s := &Server{modelRecommendations: cache} + s := &Server{modelCaches: &modelCaches{recommendations: cache}} router, err := s.GenerateRoutes(nil) if err != nil { diff --git a/server/model_show_cache.go b/server/model_show_cache.go new file mode 100644 index 000000000..b32e025e8 --- /dev/null +++ b/server/model_show_cache.go @@ -0,0 +1,705 @@ +package server + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "log/slog" + "net/http" + "net/url" + "slices" + "strings" + "sync" + "time" + + "github.com/ollama/ollama/api" + internalcloud "github.com/ollama/ollama/internal/cloud" + "github.com/ollama/ollama/internal/modelref" + "github.com/ollama/ollama/manifest" + "github.com/ollama/ollama/types/model" + "github.com/ollama/ollama/version" +) + +/* +The /api/show cache stores full api.ShowResponse values because callers use +more than capabilities: launch flows also need context length, embeddings +metadata, quantization details, remote metadata, and model-specific fields. + +Local model entries are stored by canonical model name and verbose flag, with +the manifest digest recorded in the entry. The manifest digest is the freshness +boundary: if the model content changes, the digest changes, so the previous +response is replaced instead of accumulating under an old digest key. Requests +with System or Options overlays bypass the cache because those overlays mutate +the effective show response. + +Cloud model entries are keyed by normalized cloud base model name and verbose. +They use stale-while-revalidate behavior: a warm read returns the cached +response immediately and starts a throttled background refresh for that model. +Cold cloud reads preserve existing proxy behavior. Local and cloud entries live +in separate maps, so a local "qwen3.5" and an explicit "qwen3.5:cloud" cannot +collide. The cloud suffix is request routing intent; api.ShowResponse does not +carry a model-name field to reconstruct on the way out. + +The cache is process-local. Startup hydration runs asynchronously from the +current local manifests and cloud tags; no show responses are written to or read +from ~/.ollama/cache/show. That keeps cache lifetime tied to the server process +and avoids snapshot freshness and invalidation cases for this iteration. +*/ + +const ( + modelShowCloudFetchTimeout = 3 * time.Second + modelShowCloudReadRefreshCooldown = 5 * time.Second + modelShowCloudHydrationConcurrency = 4 +) + +var errModelShowNoCloud = errors.New("cloud disabled") + +// modelShowCache owns process-local show response caches for local and cloud +// models. All cached responses are cloned at read/write boundaries so +// handler-specific mutations, such as user-agent compatibility tweaks, cannot +// leak back into the cache. +type modelShowCache struct { + mu sync.RWMutex + + local map[modelShowLocalKey]modelShowLocalEntry + cloud map[modelShowCloudKey]*api.ShowResponse + + cloudRefreshing map[modelShowCloudKey]bool + cloudNextReadRefreshAfter map[modelShowCloudKey]time.Time + + once sync.Once + client *http.Client + getModelInfo func(api.ShowRequest) (*api.ShowResponse, error) +} + +// modelShowLocalKey describes the local cache slot for a model response. The +// manifest digest is stored in the entry instead of the key so a pulled or +// recreated model overwrites the previous response for the same model/verbose +// variant instead of leaving stale digest-keyed entries behind. +// +// Deleted models are not eagerly pruned from this process-local cache. Manifest +// resolution happens before local cache lookup, so stale delete entries are not +// served and disappear on process restart. +type modelShowLocalKey struct { + Model string + Verbose bool +} + +type modelShowLocalEntry struct { + Digest string + Response *api.ShowResponse +} + +// modelShowCloudKey intentionally excludes any local digest because cloud +// models are refreshed through SWR and normalized by cloud base model name. +type modelShowCloudKey struct { + Model string + Verbose bool +} + +func newModelShowCache() *modelShowCache { + return &modelShowCache{ + local: make(map[modelShowLocalKey]modelShowLocalEntry), + cloud: make(map[modelShowCloudKey]*api.ShowResponse), + cloudRefreshing: make(map[modelShowCloudKey]bool), + cloudNextReadRefreshAfter: make(map[modelShowCloudKey]time.Time), + client: http.DefaultClient, + getModelInfo: GetModelInfo, + } +} + +// modelShowCacheable returns whether a request can use the shared show cache. +// System and Options overlays are request-specific response variants, so v1 +// bypasses caching for those rather than expanding the key space. +func modelShowCacheable(req api.ShowRequest) bool { + return req.System == "" && len(req.Options) == 0 +} + +// Start kicks off non-blocking startup hydration. The cache remains +// process-local; warm entries appear as the background local and cloud scans +// populate the maps. +func (c *modelShowCache) Start(ctx context.Context) { + c.once.Do(func() { + slog.Debug("starting model show cache") + go c.runStartup(ctx) + }) +} + +// runStartup hydrates local and cloud caches concurrently. It is only called in +// a goroutine from Start, so manifest scans and cloud requests cannot delay the +// listener from accepting traffic. +func (c *modelShowCache) runStartup(ctx context.Context) { + var wg sync.WaitGroup + wg.Add(2) + + go func() { + defer wg.Done() + if err := c.hydrateLocal(ctx); err != nil && !errors.Is(err, context.Canceled) { + slog.Warn("model show local cache hydration failed", "error", err) + } + }() + + go func() { + defer wg.Done() + if err := c.hydrateCloud(ctx); err != nil { + switch { + case errors.Is(err, context.Canceled): + case errors.Is(err, errModelShowNoCloud): + slog.Debug("skipping model show cloud cache hydration because cloud is disabled") + default: + slog.Warn("model show cloud cache hydration failed", "error", err) + } + } + }() + + wg.Wait() +} + +// GetLocal returns a cached local show response when the current manifest +// digest matches. On a miss, it falls back to GetModelInfo, stores non-remote +// local responses, and returns a clone to the caller. +func (c *modelShowCache) GetLocal(req api.ShowRequest) (*api.ShowResponse, error) { + key, digest, err := modelShowLocalKeyForRequest(req) + if err != nil { + return nil, err + } + + if resp, ok := c.getLocal(key, digest); ok { + return resp, nil + } + + req.Model = key.Model + resp, err := c.getModelInfo(req) + if err != nil { + return nil, err + } + + if resp.RemoteHost == "" { + c.setLocal(key, digest, resp) + } + + return cloneShowResponse(resp), nil +} + +// GetCloudSWR returns a cached cloud show response and triggers a throttled +// background refresh. The boolean is false on a cold miss so callers can +// preserve existing synchronous proxy behavior. +func (c *modelShowCache) GetCloudSWR(ctx context.Context, req api.ShowRequest) (*api.ShowResponse, bool) { + key := modelShowCloudKeyForModel(req.Model, req.Verbose) + resp, ok := c.getCloud(key) + if !ok { + return nil, false + } + + c.triggerCloudRefreshOnRead(ctx, key) + return resp, true +} + +func (c *modelShowCache) getLocal(key modelShowLocalKey, digest string) (*api.ShowResponse, bool) { + c.mu.RLock() + entry, ok := c.local[key] + c.mu.RUnlock() + if !ok || entry.Digest != digest || entry.Response == nil { + return nil, false + } + return cloneShowResponse(entry.Response), true +} + +func (c *modelShowCache) setLocal(key modelShowLocalKey, digest string, resp *api.ShowResponse) { + c.mu.Lock() + c.local[key] = modelShowLocalEntry{ + Digest: digest, + Response: cloneShowResponse(resp), + } + c.mu.Unlock() +} + +func (c *modelShowCache) hasLocal(key modelShowLocalKey, digest string) bool { + c.mu.RLock() + entry, ok := c.local[key] + c.mu.RUnlock() + return ok && entry.Digest == digest && entry.Response != nil +} + +func (c *modelShowCache) getCloud(key modelShowCloudKey) (*api.ShowResponse, bool) { + c.mu.RLock() + resp, ok := c.cloud[key] + c.mu.RUnlock() + if !ok || resp == nil { + return nil, false + } + return cloneShowResponse(resp), true +} + +func (c *modelShowCache) setCloud(key modelShowCloudKey, resp *api.ShowResponse) { + c.mu.Lock() + c.cloud[key] = cloneShowResponse(resp) + c.mu.Unlock() +} + +func (c *modelShowCache) beginCloudReadRefresh(key modelShowCloudKey) bool { + c.mu.Lock() + defer c.mu.Unlock() + + now := time.Now() + if c.cloudRefreshing[key] || now.Before(c.cloudNextReadRefreshAfter[key]) { + return false + } + + c.cloudRefreshing[key] = true + return true +} + +func (c *modelShowCache) endCloudReadRefresh(key modelShowCloudKey) { + c.mu.Lock() + c.cloudRefreshing[key] = false + c.cloudNextReadRefreshAfter[key] = time.Now().Add(modelShowCloudReadRefreshCooldown) + c.mu.Unlock() +} + +// triggerCloudRefreshOnRead starts the revalidation side of SWR. The refresh +// uses context.WithoutCancel so a completed client request does not cancel the +// cache update it initiated. +func (c *modelShowCache) triggerCloudRefreshOnRead(ctx context.Context, key modelShowCloudKey) { + if !c.beginCloudReadRefresh(key) { + return + } + if ctx == nil { + ctx = context.Background() + } + ctx = context.WithoutCancel(ctx) + + slog.Debug("triggering model show cloud refresh on read", "model", key.Model, "verbose", key.Verbose) + go func() { + defer c.endCloudReadRefresh(key) + + if err := c.refreshCloud(ctx, key); err != nil { + switch { + case errors.Is(err, errModelShowNoCloud): + slog.Debug("skipping model show cloud read refresh because cloud is disabled", "model", key.Model) + default: + slog.Warn("model show cloud read refresh failed", "model", key.Model, "error", err) + } + } + }() +} + +// refreshCloud fetches and stores one cloud show response. Refresh failures are +// returned without touching the existing cached entry, which preserves stale +// data for future reads. +func (c *modelShowCache) refreshCloud(ctx context.Context, key modelShowCloudKey) error { + if disabled, _ := internalcloud.Status(); disabled { + return errModelShowNoCloud + } + + resp, err := c.fetchCloudShow(ctx, key.Model, key.Verbose) + if err != nil { + return err + } + + c.setCloud(key, resp) + return nil +} + +// hydrateLocal scans manifests at startup and refreshes only entries missing +// for the current digest. It hydrates non-verbose responses only, avoiding an +// expensive tensor walk for users who have never asked for verbose show data. +func (c *modelShowCache) hydrateLocal(ctx context.Context) error { + manifests, err := manifest.Manifests(true) + if err != nil { + return err + } + + for name, mf := range manifests { + if err := ctx.Err(); err != nil { + return err + } + + if modelShowManifestIsRemote(mf) { + continue + } + + modelName := name.String() + digest := mf.Digest() + key := modelShowLocalKey{ + Model: modelName, + Verbose: false, + } + if c.hasLocal(key, digest) { + continue + } + + resp, err := c.getModelInfo(api.ShowRequest{Model: modelName}) + if err != nil { + slog.Warn("failed to hydrate local model show cache", "model", modelName, "error", err) + continue + } + if resp.RemoteHost != "" { + continue + } + + c.setLocal(key, digest, resp) + } + return nil +} + +// hydrateCloud refreshes cloud show entries by listing cloud tags and fetching +// /api/show for each returned model with bounded concurrency. Per-model show +// failures are logged and skipped so one bad cloud entry does not prevent the +// rest of the cache from warming. +func (c *modelShowCache) hydrateCloud(ctx context.Context) error { + if disabled, _ := internalcloud.Status(); disabled { + return errModelShowNoCloud + } + + models, err := c.fetchCloudTags(ctx) + if err != nil { + return err + } + + jobs := make(chan string) + var wg sync.WaitGroup + + worker := func() { + defer wg.Done() + for modelName := range jobs { + if ctx.Err() != nil { + continue + } + + key := modelShowCloudKeyForModel(modelName, false) + resp, err := c.fetchCloudShow(ctx, key.Model, key.Verbose) + if err != nil { + slog.Warn("failed to hydrate cloud model show cache", "model", key.Model, "error", err) + continue + } + + c.setCloud(key, resp) + } + } + + workers := min(modelShowCloudHydrationConcurrency, max(1, len(models))) + for range workers { + wg.Add(1) + go worker() + } + +sendLoop: + for _, modelName := range models { + select { + case <-ctx.Done(): + break sendLoop + case jobs <- modelName: + } + } + close(jobs) + wg.Wait() + + if err := ctx.Err(); err != nil { + return err + } + + return nil +} + +// fetchCloudTags returns de-duplicated cloud model names normalized to their +// show-cache key form. It accepts either ListModelResponse.Model or the legacy +// Name field because /api/tags responses may contain both. +func (c *modelShowCache) fetchCloudTags(ctx context.Context) ([]string, error) { + var payload api.ListResponse + if err := c.doCloudJSON(ctx, http.MethodGet, "/api/tags", nil, &payload); err != nil { + return nil, err + } + + seen := make(map[string]struct{}, len(payload.Models)) + models := make([]string, 0, len(payload.Models)) + for _, item := range payload.Models { + name := strings.TrimSpace(item.Model) + if name == "" { + name = strings.TrimSpace(item.Name) + } + name = modelShowNormalizeCloudModel(name) + if name == "" { + continue + } + if _, ok := seen[name]; ok { + continue + } + seen[name] = struct{}{} + models = append(models, name) + } + + return models, nil +} + +func (c *modelShowCache) fetchCloudShow(ctx context.Context, modelName string, verbose bool) (*api.ShowResponse, error) { + payload := api.ShowRequest{ + Model: modelShowNormalizeCloudModel(modelName), + Verbose: verbose, + } + + var resp api.ShowResponse + if err := c.doCloudJSON(ctx, http.MethodPost, "/api/show", payload, &resp); err != nil { + return nil, err + } + + if resp.ModelInfo == nil { + resp.ModelInfo = map[string]any{} + } + return &resp, nil +} + +// doCloudJSON is the cache's direct cloud client. It mirrors the cloud proxy's +// signing and client-version behavior but uses an internal timeout because +// hydration and refreshes must not hang indefinitely. +func (c *modelShowCache) doCloudJSON(ctx context.Context, method, path string, payload any, out any) error { + reqCtx, cancel := context.WithTimeout(ctx, modelShowCloudFetchTimeout) + defer cancel() + + baseURL, err := url.Parse(cloudProxyBaseURL) + if err != nil { + return err + } + targetURL := baseURL.ResolveReference(&url.URL{Path: path}) + + var body io.Reader + if payload != nil { + data, err := json.Marshal(payload) + if err != nil { + return err + } + body = bytes.NewReader(data) + } + + req, err := http.NewRequestWithContext(reqCtx, method, targetURL.String(), body) + if err != nil { + return err + } + req.Header.Set("Accept", "application/json") + if payload != nil { + req.Header.Set("Content-Type", "application/json") + } + if clientVersion := strings.TrimSpace(version.Version); clientVersion != "" { + req.Header.Set(cloudProxyClientVersionHeader, clientVersion) + } + + if err := cloudProxySignRequest(req.Context(), req); err != nil { + return err + } + + resp, err := c.client.Do(req) + if err != nil { + return err + } + defer resp.Body.Close() + + data, err := io.ReadAll(resp.Body) + if err != nil { + return err + } + if resp.StatusCode >= http.StatusBadRequest { + return modelShowStatusError(resp, data) + } + + if out == nil { + return nil + } + return json.Unmarshal(data, out) +} + +// modelShowStatusError preserves the important error shape from cloud +// responses, including AuthorizationError for 401s and StatusError otherwise. +func modelShowStatusError(resp *http.Response, body []byte) error { + if resp.StatusCode == http.StatusUnauthorized { + err := api.AuthorizationError{ + StatusCode: resp.StatusCode, + Status: resp.Status, + } + _ = json.Unmarshal(body, &err) + if err.Status == "" { + err.Status = resp.Status + } + return err + } + + statusErr := api.StatusError{ + StatusCode: resp.StatusCode, + Status: resp.Status, + } + if err := json.Unmarshal(body, &statusErr); err != nil || statusErr.ErrorMessage == "" { + statusErr.ErrorMessage = strings.TrimSpace(string(body)) + } + return statusErr +} + +// modelShowLocalKeyForRequest normalizes a local show request to the canonical +// on-disk model name and returns the current manifest digest used to validate +// the cached entry. +func modelShowLocalKeyForRequest(req api.ShowRequest) (modelShowLocalKey, string, error) { + name := model.ParseName(req.Model) + if !name.IsValid() { + return modelShowLocalKey{}, "", model.Unqualified(name) + } + name, err := getExistingName(name) + if err != nil { + return modelShowLocalKey{}, "", err + } + + mf, err := manifest.ParseNamedManifest(name) + if err != nil { + return modelShowLocalKey{}, "", err + } + + return modelShowLocalKey{ + Model: name.String(), + Verbose: req.Verbose, + }, mf.Digest(), nil +} + +func modelShowCloudKeyForModel(modelName string, verbose bool) modelShowCloudKey { + return modelShowCloudKey{ + Model: modelShowNormalizeCloudModel(modelName), + Verbose: verbose, + } +} + +// modelShowNormalizeCloudModel strips explicit cloud source syntax, including +// legacy "-cloud" tags, so :cloud and -cloud forms share a cache entry. +func modelShowNormalizeCloudModel(modelName string) string { + modelName = strings.TrimSpace(modelName) + if modelName == "" { + return "" + } + if base, stripped := modelref.StripCloudSourceTag(modelName); stripped { + return strings.TrimSpace(base) + } + return modelName +} + +// modelShowManifestIsRemote checks whether a manifest represents a local stub +// for a remote model. Startup hydration skips these so the local content cache +// does not store entries whose freshness is governed by cloud state. +func modelShowManifestIsRemote(mf *manifest.Manifest) bool { + if mf == nil || mf.Config.Digest == "" { + return false + } + + f, err := mf.Config.Open() + if err != nil { + slog.Warn("failed to open manifest config while checking model show cache eligibility", "error", err) + return false + } + defer f.Close() + + var cfg model.ConfigV2 + if err := json.NewDecoder(f).Decode(&cfg); err != nil { + slog.Warn("failed to decode manifest config while checking model show cache eligibility", "error", err) + return false + } + + return cfg.RemoteHost != "" || cfg.RemoteModel != "" +} + +// cloneShowResponse deep-copies mutable fields of api.ShowResponse before +// storing or returning cached entries. The response contains maps and slices, +// and some handlers mutate ModelInfo before writing JSON. +func cloneShowResponse(in *api.ShowResponse) *api.ShowResponse { + if in == nil { + return nil + } + + out := *in + out.Details.Families = slices.Clone(in.Details.Families) + out.Messages = cloneMessages(in.Messages) + out.Capabilities = slices.Clone(in.Capabilities) + out.ModelInfo = cloneAnyMap(in.ModelInfo) + out.ProjectorInfo = cloneAnyMap(in.ProjectorInfo) + out.Tensors = cloneTensors(in.Tensors) + return &out +} + +func cloneMessages(in []api.Message) []api.Message { + if in == nil { + return nil + } + out := make([]api.Message, len(in)) + for i, msg := range in { + out[i] = msg + if msg.Images != nil { + out[i].Images = make([]api.ImageData, len(msg.Images)) + for j, image := range msg.Images { + out[i].Images[j] = slices.Clone(image) + } + } + out[i].ToolCalls = slices.Clone(msg.ToolCalls) + } + return out +} + +func cloneTensors(in []api.Tensor) []api.Tensor { + if in == nil { + return nil + } + out := make([]api.Tensor, len(in)) + for i, tensor := range in { + out[i] = tensor + out[i].Shape = slices.Clone(tensor.Shape) + } + return out +} + +func cloneAnyMap(in map[string]any) map[string]any { + if in == nil { + return nil + } + out := make(map[string]any, len(in)) + for k, v := range in { + out[k] = cloneAny(v) + } + return out +} + +func cloneAny(v any) any { + switch v := v.(type) { + case map[string]any: + return cloneAnyMap(v) + case []any: + out := make([]any, len(v)) + for i, item := range v { + out[i] = cloneAny(item) + } + return out + case []string: + return slices.Clone(v) + case []bool: + return slices.Clone(v) + case []int: + return slices.Clone(v) + case []int8: + return slices.Clone(v) + case []int16: + return slices.Clone(v) + case []int32: + return slices.Clone(v) + case []int64: + return slices.Clone(v) + case []uint: + return slices.Clone(v) + case []uint8: + return slices.Clone(v) + case []uint16: + return slices.Clone(v) + case []uint32: + return slices.Clone(v) + case []uint64: + return slices.Clone(v) + case []float32: + return slices.Clone(v) + case []float64: + return slices.Clone(v) + default: + return v + } +} diff --git a/server/model_show_cache_test.go b/server/model_show_cache_test.go new file mode 100644 index 000000000..8b8175c66 --- /dev/null +++ b/server/model_show_cache_test.go @@ -0,0 +1,501 @@ +package server + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "net/http" + "net/http/httptest" + "slices" + "strings" + "sync" + "testing" + "time" + + "github.com/gin-gonic/gin" + + "github.com/ollama/ollama/api" + "github.com/ollama/ollama/envconfig" + internalcloud "github.com/ollama/ollama/internal/cloud" + "github.com/ollama/ollama/manifest" + modelpkg "github.com/ollama/ollama/types/model" +) + +func TestModelShowCacheLocalHitUsesManifestDigest(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-local", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + calls := 0 + cache.getModelInfo = func(req api.ShowRequest) (*api.ShowResponse, error) { + calls++ + return showCacheTestResponse(calls, req.Verbose), nil + } + + first, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-local"}) + if err != nil { + t.Fatalf("first GetLocal failed: %v", err) + } + second, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-local"}) + if err != nil { + t.Fatalf("second GetLocal failed: %v", err) + } + + if calls != 1 { + t.Fatalf("getModelInfo calls = %d, want 1", calls) + } + if first.ModelInfo["call"] != 1 || second.ModelInfo["call"] != 1 { + t.Fatalf("cached call markers = %v / %v, want both 1", first.ModelInfo["call"], second.ModelInfo["call"]) + } +} + +func TestModelShowCacheLocalManifestDigestChangeRefreshes(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-refresh", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + calls := 0 + cache.getModelInfo = func(req api.ShowRequest) (*api.ShowResponse, error) { + calls++ + return showCacheTestResponse(calls, req.Verbose), nil + } + + if _, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-refresh"}); err != nil { + t.Fatalf("first GetLocal failed: %v", err) + } + changeShowCacheManifest(t, "show-cache-refresh") + refreshed, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-refresh"}) + if err != nil { + t.Fatalf("refreshed GetLocal failed: %v", err) + } + + if calls != 2 { + t.Fatalf("getModelInfo calls = %d, want 2", calls) + } + if refreshed.ModelInfo["call"] != 2 { + t.Fatalf("refreshed call marker = %v, want 2", refreshed.ModelInfo["call"]) + } + if len(cache.local) != 1 { + t.Fatalf("local cache entries = %d, want 1", len(cache.local)) + } +} + +func TestModelShowCacheLocalVerboseVariantsAreSeparate(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-verbose", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + calls := 0 + cache.getModelInfo = func(req api.ShowRequest) (*api.ShowResponse, error) { + calls++ + return showCacheTestResponse(calls, req.Verbose), nil + } + + plain, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-verbose"}) + if err != nil { + t.Fatalf("plain GetLocal failed: %v", err) + } + verbose, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-verbose", Verbose: true}) + if err != nil { + t.Fatalf("verbose GetLocal failed: %v", err) + } + plainAgain, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-verbose"}) + if err != nil { + t.Fatalf("plain repeat GetLocal failed: %v", err) + } + + if calls != 2 { + t.Fatalf("getModelInfo calls = %d, want 2", calls) + } + if plain.ModelInfo["verbose"] != false || verbose.ModelInfo["verbose"] != true || plainAgain.ModelInfo["call"] != 1 { + t.Fatalf("unexpected verbose cache markers: plain=%v verbose=%v plainAgainCall=%v", plain.ModelInfo, verbose.ModelInfo, plainAgain.ModelInfo["call"]) + } +} + +func TestModelShowCacheLocalHydrationSkipsUnchangedInMemory(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-hydrate", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + calls := 0 + cache.getModelInfo = func(req api.ShowRequest) (*api.ShowResponse, error) { + calls++ + return showCacheTestResponse(calls, req.Verbose), nil + } + + if err := cache.hydrateLocal(context.Background()); err != nil { + t.Fatalf("first hydrateLocal failed: %v", err) + } + if err := cache.hydrateLocal(context.Background()); err != nil { + t.Fatalf("second hydrateLocal failed: %v", err) + } + resp, err := cache.GetLocal(api.ShowRequest{Model: "show-cache-hydrate"}) + if err != nil { + t.Fatalf("GetLocal after hydration failed: %v", err) + } + + if calls != 1 { + t.Fatalf("getModelInfo calls after unchanged in-memory hydration = %d, want 1", calls) + } + if resp.ModelInfo["call"] != 1 { + t.Fatalf("hydrated call marker = %v, want 1", resp.ModelInfo["call"]) + } +} + +func TestModelShowCacheBypassesSystemAndOptionsOverlays(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-overlay", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + key, digest, err := modelShowLocalKeyForRequest(api.ShowRequest{Model: "show-cache-overlay"}) + if err != nil { + t.Fatalf("local key failed: %v", err) + } + cache.setLocal(key, digest, &api.ShowResponse{System: "cached", ModelInfo: map[string]any{}}) + + s := Server{modelCaches: &modelCaches{show: cache}} + w := createRequest(t, s.ShowHandler, api.ShowRequest{ + Model: "show-cache-overlay", + System: "overlay-system", + }) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want 200: %s", w.Code, w.Body.String()) + } + + var resp api.ShowResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decode response failed: %v", err) + } + if resp.System != "overlay-system" { + t.Fatalf("system = %q, want overlay-system", resp.System) + } + + w = createRequest(t, s.ShowHandler, api.ShowRequest{ + Model: "show-cache-overlay", + Options: map[string]any{"num_ctx": float64(8192)}, + }) + if w.Code != http.StatusOK { + t.Fatalf("options overlay status = %d, want 200: %s", w.Code, w.Body.String()) + } + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decode options response failed: %v", err) + } + if resp.System == "cached" { + t.Fatalf("options overlay unexpectedly returned cached response") + } +} + +func TestModelShowCacheLocalAndCloudSameBaseDoNotCollide(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + createShowCacheModel(t, "show-cache-dual", map[string]any{"test.context_length": uint32(1024)}) + + cache := newModelShowCache() + cache.getModelInfo = func(req api.ShowRequest) (*api.ShowResponse, error) { + return &api.ShowResponse{ + Details: api.ModelDetails{Format: "local"}, + ModelInfo: map[string]any{}, + }, nil + } + cloudKey := modelShowCloudKeyForModel("show-cache-dual:cloud", false) + cache.setCloud(cloudKey, &api.ShowResponse{ + Details: api.ModelDetails{Format: "cloud"}, + ModelInfo: map[string]any{}, + }) + cache.mu.Lock() + cache.cloudNextReadRefreshAfter[cloudKey] = time.Now().Add(time.Hour) + cache.mu.Unlock() + + s := Server{modelCaches: &modelCaches{show: cache}} + + w := createRequest(t, s.ShowHandler, api.ShowRequest{Model: "show-cache-dual:cloud"}) + if w.Code != http.StatusOK { + t.Fatalf("cloud status = %d, want 200: %s", w.Code, w.Body.String()) + } + var cloudResp api.ShowResponse + if err := json.NewDecoder(w.Body).Decode(&cloudResp); err != nil { + t.Fatalf("decode cloud response failed: %v", err) + } + if cloudResp.Details.Format != "cloud" { + t.Fatalf("cloud format = %q, want cloud", cloudResp.Details.Format) + } + + w = createRequest(t, s.ShowHandler, api.ShowRequest{Model: "show-cache-dual"}) + if w.Code != http.StatusOK { + t.Fatalf("local status = %d, want 200: %s", w.Code, w.Body.String()) + } + var localResp api.ShowResponse + if err := json.NewDecoder(w.Body).Decode(&localResp); err != nil { + t.Fatalf("decode local response failed: %v", err) + } + if localResp.Details.Format != "local" { + t.Fatalf("local format = %q, want local", localResp.Details.Format) + } +} + +func TestModelShowCacheCloudWarmHitReturnsStaleAndRefreshes(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + + refreshDone := make(chan struct{}) + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/api/show" { + t.Fatalf("unexpected upstream path %q", r.URL.Path) + } + defer close(refreshDone) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"details":{"format":"updated"},"model_info":{"source":"fresh"}}`)) + })) + defer upstream.Close() + withCloudProxyBaseURL(t, upstream.URL) + + cache := newModelShowCache() + cache.client = upstream.Client() + cache.setCloud(modelShowCloudKeyForModel("kimi-k2.5:cloud", false), &api.ShowResponse{ + Details: api.ModelDetails{Format: "cached"}, + ModelInfo: map[string]any{"source": "stale"}, + }) + + s := Server{modelCaches: &modelCaches{show: cache}} + w := createRequest(t, s.ShowHandler, api.ShowRequest{Model: "kimi-k2.5:cloud"}) + if w.Code != http.StatusOK { + t.Fatalf("status = %d, want 200: %s", w.Code, w.Body.String()) + } + + var resp api.ShowResponse + if err := json.NewDecoder(w.Body).Decode(&resp); err != nil { + t.Fatalf("decode response failed: %v", err) + } + if resp.Details.Format != "cached" { + t.Fatalf("format = %q, want cached", resp.Details.Format) + } + + select { + case <-refreshDone: + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for cloud refresh") + } + waitForCondition(t, 2*time.Second, func() bool { + resp, ok := cache.getCloud(modelShowCloudKeyForModel("kimi-k2.5:cloud", false)) + return ok && resp.Details.Format == "updated" + }) +} + +func TestModelShowCacheCloudColdMissFallsBackToProxy(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + + var capturedPath, capturedBody string + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + capturedPath = r.URL.Path + body, _ := io.ReadAll(r.Body) + capturedBody = string(body) + w.Header().Set("Content-Type", "application/json") + _, _ = w.Write([]byte(`{"details":{"format":"cold"},"model_info":{}}`)) + })) + defer upstream.Close() + withCloudProxyBaseURL(t, upstream.URL) + + s := &Server{modelCaches: &modelCaches{show: newModelShowCache()}} + router, err := s.GenerateRoutes(nil) + if err != nil { + t.Fatalf("GenerateRoutes failed: %v", err) + } + local := httptest.NewServer(router) + defer local.Close() + + req, err := http.NewRequestWithContext(t.Context(), http.MethodPost, local.URL+"/api/show", bytes.NewBufferString(`{"model":"kimi-k2.5:cloud"}`)) + if err != nil { + t.Fatal(err) + } + req.Header.Set("Content-Type", "application/json") + resp, err := local.Client().Do(req) + if err != nil { + t.Fatal(err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + t.Fatalf("status = %d, want 200: %s", resp.StatusCode, string(body)) + } + if capturedPath != "/api/show" { + t.Fatalf("upstream path = %q, want /api/show", capturedPath) + } + if !strings.Contains(capturedBody, `"model":"kimi-k2.5"`) { + t.Fatalf("expected normalized model in upstream body, got %q", capturedBody) + } +} + +func TestModelShowCacheCloudHydrationUsesTagsAndShow(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + + var mu sync.Mutex + var showModels []string + var tagsCalled bool + upstream := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json") + switch r.URL.Path { + case "/api/tags": + if r.Method != http.MethodGet { + t.Fatalf("tags method = %s, want GET", r.Method) + } + mu.Lock() + tagsCalled = true + mu.Unlock() + _, _ = w.Write([]byte(`{"models":[{"name":"alpha:cloud"},{"model":"beta"}]}`)) + case "/api/show": + var req api.ShowRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode show request failed: %v", err) + } + mu.Lock() + showModels = append(showModels, req.Model) + mu.Unlock() + _ = json.NewEncoder(w).Encode(api.ShowResponse{ + Details: api.ModelDetails{Format: req.Model}, + ModelInfo: map[string]any{"model": req.Model}, + }) + default: + t.Fatalf("unexpected upstream path %q", r.URL.Path) + } + })) + defer upstream.Close() + withCloudProxyBaseURL(t, upstream.URL) + + cache := newModelShowCache() + cache.client = upstream.Client() + if err := cache.hydrateCloud(context.Background()); err != nil { + t.Fatalf("hydrateCloud failed: %v", err) + } + + mu.Lock() + gotTagsCalled := tagsCalled + gotShowModels := slices.Clone(showModels) + mu.Unlock() + slices.Sort(gotShowModels) + + if !gotTagsCalled { + t.Fatal("expected /api/tags to be called") + } + if !slices.Equal(gotShowModels, []string{"alpha", "beta"}) { + t.Fatalf("show models = %v, want [alpha beta]", gotShowModels) + } + for _, modelName := range gotShowModels { + resp, ok := cache.getCloud(modelShowCloudKeyForModel(modelName, false)) + if !ok { + t.Fatalf("missing cached cloud show response for %s", modelName) + } + if resp.Details.Format != modelName { + t.Fatalf("cached format for %s = %q", modelName, resp.Details.Format) + } + } +} + +func TestModelShowCacheCloudKeyNormalizesSourceTags(t *testing.T) { + tests := map[string]string{ + " kimi-k2.5:cloud ": "kimi-k2.5", + "gpt-oss:20b-cloud": "gpt-oss:20b", + "qwen3": "qwen3", + } + + for input, want := range tests { + if got := modelShowCloudKeyForModel(input, false).Model; got != want { + t.Fatalf("cloud key model for %q = %q, want %q", input, got, want) + } + } +} + +func TestModelShowCacheCloudDisabledDoesNotServeStale(t *testing.T) { + gin.SetMode(gin.TestMode) + setTestHome(t, t.TempDir()) + + t.Cleanup(envconfig.ReloadServerConfig) + t.Setenv("OLLAMA_NO_CLOUD", "1") + envconfig.ReloadServerConfig() + + cache := newModelShowCache() + cache.setCloud(modelShowCloudKeyForModel("kimi-k2.5:cloud", false), &api.ShowResponse{ + Details: api.ModelDetails{Format: "cached"}, + ModelInfo: map[string]any{}, + }) + + if err := cache.hydrateCloud(context.Background()); !errors.Is(err, errModelShowNoCloud) { + t.Fatalf("hydrateCloud error = %v, want %v", err, errModelShowNoCloud) + } + + s := Server{modelCaches: &modelCaches{show: cache}} + w := createRequest(t, s.ShowHandler, api.ShowRequest{Model: "kimi-k2.5:cloud"}) + if w.Code != http.StatusForbidden { + t.Fatalf("status = %d, want 403: %s", w.Code, w.Body.String()) + } + if !strings.Contains(w.Body.String(), internalcloud.DisabledError(cloudErrRemoteModelDetailsUnavailable)) { + t.Fatalf("unexpected disabled response: %s", w.Body.String()) + } +} + +func createShowCacheModel(t *testing.T, name string, kv map[string]any) { + t.Helper() + _, digest := createBinFile(t, kv, nil) + var s Server + w := createRequest(t, s.CreateHandler, api.CreateRequest{ + Model: name, + Files: map[string]string{"model.gguf": digest}, + Stream: &stream, + }) + if w.Code != http.StatusOK { + t.Fatalf("create model status = %d, want 200: %s", w.Code, w.Body.String()) + } +} + +func changeShowCacheManifest(t *testing.T, name string) { + t.Helper() + n, err := getExistingName(modelpkg.ParseName(name)) + if err != nil { + t.Fatalf("get existing name: %v", err) + } + mf, err := manifest.ParseNamedManifest(n) + if err != nil { + t.Fatalf("parse manifest: %v", err) + } + layer, err := manifest.NewLayer(strings.NewReader("changed"), "application/vnd.ollama.image.system") + if err != nil { + t.Fatalf("new layer: %v", err) + } + layers := append([]manifest.Layer(nil), mf.Layers...) + layers = append(layers, layer) + if err := manifest.WriteManifest(n, mf.Config, layers); err != nil { + t.Fatalf("write manifest: %v", err) + } +} + +func showCacheTestResponse(call int, verbose bool) *api.ShowResponse { + return &api.ShowResponse{ + Details: api.ModelDetails{ + Format: "gguf", + Family: "test", + }, + Capabilities: []modelpkg.Capability{modelpkg.CapabilityCompletion}, + ModelInfo: map[string]any{ + "call": call, + "verbose": verbose, + }, + } +} + +func withCloudProxyBaseURL(t *testing.T, url string) { + t.Helper() + original := cloudProxyBaseURL + cloudProxyBaseURL = url + t.Cleanup(func() { + cloudProxyBaseURL = original + }) +} diff --git a/server/routes.go b/server/routes.go index 32cd99cd0..646142d8b 100644 --- a/server/routes.go +++ b/server/routes.go @@ -98,11 +98,11 @@ var useClient2 = experimentEnabled("client2") var mode string = gin.DebugMode type Server struct { - addr net.Addr - sched *Scheduler - defaultNumCtx int - requestLogger *inferenceRequestLogger - modelRecommendations *modelRecommendationsCache + addr net.Addr + sched *Scheduler + defaultNumCtx int + requestLogger *inferenceRequestLogger + modelCaches *modelCaches } func init() { @@ -1143,13 +1143,33 @@ func (s *Server) ShowHandler(c *gin.Context) { if modelRef.Source == modelSourceCloud { req.Model = modelRef.Base + if modelShowCacheable(req) && s.modelCaches != nil && s.modelCaches.show != nil { + if disabled, _ := internalcloud.Status(); disabled { + c.JSON(http.StatusForbidden, gin.H{"error": internalcloud.DisabledError(cloudErrRemoteModelDetailsUnavailable)}) + return + } + + ctx := context.Background() + if c.Request != nil { + ctx = c.Request.Context() + } + if resp, ok := s.modelCaches.show.GetCloudSWR(ctx, req); ok { + c.JSON(http.StatusOK, resp) + return + } + } proxyCloudJSONRequest(c, req, cloudErrRemoteModelDetailsUnavailable) return } req.Model = modelRef.Base - resp, err := GetModelInfo(req) + var resp *api.ShowResponse + if modelShowCacheable(req) && s.modelCaches != nil && s.modelCaches.show != nil { + resp, err = s.modelCaches.show.GetLocal(req) + } else { + resp, err = GetModelInfo(req) + } if err != nil { var statusErr api.StatusError switch { @@ -1762,12 +1782,12 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { func (s *Server) ModelRecommendationsExperimentalHandler(c *gin.Context) { recs := defaultModelRecommendations source := "default" - if s.modelRecommendations != nil { + if s.modelCaches != nil && s.modelCaches.recommendations != nil { ctx := context.Background() if c.Request != nil { ctx = c.Request.Context() } - recs = s.modelRecommendations.GetSWR(ctx) + recs = s.modelCaches.recommendations.GetSWR(ctx) source = "cache" } @@ -1811,12 +1831,9 @@ func Serve(ln net.Listener) error { } } - // TODO(parthsareen): If we add more runtime caches, prefer introducing a - // small cache manager owned by Server (for shared start/stop/health wiring) - // instead of adding one top-level field per cache here in Serve. s := &Server{ - addr: ln.Addr(), - modelRecommendations: newModelRecommendationsCache(), + addr: ln.Addr(), + modelCaches: newModelCaches(), } if err := s.initRequestLogging(); err != nil { return err @@ -1842,7 +1859,7 @@ func Serve(ln net.Listener) error { schedCtx, schedDone := context.WithCancel(ctx) sched := InitScheduler(schedCtx) s.sched = sched - s.modelRecommendations.Start(ctx) + s.modelCaches.Start(ctx) slog.Info(fmt.Sprintf("Listening on %s (version %s)", ln.Addr(), version.Version)) srvr := &http.Server{