llm: Don't always evict models on CPU-only systems

Model eviction happens when we have at least one other model
loaded and are unable to load all layers into VRAM. However, on
CPU-only systems we can never load layers into VRAM, so this
constantly triggered eviction.

Fixes #13227
This commit is contained in:
Jesse Gross
2025-11-25 14:51:02 -08:00
committed by Jesse Gross
parent d771043e88
commit 5317202c38
2 changed files with 10 additions and 9 deletions

View File

@@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
}}
}
gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
if err != nil {
return nil, err
}
@@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
}
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
// These sizes will only increase as we go through additional iterations and get additional information.
cpuSize := memory.InputWeights + memory.CPU.Graph
var vramSize uint64
@@ -970,8 +970,8 @@ nextLayer:
}
if requireFull {
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
return ErrLoadRequiredFull
}
@@ -998,7 +998,7 @@ nextLayer:
}
}
if gpuLayers.Sum() == 0 {
if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
slog.Debug("insufficient VRAM to load any model layers")
}

View File

@@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) {
expectedErr error
}{
{
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
name: "No GPU",
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
numGPU: -1,
expected: ml.GPULayersList{},
requireFull: true, // Should not try to evict even though we can't load any layers
},
{
name: "Full single GPU",