mirror of
https://github.com/ollama/ollama.git
synced 2025-12-05 18:46:22 -06:00
llm: Don't always evict models on CPU-only systems
Model eviction happens when we have at least one other model loaded and are unable to load all layers into VRAM. However, on CPU-only systems we can never load layers into VRAM, so this constantly triggered eviction. Fixes #13227
This commit is contained in:
@@ -874,7 +874,7 @@ func (s *llmServer) createLayout(systemInfo ml.SystemInfo, systemGPUs []ml.Devic
|
||||
}}
|
||||
}
|
||||
gpuLayers, layers := s.buildLayout(systemGPUs, memory, requireFull, backoff)
|
||||
err := s.verifyLayout(systemInfo, memory, requireFull, gpuLayers, layers)
|
||||
err := s.verifyLayout(systemInfo, systemGPUs, memory, requireFull, gpuLayers, layers)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -943,7 +943,7 @@ func (s *llmServer) buildLayout(systemGPUs []ml.DeviceInfo, memory *ml.BackendMe
|
||||
}
|
||||
|
||||
// verifyLayout ensures that we don't exceed limits, such as requirements about partial offloading or system memory
|
||||
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
|
||||
func (s *llmServer) verifyLayout(systemInfo ml.SystemInfo, systemGPUs []ml.DeviceInfo, memory *ml.BackendMemory, requireFull bool, gpuLayers ml.GPULayersList, layers []uint64) error {
|
||||
// These sizes will only increase as we go through additional iterations and get additional information.
|
||||
cpuSize := memory.InputWeights + memory.CPU.Graph
|
||||
var vramSize uint64
|
||||
@@ -970,8 +970,8 @@ nextLayer:
|
||||
}
|
||||
|
||||
if requireFull {
|
||||
if gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
||||
slog.Info("model requires more memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
|
||||
if len(systemGPUs) > 0 && gpuLayers.Sum() < len(layers) && (s.options.NumGPU < 0 || gpuLayers.Sum() < s.options.NumGPU) {
|
||||
slog.Info("model requires more gpu memory than is currently available, evicting a model to make space", "loaded layers", gpuLayers.Sum())
|
||||
return ErrLoadRequiredFull
|
||||
}
|
||||
|
||||
@@ -998,7 +998,7 @@ nextLayer:
|
||||
}
|
||||
}
|
||||
|
||||
if gpuLayers.Sum() == 0 {
|
||||
if len(systemGPUs) > 0 && gpuLayers.Sum() == 0 {
|
||||
slog.Debug("insufficient VRAM to load any model layers")
|
||||
}
|
||||
|
||||
|
||||
@@ -26,10 +26,11 @@ func TestLLMServerFitGPU(t *testing.T) {
|
||||
expectedErr error
|
||||
}{
|
||||
{
|
||||
name: "No GPU",
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{},
|
||||
name: "No GPU",
|
||||
layers: []int{50 * format.MebiByte, 50 * format.MebiByte, 50 * format.MebiByte},
|
||||
numGPU: -1,
|
||||
expected: ml.GPULayersList{},
|
||||
requireFull: true, // Should not try to evict even though we can't load any layers
|
||||
},
|
||||
{
|
||||
name: "Full single GPU",
|
||||
|
||||
Reference in New Issue
Block a user