From 2e9d9acf18d24f42357b87bd34bba2ba8cdb211e Mon Sep 17 00:00:00 2001 From: Devon Rifkin Date: Thu, 5 Feb 2026 14:57:04 -0800 Subject: [PATCH] add ability to turn on debug request logging If `OLLAMA_DEBUG_LOG_REQUESTS` is set, then on server startup a temp folder will be created. Upon any inference request, the body will be logged to a file in this folder, as well as a small shell script to "replay" the request using cURL. This is just intended for debugging scenarios, not as something to turn on normally. --- envconfig/config.go | 43 ++++----- server/inference_request_log.go | 144 ++++++++++++++++++++++++++++++ server/routes.go | 16 ++-- server/routes_request_log_test.go | 128 ++++++++++++++++++++++++++ 4 files changed, 305 insertions(+), 26 deletions(-) create mode 100644 server/inference_request_log.go create mode 100644 server/routes_request_log_test.go diff --git a/envconfig/config.go b/envconfig/config.go index a7466a84a..72622ef05 100644 --- a/envconfig/config.go +++ b/envconfig/config.go @@ -188,6 +188,8 @@ func LogLevel() slog.Level { var ( // FlashAttention enables the experimental flash attention feature. FlashAttention = BoolWithDefault("OLLAMA_FLASH_ATTENTION") + // DebugLogRequests logs inference requests to disk for replay/debugging. + DebugLogRequests = Bool("OLLAMA_DEBUG_LOG_REQUESTS") // KvCacheType is the quantization type for the K/V cache. KvCacheType = String("OLLAMA_KV_CACHE_TYPE") // NoHistory disables readline history. @@ -273,26 +275,27 @@ type EnvVar struct { func AsMap() map[string]EnvVar { ret := map[string]EnvVar{ - "OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, - "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(false), "Enabled flash attention"}, - "OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"}, - "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, - "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, - "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, - "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"}, - "OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"}, - "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"}, - "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"}, - "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, - "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, - "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, - "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, - "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, - "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, - "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, - "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"}, - "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, - "OLLAMA_REMOTES": {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"}, + "OLLAMA_DEBUG": {"OLLAMA_DEBUG", LogLevel(), "Show additional debug information (e.g. OLLAMA_DEBUG=1)"}, + "OLLAMA_DEBUG_LOG_REQUESTS": {"OLLAMA_DEBUG_LOG_REQUESTS", DebugLogRequests(), "Log inference request bodies and replay curl commands to a temp directory"}, + "OLLAMA_FLASH_ATTENTION": {"OLLAMA_FLASH_ATTENTION", FlashAttention(false), "Enabled flash attention"}, + "OLLAMA_KV_CACHE_TYPE": {"OLLAMA_KV_CACHE_TYPE", KvCacheType(), "Quantization type for the K/V cache (default: f16)"}, + "OLLAMA_GPU_OVERHEAD": {"OLLAMA_GPU_OVERHEAD", GpuOverhead(), "Reserve a portion of VRAM per GPU (bytes)"}, + "OLLAMA_HOST": {"OLLAMA_HOST", Host(), "IP Address for the ollama server (default 127.0.0.1:11434)"}, + "OLLAMA_KEEP_ALIVE": {"OLLAMA_KEEP_ALIVE", KeepAlive(), "The duration that models stay loaded in memory (default \"5m\")"}, + "OLLAMA_LLM_LIBRARY": {"OLLAMA_LLM_LIBRARY", LLMLibrary(), "Set LLM library to bypass autodetection"}, + "OLLAMA_LOAD_TIMEOUT": {"OLLAMA_LOAD_TIMEOUT", LoadTimeout(), "How long to allow model loads to stall before giving up (default \"5m\")"}, + "OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners(), "Maximum number of loaded models per GPU"}, + "OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueue(), "Maximum number of queued requests"}, + "OLLAMA_MODELS": {"OLLAMA_MODELS", Models(), "The path to the models directory"}, + "OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"}, + "OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"}, + "OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"}, + "OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", AllowedOrigins(), "A comma separated list of allowed origins"}, + "OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"}, + "OLLAMA_MULTIUSER_CACHE": {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"}, + "OLLAMA_CONTEXT_LENGTH": {"OLLAMA_CONTEXT_LENGTH", ContextLength(), "Context length to use unless otherwise specified (default: 4k/32k/256k based on VRAM)"}, + "OLLAMA_NEW_ENGINE": {"OLLAMA_NEW_ENGINE", NewEngine(), "Enable the new Ollama engine"}, + "OLLAMA_REMOTES": {"OLLAMA_REMOTES", Remotes(), "Allowed hosts for remote models (default \"ollama.com\")"}, // Informational "HTTP_PROXY": {"HTTP_PROXY", String("HTTP_PROXY")(), "HTTP proxy"}, diff --git a/server/inference_request_log.go b/server/inference_request_log.go new file mode 100644 index 000000000..0b382e577 --- /dev/null +++ b/server/inference_request_log.go @@ -0,0 +1,144 @@ +package server + +import ( + "bytes" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "strings" + "sync/atomic" + "time" + + "github.com/gin-gonic/gin" + + "github.com/ollama/ollama/envconfig" +) + +type inferenceRequestLogger struct { + dir string + counter uint64 +} + +func newInferenceRequestLogger() (*inferenceRequestLogger, error) { + dir, err := os.MkdirTemp("", "ollama-request-logs-*") + if err != nil { + return nil, err + } + + return &inferenceRequestLogger{dir: dir}, nil +} + +func (s *Server) initRequestLogging() error { + if !envconfig.DebugLogRequests() { + return nil + } + + requestLogger, err := newInferenceRequestLogger() + if err != nil { + return fmt.Errorf("enable OLLAMA_DEBUG_LOG_REQUESTS: %w", err) + } + + s.requestLogger = requestLogger + slog.Info(fmt.Sprintf("request debug logging enabled; inference request logs will be stored in %s and include request bodies and replay curl commands", requestLogger.dir)) + + return nil +} + +func (s *Server) withInferenceRequestLogging(route string, handlers ...gin.HandlerFunc) []gin.HandlerFunc { + if s.requestLogger == nil { + return handlers + } + + return append([]gin.HandlerFunc{s.requestLogger.middleware(route)}, handlers...) +} + +func (l *inferenceRequestLogger) middleware(route string) gin.HandlerFunc { + return func(c *gin.Context) { + if c.Request == nil { + c.Next() + return + } + + method := c.Request.Method + host := c.Request.Host + scheme := "http" + if c.Request.TLS != nil { + scheme = "https" + } + contentType := c.GetHeader("Content-Type") + + var body []byte + if c.Request.Body != nil { + var err error + body, err = io.ReadAll(c.Request.Body) + c.Request.Body = io.NopCloser(bytes.NewReader(body)) + if err != nil { + slog.Warn("failed to read request body for debug logging", "route", route, "error", err) + } + } + + c.Next() + l.log(route, method, scheme, host, contentType, body) + } +} + +func (l *inferenceRequestLogger) log(route, method, scheme, host, contentType string, body []byte) { + if l == nil || l.dir == "" { + return + } + + if contentType == "" { + contentType = "application/json" + } + if host == "" || scheme == "" { + base := envconfig.Host() + if host == "" { + host = base.Host + } + if scheme == "" { + scheme = base.Scheme + } + } + + routeForFilename := sanitizeRouteForFilename(route) + timestamp := fmt.Sprintf("%s-%06d", time.Now().UTC().Format("20060102T150405.000000000Z"), atomic.AddUint64(&l.counter, 1)) + bodyFilename := fmt.Sprintf("%s_%s_body.json", timestamp, routeForFilename) + curlFilename := fmt.Sprintf("%s_%s_request.sh", timestamp, routeForFilename) + bodyPath := filepath.Join(l.dir, bodyFilename) + curlPath := filepath.Join(l.dir, curlFilename) + + if err := os.WriteFile(bodyPath, body, 0o600); err != nil { + slog.Warn("failed to write debug request body", "route", route, "error", err) + return + } + + url := fmt.Sprintf("%s://%s%s", scheme, host, route) + curl := fmt.Sprintf("#!/bin/sh\nSCRIPT_DIR=\"$(CDPATH= cd -- \"$(dirname -- \"$0\")\" && pwd)\"\ncurl --request %s --url %q --header %q --data-binary @\"${SCRIPT_DIR}/%s\"\n", method, url, "Content-Type: "+contentType, bodyFilename) + if err := os.WriteFile(curlPath, []byte(curl), 0o600); err != nil { + slog.Warn("failed to write debug request replay command", "route", route, "error", err) + return + } + + slog.Info(fmt.Sprintf("logged to %s, replay using curl with `sh %s`", bodyPath, curlPath)) +} + +func sanitizeRouteForFilename(route string) string { + route = strings.TrimPrefix(route, "/") + if route == "" { + return "root" + } + + var b strings.Builder + b.Grow(len(route)) + for _, r := range route { + if ('a' <= r && r <= 'z') || ('A' <= r && r <= 'Z') || ('0' <= r && r <= '9') { + b.WriteRune(r) + } else { + b.WriteByte('_') + } + } + + return b.String() +} diff --git a/server/routes.go b/server/routes.go index 910b8e954..5fbe520d6 100644 --- a/server/routes.go +++ b/server/routes.go @@ -81,6 +81,7 @@ type Server struct { addr net.Addr sched *Scheduler defaultNumCtx int + requestLogger *inferenceRequestLogger } func init() { @@ -1583,24 +1584,24 @@ func (s *Server) GenerateRoutes(rc *ollama.Registry) (http.Handler, error) { // Inference r.GET("/api/ps", s.PsHandler) - r.POST("/api/generate", s.GenerateHandler) - r.POST("/api/chat", s.ChatHandler) + r.POST("/api/generate", s.withInferenceRequestLogging("/api/generate", s.GenerateHandler)...) + r.POST("/api/chat", s.withInferenceRequestLogging("/api/chat", s.ChatHandler)...) r.POST("/api/embed", s.EmbedHandler) r.POST("/api/embeddings", s.EmbeddingsHandler) // Inference (OpenAI compatibility) - r.POST("/v1/chat/completions", middleware.ChatMiddleware(), s.ChatHandler) - r.POST("/v1/completions", middleware.CompletionsMiddleware(), s.GenerateHandler) + r.POST("/v1/chat/completions", s.withInferenceRequestLogging("/v1/chat/completions", middleware.ChatMiddleware(), s.ChatHandler)...) + r.POST("/v1/completions", s.withInferenceRequestLogging("/v1/completions", middleware.CompletionsMiddleware(), s.GenerateHandler)...) r.POST("/v1/embeddings", middleware.EmbeddingsMiddleware(), s.EmbedHandler) r.GET("/v1/models", middleware.ListMiddleware(), s.ListHandler) r.GET("/v1/models/:model", middleware.RetrieveMiddleware(), s.ShowHandler) - r.POST("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler) + r.POST("/v1/responses", s.withInferenceRequestLogging("/v1/responses", middleware.ResponsesMiddleware(), s.ChatHandler)...) // OpenAI-compatible image generation endpoints r.POST("/v1/images/generations", middleware.ImageGenerationsMiddleware(), s.GenerateHandler) r.POST("/v1/images/edits", middleware.ImageEditsMiddleware(), s.GenerateHandler) // Inference (Anthropic compatibility) - r.POST("/v1/messages", middleware.AnthropicMessagesMiddleware(), s.ChatHandler) + r.POST("/v1/messages", s.withInferenceRequestLogging("/v1/messages", middleware.AnthropicMessagesMiddleware(), s.ChatHandler)...) if rc != nil { // wrap old with new @@ -1650,6 +1651,9 @@ func Serve(ln net.Listener) error { } s := &Server{addr: ln.Addr()} + if err := s.initRequestLogging(); err != nil { + return err + } var rc *ollama.Registry if useClient2 { diff --git a/server/routes_request_log_test.go b/server/routes_request_log_test.go new file mode 100644 index 000000000..c3148847e --- /dev/null +++ b/server/routes_request_log_test.go @@ -0,0 +1,128 @@ +package server + +import ( + "io" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/gin-gonic/gin" +) + +func TestInferenceRequestLoggerMiddlewareWritesReplayArtifacts(t *testing.T) { + gin.SetMode(gin.TestMode) + + logDir := t.TempDir() + requestLogger := &inferenceRequestLogger{dir: logDir} + + const route = "/v1/chat/completions" + const requestBody = `{"model":"test-model","messages":[{"role":"user","content":"hello"}]}` + + var bodySeenByHandler string + + r := gin.New() + r.POST(route, requestLogger.middleware(route), func(c *gin.Context) { + body, err := io.ReadAll(c.Request.Body) + if err != nil { + t.Fatalf("failed to read body in handler: %v", err) + } + + bodySeenByHandler = string(body) + c.Status(http.StatusOK) + }) + + req := httptest.NewRequest(http.MethodPost, route, strings.NewReader(requestBody)) + req.Host = "127.0.0.1:11434" + req.Header.Set("Content-Type", "application/json") + w := httptest.NewRecorder() + r.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected status 200, got %d", w.Code) + } + + if bodySeenByHandler != requestBody { + t.Fatalf("handler body mismatch:\nexpected: %s\ngot: %s", requestBody, bodySeenByHandler) + } + + bodyFiles, err := filepath.Glob(filepath.Join(logDir, "*_v1_chat_completions_body.json")) + if err != nil { + t.Fatalf("failed to glob body logs: %v", err) + } + if len(bodyFiles) != 1 { + t.Fatalf("expected 1 body log, got %d (%v)", len(bodyFiles), bodyFiles) + } + + curlFiles, err := filepath.Glob(filepath.Join(logDir, "*_v1_chat_completions_request.sh")) + if err != nil { + t.Fatalf("failed to glob curl logs: %v", err) + } + if len(curlFiles) != 1 { + t.Fatalf("expected 1 curl log, got %d (%v)", len(curlFiles), curlFiles) + } + + bodyData, err := os.ReadFile(bodyFiles[0]) + if err != nil { + t.Fatalf("failed to read body log: %v", err) + } + if string(bodyData) != requestBody { + t.Fatalf("body log mismatch:\nexpected: %s\ngot: %s", requestBody, string(bodyData)) + } + + curlData, err := os.ReadFile(curlFiles[0]) + if err != nil { + t.Fatalf("failed to read curl log: %v", err) + } + + curlString := string(curlData) + if !strings.Contains(curlString, "http://127.0.0.1:11434"+route) { + t.Fatalf("curl log does not contain expected route URL: %s", curlString) + } + + bodyFileName := filepath.Base(bodyFiles[0]) + if !strings.Contains(curlString, "@\"${SCRIPT_DIR}/"+bodyFileName+"\"") { + t.Fatalf("curl log does not reference sibling body file: %s", curlString) + } +} + +func TestNewInferenceRequestLoggerCreatesDirectory(t *testing.T) { + requestLogger, err := newInferenceRequestLogger() + if err != nil { + t.Fatalf("expected no error creating request logger: %v", err) + } + t.Cleanup(func() { + _ = os.RemoveAll(requestLogger.dir) + }) + + if requestLogger == nil || requestLogger.dir == "" { + t.Fatalf("expected request logger directory to be set") + } + + info, err := os.Stat(requestLogger.dir) + if err != nil { + t.Fatalf("expected directory to exist: %v", err) + } + if !info.IsDir() { + t.Fatalf("expected %q to be a directory", requestLogger.dir) + } +} + +func TestSanitizeRouteForFilename(t *testing.T) { + tests := []struct { + route string + want string + }{ + {route: "/api/generate", want: "api_generate"}, + {route: "/v1/chat/completions", want: "v1_chat_completions"}, + {route: "/v1/messages", want: "v1_messages"}, + } + + for _, tt := range tests { + if got := sanitizeRouteForFilename(tt.route); got != tt.want { + t.Fatalf("sanitizeRouteForFilename(%q) = %q, want %q", tt.route, got, tt.want) + } + } +}