Files
newt/healthcheck/healthcheck.go
Varun Narravula 8a45f6fd63 feat(healthcheck): add TLS SNI header to request when needed
Add the Server Name Indication (SNI) field to healthcheck requests, if
present in the target config.

SNI handling is already present for proxying resources, but this has
not been implemented for healthcheck requests yet until this commit.

In order to facilitate this, this commit moves the client instantiation
to when the healthcheck is performed, rather than as a part of the
monitor init.
2025-12-04 14:27:04 -05:00

518 lines
15 KiB
Go

package healthcheck
import (
"context"
"crypto/tls"
"encoding/json"
"fmt"
"net/http"
"strings"
"sync"
"time"
"github.com/fosrl/newt/logger"
)
// Health represents the health status of a target
type Health int
const (
StatusUnknown Health = iota
StatusHealthy
StatusUnhealthy
)
func (s Health) String() string {
switch s {
case StatusHealthy:
return "healthy"
case StatusUnhealthy:
return "unhealthy"
default:
return "unknown"
}
}
// Config holds the health check configuration for a target
type Config struct {
ID int `json:"id"`
Enabled bool `json:"hcEnabled"`
Path string `json:"hcPath"`
Scheme string `json:"hcScheme"`
Mode string `json:"hcMode"`
Hostname string `json:"hcHostname"`
Port int `json:"hcPort"`
Interval int `json:"hcInterval"` // in seconds
UnhealthyInterval int `json:"hcUnhealthyInterval"` // in seconds
Timeout int `json:"hcTimeout"` // in seconds
Headers map[string]string `json:"hcHeaders"`
Method string `json:"hcMethod"`
Status int `json:"hcStatus"` // HTTP status code
TLSServerName string `json:"hcTlsServerName"`
}
// Target represents a health check target with its current status
type Target struct {
Config Config `json:"config"`
Status Health `json:"status"`
LastCheck time.Time `json:"lastCheck"`
LastError string `json:"lastError,omitempty"`
CheckCount int `json:"checkCount"`
ticker *time.Ticker
ctx context.Context
cancel context.CancelFunc
}
// StatusChangeCallback is called when any target's status changes
type StatusChangeCallback func(targets map[int]*Target)
// Monitor manages health check targets and their monitoring
type Monitor struct {
targets map[int]*Target
mutex sync.RWMutex
callback StatusChangeCallback
enforceCert bool
}
// NewMonitor creates a new health check monitor
func NewMonitor(callback StatusChangeCallback, enforceCert bool) *Monitor {
logger.Debug("Creating new health check monitor with certificate enforcement: %t", enforceCert)
return &Monitor{
targets: make(map[int]*Target),
callback: callback,
enforceCert: enforceCert,
}
}
// parseHeaders parses the headers string into a map
func parseHeaders(headersStr string) map[string]string {
headers := make(map[string]string)
if headersStr == "" {
return headers
}
// Try to parse as JSON first
if err := json.Unmarshal([]byte(headersStr), &headers); err == nil {
return headers
}
// Fallback to simple key:value parsing
pairs := strings.Split(headersStr, ",")
for _, pair := range pairs {
kv := strings.SplitN(strings.TrimSpace(pair), ":", 2)
if len(kv) == 2 {
headers[strings.TrimSpace(kv[0])] = strings.TrimSpace(kv[1])
}
}
return headers
}
// AddTarget adds a new health check target
func (m *Monitor) AddTarget(config Config) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Info("Adding health check target: ID=%d, hostname=%s, port=%d, enabled=%t",
config.ID, config.Hostname, config.Port, config.Enabled)
return m.addTargetUnsafe(config)
}
// AddTargets adds multiple health check targets in bulk
func (m *Monitor) AddTargets(configs []Config) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Debug("Adding %d health check targets in bulk", len(configs))
for _, config := range configs {
if err := m.addTargetUnsafe(config); err != nil {
logger.Error("Failed to add target %d: %v", config.ID, err)
return fmt.Errorf("failed to add target %d: %v", config.ID, err)
}
logger.Debug("Successfully added target: ID=%d, hostname=%s", config.ID, config.Hostname)
}
// Don't notify callback immediately - let the initial health checks complete first
// The callback will be triggered when the first health check results are available
logger.Debug("Successfully added all %d health check targets", len(configs))
return nil
}
// addTargetUnsafe adds a target without acquiring the mutex (internal method)
func (m *Monitor) addTargetUnsafe(config Config) error {
// Set defaults
if config.Scheme == "" {
config.Scheme = "http"
}
if config.Mode == "" {
config.Mode = "http"
}
if config.Method == "" {
config.Method = "GET"
}
if config.Interval == 0 {
config.Interval = 30
}
if config.UnhealthyInterval == 0 {
config.UnhealthyInterval = 30
}
if config.Timeout == 0 {
config.Timeout = 5
}
logger.Debug("Target %d configuration: scheme=%s, method=%s, interval=%ds, timeout=%ds",
config.ID, config.Scheme, config.Method, config.Interval, config.Timeout)
// Parse headers if provided as string
if len(config.Headers) == 0 && config.Path != "" {
// This is a simplified header parsing - in real use you might want more robust parsing
config.Headers = make(map[string]string)
}
// Remove existing target if it exists
if existing, exists := m.targets[config.ID]; exists {
logger.Info("Replacing existing target with ID %d", config.ID)
existing.cancel()
}
// Create new target
ctx, cancel := context.WithCancel(context.Background())
target := &Target{
Config: config,
Status: StatusUnknown,
ctx: ctx,
cancel: cancel,
}
m.targets[config.ID] = target
// Start monitoring if enabled
if config.Enabled {
logger.Info("Starting monitoring for target %d (%s:%d)", config.ID, config.Hostname, config.Port)
go m.monitorTarget(target)
} else {
logger.Debug("Target %d added but monitoring is disabled", config.ID)
}
return nil
}
// RemoveTarget removes a health check target
func (m *Monitor) RemoveTarget(id int) error {
m.mutex.Lock()
defer m.mutex.Unlock()
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to remove non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
logger.Info("Removing health check target: ID=%d", id)
target.cancel()
delete(m.targets, id)
// Notify callback of status change
if m.callback != nil {
go m.callback(m.GetTargets())
}
logger.Info("Successfully removed target %d", id)
return nil
}
// RemoveTargets removes multiple health check targets
func (m *Monitor) RemoveTargets(ids []int) error {
m.mutex.Lock()
defer m.mutex.Unlock()
logger.Info("Removing %d health check targets", len(ids))
var notFound []int
for _, id := range ids {
target, exists := m.targets[id]
if !exists {
notFound = append(notFound, id)
logger.Warn("Target with ID %d not found during bulk removal", id)
continue
}
logger.Debug("Removing target %d", id)
target.cancel()
delete(m.targets, id)
}
removedCount := len(ids) - len(notFound)
logger.Info("Successfully removed %d targets", removedCount)
// Notify callback of status change if any targets were removed
if len(notFound) != len(ids) && m.callback != nil {
go m.callback(m.GetTargets())
}
if len(notFound) > 0 {
logger.Error("Some targets not found during removal: %v", notFound)
return fmt.Errorf("targets not found: %v", notFound)
}
return nil
}
// RemoveTargetsByID is a convenience method that accepts either a single ID or multiple IDs
func (m *Monitor) RemoveTargetsByID(ids ...int) error {
return m.RemoveTargets(ids)
}
// GetTargets returns a copy of all targets
func (m *Monitor) GetTargets() map[int]*Target {
m.mutex.RLock()
defer m.mutex.RUnlock()
return m.getAllTargetsUnsafe()
}
// getAllTargetsUnsafe returns a copy of all targets without acquiring the mutex (internal method)
func (m *Monitor) getAllTargetsUnsafe() map[int]*Target {
targets := make(map[int]*Target)
for id, target := range m.targets {
// Create a copy to avoid race conditions
targetCopy := *target
targets[id] = &targetCopy
}
return targets
}
// getAllTargets returns a copy of all targets (deprecated, use GetTargets)
func (m *Monitor) getAllTargets() map[int]*Target {
return m.GetTargets()
}
// monitorTarget monitors a single target
func (m *Monitor) monitorTarget(target *Target) {
logger.Info("Starting health check monitoring for target %d (%s:%d)",
target.Config.ID, target.Config.Hostname, target.Config.Port)
// Initial check
oldStatus := target.Status
m.performHealthCheck(target)
// Notify callback after initial check if status changed or if it's the first check
if (oldStatus != target.Status || oldStatus == StatusUnknown) && m.callback != nil {
logger.Info("Target %d initial status: %s", target.Config.ID, target.Status.String())
go m.callback(m.GetTargets())
}
// Set up ticker based on current status
interval := time.Duration(target.Config.Interval) * time.Second
if target.Status == StatusUnhealthy {
interval = time.Duration(target.Config.UnhealthyInterval) * time.Second
}
logger.Debug("Target %d: initial check interval set to %v", target.Config.ID, interval)
target.ticker = time.NewTicker(interval)
defer target.ticker.Stop()
for {
select {
case <-target.ctx.Done():
logger.Info("Stopping health check monitoring for target %d", target.Config.ID)
return
case <-target.ticker.C:
oldStatus := target.Status
m.performHealthCheck(target)
// Update ticker interval if status changed
newInterval := time.Duration(target.Config.Interval) * time.Second
if target.Status == StatusUnhealthy {
newInterval = time.Duration(target.Config.UnhealthyInterval) * time.Second
}
if newInterval != interval {
logger.Debug("Target %d: updating check interval from %v to %v due to status change",
target.Config.ID, interval, newInterval)
target.ticker.Stop()
target.ticker = time.NewTicker(newInterval)
interval = newInterval
}
// Notify callback if status changed
if oldStatus != target.Status && m.callback != nil {
logger.Info("Target %d status changed: %s -> %s",
target.Config.ID, oldStatus.String(), target.Status.String())
go m.callback(m.GetTargets())
}
}
}
}
// performHealthCheck performs a health check on a target
func (m *Monitor) performHealthCheck(target *Target) {
target.CheckCount++
target.LastCheck = time.Now()
target.LastError = ""
// Build URL
url := fmt.Sprintf("%s://%s", target.Config.Scheme, target.Config.Hostname)
if target.Config.Port > 0 {
url = fmt.Sprintf("%s:%d", url, target.Config.Port)
}
if target.Config.Path != "" {
if !strings.HasPrefix(target.Config.Path, "/") {
url += "/"
}
url += target.Config.Path
}
logger.Debug("Target %d: performing health check %d to %s",
target.Config.ID, target.CheckCount, url)
if target.Config.Scheme == "https" {
logger.Debug("Target %d: HTTPS health check with certificate enforcement: %t",
target.Config.ID, m.enforceCert)
}
// Create request
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(target.Config.Timeout)*time.Second)
defer cancel()
client := &http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{
// Configure TLS settings based on certificate enforcement
InsecureSkipVerify: !m.enforceCert,
// Use SNI TLS header if present
ServerName: target.Config.TLSServerName,
},
},
}
req, err := http.NewRequestWithContext(ctx, target.Config.Method, url, nil)
if err != nil {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("failed to create request: %v", err)
logger.Warn("Target %d: failed to create request: %v", target.Config.ID, err)
return
}
// Add headers
for key, value := range target.Config.Headers {
req.Header.Set(key, value)
}
// Perform request
resp, err := client.Do(req)
if err != nil {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("request failed: %v", err)
logger.Warn("Target %d: health check failed: %v", target.Config.ID, err)
return
}
defer resp.Body.Close()
// Check response status
var expectedStatus int
if target.Config.Status > 0 {
expectedStatus = target.Config.Status
} else {
expectedStatus = 0 // Use range check for 200-299
}
if expectedStatus > 0 {
logger.Debug("Target %d: checking health status against expected code %d", target.Config.ID, expectedStatus)
// Check for specific status code
if resp.StatusCode == expectedStatus {
target.Status = StatusHealthy
logger.Debug("Target %d: health check passed (status: %d, expected: %d)", target.Config.ID, resp.StatusCode, expectedStatus)
} else {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("unexpected status code: %d (expected: %d)", resp.StatusCode, expectedStatus)
logger.Warn("Target %d: health check failed with status code %d (expected: %d)", target.Config.ID, resp.StatusCode, expectedStatus)
}
} else {
// Check for 2xx range
if resp.StatusCode >= 200 && resp.StatusCode < 300 {
target.Status = StatusHealthy
logger.Debug("Target %d: health check passed (status: %d)", target.Config.ID, resp.StatusCode)
} else {
target.Status = StatusUnhealthy
target.LastError = fmt.Sprintf("unhealthy status code: %d", resp.StatusCode)
logger.Warn("Target %d: health check failed with status code %d", target.Config.ID, resp.StatusCode)
}
}
}
// Stop stops monitoring all targets
func (m *Monitor) Stop() {
m.mutex.Lock()
defer m.mutex.Unlock()
targetCount := len(m.targets)
logger.Info("Stopping health check monitor with %d targets", targetCount)
for id, target := range m.targets {
logger.Debug("Stopping monitoring for target %d", id)
target.cancel()
}
m.targets = make(map[int]*Target)
logger.Info("Health check monitor stopped")
}
// EnableTarget enables monitoring for a specific target
func (m *Monitor) EnableTarget(id int) error {
m.mutex.Lock()
defer m.mutex.Unlock()
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to enable non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
if !target.Config.Enabled {
logger.Info("Enabling health check monitoring for target %d", id)
target.Config.Enabled = true
target.cancel() // Stop existing monitoring
ctx, cancel := context.WithCancel(context.Background())
target.ctx = ctx
target.cancel = cancel
go m.monitorTarget(target)
} else {
logger.Debug("Target %d is already enabled", id)
}
return nil
}
// DisableTarget disables monitoring for a specific target
func (m *Monitor) DisableTarget(id int) error {
m.mutex.Lock()
defer m.mutex.Unlock()
target, exists := m.targets[id]
if !exists {
logger.Warn("Attempted to disable non-existent target with ID %d", id)
return fmt.Errorf("target with id %d not found", id)
}
if target.Config.Enabled {
logger.Info("Disabling health check monitoring for target %d", id)
target.Config.Enabled = false
target.cancel()
target.Status = StatusUnknown
// Notify callback of status change
if m.callback != nil {
go m.callback(m.GetTargets())
}
} else {
logger.Debug("Target %d is already disabled", id)
}
return nil
}