diff --git a/common.go b/common.go index bcbd968..8fd89f1 100644 --- a/common.go +++ b/common.go @@ -221,6 +221,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background") } +// shouldFireRecovery decides whether the data-plane recovery flow in +// startPingCheck should run on this tick. Recovery fires once when the +// consecutive-failure counter first crosses the threshold; the connectionLost +// flag prevents re-firing until a successful ping resets the state. +// +// This condition was previously inlined into startPingCheck and AND-ed with +// `currentInterval < maxInterval`, which silently broke recovery once +// pingInterval's default was bumped to 15s while maxInterval stayed at 6s +// (commit 8161fa6, March 2026): the gate became permanently false on default +// settings, so the recovery code never executed and ping failures climbed +// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004. +// +// Recovery and backoff are independent concerns; the backoff ramp is now +// computed separately in the caller. Do not re-introduce currentInterval +// here. +func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool { + return consecutiveFailures >= failureThreshold && !connectionLost +} + func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} { maxInterval := 6 * time.Second currentInterval := pingInterval @@ -280,43 +299,43 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien // More lenient threshold for declaring connection lost under load failureThreshold := 4 - if consecutiveFailures >= failureThreshold { - if !connectionLost { - connectionLost = true - logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) - if tunnelID != "" { - telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) - } - pingChainId := generateChainId() - pendingPingChainId = pingChainId - stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ - "chainId": pingChainId, - }, 3*time.Second) - // Send registration message to the server for backward compatibility - bcChainId := generateChainId() - pendingRegisterChainId = bcChainId - err := client.SendMessage("newt/wg/register", map[string]interface{}{ - "publicKey": publicKey.String(), - "backwardsCompatible": true, - "chainId": bcChainId, - }) + if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) { + connectionLost = true + logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures) + if tunnelID != "" { + telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout) + } + pingChainId := generateChainId() + pendingPingChainId = pingChainId + stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{ + "chainId": pingChainId, + }, 3*time.Second) + // Send registration message to the server for backward compatibility + bcChainId := generateChainId() + pendingRegisterChainId = bcChainId + err := client.SendMessage("newt/wg/register", map[string]interface{}{ + "publicKey": publicKey.String(), + "backwardsCompatible": true, + "chainId": bcChainId, + }) + if err != nil { + logger.Error("Failed to send registration message: %v", err) + } + if healthFile != "" { + err = os.Remove(healthFile) if err != nil { - logger.Error("Failed to send registration message: %v", err) - } - if healthFile != "" { - err = os.Remove(healthFile) - if err != nil { - logger.Error("Failed to remove health file: %v", err) - } + logger.Error("Failed to remove health file: %v", err) } } - if currentInterval < maxInterval { - currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase - if currentInterval > maxInterval { - currentInterval = maxInterval - } - ticker.Reset(currentInterval) - logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval) + } + // Backoff: ramp the periodic-ping interval up while we are + // past the failure threshold, capped at maxInterval. Kept + // independent of the recovery trigger above so the trigger + // fires on every outage regardless of pingInterval. + if consecutiveFailures >= failureThreshold && currentInterval < maxInterval { + currentInterval = time.Duration(float64(currentInterval) * 1.3) + if currentInterval > maxInterval { + currentInterval = maxInterval } } } else { diff --git a/common_test.go b/common_test.go index a7e659a..67c02cf 100644 --- a/common_test.go +++ b/common_test.go @@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) { }) } } + +// TestShouldFireRecovery is the regression guard for the broken trigger gate +// that prevented data-plane recovery from ever firing under default settings +// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was +// +// consecutiveFailures >= failureThreshold && currentInterval < maxInterval +// +// which became permanently false once pingInterval's default was bumped from +// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s, +// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never +// executed. +// +// The fix is to drop currentInterval from the trigger condition entirely; +// backoff is a separate concern computed in the caller. The cases below +// exercise the documented contract. +func TestShouldFireRecovery(t *testing.T) { + const threshold = 4 + cases := []struct { + name string + failures int + connectionLost bool + want bool + }{ + {"below threshold, fresh", 3, false, false}, + {"below threshold, already lost", 3, true, false}, + {"at threshold, fresh — recovery must fire", threshold, false, true}, + {"at threshold, already lost — gate prevents re-fire", threshold, true, false}, + {"far above threshold, fresh", 100, false, true}, + {"far above threshold, already lost", 100, true, false}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want { + t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v", + c.failures, threshold, c.connectionLost, got, c.want) + } + }) + } +}