Merge branch 'main' into dev

This commit is contained in:
Owen
2026-05-07 17:24:34 -07:00
2 changed files with 92 additions and 34 deletions

View File

@@ -221,6 +221,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
}
// shouldFireRecovery decides whether the data-plane recovery flow in
// startPingCheck should run on this tick. Recovery fires once when the
// consecutive-failure counter first crosses the threshold; the connectionLost
// flag prevents re-firing until a successful ping resets the state.
//
// This condition was previously inlined into startPingCheck and AND-ed with
// `currentInterval < maxInterval`, which silently broke recovery once
// pingInterval's default was bumped to 15s while maxInterval stayed at 6s
// (commit 8161fa6, March 2026): the gate became permanently false on default
// settings, so the recovery code never executed and ping failures climbed
// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004.
//
// Recovery and backoff are independent concerns; the backoff ramp is now
// computed separately in the caller. Do not re-introduce currentInterval
// here.
func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool {
return consecutiveFailures >= failureThreshold && !connectionLost
}
func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
maxInterval := 6 * time.Second
currentInterval := pingInterval
@@ -280,43 +299,43 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
// More lenient threshold for declaring connection lost under load
failureThreshold := 4
if consecutiveFailures >= failureThreshold {
if !connectionLost {
connectionLost = true
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
if tunnelID != "" {
telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
}
pingChainId := generateChainId()
pendingPingChainId = pingChainId
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
"chainId": pingChainId,
}, 3*time.Second)
// Send registration message to the server for backward compatibility
bcChainId := generateChainId()
pendingRegisterChainId = bcChainId
err := client.SendMessage("newt/wg/register", map[string]interface{}{
"publicKey": publicKey.String(),
"backwardsCompatible": true,
"chainId": bcChainId,
})
if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) {
connectionLost = true
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
if tunnelID != "" {
telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
}
pingChainId := generateChainId()
pendingPingChainId = pingChainId
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
"chainId": pingChainId,
}, 3*time.Second)
// Send registration message to the server for backward compatibility
bcChainId := generateChainId()
pendingRegisterChainId = bcChainId
err := client.SendMessage("newt/wg/register", map[string]interface{}{
"publicKey": publicKey.String(),
"backwardsCompatible": true,
"chainId": bcChainId,
})
if err != nil {
logger.Error("Failed to send registration message: %v", err)
}
if healthFile != "" {
err = os.Remove(healthFile)
if err != nil {
logger.Error("Failed to send registration message: %v", err)
}
if healthFile != "" {
err = os.Remove(healthFile)
if err != nil {
logger.Error("Failed to remove health file: %v", err)
}
logger.Error("Failed to remove health file: %v", err)
}
}
if currentInterval < maxInterval {
currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase
if currentInterval > maxInterval {
currentInterval = maxInterval
}
ticker.Reset(currentInterval)
logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval)
}
// Backoff: ramp the periodic-ping interval up while we are
// past the failure threshold, capped at maxInterval. Kept
// independent of the recovery trigger above so the trigger
// fires on every outage regardless of pingInterval.
if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
currentInterval = time.Duration(float64(currentInterval) * 1.3)
if currentInterval > maxInterval {
currentInterval = maxInterval
}
}
} else {

View File

@@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) {
})
}
}
// TestShouldFireRecovery is the regression guard for the broken trigger gate
// that prevented data-plane recovery from ever firing under default settings
// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was
//
// consecutiveFailures >= failureThreshold && currentInterval < maxInterval
//
// which became permanently false once pingInterval's default was bumped from
// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s,
// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never
// executed.
//
// The fix is to drop currentInterval from the trigger condition entirely;
// backoff is a separate concern computed in the caller. The cases below
// exercise the documented contract.
func TestShouldFireRecovery(t *testing.T) {
const threshold = 4
cases := []struct {
name string
failures int
connectionLost bool
want bool
}{
{"below threshold, fresh", 3, false, false},
{"below threshold, already lost", 3, true, false},
{"at threshold, fresh — recovery must fire", threshold, false, true},
{"at threshold, already lost — gate prevents re-fire", threshold, true, false},
{"far above threshold, fresh", 100, false, true},
{"far above threshold, already lost", 100, true, false},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want {
t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v",
c.failures, threshold, c.connectionLost, got, c.want)
}
})
}
}