mirror of
https://github.com/fosrl/newt.git
synced 2026-05-10 14:53:29 -05:00
Merge branch 'main' into dev
This commit is contained in:
87
common.go
87
common.go
@@ -221,6 +221,25 @@ func pingWithRetry(tnet *netstack.Net, dst string, timeout time.Duration) (stopC
|
||||
return stopChan, fmt.Errorf("initial ping attempts failed, continuing in background")
|
||||
}
|
||||
|
||||
// shouldFireRecovery decides whether the data-plane recovery flow in
|
||||
// startPingCheck should run on this tick. Recovery fires once when the
|
||||
// consecutive-failure counter first crosses the threshold; the connectionLost
|
||||
// flag prevents re-firing until a successful ping resets the state.
|
||||
//
|
||||
// This condition was previously inlined into startPingCheck and AND-ed with
|
||||
// `currentInterval < maxInterval`, which silently broke recovery once
|
||||
// pingInterval's default was bumped to 15s while maxInterval stayed at 6s
|
||||
// (commit 8161fa6, March 2026): the gate became permanently false on default
|
||||
// settings, so the recovery code never executed and ping failures climbed
|
||||
// forever — the proximate cause of fosrl/newt#284, #310 and pangolin#1004.
|
||||
//
|
||||
// Recovery and backoff are independent concerns; the backoff ramp is now
|
||||
// computed separately in the caller. Do not re-introduce currentInterval
|
||||
// here.
|
||||
func shouldFireRecovery(consecutiveFailures, failureThreshold int, connectionLost bool) bool {
|
||||
return consecutiveFailures >= failureThreshold && !connectionLost
|
||||
}
|
||||
|
||||
func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Client, tunnelID string) chan struct{} {
|
||||
maxInterval := 6 * time.Second
|
||||
currentInterval := pingInterval
|
||||
@@ -280,43 +299,43 @@ func startPingCheck(tnet *netstack.Net, serverIP string, client *websocket.Clien
|
||||
|
||||
// More lenient threshold for declaring connection lost under load
|
||||
failureThreshold := 4
|
||||
if consecutiveFailures >= failureThreshold {
|
||||
if !connectionLost {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
if tunnelID != "" {
|
||||
telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
|
||||
}
|
||||
pingChainId := generateChainId()
|
||||
pendingPingChainId = pingChainId
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
|
||||
"chainId": pingChainId,
|
||||
}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
bcChainId := generateChainId()
|
||||
pendingRegisterChainId = bcChainId
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"backwardsCompatible": true,
|
||||
"chainId": bcChainId,
|
||||
})
|
||||
if shouldFireRecovery(consecutiveFailures, failureThreshold, connectionLost) {
|
||||
connectionLost = true
|
||||
logger.Warn("Connection to server lost after %d failures. Continuous reconnection attempts will be made.", consecutiveFailures)
|
||||
if tunnelID != "" {
|
||||
telemetry.IncReconnect(context.Background(), tunnelID, "client", telemetry.ReasonTimeout)
|
||||
}
|
||||
pingChainId := generateChainId()
|
||||
pendingPingChainId = pingChainId
|
||||
stopFunc = client.SendMessageInterval("newt/ping/request", map[string]interface{}{
|
||||
"chainId": pingChainId,
|
||||
}, 3*time.Second)
|
||||
// Send registration message to the server for backward compatibility
|
||||
bcChainId := generateChainId()
|
||||
pendingRegisterChainId = bcChainId
|
||||
err := client.SendMessage("newt/wg/register", map[string]interface{}{
|
||||
"publicKey": publicKey.String(),
|
||||
"backwardsCompatible": true,
|
||||
"chainId": bcChainId,
|
||||
})
|
||||
if err != nil {
|
||||
logger.Error("Failed to send registration message: %v", err)
|
||||
}
|
||||
if healthFile != "" {
|
||||
err = os.Remove(healthFile)
|
||||
if err != nil {
|
||||
logger.Error("Failed to send registration message: %v", err)
|
||||
}
|
||||
if healthFile != "" {
|
||||
err = os.Remove(healthFile)
|
||||
if err != nil {
|
||||
logger.Error("Failed to remove health file: %v", err)
|
||||
}
|
||||
logger.Error("Failed to remove health file: %v", err)
|
||||
}
|
||||
}
|
||||
if currentInterval < maxInterval {
|
||||
currentInterval = time.Duration(float64(currentInterval) * 1.3) // Slower increase
|
||||
if currentInterval > maxInterval {
|
||||
currentInterval = maxInterval
|
||||
}
|
||||
ticker.Reset(currentInterval)
|
||||
logger.Debug("Increased ping check interval to %v due to consecutive failures", currentInterval)
|
||||
}
|
||||
// Backoff: ramp the periodic-ping interval up while we are
|
||||
// past the failure threshold, capped at maxInterval. Kept
|
||||
// independent of the recovery trigger above so the trigger
|
||||
// fires on every outage regardless of pingInterval.
|
||||
if consecutiveFailures >= failureThreshold && currentInterval < maxInterval {
|
||||
currentInterval = time.Duration(float64(currentInterval) * 1.3)
|
||||
if currentInterval > maxInterval {
|
||||
currentInterval = maxInterval
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
@@ -210,3 +210,42 @@ func TestParseTargetStringNetDialCompatibility(t *testing.T) {
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestShouldFireRecovery is the regression guard for the broken trigger gate
|
||||
// that prevented data-plane recovery from ever firing under default settings
|
||||
// (fosrl/newt#284, #310, pangolin#1004). The pre-fix condition was
|
||||
//
|
||||
// consecutiveFailures >= failureThreshold && currentInterval < maxInterval
|
||||
//
|
||||
// which became permanently false once pingInterval's default was bumped from
|
||||
// 3s to 15s in commit 8161fa6 — currentInterval starts at pingInterval=15s,
|
||||
// maxInterval stayed at 6s, so 15<6 is false and the recovery branch never
|
||||
// executed.
|
||||
//
|
||||
// The fix is to drop currentInterval from the trigger condition entirely;
|
||||
// backoff is a separate concern computed in the caller. The cases below
|
||||
// exercise the documented contract.
|
||||
func TestShouldFireRecovery(t *testing.T) {
|
||||
const threshold = 4
|
||||
cases := []struct {
|
||||
name string
|
||||
failures int
|
||||
connectionLost bool
|
||||
want bool
|
||||
}{
|
||||
{"below threshold, fresh", 3, false, false},
|
||||
{"below threshold, already lost", 3, true, false},
|
||||
{"at threshold, fresh — recovery must fire", threshold, false, true},
|
||||
{"at threshold, already lost — gate prevents re-fire", threshold, true, false},
|
||||
{"far above threshold, fresh", 100, false, true},
|
||||
{"far above threshold, already lost", 100, true, false},
|
||||
}
|
||||
for _, c := range cases {
|
||||
t.Run(c.name, func(t *testing.T) {
|
||||
if got := shouldFireRecovery(c.failures, threshold, c.connectionLost); got != c.want {
|
||||
t.Errorf("shouldFireRecovery(failures=%d, threshold=%d, lost=%v) = %v, want %v",
|
||||
c.failures, threshold, c.connectionLost, got, c.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user