diff --git a/cmd/pilotctl/verify_flow.go b/cmd/pilotctl/verify_flow.go index 726dc671..736a2bd6 100644 --- a/cmd/pilotctl/verify_flow.go +++ b/cmd/pilotctl/verify_flow.go @@ -155,12 +155,22 @@ func cmdVerifyProvider(flags map[string]string, provider string) { deadline := time.Now().Add(10 * time.Minute) var badge, badgeSig string + // Tolerate transient poll failures (a dropped frame, a relay blip) rather + // than aborting a flow the user may have already half-completed. Only give + // up after several consecutive errors. + const maxPollErrors = 5 + pollErrors := 0 for time.Now().Before(deadline) { time.Sleep(5 * time.Second) poll, err := verifierRoundtrip(d, vaddr, verifierRequest{Op: "poll", FlowID: begin.FlowID}) if err != nil { - fatalCode("connection_failed", "verify poll: %v", err) + pollErrors++ + if pollErrors >= maxPollErrors { + fatalCode("connection_failed", "verify poll: %d consecutive failures: %v", pollErrors, err) + } + continue } + pollErrors = 0 switch poll.Status { case "ready": badge, badgeSig = poll.Badge, poll.BadgeSig diff --git a/cmd/pilotctl/zz_verify_cmds_test.go b/cmd/pilotctl/zz_verify_cmds_test.go index ae494023..cc0b275f 100644 --- a/cmd/pilotctl/zz_verify_cmds_test.go +++ b/cmd/pilotctl/zz_verify_cmds_test.go @@ -126,3 +126,52 @@ func TestNodeArgToID(t *testing.T) { t.Errorf("address: got %d, want 99", got) } } + +// TestCmdRecoveryRecoverInstallsNewIdentity covers the most destructive command: +// keyless force-rotate via the registry, then install the new identity locally. +func TestCmdRecoveryRecoverInstallsNewIdentity(t *testing.T) { + dir := t.TempDir() + newKey := filepath.Join(dir, "new.json") + id, err := crypto.GenerateIdentity() + if err != nil { + t.Fatalf("keygen: %v", err) + } + if err := crypto.SaveIdentity(newKey, id); err != nil { + t.Fatalf("save new key: %v", err) + } + newPub := crypto.EncodePublicKey(id.PublicKey) + idPath := filepath.Join(dir, "installed.json") + + r := newFakeRegistry(t) + var gotPub, gotRecovery string + r.on("recover_identity", func(req map[string]interface{}) map[string]interface{} { + gotPub, _ = req["new_public_key"].(string) + gotRecovery, _ = req["recovery"].(string) + return map[string]interface{}{"type": "recover_identity_ok", "ok": true, "node_id": float64(99)} + }) + useRegistry(t, r) + + prev := jsonOutput + defer func() { jsonOutput = prev }() + jsonOutput = true + _ = captureStdout(t, func() { + cmdRecovery([]string{"recover", "--node", "99", "--new-key", newKey, + "--recovery", "pilotrecover:v1:99:bmV3:Y29t:9999999999:nn:rec-v1", "--recovery-sig", "c2ln", + "--identity", idPath}) + }) + + if gotRecovery == "" { + t.Fatal("registry never received recover_identity") + } + if gotPub != newPub { + t.Errorf("registry got new_public_key=%q, want the new-key pubkey %q", gotPub, newPub) + } + // The recovered key must be installed at the daemon identity path. + installed, err := crypto.LoadIdentity(idPath) + if err != nil { + t.Fatalf("new identity not installed at %s: %v", idPath, err) + } + if crypto.EncodePublicKey(installed.PublicKey) != newPub { + t.Errorf("installed identity does not match the recovered key") + } +} diff --git a/pkg/daemon/tunnel.go b/pkg/daemon/tunnel.go index 337f41f4..0a59ff81 100644 --- a/pkg/daemon/tunnel.go +++ b/pkg/daemon/tunnel.go @@ -791,6 +791,19 @@ var TunnelKeepaliveInterval = 25 * time.Second // "ping" packet. The encrypted payload still authenticates as us // (peer's AEAD verifies our nodeID AAD), so an attacker can't forge // keepalives to keep stale entries warm. +// newKeepalivePacket builds the tiny ProtoControl/PortPing keepalive. It +// stamps our own node id as the inner source: the receiver enforces +// pkt.Src.Node == the AEAD-authenticated peer, so a zero-Src keepalive would +// be dropped as spoofed and the peer's NAT mapping would then expire. +func (tm *TunnelManager) newKeepalivePacket() *protocol.Packet { + return &protocol.Packet{ + Version: protocol.Version, + Protocol: protocol.ProtoControl, + DstPort: protocol.PortPing, + Src: protocol.Addr{Node: tm.loadNodeID()}, + } +} + func (tm *TunnelManager) keepaliveSweep(now time.Time) int { type peerInfo struct { id uint32 @@ -814,11 +827,7 @@ func (tm *TunnelManager) keepaliveSweep(now time.Time) int { sent := 0 for _, p := range stale { - ka := &protocol.Packet{ - Version: protocol.Version, - Protocol: protocol.ProtoControl, - DstPort: protocol.PortPing, - } + ka := tm.newKeepalivePacket() plaintext, err := ka.Marshal() if err != nil { continue diff --git a/pkg/daemon/zz_nat_keepalive_bug_test.go b/pkg/daemon/zz_nat_keepalive_bug_test.go index 951495f5..0e008a4a 100644 --- a/pkg/daemon/zz_nat_keepalive_bug_test.go +++ b/pkg/daemon/zz_nat_keepalive_bug_test.go @@ -233,6 +233,28 @@ func TestHandleEncryptedDropsKeepaliveBeforeRecvCh(t *testing.T) { } } +// TestKeepalivePacketStampsOwnSource pins the fix for the regression where the +// transport src-binding check dropped NAT keepalives: keepaliveSweep left +// Src.Node=0, but the receiver enforces pkt.Src.Node == the authenticated peer, +// so a zero-Src keepalive was dropped and the NAT mapping then expired. The +// keepalive must carry our own node id (= the peer id from the receiver's view). +func TestKeepalivePacketStampsOwnSource(t *testing.T) { + t.Parallel() + tm := NewTunnelManager() + t.Cleanup(func() { tm.Close() }) + + const selfID uint32 = 0xAB00CD01 + tm.SetNodeID(selfID) + + ka := tm.newKeepalivePacket() + if ka.Src.Node != selfID { + t.Fatalf("keepalive Src.Node = %#x, want own id %#x — a zero/wrong Src is dropped by the receiver's spoof check, expiring the NAT mapping", ka.Src.Node, selfID) + } + if ka.Protocol != protocol.ProtoControl || ka.DstPort != protocol.PortPing { + t.Fatalf("keepalive shape wrong: protocol=%d dstport=%d", ka.Protocol, ka.DstPort) + } +} + // TestRecordOutboundSendStampsTimestamp pins the half of the fix that // already lands in RED: every successful writeFrame call must update // tm.lastOutboundSend[nodeID] so the eventual keepaliveSweep can tell