diff --git a/CHANGELOG.md b/CHANGELOG.md index 17c5727c8..cccd66fe0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -267,6 +267,7 @@ Docs: https://docs.openclaw.ai - Agents/compaction counter accuracy: count successful overflow-triggered auto-compactions (`willRetry=true`) in the compaction counter while still excluding aborted/no-result events, so `/status` reflects actual safeguard compaction activity. (#39123) Thanks @MumuTW. - Gateway/chat delta ordering: flush buffered assistant deltas before emitting tool `start` events so pre-tool text is delivered to Control UI before tool cards, avoiding transient text/tool ordering artifacts in streaming. (#39128) Thanks @0xtangping. - Voice-call plugin schema parity: add missing manifest `configSchema` fields (`webhookSecurity`, `streaming.preStartTimeoutMs|maxPendingConnections|maxPendingConnectionsPerIp|maxConnections`, `staleCallReaperSeconds`) so gateway AJV validation accepts already-supported runtime config instead of failing with `additionalProperties` errors. (#38892) Thanks @giumex. +- Agents/OpenAI WS reconnect retry accounting: avoid double retry scheduling when reconnect failures emit both `error` and `close`, so retry budgets track actual reconnect attempts instead of exhausting early. (#39133) Thanks @scoootscooob. ## 2026.3.2 diff --git a/src/agents/openai-ws-connection.test.ts b/src/agents/openai-ws-connection.test.ts index 64afd9d0b..fb80f510a 100644 --- a/src/agents/openai-ws-connection.test.ts +++ b/src/agents/openai-ws-connection.test.ts @@ -506,6 +506,53 @@ describe("OpenAIWebSocketManager", () => { expect(maxRetryError).toBeDefined(); }); + it("does not double-count retries when error and close both fire on a reconnect attempt", async () => { + // In the real `ws` library, a failed connection fires "error" followed + // by "close". Previously, both the onClose handler AND the promise + // .catch() in _scheduleReconnect called _scheduleReconnect(), which + // double-incremented retryCount and exhausted the retry budget + // prematurely (e.g. 3 retries became ~1-2 actual attempts). + const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 5, 5] }); + const errors = attachErrorCollector(manager); + const p = manager.connect("sk-test"); + lastSocket().simulateOpen(); + await p; + + // Drop the established connection — triggers first reconnect schedule + lastSocket().simulateClose(1006, "Network error"); + + // Advance past first retry delay — a new socket is created + await vi.advanceTimersByTimeAsync(10); + const sock2 = lastSocket(); + + // Simulate a realistic failure: error fires first, then close follows. + sock2.simulateError(new Error("ECONNREFUSED")); + sock2.simulateClose(1006, "Connection failed"); + + // Advance past second retry delay — another socket should be created + // because we've only used 2 retries (not 3 from double-counting). + await vi.advanceTimersByTimeAsync(10); + const sock3 = lastSocket(); + expect(sock3).not.toBe(sock2); + + // Third attempt also fails with error+close + sock3.simulateError(new Error("ECONNREFUSED")); + sock3.simulateClose(1006, "Connection failed"); + + // Advance past third retry delay — one more attempt (retry 3 of 3) + await vi.advanceTimersByTimeAsync(10); + const sock4 = lastSocket(); + expect(sock4).not.toBe(sock3); + + // Fourth socket also fails — now retries should be exhausted (3/3) + sock4.simulateError(new Error("ECONNREFUSED")); + sock4.simulateClose(1006, "Connection failed"); + await vi.advanceTimersByTimeAsync(10); + + const maxRetryError = errors.find((e) => e.message.includes("max reconnect retries")); + expect(maxRetryError).toBeDefined(); + }); + it("resets retry count after a successful reconnect", async () => { const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 10, 20] }); const p = manager.connect("sk-test"); diff --git a/src/agents/openai-ws-connection.ts b/src/agents/openai-ws-connection.ts index b3214c3e2..a765c0f37 100644 --- a/src/agents/openai-ws-connection.ts +++ b/src/agents/openai-ws-connection.ts @@ -446,11 +446,11 @@ export class OpenAIWebSocketManager extends EventEmitter { if (this.closed) { return; } - this._openConnection().catch((err: unknown) => { - // onError handler already emitted error event; schedule next retry. - void err; - this._scheduleReconnect(); - }); + // The onClose handler already calls _scheduleReconnect() for the next + // attempt, so we intentionally swallow the rejection here to avoid + // double-scheduling (which would double-increment retryCount per + // failed reconnect and exhaust the retry budget prematurely). + this._openConnection().catch(() => {}); }, delayMs); }