fix(agents): avoid double websocket retry accounting on reconnect failures (#39133)

Co-authored-by: scoootscooob <zhentongfan@gmail.com>
This commit is contained in:
Peter Steinberger
2026-03-07 19:55:13 +00:00
parent ac86deccee
commit 3c1176110a
3 changed files with 53 additions and 5 deletions

View File

@@ -267,6 +267,7 @@ Docs: https://docs.openclaw.ai
- Agents/compaction counter accuracy: count successful overflow-triggered auto-compactions (`willRetry=true`) in the compaction counter while still excluding aborted/no-result events, so `/status` reflects actual safeguard compaction activity. (#39123) Thanks @MumuTW.
- Gateway/chat delta ordering: flush buffered assistant deltas before emitting tool `start` events so pre-tool text is delivered to Control UI before tool cards, avoiding transient text/tool ordering artifacts in streaming. (#39128) Thanks @0xtangping.
- Voice-call plugin schema parity: add missing manifest `configSchema` fields (`webhookSecurity`, `streaming.preStartTimeoutMs|maxPendingConnections|maxPendingConnectionsPerIp|maxConnections`, `staleCallReaperSeconds`) so gateway AJV validation accepts already-supported runtime config instead of failing with `additionalProperties` errors. (#38892) Thanks @giumex.
- Agents/OpenAI WS reconnect retry accounting: avoid double retry scheduling when reconnect failures emit both `error` and `close`, so retry budgets track actual reconnect attempts instead of exhausting early. (#39133) Thanks @scoootscooob.
## 2026.3.2

View File

@@ -506,6 +506,53 @@ describe("OpenAIWebSocketManager", () => {
expect(maxRetryError).toBeDefined();
});
it("does not double-count retries when error and close both fire on a reconnect attempt", async () => {
// In the real `ws` library, a failed connection fires "error" followed
// by "close". Previously, both the onClose handler AND the promise
// .catch() in _scheduleReconnect called _scheduleReconnect(), which
// double-incremented retryCount and exhausted the retry budget
// prematurely (e.g. 3 retries became ~1-2 actual attempts).
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 5, 5] });
const errors = attachErrorCollector(manager);
const p = manager.connect("sk-test");
lastSocket().simulateOpen();
await p;
// Drop the established connection — triggers first reconnect schedule
lastSocket().simulateClose(1006, "Network error");
// Advance past first retry delay — a new socket is created
await vi.advanceTimersByTimeAsync(10);
const sock2 = lastSocket();
// Simulate a realistic failure: error fires first, then close follows.
sock2.simulateError(new Error("ECONNREFUSED"));
sock2.simulateClose(1006, "Connection failed");
// Advance past second retry delay — another socket should be created
// because we've only used 2 retries (not 3 from double-counting).
await vi.advanceTimersByTimeAsync(10);
const sock3 = lastSocket();
expect(sock3).not.toBe(sock2);
// Third attempt also fails with error+close
sock3.simulateError(new Error("ECONNREFUSED"));
sock3.simulateClose(1006, "Connection failed");
// Advance past third retry delay — one more attempt (retry 3 of 3)
await vi.advanceTimersByTimeAsync(10);
const sock4 = lastSocket();
expect(sock4).not.toBe(sock3);
// Fourth socket also fails — now retries should be exhausted (3/3)
sock4.simulateError(new Error("ECONNREFUSED"));
sock4.simulateClose(1006, "Connection failed");
await vi.advanceTimersByTimeAsync(10);
const maxRetryError = errors.find((e) => e.message.includes("max reconnect retries"));
expect(maxRetryError).toBeDefined();
});
it("resets retry count after a successful reconnect", async () => {
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 10, 20] });
const p = manager.connect("sk-test");

View File

@@ -446,11 +446,11 @@ export class OpenAIWebSocketManager extends EventEmitter<InternalEvents> {
if (this.closed) {
return;
}
this._openConnection().catch((err: unknown) => {
// onError handler already emitted error event; schedule next retry.
void err;
this._scheduleReconnect();
});
// The onClose handler already calls _scheduleReconnect() for the next
// attempt, so we intentionally swallow the rejection here to avoid
// double-scheduling (which would double-increment retryCount per
// failed reconnect and exhaust the retry budget prematurely).
this._openConnection().catch(() => {});
}, delayMs);
}