fix(agents): avoid double websocket retry accounting on reconnect failures (#39133)
Co-authored-by: scoootscooob <zhentongfan@gmail.com>
This commit is contained in:
@@ -267,6 +267,7 @@ Docs: https://docs.openclaw.ai
|
||||
- Agents/compaction counter accuracy: count successful overflow-triggered auto-compactions (`willRetry=true`) in the compaction counter while still excluding aborted/no-result events, so `/status` reflects actual safeguard compaction activity. (#39123) Thanks @MumuTW.
|
||||
- Gateway/chat delta ordering: flush buffered assistant deltas before emitting tool `start` events so pre-tool text is delivered to Control UI before tool cards, avoiding transient text/tool ordering artifacts in streaming. (#39128) Thanks @0xtangping.
|
||||
- Voice-call plugin schema parity: add missing manifest `configSchema` fields (`webhookSecurity`, `streaming.preStartTimeoutMs|maxPendingConnections|maxPendingConnectionsPerIp|maxConnections`, `staleCallReaperSeconds`) so gateway AJV validation accepts already-supported runtime config instead of failing with `additionalProperties` errors. (#38892) Thanks @giumex.
|
||||
- Agents/OpenAI WS reconnect retry accounting: avoid double retry scheduling when reconnect failures emit both `error` and `close`, so retry budgets track actual reconnect attempts instead of exhausting early. (#39133) Thanks @scoootscooob.
|
||||
|
||||
## 2026.3.2
|
||||
|
||||
|
||||
@@ -506,6 +506,53 @@ describe("OpenAIWebSocketManager", () => {
|
||||
expect(maxRetryError).toBeDefined();
|
||||
});
|
||||
|
||||
it("does not double-count retries when error and close both fire on a reconnect attempt", async () => {
|
||||
// In the real `ws` library, a failed connection fires "error" followed
|
||||
// by "close". Previously, both the onClose handler AND the promise
|
||||
// .catch() in _scheduleReconnect called _scheduleReconnect(), which
|
||||
// double-incremented retryCount and exhausted the retry budget
|
||||
// prematurely (e.g. 3 retries became ~1-2 actual attempts).
|
||||
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 5, 5] });
|
||||
const errors = attachErrorCollector(manager);
|
||||
const p = manager.connect("sk-test");
|
||||
lastSocket().simulateOpen();
|
||||
await p;
|
||||
|
||||
// Drop the established connection — triggers first reconnect schedule
|
||||
lastSocket().simulateClose(1006, "Network error");
|
||||
|
||||
// Advance past first retry delay — a new socket is created
|
||||
await vi.advanceTimersByTimeAsync(10);
|
||||
const sock2 = lastSocket();
|
||||
|
||||
// Simulate a realistic failure: error fires first, then close follows.
|
||||
sock2.simulateError(new Error("ECONNREFUSED"));
|
||||
sock2.simulateClose(1006, "Connection failed");
|
||||
|
||||
// Advance past second retry delay — another socket should be created
|
||||
// because we've only used 2 retries (not 3 from double-counting).
|
||||
await vi.advanceTimersByTimeAsync(10);
|
||||
const sock3 = lastSocket();
|
||||
expect(sock3).not.toBe(sock2);
|
||||
|
||||
// Third attempt also fails with error+close
|
||||
sock3.simulateError(new Error("ECONNREFUSED"));
|
||||
sock3.simulateClose(1006, "Connection failed");
|
||||
|
||||
// Advance past third retry delay — one more attempt (retry 3 of 3)
|
||||
await vi.advanceTimersByTimeAsync(10);
|
||||
const sock4 = lastSocket();
|
||||
expect(sock4).not.toBe(sock3);
|
||||
|
||||
// Fourth socket also fails — now retries should be exhausted (3/3)
|
||||
sock4.simulateError(new Error("ECONNREFUSED"));
|
||||
sock4.simulateClose(1006, "Connection failed");
|
||||
await vi.advanceTimersByTimeAsync(10);
|
||||
|
||||
const maxRetryError = errors.find((e) => e.message.includes("max reconnect retries"));
|
||||
expect(maxRetryError).toBeDefined();
|
||||
});
|
||||
|
||||
it("resets retry count after a successful reconnect", async () => {
|
||||
const manager = buildManager({ maxRetries: 3, backoffDelaysMs: [5, 10, 20] });
|
||||
const p = manager.connect("sk-test");
|
||||
|
||||
@@ -446,11 +446,11 @@ export class OpenAIWebSocketManager extends EventEmitter<InternalEvents> {
|
||||
if (this.closed) {
|
||||
return;
|
||||
}
|
||||
this._openConnection().catch((err: unknown) => {
|
||||
// onError handler already emitted error event; schedule next retry.
|
||||
void err;
|
||||
this._scheduleReconnect();
|
||||
});
|
||||
// The onClose handler already calls _scheduleReconnect() for the next
|
||||
// attempt, so we intentionally swallow the rejection here to avoid
|
||||
// double-scheduling (which would double-increment retryCount per
|
||||
// failed reconnect and exhaust the retry budget prematurely).
|
||||
this._openConnection().catch(() => {});
|
||||
}, delayMs);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user