From 156f13aa64cdbb11056250842c9a8d0bbb22b68a Mon Sep 17 00:00:00 2001 From: Sid Date: Wed, 25 Feb 2026 12:53:26 +0800 Subject: [PATCH] fix(agents): continue fallback loop for unrecognized provider errors (#26106) * fix(agents): continue fallback loop for unrecognized provider errors When a provider returns an error that coerceToFailoverError cannot classify (e.g., custom error messages without standard HTTP status codes), the fallback loop threw immediately instead of trying the next candidate. This caused fallback to stop after 2 models even when 17 were configured. Only rethrow unrecognized errors when they occur on the last candidate. For intermediate candidates, record the error as an attempt and continue to the next model. Closes #25926 Co-authored-by: Cursor * test: cover unknown-error fallback telemetry and land #26106 (thanks @Sid-Qin) --------- Co-authored-by: Cursor Co-authored-by: Peter Steinberger --- CHANGELOG.md | 1 + src/agents/model-fallback.test.ts | 46 +++++++++++++++++++++++++++++-- src/agents/model-fallback.ts | 13 ++++++--- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fb1208004..47dcf207a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Docs: https://docs.openclaw.ai - Agents/Model fallback: keep explicit text + image fallback chains reachable even when `agents.defaults.models` allowlists are present, prefer explicit run `agentId` over session-key parsing for followup fallback override resolution (with session-key fallback), treat agent-level fallback overrides as configured in embedded runner preflight, and classify `model_cooldown` / `cooling down` errors as `rate_limit` so failover continues. (#11972, #24137, #17231) - Followups/Routing: when explicit origin routing fails, allow same-channel fallback dispatch (while still blocking cross-channel fallback) so followup replies do not get dropped on transient origin-adapter failures. (#26109) Thanks @Sid-Qin. +- Agents/Model fallback: continue fallback traversal on unrecognized errors when candidates remain, while still throwing the original unknown error on the last candidate. (#26106) Thanks @Sid-Qin. ## 2026.2.24 diff --git a/src/agents/model-fallback.test.ts b/src/agents/model-fallback.test.ts index 903c292ec..16592cdb4 100644 --- a/src/agents/model-fallback.test.ts +++ b/src/agents/model-fallback.test.ts @@ -178,18 +178,60 @@ describe("runWithModelFallback", () => { expect(run).toHaveBeenCalledWith("openai-codex", "gpt-5.3-codex"); }); - it("does not fall back on non-auth errors", async () => { + it("falls back on unrecognized errors when candidates remain", async () => { const cfg = makeCfg(); const run = vi.fn().mockRejectedValueOnce(new Error("bad request")).mockResolvedValueOnce("ok"); + const result = await runWithModelFallback({ + cfg, + provider: "openai", + model: "gpt-4.1-mini", + run, + }); + expect(result.result).toBe("ok"); + expect(run).toHaveBeenCalledTimes(2); + expect(result.attempts).toHaveLength(1); + expect(result.attempts[0].error).toBe("bad request"); + expect(result.attempts[0].reason).toBe("unknown"); + }); + + it("passes original unknown errors to onError during fallback", async () => { + const cfg = makeCfg(); + const unknownError = new Error("provider misbehaved"); + const run = vi.fn().mockRejectedValueOnce(unknownError).mockResolvedValueOnce("ok"); + const onError = vi.fn(); + + await runWithModelFallback({ + cfg, + provider: "openai", + model: "gpt-4.1-mini", + run, + onError, + }); + + expect(onError).toHaveBeenCalledTimes(1); + expect(onError.mock.calls[0]?.[0]).toMatchObject({ + provider: "openai", + model: "gpt-4.1-mini", + attempt: 1, + total: 2, + }); + expect(onError.mock.calls[0]?.[0]?.error).toBe(unknownError); + }); + + it("throws unrecognized error on last candidate", async () => { + const cfg = makeCfg(); + const run = vi.fn().mockRejectedValueOnce(new Error("something weird")); + await expect( runWithModelFallback({ cfg, provider: "openai", model: "gpt-4.1-mini", run, + fallbacksOverride: [], }), - ).rejects.toThrow("bad request"); + ).rejects.toThrow("something weird"); expect(run).toHaveBeenCalledTimes(1); }); diff --git a/src/agents/model-fallback.ts b/src/agents/model-fallback.ts index b75eb8de4..e59d9e935 100644 --- a/src/agents/model-fallback.ts +++ b/src/agents/model-fallback.ts @@ -402,24 +402,29 @@ export async function runWithModelFallback(params: { provider: candidate.provider, model: candidate.model, }) ?? err; - if (!isFailoverError(normalized)) { + + // Even unrecognized errors should not abort the fallback loop when + // there are remaining candidates. Only abort/context-overflow errors + // (handled above) are truly non-retryable. + const isKnownFailover = isFailoverError(normalized); + if (!isKnownFailover && i === candidates.length - 1) { throw err; } - lastError = normalized; + lastError = isKnownFailover ? normalized : err; const described = describeFailoverError(normalized); attempts.push({ provider: candidate.provider, model: candidate.model, error: described.message, - reason: described.reason, + reason: described.reason ?? "unknown", status: described.status, code: described.code, }); await params.onError?.({ provider: candidate.provider, model: candidate.model, - error: normalized, + error: isKnownFailover ? normalized : err, attempt: i + 1, total: candidates.length, });