openclaw/src/agents/failover-error.test.ts

import { describe, expect, it } from "vitest";
import {
  coerceToFailoverError,
  describeFailoverError,
  isTimeoutError,
  resolveFailoverReasonFromError,
  resolveFailoverStatus,
} from "./failover-error.js";

// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors
const OPENAI_RATE_LIMIT_MESSAGE =
  "Rate limit reached for gpt-4.1-mini in organization org_test on requests per min. Limit: 3.000000 / min. Current: 3.000000 / min.";
// Anthropic overloaded_error example shape: https://docs.anthropic.com/en/api/errors
const ANTHROPIC_OVERLOADED_PAYLOAD =
  '{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_test"}';
// Gemini RESOURCE_EXHAUSTED troubleshooting example: https://ai.google.dev/gemini-api/docs/troubleshooting
const GEMINI_RESOURCE_EXHAUSTED_MESSAGE =
  "RESOURCE_EXHAUSTED: Resource has been exhausted (e.g. check quota).";
// OpenRouter 402 billing example: https://openrouter.ai/docs/api-reference/errors
const OPENROUTER_CREDITS_MESSAGE = "Payment Required: insufficient credits";
// AWS Bedrock 429 ThrottlingException / 503 ServiceUnavailable:
// https://docs.aws.amazon.com/bedrock/latest/userguide/troubleshooting-api-error-codes.html
const BEDROCK_THROTTLING_EXCEPTION_MESSAGE =
  "ThrottlingException: Your request was denied due to exceeding the account quotas for Amazon Bedrock.";
const BEDROCK_SERVICE_UNAVAILABLE_MESSAGE =
  "ServiceUnavailable: The service is temporarily unable to handle the request.";
// Groq error codes examples: https://console.groq.com/docs/errors
const GROQ_TOO_MANY_REQUESTS_MESSAGE =
  "429 Too Many Requests: Too many requests were sent in a given timeframe.";
const GROQ_SERVICE_UNAVAILABLE_MESSAGE =
  "503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance.";

describe("failover-error", () => {
  it("infers failover reason from HTTP status", () => {
    expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");
    expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe("rate_limit");
    expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");
    expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");
    // Keep the status-only path behavior-preserving and conservative.
    expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();
    expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");
  });

  it("classifies documented provider error shapes at the error boundary", () => {
    expect(
      resolveFailoverReasonFromError({
        status: 429,
        message: OPENAI_RATE_LIMIT_MESSAGE,
      }),
    ).toBe("rate_limit");
    expect(
      resolveFailoverReasonFromError({
        status: 529,
        message: ANTHROPIC_OVERLOADED_PAYLOAD,
      }),
    ).toBe("rate_limit");
    expect(
      resolveFailoverReasonFromError({
        status: 429,
        message: GEMINI_RESOURCE_EXHAUSTED_MESSAGE,
      }),
    ).toBe("rate_limit");
    expect(
      resolveFailoverReasonFromError({
        status: 402,
        message: OPENROUTER_CREDITS_MESSAGE,
      }),
    ).toBe("billing");
    expect(
      resolveFailoverReasonFromError({
        status: 429,
        message: BEDROCK_THROTTLING_EXCEPTION_MESSAGE,
      }),
    ).toBe("rate_limit");
    expect(
      resolveFailoverReasonFromError({
        status: 503,
        message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,
      }),
    ).toBe("timeout");
    expect(
      resolveFailoverReasonFromError({
        status: 429,
        message: GROQ_TOO_MANY_REQUESTS_MESSAGE,
      }),
    ).toBe("rate_limit");
    expect(
      resolveFailoverReasonFromError({
        status: 503,
        message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,
      }),
    ).toBe("timeout");
  });

  it("infers format errors from error messages", () => {
    expect(
      resolveFailoverReasonFromError({
        message: "invalid request format: messages.1.content.1.tool_use.id",
      }),
    ).toBe("format");
  });

  it("infers timeout from common node error codes", () => {
    expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");
  });

  it("infers timeout from abort/error stop-reason messages", () => {
    expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(
      "timeout",
    );
    expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: error" })).toBe(
      "timeout",
    );
    expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "stop reason: error" })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "reason: error" })).toBe("timeout");
  });

  it("infers timeout from connection/network error messages", () => {
    expect(resolveFailoverReasonFromError({ message: "Connection error." })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "fetch failed" })).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "Network error: ECONNREFUSED" })).toBe(
      "timeout",
    );
    expect(
      resolveFailoverReasonFromError({
        message: "dial tcp: lookup api.example.com: no such host (ENOTFOUND)",
      }),
    ).toBe("timeout");
    expect(resolveFailoverReasonFromError({ message: "temporary dns failure EAI_AGAIN" })).toBe(
      "timeout",
    );
  });

  it("treats AbortError reason=abort as timeout", () => {
    const err = Object.assign(new Error("aborted"), {
      name: "AbortError",
      reason: "reason: abort",
    });
    expect(isTimeoutError(err)).toBe(true);
  });

  it("coerces failover-worthy errors into FailoverError with metadata", () => {
    const err = coerceToFailoverError("credit balance too low", {
      provider: "anthropic",
      model: "claude-opus-4-5",
    });
    expect(err?.name).toBe("FailoverError");
    expect(err?.reason).toBe("billing");
    expect(err?.status).toBe(402);
    expect(err?.provider).toBe("anthropic");
    expect(err?.model).toBe("claude-opus-4-5");
  });

  it("coerces format errors with a 400 status", () => {
    const err = coerceToFailoverError("invalid request format", {
      provider: "google",
      model: "cloud-code-assist",
    });
    expect(err?.reason).toBe("format");
    expect(err?.status).toBe(400);
  });

  it("401/403 with generic message still returns auth (backward compat)", () => {
    expect(resolveFailoverReasonFromError({ status: 401, message: "Unauthorized" })).toBe("auth");
    expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");
  });

  it("401 with permanent auth message returns auth_permanent", () => {
    expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(
      "auth_permanent",
    );
  });

  it("403 with revoked key message returns auth_permanent", () => {
    expect(resolveFailoverReasonFromError({ status: 403, message: "api key revoked" })).toBe(
      "auth_permanent",
    );
  });

  it("resolveFailoverStatus maps auth_permanent to 403", () => {
    expect(resolveFailoverStatus("auth_permanent")).toBe(403);
  });

  it("coerces permanent auth error with correct reason", () => {
    const err = coerceToFailoverError(
      { status: 401, message: "invalid_api_key" },
      { provider: "anthropic", model: "claude-opus-4-6" },
    );
    expect(err?.reason).toBe("auth_permanent");
    expect(err?.provider).toBe("anthropic");
  });

  it("403 permission_error returns auth_permanent", () => {
    expect(
      resolveFailoverReasonFromError({
        status: 403,
        message:
          "permission_error: OAuth authentication is currently not allowed for this organization.",
      }),
    ).toBe("auth_permanent");
  });

  it("permission_error in error message string classifies as auth_permanent", () => {
    const err = coerceToFailoverError(
      "HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",
      { provider: "anthropic", model: "claude-opus-4-6" },
    );
    expect(err?.reason).toBe("auth_permanent");
  });

  it("'not allowed for this organization' classifies as auth_permanent", () => {
    const err = coerceToFailoverError(
      "OAuth authentication is currently not allowed for this organization",
      { provider: "anthropic", model: "claude-opus-4-6" },
    );
    expect(err?.reason).toBe("auth_permanent");
  });

  it("describes non-Error values consistently", () => {
    const described = describeFailoverError(123);
    expect(described.message).toBe("123");
    expect(described.reason).toBeUndefined();
  });
});
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`import { describe, expect, it } from "vitest";`
			`import {`
			`coerceToFailoverError,`
			`describeFailoverError,`
fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`isTimeoutError,`
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`resolveFailoverReasonFromError,`
fix(auth): distinguish revoked API keys from transient auth errors (#25754) Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: 8f9c07a200644284e11adae76368adab40c5fa4e Co-authored-by: rrenamed <87486610+rrenamed@users.noreply.github.com> Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Reviewed-by: @gumadeiras 2026-02-26 02:47:16 +02:00			`resolveFailoverStatus,`
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`} from "./failover-error.js";`

test(agents): add provider-backed failover regressions (#36735) * test(agents): add provider-backed failover fixtures * test(agents): cover more provider error docs * test(agents): tighten provider doc fixtures 2026-03-06 00:42:59 +03:00			`// OpenAI 429 example shape: https://help.openai.com/en/articles/5955604-how-can-i-solve-429-too-many-requests-errors`
			`const OPENAI_RATE_LIMIT_MESSAGE =`
			`"Rate limit reached for gpt-4.1-mini in organization org_test on requests per min. Limit: 3.000000 / min. Current: 3.000000 / min.";`
			`// Anthropic overloaded_error example shape: https://docs.anthropic.com/en/api/errors`
			`const ANTHROPIC_OVERLOADED_PAYLOAD =`
			`'{"type":"error","error":{"type":"overloaded_error","message":"Overloaded"},"request_id":"req_test"}';`
			`// Gemini RESOURCE_EXHAUSTED troubleshooting example: https://ai.google.dev/gemini-api/docs/troubleshooting`
			`const GEMINI_RESOURCE_EXHAUSTED_MESSAGE =`
			`"RESOURCE_EXHAUSTED: Resource has been exhausted (e.g. check quota).";`
			`// OpenRouter 402 billing example: https://openrouter.ai/docs/api-reference/errors`
			`const OPENROUTER_CREDITS_MESSAGE = "Payment Required: insufficient credits";`
			`// AWS Bedrock 429 ThrottlingException / 503 ServiceUnavailable:`
			`// https://docs.aws.amazon.com/bedrock/latest/userguide/troubleshooting-api-error-codes.html`
			`const BEDROCK_THROTTLING_EXCEPTION_MESSAGE =`
			`"ThrottlingException: Your request was denied due to exceeding the account quotas for Amazon Bedrock.";`
			`const BEDROCK_SERVICE_UNAVAILABLE_MESSAGE =`
			`"ServiceUnavailable: The service is temporarily unable to handle the request.";`
			`// Groq error codes examples: https://console.groq.com/docs/errors`
			`const GROQ_TOO_MANY_REQUESTS_MESSAGE =`
			`"429 Too Many Requests: Too many requests were sent in a given timeframe.";`
			`const GROQ_SERVICE_UNAVAILABLE_MESSAGE =`
			`"503 Service Unavailable: The server is temporarily unable to handle the request due to overloading or maintenance.";`

refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`describe("failover-error", () => {`
			`it("infers failover reason from HTTP status", () => {`
			`expect(resolveFailoverReasonFromError({ status: 402 })).toBe("billing");`
chore: migrate to oxlint and oxfmt Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com> 2026-01-14 14:31:43 +00:00			`expect(resolveFailoverReasonFromError({ statusCode: "429" })).toBe("rate_limit");`
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`expect(resolveFailoverReasonFromError({ status: 403 })).toBe("auth");`
			`expect(resolveFailoverReasonFromError({ status: 408 })).toBe("timeout");`
fix: handle 400 status in failover to enable model fallback (#1879) 2026-02-09 09:12:06 +02:00			`expect(resolveFailoverReasonFromError({ status: 400 })).toBe("format");`
refactor(agents): share failover HTTP status classification (#36615) * fix(agents): classify transient failover statuses consistently * fix(agents): preserve legacy failover status mapping 2026-03-05 23:50:36 +03:00			`// Keep the status-only path behavior-preserving and conservative.`
			`expect(resolveFailoverReasonFromError({ status: 500 })).toBeNull();`
fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) (#21017) * fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) When a model API returns 502 Bad Gateway, 503 Service Unavailable, or 504 Gateway Timeout, the error object carries the status code directly. resolveFailoverReasonFromError() only checked 402/429/401/403/408/400, so 5xx server errors fell through to message-based classification which requires the status code to appear at the start of the error message. Many API SDKs (Google, Anthropic) set err.status = 503 without prefixing the message with '503', so the message classifier never matched and failover never triggered — the run retried the same broken model. Add 502/503/504 to the status-code branch, returning 'timeout' (matching the existing behavior of isTransientHttpError in the message classifier). Fixes #20999 * Changelog: add failover 502/503/504 note with credits * Failover: classify HTTP 504 as transient in message parser * Changelog: credit taw0002 and vincentkoc for failover fix --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org> 2026-02-23 01:01:57 -07:00			`expect(resolveFailoverReasonFromError({ status: 502 })).toBe("timeout");`
fix: treat HTTP 503 as failover-eligible for LLM provider errors (#21086) * fix: treat HTTP 503 as failover-eligible for LLM provider errors When LLM SDKs wrap 503 responses, the leading "503" prefix is lost (e.g. Google Gemini returns "high demand" / "UNAVAILABLE" without a numeric prefix). The existing isTransientHttpError only matches messages starting with "503 ...", so these wrapped errors silently skip failover — no profile rotation, no model fallback. This patch closes that gap: - resolveFailoverReasonFromError: map HTTP status 503 → rate_limit (covers structured error objects with a status field) - ERROR_PATTERNS.overloaded: add /\b503\b/, "service unavailable", "high demand" (covers message-only classification when the leading status prefix is absent) Existing isTransientHttpError behavior is unchanged; these additions are complementary and only fire for errors that previously fell through unclassified. * fix: address review feedback — drop /\b503\b/ pattern, add test coverage - Remove `/\b503\b/` from ERROR_PATTERNS.overloaded to resolve the semantic inconsistency noted by reviewers: `isTransientHttpError` already handles messages prefixed with "503" (→ "timeout"), so a redundant overloaded pattern would classify the same class of errors differently depending on message formatting. - Keep "service unavailable" and "high demand" patterns — these are the real gap-fillers for SDK-rewritten messages that lack a numeric prefix. - Add test case for JSON-wrapped 503 error body containing "overloaded" to strengthen coverage. * fix: unify 503 classification — status 503 → timeout (consistent with isTransientHttpError) resolveFailoverReasonFromError previously mapped status 503 → "rate_limit", while the string-based isTransientHttpError mapped "503 ..." → "timeout". Align both paths: structured {status: 503} now also returns "timeout", matching the existing transient-error convention. Both reasons are failover-eligible, so runtime behavior is unchanged. --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org> 2026-02-20 04:45:09 +08:00			`expect(resolveFailoverReasonFromError({ status: 503 })).toBe("timeout");`
fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) (#21017) * fix: treat HTTP 502/503/504 as failover-eligible (timeout reason) When a model API returns 502 Bad Gateway, 503 Service Unavailable, or 504 Gateway Timeout, the error object carries the status code directly. resolveFailoverReasonFromError() only checked 402/429/401/403/408/400, so 5xx server errors fell through to message-based classification which requires the status code to appear at the start of the error message. Many API SDKs (Google, Anthropic) set err.status = 503 without prefixing the message with '503', so the message classifier never matched and failover never triggered — the run retried the same broken model. Add 502/503/504 to the status-code branch, returning 'timeout' (matching the existing behavior of isTransientHttpError in the message classifier). Fixes #20999 * Changelog: add failover 502/503/504 note with credits * Failover: classify HTTP 504 as transient in message parser * Changelog: credit taw0002 and vincentkoc for failover fix --------- Co-authored-by: Vincent Koc <vincentkoc@ieee.org> 2026-02-23 01:01:57 -07:00			`expect(resolveFailoverReasonFromError({ status: 504 })).toBe("timeout");`
refactor(agents): share failover HTTP status classification (#36615) * fix(agents): classify transient failover statuses consistently * fix(agents): preserve legacy failover status mapping 2026-03-05 23:50:36 +03:00			`expect(resolveFailoverReasonFromError({ status: 521 })).toBeNull();`
			`expect(resolveFailoverReasonFromError({ status: 522 })).toBeNull();`
			`expect(resolveFailoverReasonFromError({ status: 523 })).toBeNull();`
			`expect(resolveFailoverReasonFromError({ status: 524 })).toBeNull();`
fix: handle HTTP 529 (Anthropic overloaded) in failover error classification Classify Anthropic's 529 status code as "rate_limit" so model fallback triggers reliably without depending on fragile message-based detection. Closes #28502 2026-03-02 21:03:37 +05:30			`expect(resolveFailoverReasonFromError({ status: 529 })).toBe("rate_limit");`
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`});`

test(agents): add provider-backed failover regressions (#36735) * test(agents): add provider-backed failover fixtures * test(agents): cover more provider error docs * test(agents): tighten provider doc fixtures 2026-03-06 00:42:59 +03:00			`it("classifies documented provider error shapes at the error boundary", () => {`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 429,`
			`message: OPENAI_RATE_LIMIT_MESSAGE,`
			`}),`
			`).toBe("rate_limit");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 529,`
			`message: ANTHROPIC_OVERLOADED_PAYLOAD,`
			`}),`
			`).toBe("rate_limit");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 429,`
			`message: GEMINI_RESOURCE_EXHAUSTED_MESSAGE,`
			`}),`
			`).toBe("rate_limit");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 402,`
			`message: OPENROUTER_CREDITS_MESSAGE,`
			`}),`
			`).toBe("billing");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 429,`
			`message: BEDROCK_THROTTLING_EXCEPTION_MESSAGE,`
			`}),`
			`).toBe("rate_limit");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 503,`
			`message: BEDROCK_SERVICE_UNAVAILABLE_MESSAGE,`
			`}),`
			`).toBe("timeout");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 429,`
			`message: GROQ_TOO_MANY_REQUESTS_MESSAGE,`
			`}),`
			`).toBe("rate_limit");`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 503,`
			`message: GROQ_SERVICE_UNAVAILABLE_MESSAGE,`
			`}),`
			`).toBe("timeout");`
			`});`

refactor: centralize failover error parsing 2026-01-10 01:25:01 +01:00			`it("infers format errors from error messages", () => {`
			`expect(`
			`resolveFailoverReasonFromError({`
			`message: "invalid request format: messages.1.content.1.tool_use.id",`
			`}),`
			`).toBe("format");`
			`});`

refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`it("infers timeout from common node error codes", () => {`
chore: migrate to oxlint and oxfmt Co-authored-by: Christoph Nakazawa <christoph.pojer@gmail.com> 2026-01-14 14:31:43 +00:00			`expect(resolveFailoverReasonFromError({ code: "ETIMEDOUT" })).toBe("timeout");`
			`expect(resolveFailoverReasonFromError({ code: "ECONNRESET" })).toBe("timeout");`
refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`});`

fix(failover): treat stop reason error as timeout 2026-03-03 01:05:06 +00:00			`it("infers timeout from abort/error stop-reason messages", () => {`
fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: abort" })).toBe(`
			`"timeout",`
			`);`
fix(failover): handle unhandled stop reason error 2026-03-03 01:04:20 +00:00			`expect(resolveFailoverReasonFromError({ message: "Unhandled stop reason: error" })).toBe(`
			`"timeout",`
			`);`
fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`expect(resolveFailoverReasonFromError({ message: "stop reason: abort" })).toBe("timeout");`
fix(failover): treat stop reason error as timeout 2026-03-03 01:05:06 +00:00			`expect(resolveFailoverReasonFromError({ message: "stop reason: error" })).toBe("timeout");`
fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`expect(resolveFailoverReasonFromError({ message: "reason: abort" })).toBe("timeout");`
fix(failover): treat stop reason error as timeout 2026-03-03 01:05:06 +00:00			`expect(resolveFailoverReasonFromError({ message: "reason: error" })).toBe("timeout");`
fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`});`

fix(agents): recognize connection errors as retryable timeout failures (#31697) * fix(agents): recognize connection errors as retryable timeout failures ## Problem When a model endpoint becomes unreachable (e.g., local proxy down, relay server offline), the failover system fails to switch to the next candidate model. Errors like "Connection error." are not classified as retryable, causing the session to hang on a broken endpoint instead of falling back to healthy alternatives. ## Root Cause Connection/network errors are not recognized by the current failover classifier: - Text patterns like "Connection error.", "fetch failed", "network error" - Error codes like ECONNREFUSED, ENOTFOUND, EAI_AGAIN (in message text) While `failover-error.ts` handles these as error codes (err.code), it misses them when they appear as plain text in error messages. ## Solution Extend timeout error patterns to include connection/network failures: In `errors.ts` (ERROR_PATTERNS.timeout): - Text: "connection error", "network error", "fetch failed", etc. - Regex: /\beconn(?:refused\|reset\|aborted)\b/i, /\benotfound\b/i, /\beai_again\b/i In `failover-error.ts` (TIMEOUT_HINT_RE): - Same patterns for non-assistant error paths ## Testing Added test cases covering: - "Connection error." - "fetch failed" - "network error: ECONNREFUSED" - "ENOTFOUND" / "EAI_AGAIN" in message text ## Impact - Compatibility: High - only expands retryable error detection - Behavior: Connection failures now trigger automatic fallback - Risk: Low - changes are additive and well-tested * style: fix code formatting for test file 2026-03-03 10:37:23 +08:00			`it("infers timeout from connection/network error messages", () => {`
			`expect(resolveFailoverReasonFromError({ message: "Connection error." })).toBe("timeout");`
			`expect(resolveFailoverReasonFromError({ message: "fetch failed" })).toBe("timeout");`
			`expect(resolveFailoverReasonFromError({ message: "Network error: ECONNREFUSED" })).toBe(`
			`"timeout",`
			`);`
			`expect(`
			`resolveFailoverReasonFromError({`
			`message: "dial tcp: lookup api.example.com: no such host (ENOTFOUND)",`
			`}),`
			`).toBe("timeout");`
			`expect(resolveFailoverReasonFromError({ message: "temporary dns failure EAI_AGAIN" })).toBe(`
			`"timeout",`
			`);`
			`});`

fix(failover): align abort timeout detection and regressions 2026-02-16 20:59:44 -05:00			`it("treats AbortError reason=abort as timeout", () => {`
			`const err = Object.assign(new Error("aborted"), {`
			`name: "AbortError",`
			`reason: "reason: abort",`
			`});`
			`expect(isTimeoutError(err)).toBe(true);`
			`});`

refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`it("coerces failover-worthy errors into FailoverError with metadata", () => {`
			`const err = coerceToFailoverError("credit balance too low", {`
			`provider: "anthropic",`
			`model: "claude-opus-4-5",`
			`});`
			`expect(err?.name).toBe("FailoverError");`
			`expect(err?.reason).toBe("billing");`
			`expect(err?.status).toBe(402);`
			`expect(err?.provider).toBe("anthropic");`
			`expect(err?.model).toBe("claude-opus-4-5");`
			`});`

refactor: centralize failover error parsing 2026-01-10 01:25:01 +01:00			`it("coerces format errors with a 400 status", () => {`
			`const err = coerceToFailoverError("invalid request format", {`
			`provider: "google",`
			`model: "cloud-code-assist",`
			`});`
			`expect(err?.reason).toBe("format");`
			`expect(err?.status).toBe(400);`
			`});`

fix(auth): distinguish revoked API keys from transient auth errors (#25754) Merged via /review-pr -> /prepare-pr -> /merge-pr. Prepared head SHA: 8f9c07a200644284e11adae76368adab40c5fa4e Co-authored-by: rrenamed <87486610+rrenamed@users.noreply.github.com> Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com> Reviewed-by: @gumadeiras 2026-02-26 02:47:16 +02:00			`it("401/403 with generic message still returns auth (backward compat)", () => {`
			`expect(resolveFailoverReasonFromError({ status: 401, message: "Unauthorized" })).toBe("auth");`
			`expect(resolveFailoverReasonFromError({ status: 403, message: "Forbidden" })).toBe("auth");`
			`});`

			`it("401 with permanent auth message returns auth_permanent", () => {`
			`expect(resolveFailoverReasonFromError({ status: 401, message: "invalid_api_key" })).toBe(`
			`"auth_permanent",`
			`);`
			`});`

			`it("403 with revoked key message returns auth_permanent", () => {`
			`expect(resolveFailoverReasonFromError({ status: 403, message: "api key revoked" })).toBe(`
			`"auth_permanent",`
			`);`
			`});`

			`it("resolveFailoverStatus maps auth_permanent to 403", () => {`
			`expect(resolveFailoverStatus("auth_permanent")).toBe(403);`
			`});`

			`it("coerces permanent auth error with correct reason", () => {`
			`const err = coerceToFailoverError(`
			`{ status: 401, message: "invalid_api_key" },`
			`{ provider: "anthropic", model: "claude-opus-4-6" },`
			`);`
			`expect(err?.reason).toBe("auth_permanent");`
			`expect(err?.provider).toBe("anthropic");`
			`});`

fix(auth): classify permission_error as auth_permanent for profile fallback (#31324) When an OAuth auth profile returns HTTP 403 with permission_error (e.g. expired plan), the error was not matched by the authPermanent patterns. This caused the profile to receive only a short cooldown instead of being disabled, so the gateway kept retrying the same broken profile indefinitely. Add "permission_error" and "not allowed for this organization" to the authPermanent error patterns so these errors trigger the longer billing/auth_permanent disable window and proper profile rotation. Closes #31306 Made-with: Cursor Co-authored-by: Vincent Koc <vincentkoc@ieee.org> 2026-03-02 14:26:05 +08:00			`it("403 permission_error returns auth_permanent", () => {`
			`expect(`
			`resolveFailoverReasonFromError({`
			`status: 403,`
			`message:`
			`"permission_error: OAuth authentication is currently not allowed for this organization.",`
			`}),`
			`).toBe("auth_permanent");`
			`});`

			`it("permission_error in error message string classifies as auth_permanent", () => {`
			`const err = coerceToFailoverError(`
			`"HTTP 403 permission_error: OAuth authentication is currently not allowed for this organization.",`
			`{ provider: "anthropic", model: "claude-opus-4-6" },`
			`);`
			`expect(err?.reason).toBe("auth_permanent");`
			`});`

			`it("'not allowed for this organization' classifies as auth_permanent", () => {`
			`const err = coerceToFailoverError(`
			`"OAuth authentication is currently not allowed for this organization",`
			`{ provider: "anthropic", model: "claude-opus-4-6" },`
			`);`
			`expect(err?.reason).toBe("auth_permanent");`
			`});`

refactor(agents): centralize failover normalization 2026-01-09 22:15:03 +01:00			`it("describes non-Error values consistently", () => {`
			`const described = describeFailoverError(123);`
			`expect(described.message).toBe("123");`
			`expect(described.reason).toBeUndefined();`
			`});`
			`});`