2026-03-02 19:47:30 +00:00
|
|
|
import { readErrorName } from "../infra/errors.js";
|
2026-02-26 02:47:16 +02:00
|
|
|
import {
|
|
|
|
|
classifyFailoverReason,
|
|
|
|
|
isAuthPermanentErrorMessage,
|
2026-03-03 02:51:00 +00:00
|
|
|
isTimeoutErrorMessage,
|
2026-02-26 02:47:16 +02:00
|
|
|
type FailoverReason,
|
|
|
|
|
} from "./pi-embedded-helpers.js";
|
2026-01-09 21:57:52 +01:00
|
|
|
|
2026-01-24 06:27:24 -05:00
|
|
|
const ABORT_TIMEOUT_RE = /request was aborted|request aborted/i;
|
2026-01-18 07:52:19 +00:00
|
|
|
|
2026-01-09 21:57:52 +01:00
|
|
|
export class FailoverError extends Error {
|
|
|
|
|
readonly reason: FailoverReason;
|
|
|
|
|
readonly provider?: string;
|
|
|
|
|
readonly model?: string;
|
|
|
|
|
readonly profileId?: string;
|
|
|
|
|
readonly status?: number;
|
|
|
|
|
readonly code?: string;
|
|
|
|
|
|
|
|
|
|
constructor(
|
|
|
|
|
message: string,
|
|
|
|
|
params: {
|
|
|
|
|
reason: FailoverReason;
|
|
|
|
|
provider?: string;
|
|
|
|
|
model?: string;
|
|
|
|
|
profileId?: string;
|
|
|
|
|
status?: number;
|
|
|
|
|
code?: string;
|
|
|
|
|
cause?: unknown;
|
|
|
|
|
},
|
|
|
|
|
) {
|
|
|
|
|
super(message, { cause: params.cause });
|
|
|
|
|
this.name = "FailoverError";
|
|
|
|
|
this.reason = params.reason;
|
|
|
|
|
this.provider = params.provider;
|
|
|
|
|
this.model = params.model;
|
|
|
|
|
this.profileId = params.profileId;
|
|
|
|
|
this.status = params.status;
|
|
|
|
|
this.code = params.code;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function isFailoverError(err: unknown): err is FailoverError {
|
|
|
|
|
return err instanceof FailoverError;
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-14 14:31:43 +00:00
|
|
|
export function resolveFailoverStatus(reason: FailoverReason): number | undefined {
|
2026-01-09 21:57:52 +01:00
|
|
|
switch (reason) {
|
|
|
|
|
case "billing":
|
|
|
|
|
return 402;
|
|
|
|
|
case "rate_limit":
|
|
|
|
|
return 429;
|
|
|
|
|
case "auth":
|
|
|
|
|
return 401;
|
2026-02-26 02:47:16 +02:00
|
|
|
case "auth_permanent":
|
|
|
|
|
return 403;
|
2026-01-09 21:57:52 +01:00
|
|
|
case "timeout":
|
|
|
|
|
return 408;
|
2026-01-10 01:25:01 +01:00
|
|
|
case "format":
|
|
|
|
|
return 400;
|
2026-02-20 18:31:09 +08:00
|
|
|
case "model_not_found":
|
|
|
|
|
return 404;
|
2026-03-02 09:11:05 +08:00
|
|
|
case "session_expired":
|
|
|
|
|
return 410; // Gone - session no longer exists
|
2026-01-09 21:57:52 +01:00
|
|
|
default:
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
|
|
|
|
|
function getStatusCode(err: unknown): number | undefined {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!err || typeof err !== "object") {
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
const candidate =
|
|
|
|
|
(err as { status?: unknown; statusCode?: unknown }).status ??
|
|
|
|
|
(err as { statusCode?: unknown }).statusCode;
|
2026-01-31 16:19:20 +09:00
|
|
|
if (typeof candidate === "number") {
|
|
|
|
|
return candidate;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
if (typeof candidate === "string" && /^\d+$/.test(candidate)) {
|
|
|
|
|
return Number(candidate);
|
|
|
|
|
}
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getErrorCode(err: unknown): string | undefined {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!err || typeof err !== "object") {
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
const candidate = (err as { code?: unknown }).code;
|
2026-01-31 16:19:20 +09:00
|
|
|
if (typeof candidate !== "string") {
|
|
|
|
|
return undefined;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
const trimmed = candidate.trim();
|
|
|
|
|
return trimmed ? trimmed : undefined;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getErrorMessage(err: unknown): string {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (err instanceof Error) {
|
|
|
|
|
return err.message;
|
|
|
|
|
}
|
|
|
|
|
if (typeof err === "string") {
|
|
|
|
|
return err;
|
|
|
|
|
}
|
2026-01-14 14:31:43 +00:00
|
|
|
if (typeof err === "number" || typeof err === "boolean" || typeof err === "bigint") {
|
2026-01-09 22:15:03 +01:00
|
|
|
return String(err);
|
|
|
|
|
}
|
2026-01-31 16:19:20 +09:00
|
|
|
if (typeof err === "symbol") {
|
|
|
|
|
return err.description ?? "";
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
if (err && typeof err === "object") {
|
|
|
|
|
const message = (err as { message?: unknown }).message;
|
2026-01-31 16:19:20 +09:00
|
|
|
if (typeof message === "string") {
|
|
|
|
|
return message;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
}
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-18 07:52:19 +00:00
|
|
|
function hasTimeoutHint(err: unknown): boolean {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!err) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2026-03-02 19:47:30 +00:00
|
|
|
if (readErrorName(err) === "TimeoutError") {
|
2026-01-31 16:19:20 +09:00
|
|
|
return true;
|
|
|
|
|
}
|
2026-01-18 07:52:19 +00:00
|
|
|
const message = getErrorMessage(err);
|
2026-03-03 02:51:00 +00:00
|
|
|
return Boolean(message && isTimeoutErrorMessage(message));
|
2026-01-18 07:52:19 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function isTimeoutError(err: unknown): boolean {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (hasTimeoutHint(err)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
if (!err || typeof err !== "object") {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2026-03-02 19:47:30 +00:00
|
|
|
if (readErrorName(err) !== "AbortError") {
|
2026-01-31 16:19:20 +09:00
|
|
|
return false;
|
|
|
|
|
}
|
2026-01-24 06:27:24 -05:00
|
|
|
const message = getErrorMessage(err);
|
2026-01-31 16:19:20 +09:00
|
|
|
if (message && ABORT_TIMEOUT_RE.test(message)) {
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2026-01-18 07:52:19 +00:00
|
|
|
const cause = "cause" in err ? (err as { cause?: unknown }).cause : undefined;
|
|
|
|
|
const reason = "reason" in err ? (err as { reason?: unknown }).reason : undefined;
|
|
|
|
|
return hasTimeoutHint(cause) || hasTimeoutHint(reason);
|
|
|
|
|
}
|
|
|
|
|
|
2026-01-14 14:31:43 +00:00
|
|
|
export function resolveFailoverReasonFromError(err: unknown): FailoverReason | null {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (isFailoverError(err)) {
|
|
|
|
|
return err.reason;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
|
|
|
|
|
const status = getStatusCode(err);
|
2026-01-31 16:19:20 +09:00
|
|
|
if (status === 402) {
|
|
|
|
|
return "billing";
|
|
|
|
|
}
|
|
|
|
|
if (status === 429) {
|
|
|
|
|
return "rate_limit";
|
|
|
|
|
}
|
|
|
|
|
if (status === 401 || status === 403) {
|
2026-02-26 02:47:16 +02:00
|
|
|
const msg = getErrorMessage(err);
|
|
|
|
|
if (msg && isAuthPermanentErrorMessage(msg)) {
|
|
|
|
|
return "auth_permanent";
|
|
|
|
|
}
|
2026-01-31 16:19:20 +09:00
|
|
|
return "auth";
|
|
|
|
|
}
|
|
|
|
|
if (status === 408) {
|
|
|
|
|
return "timeout";
|
|
|
|
|
}
|
2026-02-23 01:01:57 -07:00
|
|
|
if (status === 502 || status === 503 || status === 504) {
|
fix: treat HTTP 503 as failover-eligible for LLM provider errors (#21086)
* fix: treat HTTP 503 as failover-eligible for LLM provider errors
When LLM SDKs wrap 503 responses, the leading "503" prefix is lost
(e.g. Google Gemini returns "high demand" / "UNAVAILABLE" without a
numeric prefix). The existing isTransientHttpError only matches
messages starting with "503 ...", so these wrapped errors silently
skip failover — no profile rotation, no model fallback.
This patch closes that gap:
- resolveFailoverReasonFromError: map HTTP status 503 → rate_limit
(covers structured error objects with a status field)
- ERROR_PATTERNS.overloaded: add /\b503\b/, "service unavailable",
"high demand" (covers message-only classification when the leading
status prefix is absent)
Existing isTransientHttpError behavior is unchanged; these additions
are complementary and only fire for errors that previously fell
through unclassified.
* fix: address review feedback — drop /\b503\b/ pattern, add test coverage
- Remove `/\b503\b/` from ERROR_PATTERNS.overloaded to resolve the
semantic inconsistency noted by reviewers: `isTransientHttpError`
already handles messages prefixed with "503" (→ "timeout"), so a
redundant overloaded pattern would classify the same class of errors
differently depending on message formatting.
- Keep "service unavailable" and "high demand" patterns — these are the
real gap-fillers for SDK-rewritten messages that lack a numeric prefix.
- Add test case for JSON-wrapped 503 error body containing "overloaded"
to strengthen coverage.
* fix: unify 503 classification — status 503 → timeout (consistent with isTransientHttpError)
resolveFailoverReasonFromError previously mapped status 503 → "rate_limit",
while the string-based isTransientHttpError mapped "503 ..." → "timeout".
Align both paths: structured {status: 503} now also returns "timeout",
matching the existing transient-error convention. Both reasons are
failover-eligible, so runtime behavior is unchanged.
---------
Co-authored-by: Vincent Koc <vincentkoc@ieee.org>
2026-02-20 04:45:09 +08:00
|
|
|
return "timeout";
|
|
|
|
|
}
|
2026-03-02 21:03:37 +05:30
|
|
|
if (status === 529) {
|
|
|
|
|
return "rate_limit";
|
|
|
|
|
}
|
2026-02-09 09:12:06 +02:00
|
|
|
if (status === 400) {
|
|
|
|
|
return "format";
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
|
|
|
|
|
const code = (getErrorCode(err) ?? "").toUpperCase();
|
2026-02-17 18:09:05 +08:00
|
|
|
if (
|
|
|
|
|
[
|
|
|
|
|
"ETIMEDOUT",
|
|
|
|
|
"ESOCKETTIMEDOUT",
|
|
|
|
|
"ECONNRESET",
|
|
|
|
|
"ECONNABORTED",
|
|
|
|
|
"ECONNREFUSED",
|
|
|
|
|
"ENETUNREACH",
|
|
|
|
|
"EHOSTUNREACH",
|
|
|
|
|
"ENETRESET",
|
|
|
|
|
"EAI_AGAIN",
|
|
|
|
|
].includes(code)
|
|
|
|
|
) {
|
2026-01-09 22:15:03 +01:00
|
|
|
return "timeout";
|
|
|
|
|
}
|
2026-01-31 16:19:20 +09:00
|
|
|
if (isTimeoutError(err)) {
|
|
|
|
|
return "timeout";
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
|
|
|
|
|
const message = getErrorMessage(err);
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!message) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
return classifyFailoverReason(message);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function describeFailoverError(err: unknown): {
|
|
|
|
|
message: string;
|
|
|
|
|
reason?: FailoverReason;
|
|
|
|
|
status?: number;
|
|
|
|
|
code?: string;
|
|
|
|
|
} {
|
|
|
|
|
if (isFailoverError(err)) {
|
|
|
|
|
return {
|
|
|
|
|
message: err.message,
|
|
|
|
|
reason: err.reason,
|
|
|
|
|
status: err.status,
|
|
|
|
|
code: err.code,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
const message = getErrorMessage(err) || String(err);
|
|
|
|
|
return {
|
|
|
|
|
message,
|
|
|
|
|
reason: resolveFailoverReasonFromError(err) ?? undefined,
|
|
|
|
|
status: getStatusCode(err),
|
|
|
|
|
code: getErrorCode(err),
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
export function coerceToFailoverError(
|
|
|
|
|
err: unknown,
|
|
|
|
|
context?: {
|
|
|
|
|
provider?: string;
|
|
|
|
|
model?: string;
|
|
|
|
|
profileId?: string;
|
|
|
|
|
},
|
|
|
|
|
): FailoverError | null {
|
2026-01-31 16:19:20 +09:00
|
|
|
if (isFailoverError(err)) {
|
|
|
|
|
return err;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
const reason = resolveFailoverReasonFromError(err);
|
2026-01-31 16:19:20 +09:00
|
|
|
if (!reason) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
2026-01-09 22:15:03 +01:00
|
|
|
|
|
|
|
|
const message = getErrorMessage(err) || String(err);
|
|
|
|
|
const status = getStatusCode(err) ?? resolveFailoverStatus(reason);
|
|
|
|
|
const code = getErrorCode(err);
|
|
|
|
|
|
|
|
|
|
return new FailoverError(message, {
|
|
|
|
|
reason,
|
|
|
|
|
provider: context?.provider,
|
|
|
|
|
model: context?.model,
|
|
|
|
|
profileId: context?.profileId,
|
|
|
|
|
status,
|
|
|
|
|
code,
|
|
|
|
|
cause: err instanceof Error ? err : undefined,
|
|
|
|
|
});
|
|
|
|
|
}
|