feat(gateway): add channel-backed readiness probes (#38285)

* Changelog: add channel-backed readiness probe entry

* Gateway: add channel-backed readiness probes

* Docs: describe readiness probe behavior

* Gateway: add readiness probe regression tests

* Changelog: dedupe gateway probe entries

* Docs: fix readiness startup grace description

* Changelog: remove stale readiness entry

* Gateway: cover readiness hardening

* Gateway: harden readiness probes
This commit is contained in:
Vincent Koc
2026-03-06 15:15:23 -05:00
committed by GitHub
parent b17baca871
commit ab5fcfcc01
13 changed files with 558 additions and 20 deletions

View File

@@ -207,7 +207,7 @@ Docs: https://docs.openclaw.ai
- WhatsApp media upload caps: make outbound media sends and auto-replies honor `channels.whatsapp.mediaMaxMb` with per-account overrides so inbound and outbound limits use the same channel config. Thanks @vincentkoc.
- Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023.
- Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023.
- Gateway/probe route precedence: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, so root-mounted SPA fallbacks no longer swallow machine probe routes while plugin-owned routes on those paths still keep precedence. (#18446) Thanks @vibecodooor and @vincentkoc.
- Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc.
## 2026.3.2

View File

@@ -476,6 +476,10 @@ curl -fsS http://127.0.0.1:18789/readyz
Aliases: `/health` and `/ready`.
`/healthz` is a shallow liveness probe for "the gateway process is up".
`/readyz` stays ready during startup grace, then becomes `503` only if required
managed channels are still disconnected after grace or disconnect later.
The Docker image includes a built-in `HEALTHCHECK` that pings `/healthz` in the
background. In plain terms: Docker keeps checking if OpenClaw is still
responsive. If checks keep failing, Docker marks the container as `unhealthy`,

View File

@@ -102,6 +102,7 @@ export type ChannelAccountSnapshot = {
linked?: boolean;
running?: boolean;
connected?: boolean;
restartPending?: boolean;
reconnectAttempts?: number;
lastConnectedAt?: number | null;
lastDisconnect?:

View File

@@ -1,6 +1,8 @@
import type { ChannelId } from "../channels/plugins/types.js";
import { createSubsystemLogger } from "../logging/subsystem.js";
import {
DEFAULT_CHANNEL_CONNECT_GRACE_MS,
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
evaluateChannelHealth,
resolveChannelRestartReason,
type ChannelHealthPolicy,
@@ -21,9 +23,6 @@ const ONE_HOUR_MS = 60 * 60_000;
* This catches the half-dead WebSocket scenario where the connection appears
* alive (health checks pass) but Slack silently stops delivering events.
*/
const DEFAULT_STALE_EVENT_THRESHOLD_MS = 30 * 60_000;
const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000;
export type ChannelHealthTimingPolicy = {
monitorStartupGraceMs: number;
channelConnectGraceMs: number;
@@ -70,7 +69,7 @@ function resolveTimingPolicy(
staleEventThresholdMs:
deps.timing?.staleEventThresholdMs ??
deps.staleEventThresholdMs ??
DEFAULT_STALE_EVENT_THRESHOLD_MS,
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
};
}

View File

@@ -3,6 +3,7 @@ export type ChannelHealthSnapshot = {
connected?: boolean;
enabled?: boolean;
configured?: boolean;
restartPending?: boolean;
busy?: boolean;
activeRuns?: number;
lastRunActivityAt?: number | null;
@@ -39,6 +40,10 @@ function isManagedAccount(snapshot: ChannelHealthSnapshot): boolean {
}
const BUSY_ACTIVITY_STALE_THRESHOLD_MS = 25 * 60_000;
// Keep these shared between the background health monitor and on-demand readiness
// probes so both surfaces evaluate channel lifecycle windows consistently.
export const DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS = 30 * 60_000;
export const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000;
export function evaluateChannelHealth(
snapshot: ChannelHealthSnapshot,

View File

@@ -180,6 +180,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
enabled: false,
configured: true,
running: false,
restartPending: false,
lastError: plugin.config.disabledReason?.(account, cfg) ?? "disabled",
});
return;
@@ -195,6 +196,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
enabled: true,
configured: false,
running: false,
restartPending: false,
lastError: plugin.config.unconfiguredReason?.(account, cfg) ?? "not configured",
});
return;
@@ -215,6 +217,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
enabled: true,
configured: true,
running: true,
restartPending: false,
lastStartAt: Date.now(),
lastError: null,
reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0,
@@ -252,6 +255,11 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const attempt = (restartAttempts.get(rKey) ?? 0) + 1;
restartAttempts.set(rKey, attempt);
if (attempt > MAX_RESTART_ATTEMPTS) {
setRuntime(channelId, id, {
accountId: id,
restartPending: false,
reconnectAttempts: attempt,
});
log.error?.(`[${id}] giving up after ${MAX_RESTART_ATTEMPTS} restart attempts`);
return;
}
@@ -261,6 +269,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
);
setRuntime(channelId, id, {
accountId: id,
restartPending: true,
reconnectAttempts: attempt,
});
try {
@@ -349,6 +358,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
setRuntime(channelId, id, {
accountId: id,
running: false,
restartPending: false,
lastStopAt: Date.now(),
});
}),
@@ -377,6 +387,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
const next: ChannelAccountSnapshot = {
accountId: resolvedId,
running: false,
restartPending: false,
lastError: cleared ? "logged out" : current.lastError,
};
if (typeof current.connected === "boolean") {

View File

@@ -0,0 +1,155 @@
import { describe, expect, it } from "vitest";
import {
AUTH_TOKEN,
AUTH_NONE,
createRequest,
createResponse,
dispatchRequest,
withGatewayServer,
} from "./server-http.test-harness.js";
import type { ReadinessChecker } from "./server/readiness.js";
describe("gateway probe endpoints", () => {
it("returns detailed readiness payload for local /ready requests", async () => {
const getReadiness: ReadinessChecker = () => ({
ready: true,
failing: [],
uptimeMs: 45_000,
});
await withGatewayServer({
prefix: "probe-ready",
resolvedAuth: AUTH_NONE,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({ path: "/ready" });
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(200);
expect(JSON.parse(getBody())).toEqual({ ready: true, failing: [], uptimeMs: 45_000 });
},
});
});
it("returns only readiness state for unauthenticated remote /ready requests", async () => {
const getReadiness: ReadinessChecker = () => ({
ready: false,
failing: ["discord", "telegram"],
uptimeMs: 8_000,
});
await withGatewayServer({
prefix: "probe-not-ready",
resolvedAuth: AUTH_NONE,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({
path: "/ready",
remoteAddress: "10.0.0.8",
host: "gateway.test",
});
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(503);
expect(JSON.parse(getBody())).toEqual({ ready: false });
},
});
});
it("returns detailed readiness payload for authenticated remote /ready requests", async () => {
const getReadiness: ReadinessChecker = () => ({
ready: false,
failing: ["discord", "telegram"],
uptimeMs: 8_000,
});
await withGatewayServer({
prefix: "probe-remote-authenticated",
resolvedAuth: AUTH_TOKEN,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({
path: "/ready",
remoteAddress: "10.0.0.8",
host: "gateway.test",
authorization: "Bearer test-token",
});
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(503);
expect(JSON.parse(getBody())).toEqual({
ready: false,
failing: ["discord", "telegram"],
uptimeMs: 8_000,
});
},
});
});
it("returns typed internal error payload when readiness evaluation throws", async () => {
const getReadiness: ReadinessChecker = () => {
throw new Error("boom");
};
await withGatewayServer({
prefix: "probe-throws",
resolvedAuth: AUTH_NONE,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({ path: "/ready" });
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(503);
expect(JSON.parse(getBody())).toEqual({ ready: false, failing: ["internal"], uptimeMs: 0 });
},
});
});
it("keeps /healthz shallow even when readiness checker reports failing channels", async () => {
const getReadiness: ReadinessChecker = () => ({
ready: false,
failing: ["discord"],
uptimeMs: 999,
});
await withGatewayServer({
prefix: "probe-healthz-unaffected",
resolvedAuth: AUTH_NONE,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({ path: "/healthz" });
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(200);
expect(getBody()).toBe(JSON.stringify({ ok: true, status: "live" }));
},
});
});
it("reflects readiness status on HEAD /readyz without a response body", async () => {
const getReadiness: ReadinessChecker = () => ({
ready: false,
failing: ["discord"],
uptimeMs: 5_000,
});
await withGatewayServer({
prefix: "probe-readyz-head",
resolvedAuth: AUTH_NONE,
overrides: { getReadiness },
run: async (server) => {
const req = createRequest({ path: "/readyz", method: "HEAD" });
const { res, getBody } = createResponse();
await dispatchRequest(server, req, res);
expect(res.statusCode).toBe(503);
expect(getBody()).toBe("");
},
});
});
});

View File

@@ -28,11 +28,15 @@ export function createRequest(params: {
path: string;
authorization?: string;
method?: string;
remoteAddress?: string;
host?: string;
}): IncomingMessage {
return createGatewayRequest({
path: params.path,
authorization: params.authorization,
method: params.method,
remoteAddress: params.remoteAddress,
host: params.host,
});
}
@@ -127,6 +131,8 @@ export async function sendRequest(
path: string;
authorization?: string;
method?: string;
remoteAddress?: string;
host?: string;
},
): Promise<ReturnType<typeof createResponse>> {
const response = createResponse();

View File

@@ -20,7 +20,12 @@ import {
normalizeRateLimitClientIp,
type AuthRateLimiter,
} from "./auth-rate-limit.js";
import { type GatewayAuthResult, type ResolvedGatewayAuth } from "./auth.js";
import {
authorizeHttpGatewayConnect,
isLocalDirectRequest,
type GatewayAuthResult,
type ResolvedGatewayAuth,
} from "./auth.js";
import { normalizeCanvasScopedUrl } from "./canvas-capability.js";
import {
handleControlUiAvatarRequest,
@@ -46,6 +51,7 @@ import {
resolveHookDeliver,
} from "./hooks.js";
import { sendGatewayAuthFailure, setDefaultSecurityHeaders } from "./http-common.js";
import { getBearerToken } from "./http-utils.js";
import { handleOpenAiHttpRequest } from "./openai-http.js";
import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
import {
@@ -59,6 +65,7 @@ import {
type PluginHttpRequestHandler,
type PluginRoutePathContext,
} from "./server/plugins-http.js";
import type { ReadinessChecker } from "./server/readiness.js";
import type { GatewayWsClient } from "./server/ws-types.js";
import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
@@ -150,11 +157,39 @@ function shouldEnforceDefaultPluginGatewayAuth(pathContext: PluginRoutePathConte
);
}
function handleGatewayProbeRequest(
async function canRevealReadinessDetails(params: {
req: IncomingMessage;
resolvedAuth: ResolvedGatewayAuth;
trustedProxies: string[];
allowRealIpFallback: boolean;
}): Promise<boolean> {
if (isLocalDirectRequest(params.req, params.trustedProxies, params.allowRealIpFallback)) {
return true;
}
if (params.resolvedAuth.mode === "none") {
return false;
}
const bearerToken = getBearerToken(params.req);
const authResult = await authorizeHttpGatewayConnect({
auth: params.resolvedAuth,
connectAuth: bearerToken ? { token: bearerToken, password: bearerToken } : null,
req: params.req,
trustedProxies: params.trustedProxies,
allowRealIpFallback: params.allowRealIpFallback,
});
return authResult.ok;
}
async function handleGatewayProbeRequest(
req: IncomingMessage,
res: ServerResponse,
requestPath: string,
): boolean {
resolvedAuth: ResolvedGatewayAuth,
trustedProxies: string[],
allowRealIpFallback: boolean,
getReadiness?: ReadinessChecker,
): Promise<boolean> {
const status = GATEWAY_PROBE_STATUS_BY_PATH.get(requestPath);
if (!status) {
return false;
@@ -169,14 +204,34 @@ function handleGatewayProbeRequest(
return true;
}
res.statusCode = 200;
res.setHeader("Content-Type", "application/json; charset=utf-8");
res.setHeader("Cache-Control", "no-store");
if (method === "HEAD") {
res.end();
return true;
let statusCode: number;
let body: string;
if (status === "ready" && getReadiness) {
const includeDetails = await canRevealReadinessDetails({
req,
resolvedAuth,
trustedProxies,
allowRealIpFallback,
});
try {
const result = getReadiness();
statusCode = result.ready ? 200 : 503;
body = JSON.stringify(includeDetails ? result : { ready: result.ready });
} catch {
statusCode = 503;
body = JSON.stringify(
includeDetails ? { ready: false, failing: ["internal"], uptimeMs: 0 } : { ready: false },
);
}
} else {
statusCode = 200;
body = JSON.stringify({ ok: true, status });
}
res.end(JSON.stringify({ ok: true, status }));
res.statusCode = statusCode;
res.end(method === "HEAD" ? undefined : body);
return true;
}
@@ -519,6 +574,7 @@ export function createGatewayHttpServer(opts: {
resolvedAuth: ResolvedGatewayAuth;
/** Optional rate limiter for auth brute-force protection. */
rateLimiter?: AuthRateLimiter;
getReadiness?: ReadinessChecker;
tlsOptions?: TlsOptions;
}): HttpServer {
const {
@@ -537,6 +593,7 @@ export function createGatewayHttpServer(opts: {
shouldEnforcePluginGatewayAuth,
resolvedAuth,
rateLimiter,
getReadiness,
} = opts;
const httpServer: HttpServer = opts.tlsOptions
? createHttpsServer(opts.tlsOptions, (req, res) => {
@@ -693,7 +750,16 @@ export function createGatewayHttpServer(opts: {
requestStages.push({
name: "gateway-probes",
run: () => handleGatewayProbeRequest(req, res, requestPath),
run: () =>
handleGatewayProbeRequest(
req,
res,
requestPath,
resolvedAuth,
trustedProxies,
allowRealIpFallback,
getReadiness,
),
});
if (await runGatewayHttpRequestStages(requestStages)) {

View File

@@ -32,6 +32,7 @@ import {
shouldEnforceGatewayAuthForPluginPath,
type PluginRoutePathContext,
} from "./server/plugins-http.js";
import type { ReadinessChecker } from "./server/readiness.js";
import type { GatewayTlsRuntime } from "./server/tls.js";
import type { GatewayWsClient } from "./server/ws-types.js";
@@ -61,6 +62,7 @@ export async function createGatewayRuntimeState(params: {
log: { info: (msg: string) => void; warn: (msg: string) => void };
logHooks: ReturnType<typeof createSubsystemLogger>;
logPlugins: ReturnType<typeof createSubsystemLogger>;
getReadiness?: ReadinessChecker;
}): Promise<{
canvasHost: CanvasHostHandler | null;
httpServer: HttpServer;
@@ -156,6 +158,7 @@ export async function createGatewayRuntimeState(params: {
shouldEnforcePluginGatewayAuth,
resolvedAuth: params.resolvedAuth,
rateLimiter: params.rateLimiter,
getReadiness: params.getReadiness,
tlsOptions: params.gatewayTls?.enabled ? params.gatewayTls.tlsOptions : undefined,
});
try {

View File

@@ -106,6 +106,7 @@ import {
incrementPresenceVersion,
refreshGatewayHealthSnapshot,
} from "./server/health-state.js";
import { createReadinessChecker } from "./server/readiness.js";
import { loadGatewayTlsRuntime } from "./server/tls.js";
import {
ensureGatewayStartupAuth,
@@ -546,6 +547,17 @@ export async function startGatewayServer(
if (cfgAtStart.gateway?.tls?.enabled && !gatewayTls.enabled) {
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
}
const serverStartedAt = Date.now();
const channelManager = createChannelManager({
loadConfig,
channelLogs,
channelRuntimeEnvs,
channelRuntime: createPluginRuntime().channel,
});
const getReadiness = createReadinessChecker({
channelManager,
startedAt: serverStartedAt,
});
const {
canvasHost,
httpServer,
@@ -589,6 +601,7 @@ export async function startGatewayServer(
log,
logHooks,
logPlugins,
getReadiness,
});
let bonjourStop: (() => Promise<void>) | null = null;
const nodeRegistry = new NodeRegistry();
@@ -618,12 +631,6 @@ export async function startGatewayServer(
});
let { cron, storePath: cronStorePath } = cronState;
const channelManager = createChannelManager({
loadConfig,
channelLogs,
channelRuntimeEnvs,
channelRuntime: createPluginRuntime().channel,
});
const { getRuntimeSnapshot, startChannels, startChannel, stopChannel, markChannelLoggedOut } =
channelManager;

View File

@@ -0,0 +1,202 @@
import { describe, expect, it, vi } from "vitest";
import type { ChannelId } from "../../channels/plugins/index.js";
import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js";
import type { ChannelManager, ChannelRuntimeSnapshot } from "../server-channels.js";
import { createReadinessChecker } from "./readiness.js";
function snapshotWith(
accounts: Record<string, Partial<ChannelAccountSnapshot>>,
): ChannelRuntimeSnapshot {
const channels: ChannelRuntimeSnapshot["channels"] = {};
const channelAccounts: ChannelRuntimeSnapshot["channelAccounts"] = {};
for (const [channelId, accountSnapshot] of Object.entries(accounts)) {
const resolved = { accountId: "default", ...accountSnapshot } as ChannelAccountSnapshot;
channels[channelId as ChannelId] = resolved;
channelAccounts[channelId as ChannelId] = { default: resolved };
}
return { channels, channelAccounts };
}
function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager {
return {
getRuntimeSnapshot: vi.fn(() => snapshot),
startChannels: vi.fn(),
startChannel: vi.fn(),
stopChannel: vi.fn(),
markChannelLoggedOut: vi.fn(),
isManuallyStopped: vi.fn(() => false),
resetRestartAttempts: vi.fn(),
};
}
describe("createReadinessChecker", () => {
it("reports ready when all managed channels are healthy", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 5 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: true,
connected: true,
enabled: true,
configured: true,
lastStartAt: startedAt,
lastEventAt: Date.now() - 1_000,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
vi.useRealTimers();
});
it("ignores disabled and unconfigured channels", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 5 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: false,
enabled: false,
configured: true,
lastStartAt: startedAt,
},
telegram: {
running: false,
enabled: true,
configured: false,
lastStartAt: startedAt,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
vi.useRealTimers();
});
it("uses startup grace before marking disconnected channels not ready", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 30_000;
const manager = createManager(
snapshotWith({
discord: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: startedAt,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 30_000 });
vi.useRealTimers();
});
it("reports disconnected managed channels after startup grace", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 5 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: true,
connected: false,
enabled: true,
configured: true,
lastStartAt: startedAt,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: false, failing: ["discord"], uptimeMs: 300_000 });
vi.useRealTimers();
});
it("keeps restart-pending channels ready during reconnect backoff", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 5 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: false,
restartPending: true,
reconnectAttempts: 3,
enabled: true,
configured: true,
lastStartAt: startedAt - 30_000,
lastStopAt: Date.now() - 5_000,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
vi.useRealTimers();
});
it("treats stale-socket channels as ready to avoid pulling healthy idle pods", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 31 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: true,
connected: true,
enabled: true,
configured: true,
lastStartAt: startedAt,
lastEventAt: Date.now() - 31 * 60_000,
},
}),
);
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 });
vi.useRealTimers();
});
it("caches readiness snapshots briefly to keep repeated probes cheap", () => {
vi.useFakeTimers();
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
const startedAt = Date.now() - 5 * 60_000;
const manager = createManager(
snapshotWith({
discord: {
running: true,
connected: true,
enabled: true,
configured: true,
lastStartAt: startedAt,
lastEventAt: Date.now() - 1_000,
},
}),
);
const readiness = createReadinessChecker({
channelManager: manager,
startedAt,
cacheTtlMs: 1_000,
});
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
vi.advanceTimersByTime(500);
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_500 });
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(1);
vi.advanceTimersByTime(600);
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 301_100 });
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
vi.useRealTimers();
});
});

View File

@@ -0,0 +1,79 @@
import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js";
import {
DEFAULT_CHANNEL_CONNECT_GRACE_MS,
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
evaluateChannelHealth,
type ChannelHealthPolicy,
type ChannelHealthEvaluation,
} from "../channel-health-policy.js";
import type { ChannelManager } from "../server-channels.js";
export type ReadinessResult = {
ready: boolean;
failing: string[];
uptimeMs: number;
};
export type ReadinessChecker = () => ReadinessResult;
const DEFAULT_READINESS_CACHE_TTL_MS = 1_000;
function shouldIgnoreReadinessFailure(
accountSnapshot: ChannelAccountSnapshot,
health: ChannelHealthEvaluation,
): boolean {
if (health.reason === "unmanaged" || health.reason === "stale-socket") {
return true;
}
// Channel restarts spend time in backoff with running=false before the next
// lifecycle re-enters startup grace. Keep readiness green during that handoff
// window, but still surface hard failures once restart attempts are exhausted.
return health.reason === "not-running" && accountSnapshot.restartPending === true;
}
export function createReadinessChecker(deps: {
channelManager: ChannelManager;
startedAt: number;
cacheTtlMs?: number;
}): ReadinessChecker {
const { channelManager, startedAt } = deps;
const cacheTtlMs = Math.max(0, deps.cacheTtlMs ?? DEFAULT_READINESS_CACHE_TTL_MS);
let cachedAt = 0;
let cachedState: Omit<ReadinessResult, "uptimeMs"> | null = null;
return (): ReadinessResult => {
const now = Date.now();
const uptimeMs = now - startedAt;
if (cachedState && now - cachedAt < cacheTtlMs) {
return { ...cachedState, uptimeMs };
}
const snapshot = channelManager.getRuntimeSnapshot();
const failing: string[] = [];
const policy: ChannelHealthPolicy = {
now,
staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS,
};
for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) {
if (!accounts) {
continue;
}
for (const accountSnapshot of Object.values(accounts)) {
if (!accountSnapshot) {
continue;
}
const health = evaluateChannelHealth(accountSnapshot, policy);
if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) {
failing.push(channelId);
break;
}
}
}
cachedAt = now;
cachedState = { ready: failing.length === 0, failing };
return { ...cachedState, uptimeMs };
};
}