feat(gateway): add channel-backed readiness probes (#38285)
* Changelog: add channel-backed readiness probe entry * Gateway: add channel-backed readiness probes * Docs: describe readiness probe behavior * Gateway: add readiness probe regression tests * Changelog: dedupe gateway probe entries * Docs: fix readiness startup grace description * Changelog: remove stale readiness entry * Gateway: cover readiness hardening * Gateway: harden readiness probes
This commit is contained in:
@@ -207,7 +207,7 @@ Docs: https://docs.openclaw.ai
|
||||
- WhatsApp media upload caps: make outbound media sends and auto-replies honor `channels.whatsapp.mediaMaxMb` with per-account overrides so inbound and outbound limits use the same channel config. Thanks @vincentkoc.
|
||||
- Windows/Plugin install: when OpenClaw runs on Windows via Bun and `npm-cli.js` is not colocated with the runtime binary, fall back to `npm.cmd`/`npx.cmd` through the existing `cmd.exe` wrapper so `openclaw plugins install` no longer fails with `spawn EINVAL`. (#38056) Thanks @0xlin2023.
|
||||
- Telegram/send retry classification: retry grammY `Network request ... failed after N attempts` envelopes in send flows without reclassifying plain `Network request ... failed!` wrappers as transient, restoring the intended retry path while keeping broad send-context message matching tight. (#38056) Thanks @0xlin2023.
|
||||
- Gateway/probe route precedence: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, so root-mounted SPA fallbacks no longer swallow machine probe routes while plugin-owned routes on those paths still keep precedence. (#18446) Thanks @vibecodooor and @vincentkoc.
|
||||
- Gateway/probes: keep `/health`, `/healthz`, `/ready`, and `/readyz` reachable when the Control UI is mounted at `/`, preserve plugin-owned route precedence on those paths, and make `/ready` and `/readyz` report channel-backed readiness with startup grace plus `503` on disconnected managed channels, while `/health` and `/healthz` stay shallow liveness probes. (#18446) Thanks @vibecodooor, @mahsumaktas, and @vincentkoc.
|
||||
|
||||
## 2026.3.2
|
||||
|
||||
|
||||
@@ -476,6 +476,10 @@ curl -fsS http://127.0.0.1:18789/readyz
|
||||
|
||||
Aliases: `/health` and `/ready`.
|
||||
|
||||
`/healthz` is a shallow liveness probe for "the gateway process is up".
|
||||
`/readyz` stays ready during startup grace, then becomes `503` only if required
|
||||
managed channels are still disconnected after grace or disconnect later.
|
||||
|
||||
The Docker image includes a built-in `HEALTHCHECK` that pings `/healthz` in the
|
||||
background. In plain terms: Docker keeps checking if OpenClaw is still
|
||||
responsive. If checks keep failing, Docker marks the container as `unhealthy`,
|
||||
|
||||
@@ -102,6 +102,7 @@ export type ChannelAccountSnapshot = {
|
||||
linked?: boolean;
|
||||
running?: boolean;
|
||||
connected?: boolean;
|
||||
restartPending?: boolean;
|
||||
reconnectAttempts?: number;
|
||||
lastConnectedAt?: number | null;
|
||||
lastDisconnect?:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import type { ChannelId } from "../channels/plugins/types.js";
|
||||
import { createSubsystemLogger } from "../logging/subsystem.js";
|
||||
import {
|
||||
DEFAULT_CHANNEL_CONNECT_GRACE_MS,
|
||||
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
||||
evaluateChannelHealth,
|
||||
resolveChannelRestartReason,
|
||||
type ChannelHealthPolicy,
|
||||
@@ -21,9 +23,6 @@ const ONE_HOUR_MS = 60 * 60_000;
|
||||
* This catches the half-dead WebSocket scenario where the connection appears
|
||||
* alive (health checks pass) but Slack silently stops delivering events.
|
||||
*/
|
||||
const DEFAULT_STALE_EVENT_THRESHOLD_MS = 30 * 60_000;
|
||||
const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000;
|
||||
|
||||
export type ChannelHealthTimingPolicy = {
|
||||
monitorStartupGraceMs: number;
|
||||
channelConnectGraceMs: number;
|
||||
@@ -70,7 +69,7 @@ function resolveTimingPolicy(
|
||||
staleEventThresholdMs:
|
||||
deps.timing?.staleEventThresholdMs ??
|
||||
deps.staleEventThresholdMs ??
|
||||
DEFAULT_STALE_EVENT_THRESHOLD_MS,
|
||||
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -3,6 +3,7 @@ export type ChannelHealthSnapshot = {
|
||||
connected?: boolean;
|
||||
enabled?: boolean;
|
||||
configured?: boolean;
|
||||
restartPending?: boolean;
|
||||
busy?: boolean;
|
||||
activeRuns?: number;
|
||||
lastRunActivityAt?: number | null;
|
||||
@@ -39,6 +40,10 @@ function isManagedAccount(snapshot: ChannelHealthSnapshot): boolean {
|
||||
}
|
||||
|
||||
const BUSY_ACTIVITY_STALE_THRESHOLD_MS = 25 * 60_000;
|
||||
// Keep these shared between the background health monitor and on-demand readiness
|
||||
// probes so both surfaces evaluate channel lifecycle windows consistently.
|
||||
export const DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS = 30 * 60_000;
|
||||
export const DEFAULT_CHANNEL_CONNECT_GRACE_MS = 120_000;
|
||||
|
||||
export function evaluateChannelHealth(
|
||||
snapshot: ChannelHealthSnapshot,
|
||||
|
||||
@@ -180,6 +180,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
enabled: false,
|
||||
configured: true,
|
||||
running: false,
|
||||
restartPending: false,
|
||||
lastError: plugin.config.disabledReason?.(account, cfg) ?? "disabled",
|
||||
});
|
||||
return;
|
||||
@@ -195,6 +196,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
enabled: true,
|
||||
configured: false,
|
||||
running: false,
|
||||
restartPending: false,
|
||||
lastError: plugin.config.unconfiguredReason?.(account, cfg) ?? "not configured",
|
||||
});
|
||||
return;
|
||||
@@ -215,6 +217,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
enabled: true,
|
||||
configured: true,
|
||||
running: true,
|
||||
restartPending: false,
|
||||
lastStartAt: Date.now(),
|
||||
lastError: null,
|
||||
reconnectAttempts: preserveRestartAttempts ? (restartAttempts.get(rKey) ?? 0) : 0,
|
||||
@@ -252,6 +255,11 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
const attempt = (restartAttempts.get(rKey) ?? 0) + 1;
|
||||
restartAttempts.set(rKey, attempt);
|
||||
if (attempt > MAX_RESTART_ATTEMPTS) {
|
||||
setRuntime(channelId, id, {
|
||||
accountId: id,
|
||||
restartPending: false,
|
||||
reconnectAttempts: attempt,
|
||||
});
|
||||
log.error?.(`[${id}] giving up after ${MAX_RESTART_ATTEMPTS} restart attempts`);
|
||||
return;
|
||||
}
|
||||
@@ -261,6 +269,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
);
|
||||
setRuntime(channelId, id, {
|
||||
accountId: id,
|
||||
restartPending: true,
|
||||
reconnectAttempts: attempt,
|
||||
});
|
||||
try {
|
||||
@@ -349,6 +358,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
setRuntime(channelId, id, {
|
||||
accountId: id,
|
||||
running: false,
|
||||
restartPending: false,
|
||||
lastStopAt: Date.now(),
|
||||
});
|
||||
}),
|
||||
@@ -377,6 +387,7 @@ export function createChannelManager(opts: ChannelManagerOptions): ChannelManage
|
||||
const next: ChannelAccountSnapshot = {
|
||||
accountId: resolvedId,
|
||||
running: false,
|
||||
restartPending: false,
|
||||
lastError: cleared ? "logged out" : current.lastError,
|
||||
};
|
||||
if (typeof current.connected === "boolean") {
|
||||
|
||||
155
src/gateway/server-http.probe.test.ts
Normal file
155
src/gateway/server-http.probe.test.ts
Normal file
@@ -0,0 +1,155 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import {
|
||||
AUTH_TOKEN,
|
||||
AUTH_NONE,
|
||||
createRequest,
|
||||
createResponse,
|
||||
dispatchRequest,
|
||||
withGatewayServer,
|
||||
} from "./server-http.test-harness.js";
|
||||
import type { ReadinessChecker } from "./server/readiness.js";
|
||||
|
||||
describe("gateway probe endpoints", () => {
|
||||
it("returns detailed readiness payload for local /ready requests", async () => {
|
||||
const getReadiness: ReadinessChecker = () => ({
|
||||
ready: true,
|
||||
failing: [],
|
||||
uptimeMs: 45_000,
|
||||
});
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-ready",
|
||||
resolvedAuth: AUTH_NONE,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({ path: "/ready" });
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(JSON.parse(getBody())).toEqual({ ready: true, failing: [], uptimeMs: 45_000 });
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("returns only readiness state for unauthenticated remote /ready requests", async () => {
|
||||
const getReadiness: ReadinessChecker = () => ({
|
||||
ready: false,
|
||||
failing: ["discord", "telegram"],
|
||||
uptimeMs: 8_000,
|
||||
});
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-not-ready",
|
||||
resolvedAuth: AUTH_NONE,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({
|
||||
path: "/ready",
|
||||
remoteAddress: "10.0.0.8",
|
||||
host: "gateway.test",
|
||||
});
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(503);
|
||||
expect(JSON.parse(getBody())).toEqual({ ready: false });
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("returns detailed readiness payload for authenticated remote /ready requests", async () => {
|
||||
const getReadiness: ReadinessChecker = () => ({
|
||||
ready: false,
|
||||
failing: ["discord", "telegram"],
|
||||
uptimeMs: 8_000,
|
||||
});
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-remote-authenticated",
|
||||
resolvedAuth: AUTH_TOKEN,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({
|
||||
path: "/ready",
|
||||
remoteAddress: "10.0.0.8",
|
||||
host: "gateway.test",
|
||||
authorization: "Bearer test-token",
|
||||
});
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(503);
|
||||
expect(JSON.parse(getBody())).toEqual({
|
||||
ready: false,
|
||||
failing: ["discord", "telegram"],
|
||||
uptimeMs: 8_000,
|
||||
});
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("returns typed internal error payload when readiness evaluation throws", async () => {
|
||||
const getReadiness: ReadinessChecker = () => {
|
||||
throw new Error("boom");
|
||||
};
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-throws",
|
||||
resolvedAuth: AUTH_NONE,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({ path: "/ready" });
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(503);
|
||||
expect(JSON.parse(getBody())).toEqual({ ready: false, failing: ["internal"], uptimeMs: 0 });
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps /healthz shallow even when readiness checker reports failing channels", async () => {
|
||||
const getReadiness: ReadinessChecker = () => ({
|
||||
ready: false,
|
||||
failing: ["discord"],
|
||||
uptimeMs: 999,
|
||||
});
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-healthz-unaffected",
|
||||
resolvedAuth: AUTH_NONE,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({ path: "/healthz" });
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(200);
|
||||
expect(getBody()).toBe(JSON.stringify({ ok: true, status: "live" }));
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
it("reflects readiness status on HEAD /readyz without a response body", async () => {
|
||||
const getReadiness: ReadinessChecker = () => ({
|
||||
ready: false,
|
||||
failing: ["discord"],
|
||||
uptimeMs: 5_000,
|
||||
});
|
||||
|
||||
await withGatewayServer({
|
||||
prefix: "probe-readyz-head",
|
||||
resolvedAuth: AUTH_NONE,
|
||||
overrides: { getReadiness },
|
||||
run: async (server) => {
|
||||
const req = createRequest({ path: "/readyz", method: "HEAD" });
|
||||
const { res, getBody } = createResponse();
|
||||
await dispatchRequest(server, req, res);
|
||||
|
||||
expect(res.statusCode).toBe(503);
|
||||
expect(getBody()).toBe("");
|
||||
},
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -28,11 +28,15 @@ export function createRequest(params: {
|
||||
path: string;
|
||||
authorization?: string;
|
||||
method?: string;
|
||||
remoteAddress?: string;
|
||||
host?: string;
|
||||
}): IncomingMessage {
|
||||
return createGatewayRequest({
|
||||
path: params.path,
|
||||
authorization: params.authorization,
|
||||
method: params.method,
|
||||
remoteAddress: params.remoteAddress,
|
||||
host: params.host,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -127,6 +131,8 @@ export async function sendRequest(
|
||||
path: string;
|
||||
authorization?: string;
|
||||
method?: string;
|
||||
remoteAddress?: string;
|
||||
host?: string;
|
||||
},
|
||||
): Promise<ReturnType<typeof createResponse>> {
|
||||
const response = createResponse();
|
||||
|
||||
@@ -20,7 +20,12 @@ import {
|
||||
normalizeRateLimitClientIp,
|
||||
type AuthRateLimiter,
|
||||
} from "./auth-rate-limit.js";
|
||||
import { type GatewayAuthResult, type ResolvedGatewayAuth } from "./auth.js";
|
||||
import {
|
||||
authorizeHttpGatewayConnect,
|
||||
isLocalDirectRequest,
|
||||
type GatewayAuthResult,
|
||||
type ResolvedGatewayAuth,
|
||||
} from "./auth.js";
|
||||
import { normalizeCanvasScopedUrl } from "./canvas-capability.js";
|
||||
import {
|
||||
handleControlUiAvatarRequest,
|
||||
@@ -46,6 +51,7 @@ import {
|
||||
resolveHookDeliver,
|
||||
} from "./hooks.js";
|
||||
import { sendGatewayAuthFailure, setDefaultSecurityHeaders } from "./http-common.js";
|
||||
import { getBearerToken } from "./http-utils.js";
|
||||
import { handleOpenAiHttpRequest } from "./openai-http.js";
|
||||
import { handleOpenResponsesHttpRequest } from "./openresponses-http.js";
|
||||
import {
|
||||
@@ -59,6 +65,7 @@ import {
|
||||
type PluginHttpRequestHandler,
|
||||
type PluginRoutePathContext,
|
||||
} from "./server/plugins-http.js";
|
||||
import type { ReadinessChecker } from "./server/readiness.js";
|
||||
import type { GatewayWsClient } from "./server/ws-types.js";
|
||||
import { handleToolsInvokeHttpRequest } from "./tools-invoke-http.js";
|
||||
|
||||
@@ -150,11 +157,39 @@ function shouldEnforceDefaultPluginGatewayAuth(pathContext: PluginRoutePathConte
|
||||
);
|
||||
}
|
||||
|
||||
function handleGatewayProbeRequest(
|
||||
async function canRevealReadinessDetails(params: {
|
||||
req: IncomingMessage;
|
||||
resolvedAuth: ResolvedGatewayAuth;
|
||||
trustedProxies: string[];
|
||||
allowRealIpFallback: boolean;
|
||||
}): Promise<boolean> {
|
||||
if (isLocalDirectRequest(params.req, params.trustedProxies, params.allowRealIpFallback)) {
|
||||
return true;
|
||||
}
|
||||
if (params.resolvedAuth.mode === "none") {
|
||||
return false;
|
||||
}
|
||||
|
||||
const bearerToken = getBearerToken(params.req);
|
||||
const authResult = await authorizeHttpGatewayConnect({
|
||||
auth: params.resolvedAuth,
|
||||
connectAuth: bearerToken ? { token: bearerToken, password: bearerToken } : null,
|
||||
req: params.req,
|
||||
trustedProxies: params.trustedProxies,
|
||||
allowRealIpFallback: params.allowRealIpFallback,
|
||||
});
|
||||
return authResult.ok;
|
||||
}
|
||||
|
||||
async function handleGatewayProbeRequest(
|
||||
req: IncomingMessage,
|
||||
res: ServerResponse,
|
||||
requestPath: string,
|
||||
): boolean {
|
||||
resolvedAuth: ResolvedGatewayAuth,
|
||||
trustedProxies: string[],
|
||||
allowRealIpFallback: boolean,
|
||||
getReadiness?: ReadinessChecker,
|
||||
): Promise<boolean> {
|
||||
const status = GATEWAY_PROBE_STATUS_BY_PATH.get(requestPath);
|
||||
if (!status) {
|
||||
return false;
|
||||
@@ -169,14 +204,34 @@ function handleGatewayProbeRequest(
|
||||
return true;
|
||||
}
|
||||
|
||||
res.statusCode = 200;
|
||||
res.setHeader("Content-Type", "application/json; charset=utf-8");
|
||||
res.setHeader("Cache-Control", "no-store");
|
||||
if (method === "HEAD") {
|
||||
res.end();
|
||||
return true;
|
||||
|
||||
let statusCode: number;
|
||||
let body: string;
|
||||
if (status === "ready" && getReadiness) {
|
||||
const includeDetails = await canRevealReadinessDetails({
|
||||
req,
|
||||
resolvedAuth,
|
||||
trustedProxies,
|
||||
allowRealIpFallback,
|
||||
});
|
||||
try {
|
||||
const result = getReadiness();
|
||||
statusCode = result.ready ? 200 : 503;
|
||||
body = JSON.stringify(includeDetails ? result : { ready: result.ready });
|
||||
} catch {
|
||||
statusCode = 503;
|
||||
body = JSON.stringify(
|
||||
includeDetails ? { ready: false, failing: ["internal"], uptimeMs: 0 } : { ready: false },
|
||||
);
|
||||
}
|
||||
} else {
|
||||
statusCode = 200;
|
||||
body = JSON.stringify({ ok: true, status });
|
||||
}
|
||||
res.end(JSON.stringify({ ok: true, status }));
|
||||
res.statusCode = statusCode;
|
||||
res.end(method === "HEAD" ? undefined : body);
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -519,6 +574,7 @@ export function createGatewayHttpServer(opts: {
|
||||
resolvedAuth: ResolvedGatewayAuth;
|
||||
/** Optional rate limiter for auth brute-force protection. */
|
||||
rateLimiter?: AuthRateLimiter;
|
||||
getReadiness?: ReadinessChecker;
|
||||
tlsOptions?: TlsOptions;
|
||||
}): HttpServer {
|
||||
const {
|
||||
@@ -537,6 +593,7 @@ export function createGatewayHttpServer(opts: {
|
||||
shouldEnforcePluginGatewayAuth,
|
||||
resolvedAuth,
|
||||
rateLimiter,
|
||||
getReadiness,
|
||||
} = opts;
|
||||
const httpServer: HttpServer = opts.tlsOptions
|
||||
? createHttpsServer(opts.tlsOptions, (req, res) => {
|
||||
@@ -693,7 +750,16 @@ export function createGatewayHttpServer(opts: {
|
||||
|
||||
requestStages.push({
|
||||
name: "gateway-probes",
|
||||
run: () => handleGatewayProbeRequest(req, res, requestPath),
|
||||
run: () =>
|
||||
handleGatewayProbeRequest(
|
||||
req,
|
||||
res,
|
||||
requestPath,
|
||||
resolvedAuth,
|
||||
trustedProxies,
|
||||
allowRealIpFallback,
|
||||
getReadiness,
|
||||
),
|
||||
});
|
||||
|
||||
if (await runGatewayHttpRequestStages(requestStages)) {
|
||||
|
||||
@@ -32,6 +32,7 @@ import {
|
||||
shouldEnforceGatewayAuthForPluginPath,
|
||||
type PluginRoutePathContext,
|
||||
} from "./server/plugins-http.js";
|
||||
import type { ReadinessChecker } from "./server/readiness.js";
|
||||
import type { GatewayTlsRuntime } from "./server/tls.js";
|
||||
import type { GatewayWsClient } from "./server/ws-types.js";
|
||||
|
||||
@@ -61,6 +62,7 @@ export async function createGatewayRuntimeState(params: {
|
||||
log: { info: (msg: string) => void; warn: (msg: string) => void };
|
||||
logHooks: ReturnType<typeof createSubsystemLogger>;
|
||||
logPlugins: ReturnType<typeof createSubsystemLogger>;
|
||||
getReadiness?: ReadinessChecker;
|
||||
}): Promise<{
|
||||
canvasHost: CanvasHostHandler | null;
|
||||
httpServer: HttpServer;
|
||||
@@ -156,6 +158,7 @@ export async function createGatewayRuntimeState(params: {
|
||||
shouldEnforcePluginGatewayAuth,
|
||||
resolvedAuth: params.resolvedAuth,
|
||||
rateLimiter: params.rateLimiter,
|
||||
getReadiness: params.getReadiness,
|
||||
tlsOptions: params.gatewayTls?.enabled ? params.gatewayTls.tlsOptions : undefined,
|
||||
});
|
||||
try {
|
||||
|
||||
@@ -106,6 +106,7 @@ import {
|
||||
incrementPresenceVersion,
|
||||
refreshGatewayHealthSnapshot,
|
||||
} from "./server/health-state.js";
|
||||
import { createReadinessChecker } from "./server/readiness.js";
|
||||
import { loadGatewayTlsRuntime } from "./server/tls.js";
|
||||
import {
|
||||
ensureGatewayStartupAuth,
|
||||
@@ -546,6 +547,17 @@ export async function startGatewayServer(
|
||||
if (cfgAtStart.gateway?.tls?.enabled && !gatewayTls.enabled) {
|
||||
throw new Error(gatewayTls.error ?? "gateway tls: failed to enable");
|
||||
}
|
||||
const serverStartedAt = Date.now();
|
||||
const channelManager = createChannelManager({
|
||||
loadConfig,
|
||||
channelLogs,
|
||||
channelRuntimeEnvs,
|
||||
channelRuntime: createPluginRuntime().channel,
|
||||
});
|
||||
const getReadiness = createReadinessChecker({
|
||||
channelManager,
|
||||
startedAt: serverStartedAt,
|
||||
});
|
||||
const {
|
||||
canvasHost,
|
||||
httpServer,
|
||||
@@ -589,6 +601,7 @@ export async function startGatewayServer(
|
||||
log,
|
||||
logHooks,
|
||||
logPlugins,
|
||||
getReadiness,
|
||||
});
|
||||
let bonjourStop: (() => Promise<void>) | null = null;
|
||||
const nodeRegistry = new NodeRegistry();
|
||||
@@ -618,12 +631,6 @@ export async function startGatewayServer(
|
||||
});
|
||||
let { cron, storePath: cronStorePath } = cronState;
|
||||
|
||||
const channelManager = createChannelManager({
|
||||
loadConfig,
|
||||
channelLogs,
|
||||
channelRuntimeEnvs,
|
||||
channelRuntime: createPluginRuntime().channel,
|
||||
});
|
||||
const { getRuntimeSnapshot, startChannels, startChannel, stopChannel, markChannelLoggedOut } =
|
||||
channelManager;
|
||||
|
||||
|
||||
202
src/gateway/server/readiness.test.ts
Normal file
202
src/gateway/server/readiness.test.ts
Normal file
@@ -0,0 +1,202 @@
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import type { ChannelId } from "../../channels/plugins/index.js";
|
||||
import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js";
|
||||
import type { ChannelManager, ChannelRuntimeSnapshot } from "../server-channels.js";
|
||||
import { createReadinessChecker } from "./readiness.js";
|
||||
|
||||
function snapshotWith(
|
||||
accounts: Record<string, Partial<ChannelAccountSnapshot>>,
|
||||
): ChannelRuntimeSnapshot {
|
||||
const channels: ChannelRuntimeSnapshot["channels"] = {};
|
||||
const channelAccounts: ChannelRuntimeSnapshot["channelAccounts"] = {};
|
||||
|
||||
for (const [channelId, accountSnapshot] of Object.entries(accounts)) {
|
||||
const resolved = { accountId: "default", ...accountSnapshot } as ChannelAccountSnapshot;
|
||||
channels[channelId as ChannelId] = resolved;
|
||||
channelAccounts[channelId as ChannelId] = { default: resolved };
|
||||
}
|
||||
|
||||
return { channels, channelAccounts };
|
||||
}
|
||||
|
||||
function createManager(snapshot: ChannelRuntimeSnapshot): ChannelManager {
|
||||
return {
|
||||
getRuntimeSnapshot: vi.fn(() => snapshot),
|
||||
startChannels: vi.fn(),
|
||||
startChannel: vi.fn(),
|
||||
stopChannel: vi.fn(),
|
||||
markChannelLoggedOut: vi.fn(),
|
||||
isManuallyStopped: vi.fn(() => false),
|
||||
resetRestartAttempts: vi.fn(),
|
||||
};
|
||||
}
|
||||
|
||||
describe("createReadinessChecker", () => {
|
||||
it("reports ready when all managed channels are healthy", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 5 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: true,
|
||||
connected: true,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
lastEventAt: Date.now() - 1_000,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("ignores disabled and unconfigured channels", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 5 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: false,
|
||||
enabled: false,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
},
|
||||
telegram: {
|
||||
running: false,
|
||||
enabled: true,
|
||||
configured: false,
|
||||
lastStartAt: startedAt,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("uses startup grace before marking disconnected channels not ready", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 30_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: true,
|
||||
connected: false,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 30_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("reports disconnected managed channels after startup grace", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 5 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: true,
|
||||
connected: false,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: false, failing: ["discord"], uptimeMs: 300_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("keeps restart-pending channels ready during reconnect backoff", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 5 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: false,
|
||||
restartPending: true,
|
||||
reconnectAttempts: 3,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt - 30_000,
|
||||
lastStopAt: Date.now() - 5_000,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("treats stale-socket channels as ready to avoid pulling healthy idle pods", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 31 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: true,
|
||||
connected: true,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
lastEventAt: Date.now() - 31 * 60_000,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({ channelManager: manager, startedAt });
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 1_860_000 });
|
||||
vi.useRealTimers();
|
||||
});
|
||||
|
||||
it("caches readiness snapshots briefly to keep repeated probes cheap", () => {
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date("2026-03-06T12:00:00Z"));
|
||||
const startedAt = Date.now() - 5 * 60_000;
|
||||
const manager = createManager(
|
||||
snapshotWith({
|
||||
discord: {
|
||||
running: true,
|
||||
connected: true,
|
||||
enabled: true,
|
||||
configured: true,
|
||||
lastStartAt: startedAt,
|
||||
lastEventAt: Date.now() - 1_000,
|
||||
},
|
||||
}),
|
||||
);
|
||||
|
||||
const readiness = createReadinessChecker({
|
||||
channelManager: manager,
|
||||
startedAt,
|
||||
cacheTtlMs: 1_000,
|
||||
});
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_000 });
|
||||
vi.advanceTimersByTime(500);
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 300_500 });
|
||||
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(1);
|
||||
|
||||
vi.advanceTimersByTime(600);
|
||||
expect(readiness()).toEqual({ ready: true, failing: [], uptimeMs: 301_100 });
|
||||
expect(manager.getRuntimeSnapshot).toHaveBeenCalledTimes(2);
|
||||
vi.useRealTimers();
|
||||
});
|
||||
});
|
||||
79
src/gateway/server/readiness.ts
Normal file
79
src/gateway/server/readiness.ts
Normal file
@@ -0,0 +1,79 @@
|
||||
import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js";
|
||||
import {
|
||||
DEFAULT_CHANNEL_CONNECT_GRACE_MS,
|
||||
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
||||
evaluateChannelHealth,
|
||||
type ChannelHealthPolicy,
|
||||
type ChannelHealthEvaluation,
|
||||
} from "../channel-health-policy.js";
|
||||
import type { ChannelManager } from "../server-channels.js";
|
||||
|
||||
export type ReadinessResult = {
|
||||
ready: boolean;
|
||||
failing: string[];
|
||||
uptimeMs: number;
|
||||
};
|
||||
|
||||
export type ReadinessChecker = () => ReadinessResult;
|
||||
|
||||
const DEFAULT_READINESS_CACHE_TTL_MS = 1_000;
|
||||
|
||||
function shouldIgnoreReadinessFailure(
|
||||
accountSnapshot: ChannelAccountSnapshot,
|
||||
health: ChannelHealthEvaluation,
|
||||
): boolean {
|
||||
if (health.reason === "unmanaged" || health.reason === "stale-socket") {
|
||||
return true;
|
||||
}
|
||||
// Channel restarts spend time in backoff with running=false before the next
|
||||
// lifecycle re-enters startup grace. Keep readiness green during that handoff
|
||||
// window, but still surface hard failures once restart attempts are exhausted.
|
||||
return health.reason === "not-running" && accountSnapshot.restartPending === true;
|
||||
}
|
||||
|
||||
export function createReadinessChecker(deps: {
|
||||
channelManager: ChannelManager;
|
||||
startedAt: number;
|
||||
cacheTtlMs?: number;
|
||||
}): ReadinessChecker {
|
||||
const { channelManager, startedAt } = deps;
|
||||
const cacheTtlMs = Math.max(0, deps.cacheTtlMs ?? DEFAULT_READINESS_CACHE_TTL_MS);
|
||||
let cachedAt = 0;
|
||||
let cachedState: Omit<ReadinessResult, "uptimeMs"> | null = null;
|
||||
|
||||
return (): ReadinessResult => {
|
||||
const now = Date.now();
|
||||
const uptimeMs = now - startedAt;
|
||||
if (cachedState && now - cachedAt < cacheTtlMs) {
|
||||
return { ...cachedState, uptimeMs };
|
||||
}
|
||||
|
||||
const snapshot = channelManager.getRuntimeSnapshot();
|
||||
const failing: string[] = [];
|
||||
const policy: ChannelHealthPolicy = {
|
||||
now,
|
||||
staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
||||
channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS,
|
||||
};
|
||||
|
||||
for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) {
|
||||
if (!accounts) {
|
||||
continue;
|
||||
}
|
||||
for (const accountSnapshot of Object.values(accounts)) {
|
||||
if (!accountSnapshot) {
|
||||
continue;
|
||||
}
|
||||
const health = evaluateChannelHealth(accountSnapshot, policy);
|
||||
if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) {
|
||||
failing.push(channelId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cachedAt = now;
|
||||
cachedState = { ready: failing.length === 0, failing };
|
||||
return { ...cachedState, uptimeMs };
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user