* Changelog: add channel-backed readiness probe entry * Gateway: add channel-backed readiness probes * Docs: describe readiness probe behavior * Gateway: add readiness probe regression tests * Changelog: dedupe gateway probe entries * Docs: fix readiness startup grace description * Changelog: remove stale readiness entry * Gateway: cover readiness hardening * Gateway: harden readiness probes
80 lines
2.5 KiB
TypeScript
80 lines
2.5 KiB
TypeScript
import type { ChannelAccountSnapshot } from "../../channels/plugins/types.js";
|
|
import {
|
|
DEFAULT_CHANNEL_CONNECT_GRACE_MS,
|
|
DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
|
evaluateChannelHealth,
|
|
type ChannelHealthPolicy,
|
|
type ChannelHealthEvaluation,
|
|
} from "../channel-health-policy.js";
|
|
import type { ChannelManager } from "../server-channels.js";
|
|
|
|
export type ReadinessResult = {
|
|
ready: boolean;
|
|
failing: string[];
|
|
uptimeMs: number;
|
|
};
|
|
|
|
export type ReadinessChecker = () => ReadinessResult;
|
|
|
|
const DEFAULT_READINESS_CACHE_TTL_MS = 1_000;
|
|
|
|
function shouldIgnoreReadinessFailure(
|
|
accountSnapshot: ChannelAccountSnapshot,
|
|
health: ChannelHealthEvaluation,
|
|
): boolean {
|
|
if (health.reason === "unmanaged" || health.reason === "stale-socket") {
|
|
return true;
|
|
}
|
|
// Channel restarts spend time in backoff with running=false before the next
|
|
// lifecycle re-enters startup grace. Keep readiness green during that handoff
|
|
// window, but still surface hard failures once restart attempts are exhausted.
|
|
return health.reason === "not-running" && accountSnapshot.restartPending === true;
|
|
}
|
|
|
|
export function createReadinessChecker(deps: {
|
|
channelManager: ChannelManager;
|
|
startedAt: number;
|
|
cacheTtlMs?: number;
|
|
}): ReadinessChecker {
|
|
const { channelManager, startedAt } = deps;
|
|
const cacheTtlMs = Math.max(0, deps.cacheTtlMs ?? DEFAULT_READINESS_CACHE_TTL_MS);
|
|
let cachedAt = 0;
|
|
let cachedState: Omit<ReadinessResult, "uptimeMs"> | null = null;
|
|
|
|
return (): ReadinessResult => {
|
|
const now = Date.now();
|
|
const uptimeMs = now - startedAt;
|
|
if (cachedState && now - cachedAt < cacheTtlMs) {
|
|
return { ...cachedState, uptimeMs };
|
|
}
|
|
|
|
const snapshot = channelManager.getRuntimeSnapshot();
|
|
const failing: string[] = [];
|
|
const policy: ChannelHealthPolicy = {
|
|
now,
|
|
staleEventThresholdMs: DEFAULT_CHANNEL_STALE_EVENT_THRESHOLD_MS,
|
|
channelConnectGraceMs: DEFAULT_CHANNEL_CONNECT_GRACE_MS,
|
|
};
|
|
|
|
for (const [channelId, accounts] of Object.entries(snapshot.channelAccounts)) {
|
|
if (!accounts) {
|
|
continue;
|
|
}
|
|
for (const accountSnapshot of Object.values(accounts)) {
|
|
if (!accountSnapshot) {
|
|
continue;
|
|
}
|
|
const health = evaluateChannelHealth(accountSnapshot, policy);
|
|
if (!health.healthy && !shouldIgnoreReadinessFailure(accountSnapshot, health)) {
|
|
failing.push(channelId);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
cachedAt = now;
|
|
cachedState = { ready: failing.length === 0, failing };
|
|
return { ...cachedState, uptimeMs };
|
|
};
|
|
}
|