Files
openclaw/src/gateway/config-reload.test.ts
Tak Hoffman 1be39d4250 fix(gateway): synthesize lifecycle robustness for restart and startup probes (#33831)
* fix(gateway): correct launchctl command sequence for gateway restart (closes #20030)

* fix(restart): expand HOME and escape label in launchctl plist path

* fix(restart): poll port free after SIGKILL to prevent EADDRINUSE restart loop

When cleanStaleGatewayProcessesSync() kills a stale gateway process,
the kernel may not immediately release the TCP port. Previously the
function returned after a fixed 500ms sleep (300ms SIGTERM + 200ms
SIGKILL), allowing triggerOpenClawRestart() to hand off to systemd
before the port was actually free. The new systemd process then raced
the dying socket for port 18789, hit EADDRINUSE, and exited with
status 1, causing systemd to retry indefinitely — the zombie restart
loop reported in #33103.

Fix: add waitForPortFreeSync() that polls lsof at 50ms intervals for
up to 2 seconds after SIGKILL. cleanStaleGatewayProcessesSync() now
blocks until the port is confirmed free (or the budget expires with a
warning) before returning. The increased SIGTERM/SIGKILL wait budgets
(600ms / 400ms) also give slow processes more time to exit cleanly.

Fixes #33103
Related: #28134

* fix: add EADDRINUSE retry and TIME_WAIT port-bind checks for gateway startup

* fix(ports): treat EADDRNOTAVAIL as non-retryable and fix flaky test

* fix(gateway): hot-reload agents.defaults.models allowlist changes

The reload plan had a rule for `agents.defaults.model` (singular) but
not `agents.defaults.models` (plural — the allowlist array).  Because
`agents.defaults.models` does not prefix-match `agents.defaults.model.`,
it fell through to the catch-all `agents` tail rule (kind=none), so
allowlist edits in openclaw.json were silently ignored at runtime.

Add a dedicated reload rule so changes to the models allowlist trigger
a heartbeat restart, which re-reads the config and serves the updated
list to clients.

Fixes #33600

Co-authored-by: HCL <chenglunhu@gmail.com>
Signed-off-by: HCL <chenglunhu@gmail.com>

* test(restart): 100% branch coverage — audit round 2

Audit findings fixed:
- remove dead guard: terminateStaleProcessesSync pids.length===0 check was
  unreachable (only caller cleanStaleGatewayProcessesSync already guards)
- expose __testing.callSleepSyncRaw so sleepSync's real Atomics.wait path
  can be unit-tested directly without going through the override
- fix broken sleepSync Atomics.wait test: previous test set override=null
  but cleanStaleGatewayProcessesSync returned before calling sleepSync —
  replaced with direct callSleepSyncRaw calls that actually exercise L36/L42-47
- fix pid collision: two tests used process.pid+304 (EPERM + dead-at-SIGTERM);
  EPERM test changed to process.pid+305
- fix misindented tests: 'deduplicates pids' and 'lsof status 1 container
  edge case' were outside their intended describe blocks; moved to correct
  scopes (findGatewayPidsOnPortSync and pollPortOnce respectively)
- add missing branch tests:
  - status 1 + non-empty stdout with zero openclaw pids → free:true (L145)
  - mid-loop non-openclaw cmd in &&-chain (L67)
  - consecutive p-lines without c-line between them (L67)
  - invalid PID in p-line (p0 / pNaN) — ternary false branch (L67)
  - unknown lsof output line (else-if false branch L69)

Coverage: 100% stmts / 100% branch / 100% funcs / 100% lines (36 tests)

* test(restart): fix stale-pid test typing for tsgo

* fix(gateway): address lifecycle review findings

* test(update): make restart-helper path assertions windows-safe

---------

Signed-off-by: HCL <chenglunhu@gmail.com>
Co-authored-by: Glucksberg <markuscontasul@gmail.com>
Co-authored-by: Efe Büken <efe@arven.digital>
Co-authored-by: Riccardo Marino <rmarino@apple.com>
Co-authored-by: HCL <chenglunhu@gmail.com>
2026-03-03 21:31:12 -06:00

419 lines
13 KiB
TypeScript

import chokidar from "chokidar";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import { listChannelPlugins } from "../channels/plugins/index.js";
import type { ChannelPlugin } from "../channels/plugins/types.js";
import type { ConfigFileSnapshot } from "../config/config.js";
import { setActivePluginRegistry } from "../plugins/runtime.js";
import { createTestRegistry } from "../test-utils/channel-plugins.js";
import {
buildGatewayReloadPlan,
diffConfigPaths,
resolveGatewayReloadSettings,
startGatewayConfigReloader,
} from "./config-reload.js";
describe("diffConfigPaths", () => {
it("captures nested config changes", () => {
const prev = { hooks: { gmail: { account: "a" } } };
const next = { hooks: { gmail: { account: "b" } } };
const paths = diffConfigPaths(prev, next);
expect(paths).toContain("hooks.gmail.account");
});
it("captures array changes", () => {
const prev = { messages: { groupChat: { mentionPatterns: ["a"] } } };
const next = { messages: { groupChat: { mentionPatterns: ["b"] } } };
const paths = diffConfigPaths(prev, next);
expect(paths).toContain("messages.groupChat.mentionPatterns");
});
it("does not report unchanged arrays of objects as changed", () => {
const prev = {
memory: {
qmd: {
paths: [{ path: "~/docs", pattern: "**/*.md", name: "docs" }],
scope: {
rules: [{ when: { channel: "slack" }, include: ["docs"] }],
},
},
},
};
const next = {
memory: {
qmd: {
paths: [{ path: "~/docs", pattern: "**/*.md", name: "docs" }],
scope: {
rules: [{ when: { channel: "slack" }, include: ["docs"] }],
},
},
},
};
expect(diffConfigPaths(prev, next)).toEqual([]);
});
it("reports changed arrays of objects", () => {
const prev = {
memory: {
qmd: {
paths: [{ path: "~/docs", pattern: "**/*.md", name: "docs" }],
},
},
};
const next = {
memory: {
qmd: {
paths: [{ path: "~/docs", pattern: "**/*.txt", name: "docs" }],
},
},
};
expect(diffConfigPaths(prev, next)).toContain("memory.qmd.paths");
});
});
describe("buildGatewayReloadPlan", () => {
const emptyRegistry = createTestRegistry([]);
const telegramPlugin: ChannelPlugin = {
id: "telegram",
meta: {
id: "telegram",
label: "Telegram",
selectionLabel: "Telegram",
docsPath: "/channels/telegram",
blurb: "test",
},
capabilities: { chatTypes: ["direct"] },
config: {
listAccountIds: () => [],
resolveAccount: () => ({}),
},
reload: { configPrefixes: ["channels.telegram"] },
};
const whatsappPlugin: ChannelPlugin = {
id: "whatsapp",
meta: {
id: "whatsapp",
label: "WhatsApp",
selectionLabel: "WhatsApp",
docsPath: "/channels/whatsapp",
blurb: "test",
},
capabilities: { chatTypes: ["direct"] },
config: {
listAccountIds: () => [],
resolveAccount: () => ({}),
},
reload: { configPrefixes: ["web"], noopPrefixes: ["channels.whatsapp"] },
};
const registry = createTestRegistry([
{ pluginId: "telegram", plugin: telegramPlugin, source: "test" },
{ pluginId: "whatsapp", plugin: whatsappPlugin, source: "test" },
]);
beforeEach(() => {
setActivePluginRegistry(registry);
});
afterEach(() => {
setActivePluginRegistry(emptyRegistry);
});
it("marks gateway changes as restart required", () => {
const plan = buildGatewayReloadPlan(["gateway.port"]);
expect(plan.restartGateway).toBe(true);
expect(plan.restartReasons).toContain("gateway.port");
});
it("restarts the Gmail watcher for hooks.gmail changes", () => {
const plan = buildGatewayReloadPlan(["hooks.gmail.account"]);
expect(plan.restartGateway).toBe(false);
expect(plan.restartGmailWatcher).toBe(true);
expect(plan.reloadHooks).toBe(true);
});
it("restarts providers when provider config prefixes change", () => {
const changedPaths = ["web.enabled", "channels.telegram.botToken"];
const plan = buildGatewayReloadPlan(changedPaths);
expect(plan.restartGateway).toBe(false);
const expected = new Set(
listChannelPlugins()
.filter((plugin) =>
(plugin.reload?.configPrefixes ?? []).some((prefix) =>
changedPaths.some((path) => path === prefix || path.startsWith(`${prefix}.`)),
),
)
.map((plugin) => plugin.id),
);
expect(expected.size).toBeGreaterThan(0);
expect(plan.restartChannels).toEqual(expected);
});
it("restarts heartbeat when model-related config changes", () => {
const plan = buildGatewayReloadPlan([
"models.providers.openai.models",
"agents.defaults.model",
]);
expect(plan.restartGateway).toBe(false);
expect(plan.restartHeartbeat).toBe(true);
expect(plan.hotReasons).toEqual(
expect.arrayContaining(["models.providers.openai.models", "agents.defaults.model"]),
);
});
it("restarts heartbeat when agents.defaults.models allowlist changes", () => {
const plan = buildGatewayReloadPlan(["agents.defaults.models"]);
expect(plan.restartGateway).toBe(false);
expect(plan.restartHeartbeat).toBe(true);
expect(plan.hotReasons).toContain("agents.defaults.models");
expect(plan.noopPaths).toEqual([]);
});
it("hot-reloads health monitor when channelHealthCheckMinutes changes", () => {
const plan = buildGatewayReloadPlan(["gateway.channelHealthCheckMinutes"]);
expect(plan.restartGateway).toBe(false);
expect(plan.restartHealthMonitor).toBe(true);
expect(plan.hotReasons).toContain("gateway.channelHealthCheckMinutes");
});
it("treats gateway.remote as no-op", () => {
const plan = buildGatewayReloadPlan(["gateway.remote.url"]);
expect(plan.restartGateway).toBe(false);
expect(plan.noopPaths).toContain("gateway.remote.url");
});
it("treats secrets config changes as no-op for gateway restart planning", () => {
const plan = buildGatewayReloadPlan(["secrets.providers.default.path"]);
expect(plan.restartGateway).toBe(false);
expect(plan.noopPaths).toContain("secrets.providers.default.path");
});
it("treats diagnostics.stuckSessionWarnMs as no-op for gateway restart planning", () => {
const plan = buildGatewayReloadPlan(["diagnostics.stuckSessionWarnMs"]);
expect(plan.restartGateway).toBe(false);
expect(plan.noopPaths).toContain("diagnostics.stuckSessionWarnMs");
});
it("defaults unknown paths to restart", () => {
const plan = buildGatewayReloadPlan(["unknownField"]);
expect(plan.restartGateway).toBe(true);
});
it.each([
{
path: "gateway.channelHealthCheckMinutes",
expectRestartGateway: false,
expectHotPath: "gateway.channelHealthCheckMinutes",
expectRestartHealthMonitor: true,
},
{
path: "hooks.gmail.account",
expectRestartGateway: false,
expectHotPath: "hooks.gmail.account",
expectRestartGmailWatcher: true,
expectReloadHooks: true,
},
{
path: "gateway.remote.url",
expectRestartGateway: false,
expectNoopPath: "gateway.remote.url",
},
{
path: "unknownField",
expectRestartGateway: true,
expectRestartReason: "unknownField",
},
])("classifies reload path: $path", (testCase) => {
const plan = buildGatewayReloadPlan([testCase.path]);
expect(plan.restartGateway).toBe(testCase.expectRestartGateway);
if (testCase.expectHotPath) {
expect(plan.hotReasons).toContain(testCase.expectHotPath);
}
if (testCase.expectNoopPath) {
expect(plan.noopPaths).toContain(testCase.expectNoopPath);
}
if (testCase.expectRestartReason) {
expect(plan.restartReasons).toContain(testCase.expectRestartReason);
}
if (testCase.expectRestartHealthMonitor) {
expect(plan.restartHealthMonitor).toBe(true);
}
if (testCase.expectRestartGmailWatcher) {
expect(plan.restartGmailWatcher).toBe(true);
}
if (testCase.expectReloadHooks) {
expect(plan.reloadHooks).toBe(true);
}
});
});
describe("resolveGatewayReloadSettings", () => {
it("uses defaults when unset", () => {
const settings = resolveGatewayReloadSettings({});
expect(settings.mode).toBe("hybrid");
expect(settings.debounceMs).toBe(300);
});
});
type WatcherHandler = () => void;
type WatcherEvent = "add" | "change" | "unlink" | "error";
function createWatcherMock() {
const handlers = new Map<WatcherEvent, WatcherHandler[]>();
return {
on(event: WatcherEvent, handler: WatcherHandler) {
const existing = handlers.get(event) ?? [];
existing.push(handler);
handlers.set(event, existing);
return this;
},
emit(event: WatcherEvent) {
for (const handler of handlers.get(event) ?? []) {
handler();
}
},
close: vi.fn(async () => {}),
};
}
function makeSnapshot(partial: Partial<ConfigFileSnapshot> = {}): ConfigFileSnapshot {
return {
path: "/tmp/openclaw.json",
exists: true,
raw: "{}",
parsed: {},
resolved: {},
valid: true,
config: {},
issues: [],
warnings: [],
legacyIssues: [],
...partial,
};
}
function createReloaderHarness(readSnapshot: () => Promise<ConfigFileSnapshot>) {
const watcher = createWatcherMock();
vi.spyOn(chokidar, "watch").mockReturnValue(watcher as unknown as never);
const onHotReload = vi.fn(async () => {});
const onRestart = vi.fn();
const log = {
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
};
const reloader = startGatewayConfigReloader({
initialConfig: { gateway: { reload: { debounceMs: 0 } } },
readSnapshot,
onHotReload,
onRestart,
log,
watchPath: "/tmp/openclaw.json",
});
return { watcher, onHotReload, onRestart, log, reloader };
}
describe("startGatewayConfigReloader", () => {
beforeEach(() => {
vi.useFakeTimers();
});
afterEach(() => {
vi.useRealTimers();
vi.restoreAllMocks();
});
it("retries missing snapshots and reloads once config file reappears", async () => {
const readSnapshot = vi
.fn<() => Promise<ConfigFileSnapshot>>()
.mockResolvedValueOnce(makeSnapshot({ exists: false, raw: null, hash: "missing-1" }))
.mockResolvedValueOnce(
makeSnapshot({
config: {
gateway: { reload: { debounceMs: 0 } },
hooks: { enabled: true },
},
hash: "next-1",
}),
);
const { watcher, onHotReload, onRestart, log, reloader } = createReloaderHarness(readSnapshot);
watcher.emit("unlink");
await vi.runOnlyPendingTimersAsync();
await vi.advanceTimersByTimeAsync(150);
expect(readSnapshot).toHaveBeenCalledTimes(2);
expect(onHotReload).toHaveBeenCalledTimes(1);
expect(onRestart).not.toHaveBeenCalled();
expect(log.info).toHaveBeenCalledWith("config reload retry (1/2): config file not found");
expect(log.warn).not.toHaveBeenCalledWith("config reload skipped (config file not found)");
await reloader.stop();
});
it("caps missing-file retries and skips reload after retry budget is exhausted", async () => {
const readSnapshot = vi
.fn<() => Promise<ConfigFileSnapshot>>()
.mockResolvedValue(makeSnapshot({ exists: false, raw: null, hash: "missing" }));
const { watcher, onHotReload, onRestart, log, reloader } = createReloaderHarness(readSnapshot);
watcher.emit("unlink");
await vi.runAllTimersAsync();
expect(readSnapshot).toHaveBeenCalledTimes(3);
expect(onHotReload).not.toHaveBeenCalled();
expect(onRestart).not.toHaveBeenCalled();
expect(log.warn).toHaveBeenCalledWith("config reload skipped (config file not found)");
await reloader.stop();
});
it("contains restart callback failures and retries on subsequent changes", async () => {
const readSnapshot = vi
.fn<() => Promise<ConfigFileSnapshot>>()
.mockResolvedValueOnce(
makeSnapshot({
config: {
gateway: { reload: { debounceMs: 0 }, port: 18790 },
},
hash: "restart-1",
}),
)
.mockResolvedValueOnce(
makeSnapshot({
config: {
gateway: { reload: { debounceMs: 0 }, port: 18791 },
},
hash: "restart-2",
}),
);
const { watcher, onHotReload, onRestart, log, reloader } = createReloaderHarness(readSnapshot);
onRestart.mockRejectedValueOnce(new Error("restart-check failed"));
onRestart.mockResolvedValueOnce(undefined);
const unhandled: unknown[] = [];
const onUnhandled = (reason: unknown) => {
unhandled.push(reason);
};
process.on("unhandledRejection", onUnhandled);
try {
watcher.emit("change");
await vi.runOnlyPendingTimersAsync();
await Promise.resolve();
expect(onHotReload).not.toHaveBeenCalled();
expect(onRestart).toHaveBeenCalledTimes(1);
expect(log.error).toHaveBeenCalledWith("config restart failed: Error: restart-check failed");
expect(unhandled).toEqual([]);
watcher.emit("change");
await vi.runOnlyPendingTimersAsync();
await Promise.resolve();
expect(onRestart).toHaveBeenCalledTimes(2);
expect(unhandled).toEqual([]);
} finally {
process.off("unhandledRejection", onUnhandled);
await reloader.stop();
}
});
});