fix(daemon): recover Windows restarts from unknown stale listeners (openclaw#24734) thanks @chilu18
Verified: - pnpm vitest src/cli/daemon-cli/restart-health.test.ts src/cli/gateway-cli.coverage.test.ts - pnpm oxfmt --check src/cli/daemon-cli/restart-health.ts src/cli/daemon-cli/restart-health.test.ts - pnpm check (fails on unrelated repo baseline tsgo errors in extensions/* and src/process/exec.windows.test.ts) Co-authored-by: chilu18 <7957943+chilu18@users.noreply.github.com> Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
@@ -88,6 +88,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
|
||||
port: restartPort,
|
||||
attempts: POST_RESTART_HEALTH_ATTEMPTS,
|
||||
delayMs: POST_RESTART_HEALTH_DELAY_MS,
|
||||
includeUnknownListenersAsStale: process.platform === "win32",
|
||||
});
|
||||
|
||||
if (!health.healthy && health.staleGatewayPids.length > 0) {
|
||||
@@ -105,6 +106,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
|
||||
port: restartPort,
|
||||
attempts: POST_RESTART_HEALTH_ATTEMPTS,
|
||||
delayMs: POST_RESTART_HEALTH_DELAY_MS,
|
||||
includeUnknownListenersAsStale: process.platform === "win32",
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
import type { GatewayService } from "../../daemon/service.js";
|
||||
import type { PortListenerKind, PortUsage } from "../../infra/ports.js";
|
||||
|
||||
@@ -13,6 +13,8 @@ vi.mock("../../infra/ports.js", () => ({
|
||||
inspectPortUsage: (port: number) => inspectPortUsage(port),
|
||||
}));
|
||||
|
||||
const originalPlatform = process.platform;
|
||||
|
||||
describe("inspectGatewayRestart", () => {
|
||||
beforeEach(() => {
|
||||
inspectPortUsage.mockReset();
|
||||
@@ -26,6 +28,10 @@ describe("inspectGatewayRestart", () => {
|
||||
classifyPortListener.mockReturnValue("gateway");
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
Object.defineProperty(process, "platform", { value: originalPlatform, configurable: true });
|
||||
});
|
||||
|
||||
it("treats a gateway listener child pid as healthy ownership", async () => {
|
||||
const service = {
|
||||
readRuntime: vi.fn(async () => ({ status: "running", pid: 7000 })),
|
||||
@@ -63,4 +69,104 @@ describe("inspectGatewayRestart", () => {
|
||||
expect(snapshot.healthy).toBe(false);
|
||||
expect(snapshot.staleGatewayPids).toEqual([9000]);
|
||||
});
|
||||
|
||||
it("treats unknown listeners as stale on Windows when enabled", async () => {
|
||||
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
|
||||
classifyPortListener.mockReturnValue("unknown");
|
||||
|
||||
const service = {
|
||||
readRuntime: vi.fn(async () => ({ status: "stopped" })),
|
||||
} as unknown as GatewayService;
|
||||
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "busy",
|
||||
listeners: [{ pid: 10920, command: "unknown" }],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { inspectGatewayRestart } = await import("./restart-health.js");
|
||||
const snapshot = await inspectGatewayRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
includeUnknownListenersAsStale: true,
|
||||
});
|
||||
|
||||
expect(snapshot.staleGatewayPids).toEqual([10920]);
|
||||
});
|
||||
|
||||
it("does not treat unknown listeners as stale when fallback is disabled", async () => {
|
||||
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
|
||||
classifyPortListener.mockReturnValue("unknown");
|
||||
|
||||
const service = {
|
||||
readRuntime: vi.fn(async () => ({ status: "stopped" })),
|
||||
} as unknown as GatewayService;
|
||||
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "busy",
|
||||
listeners: [{ pid: 10920, command: "unknown" }],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { inspectGatewayRestart } = await import("./restart-health.js");
|
||||
const snapshot = await inspectGatewayRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
includeUnknownListenersAsStale: false,
|
||||
});
|
||||
|
||||
expect(snapshot.staleGatewayPids).toEqual([]);
|
||||
});
|
||||
|
||||
it("does not apply unknown-listener fallback while runtime is running", async () => {
|
||||
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
|
||||
classifyPortListener.mockReturnValue("unknown");
|
||||
|
||||
const service = {
|
||||
readRuntime: vi.fn(async () => ({ status: "running", pid: 10920 })),
|
||||
} as unknown as GatewayService;
|
||||
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "busy",
|
||||
listeners: [{ pid: 10920, command: "unknown" }],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { inspectGatewayRestart } = await import("./restart-health.js");
|
||||
const snapshot = await inspectGatewayRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
includeUnknownListenersAsStale: true,
|
||||
});
|
||||
|
||||
expect(snapshot.staleGatewayPids).toEqual([]);
|
||||
});
|
||||
|
||||
it("does not treat known non-gateway listeners as stale in fallback mode", async () => {
|
||||
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
|
||||
classifyPortListener.mockReturnValue("ssh");
|
||||
|
||||
const service = {
|
||||
readRuntime: vi.fn(async () => ({ status: "stopped" })),
|
||||
} as unknown as GatewayService;
|
||||
|
||||
inspectPortUsage.mockResolvedValue({
|
||||
port: 18789,
|
||||
status: "busy",
|
||||
listeners: [{ pid: 22001, command: "nginx.exe" }],
|
||||
hints: [],
|
||||
});
|
||||
|
||||
const { inspectGatewayRestart } = await import("./restart-health.js");
|
||||
const snapshot = await inspectGatewayRestart({
|
||||
service,
|
||||
port: 18789,
|
||||
includeUnknownListenersAsStale: true,
|
||||
});
|
||||
|
||||
expect(snapshot.staleGatewayPids).toEqual([]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,6 +6,7 @@ import {
|
||||
inspectPortUsage,
|
||||
type PortUsage,
|
||||
} from "../../infra/ports.js";
|
||||
import { killProcessTree } from "../../process/kill-tree.js";
|
||||
import { sleep } from "../../utils.js";
|
||||
|
||||
export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000;
|
||||
@@ -32,6 +33,7 @@ export async function inspectGatewayRestart(params: {
|
||||
service: GatewayService;
|
||||
port: number;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
includeUnknownListenersAsStale?: boolean;
|
||||
}): Promise<GatewayRestartSnapshot> {
|
||||
const env = params.env ?? process.env;
|
||||
let runtime: GatewayServiceRuntime = { status: "unknown" };
|
||||
@@ -60,6 +62,16 @@ export async function inspectGatewayRestart(params: {
|
||||
(listener) => classifyPortListener(listener, params.port) === "gateway",
|
||||
)
|
||||
: [];
|
||||
const fallbackListenerPids =
|
||||
params.includeUnknownListenersAsStale &&
|
||||
process.platform === "win32" &&
|
||||
runtime.status !== "running" &&
|
||||
portUsage.status === "busy"
|
||||
? portUsage.listeners
|
||||
.filter((listener) => classifyPortListener(listener, params.port) === "unknown")
|
||||
.map((listener) => listener.pid)
|
||||
.filter((pid): pid is number => Number.isFinite(pid))
|
||||
: [];
|
||||
const running = runtime.status === "running";
|
||||
const runtimePid = runtime.pid;
|
||||
const ownsPort =
|
||||
@@ -69,8 +81,8 @@ export async function inspectGatewayRestart(params: {
|
||||
(portUsage.status === "busy" && portUsage.listeners.length === 0);
|
||||
const healthy = running && ownsPort;
|
||||
const staleGatewayPids = Array.from(
|
||||
new Set(
|
||||
gatewayListeners
|
||||
new Set([
|
||||
...gatewayListeners
|
||||
.filter((listener) => Number.isFinite(listener.pid))
|
||||
.filter((listener) => {
|
||||
if (!running) {
|
||||
@@ -82,7 +94,10 @@ export async function inspectGatewayRestart(params: {
|
||||
return !listenerOwnedByRuntimePid({ listener, runtimePid });
|
||||
})
|
||||
.map((listener) => listener.pid as number),
|
||||
),
|
||||
...fallbackListenerPids.filter(
|
||||
(pid) => runtime.pid == null || pid !== runtime.pid || !running,
|
||||
),
|
||||
]),
|
||||
);
|
||||
|
||||
return {
|
||||
@@ -99,6 +114,7 @@ export async function waitForGatewayHealthyRestart(params: {
|
||||
attempts?: number;
|
||||
delayMs?: number;
|
||||
env?: NodeJS.ProcessEnv;
|
||||
includeUnknownListenersAsStale?: boolean;
|
||||
}): Promise<GatewayRestartSnapshot> {
|
||||
const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS;
|
||||
const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS;
|
||||
@@ -107,6 +123,7 @@ export async function waitForGatewayHealthyRestart(params: {
|
||||
service: params.service,
|
||||
port: params.port,
|
||||
env: params.env,
|
||||
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
|
||||
});
|
||||
|
||||
for (let attempt = 0; attempt < attempts; attempt += 1) {
|
||||
@@ -121,6 +138,7 @@ export async function waitForGatewayHealthyRestart(params: {
|
||||
service: params.service,
|
||||
port: params.port,
|
||||
env: params.env,
|
||||
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -156,36 +174,14 @@ export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): stri
|
||||
}
|
||||
|
||||
export async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> {
|
||||
const killed: number[] = [];
|
||||
for (const pid of pids) {
|
||||
try {
|
||||
process.kill(pid, "SIGTERM");
|
||||
killed.push(pid);
|
||||
} catch (err) {
|
||||
const code = (err as NodeJS.ErrnoException)?.code;
|
||||
if (code !== "ESRCH") {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
const targets = Array.from(
|
||||
new Set(pids.filter((pid): pid is number => Number.isFinite(pid) && pid > 0)),
|
||||
);
|
||||
for (const pid of targets) {
|
||||
killProcessTree(pid, { graceMs: 300 });
|
||||
}
|
||||
|
||||
if (killed.length === 0) {
|
||||
return killed;
|
||||
if (targets.length > 0) {
|
||||
await sleep(500);
|
||||
}
|
||||
|
||||
await sleep(400);
|
||||
|
||||
for (const pid of killed) {
|
||||
try {
|
||||
process.kill(pid, 0);
|
||||
process.kill(pid, "SIGKILL");
|
||||
} catch (err) {
|
||||
const code = (err as NodeJS.ErrnoException)?.code;
|
||||
if (code !== "ESRCH") {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return killed;
|
||||
return targets;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user