fix(daemon): recover Windows restarts from unknown stale listeners (openclaw#24734) thanks @chilu18

Verified:
- pnpm vitest src/cli/daemon-cli/restart-health.test.ts src/cli/gateway-cli.coverage.test.ts
- pnpm oxfmt --check src/cli/daemon-cli/restart-health.ts src/cli/daemon-cli/restart-health.test.ts
- pnpm check (fails on unrelated repo baseline tsgo errors in extensions/* and src/process/exec.windows.test.ts)

Co-authored-by: chilu18 <7957943+chilu18@users.noreply.github.com>
Co-authored-by: Tak Hoffman <781889+Takhoffman@users.noreply.github.com>
This commit is contained in:
Peter Machona
2026-03-02 14:24:25 +00:00
committed by GitHub
parent a05b8f47b1
commit c2d41dc473
3 changed files with 138 additions and 34 deletions

View File

@@ -88,6 +88,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
includeUnknownListenersAsStale: process.platform === "win32",
});
if (!health.healthy && health.staleGatewayPids.length > 0) {
@@ -105,6 +106,7 @@ export async function runDaemonRestart(opts: DaemonLifecycleOptions = {}): Promi
port: restartPort,
attempts: POST_RESTART_HEALTH_ATTEMPTS,
delayMs: POST_RESTART_HEALTH_DELAY_MS,
includeUnknownListenersAsStale: process.platform === "win32",
});
}

View File

@@ -1,4 +1,4 @@
import { beforeEach, describe, expect, it, vi } from "vitest";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { GatewayService } from "../../daemon/service.js";
import type { PortListenerKind, PortUsage } from "../../infra/ports.js";
@@ -13,6 +13,8 @@ vi.mock("../../infra/ports.js", () => ({
inspectPortUsage: (port: number) => inspectPortUsage(port),
}));
const originalPlatform = process.platform;
describe("inspectGatewayRestart", () => {
beforeEach(() => {
inspectPortUsage.mockReset();
@@ -26,6 +28,10 @@ describe("inspectGatewayRestart", () => {
classifyPortListener.mockReturnValue("gateway");
});
afterEach(() => {
Object.defineProperty(process, "platform", { value: originalPlatform, configurable: true });
});
it("treats a gateway listener child pid as healthy ownership", async () => {
const service = {
readRuntime: vi.fn(async () => ({ status: "running", pid: 7000 })),
@@ -63,4 +69,104 @@ describe("inspectGatewayRestart", () => {
expect(snapshot.healthy).toBe(false);
expect(snapshot.staleGatewayPids).toEqual([9000]);
});
it("treats unknown listeners as stale on Windows when enabled", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([10920]);
});
it("does not treat unknown listeners as stale when fallback is disabled", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: false,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
it("does not apply unknown-listener fallback while runtime is running", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("unknown");
const service = {
readRuntime: vi.fn(async () => ({ status: "running", pid: 10920 })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 10920, command: "unknown" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
it("does not treat known non-gateway listeners as stale in fallback mode", async () => {
Object.defineProperty(process, "platform", { value: "win32", configurable: true });
classifyPortListener.mockReturnValue("ssh");
const service = {
readRuntime: vi.fn(async () => ({ status: "stopped" })),
} as unknown as GatewayService;
inspectPortUsage.mockResolvedValue({
port: 18789,
status: "busy",
listeners: [{ pid: 22001, command: "nginx.exe" }],
hints: [],
});
const { inspectGatewayRestart } = await import("./restart-health.js");
const snapshot = await inspectGatewayRestart({
service,
port: 18789,
includeUnknownListenersAsStale: true,
});
expect(snapshot.staleGatewayPids).toEqual([]);
});
});

View File

@@ -6,6 +6,7 @@ import {
inspectPortUsage,
type PortUsage,
} from "../../infra/ports.js";
import { killProcessTree } from "../../process/kill-tree.js";
import { sleep } from "../../utils.js";
export const DEFAULT_RESTART_HEALTH_TIMEOUT_MS = 60_000;
@@ -32,6 +33,7 @@ export async function inspectGatewayRestart(params: {
service: GatewayService;
port: number;
env?: NodeJS.ProcessEnv;
includeUnknownListenersAsStale?: boolean;
}): Promise<GatewayRestartSnapshot> {
const env = params.env ?? process.env;
let runtime: GatewayServiceRuntime = { status: "unknown" };
@@ -60,6 +62,16 @@ export async function inspectGatewayRestart(params: {
(listener) => classifyPortListener(listener, params.port) === "gateway",
)
: [];
const fallbackListenerPids =
params.includeUnknownListenersAsStale &&
process.platform === "win32" &&
runtime.status !== "running" &&
portUsage.status === "busy"
? portUsage.listeners
.filter((listener) => classifyPortListener(listener, params.port) === "unknown")
.map((listener) => listener.pid)
.filter((pid): pid is number => Number.isFinite(pid))
: [];
const running = runtime.status === "running";
const runtimePid = runtime.pid;
const ownsPort =
@@ -69,8 +81,8 @@ export async function inspectGatewayRestart(params: {
(portUsage.status === "busy" && portUsage.listeners.length === 0);
const healthy = running && ownsPort;
const staleGatewayPids = Array.from(
new Set(
gatewayListeners
new Set([
...gatewayListeners
.filter((listener) => Number.isFinite(listener.pid))
.filter((listener) => {
if (!running) {
@@ -82,7 +94,10 @@ export async function inspectGatewayRestart(params: {
return !listenerOwnedByRuntimePid({ listener, runtimePid });
})
.map((listener) => listener.pid as number),
),
...fallbackListenerPids.filter(
(pid) => runtime.pid == null || pid !== runtime.pid || !running,
),
]),
);
return {
@@ -99,6 +114,7 @@ export async function waitForGatewayHealthyRestart(params: {
attempts?: number;
delayMs?: number;
env?: NodeJS.ProcessEnv;
includeUnknownListenersAsStale?: boolean;
}): Promise<GatewayRestartSnapshot> {
const attempts = params.attempts ?? DEFAULT_RESTART_HEALTH_ATTEMPTS;
const delayMs = params.delayMs ?? DEFAULT_RESTART_HEALTH_DELAY_MS;
@@ -107,6 +123,7 @@ export async function waitForGatewayHealthyRestart(params: {
service: params.service,
port: params.port,
env: params.env,
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
});
for (let attempt = 0; attempt < attempts; attempt += 1) {
@@ -121,6 +138,7 @@ export async function waitForGatewayHealthyRestart(params: {
service: params.service,
port: params.port,
env: params.env,
includeUnknownListenersAsStale: params.includeUnknownListenersAsStale,
});
}
@@ -156,36 +174,14 @@ export function renderRestartDiagnostics(snapshot: GatewayRestartSnapshot): stri
}
export async function terminateStaleGatewayPids(pids: number[]): Promise<number[]> {
const killed: number[] = [];
for (const pid of pids) {
try {
process.kill(pid, "SIGTERM");
killed.push(pid);
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
const targets = Array.from(
new Set(pids.filter((pid): pid is number => Number.isFinite(pid) && pid > 0)),
);
for (const pid of targets) {
killProcessTree(pid, { graceMs: 300 });
}
if (killed.length === 0) {
return killed;
if (targets.length > 0) {
await sleep(500);
}
await sleep(400);
for (const pid of killed) {
try {
process.kill(pid, 0);
process.kill(pid, "SIGKILL");
} catch (err) {
const code = (err as NodeJS.ErrnoException)?.code;
if (code !== "ESRCH") {
throw err;
}
}
}
return killed;
return targets;
}