fix: tighten gateway restart loop handling (#23416) (thanks @jeffwnli)

This commit is contained in:
Peter Steinberger
2026-02-22 10:36:11 +01:00
parent 26acb77450
commit dd07c06d00
6 changed files with 67 additions and 14 deletions

View File

@@ -11,7 +11,9 @@ const markGatewaySigusr1RestartHandled = vi.fn();
const getActiveTaskCount = vi.fn(() => 0);
const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
const resetAllLanes = vi.fn();
const restartGatewayProcessWithFreshPid = vi.fn(() => ({ mode: "skipped" as const }));
const restartGatewayProcessWithFreshPid = vi.fn<
() => { mode: "spawned" | "supervised" | "disabled" | "failed"; pid?: number; detail?: string }
>(() => ({ mode: "disabled" }));
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
const gatewayLog = {
info: vi.fn(),
@@ -30,8 +32,7 @@ vi.mock("../../infra/restart.js", () => ({
}));
vi.mock("../../infra/process-respawn.js", () => ({
restartGatewayProcessWithFreshPid: (...args: unknown[]) =>
restartGatewayProcessWithFreshPid(...args),
restartGatewayProcessWithFreshPid: () => restartGatewayProcessWithFreshPid(),
}));
vi.mock("../../process/command-queue.js", () => ({
@@ -140,6 +141,7 @@ describe("runGatewayLoop", () => {
});
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(2);
expect(resetAllLanes).toHaveBeenCalledTimes(2);
expect(acquireGatewayLock).toHaveBeenCalledTimes(3);
} finally {
removeNewSignalListeners("SIGTERM", beforeSigterm);
removeNewSignalListeners("SIGINT", beforeSigint);
@@ -153,8 +155,6 @@ describe("runGatewayLoop", () => {
const lockRelease = vi.fn(async () => {});
acquireGatewayLock.mockResolvedValueOnce({
release: lockRelease,
lockPath: "/tmp/test.lock",
configPath: "/test/openclaw.json",
});
// Override process-respawn to return "spawned" mode

View File

@@ -23,7 +23,7 @@ export async function runGatewayLoop(params: {
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
runtime: typeof defaultRuntime;
}) {
const lock = await acquireGatewayLock();
let lock = await acquireGatewayLock();
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
let shuttingDown = false;
let restartResolver: (() => void) | null = null;
@@ -83,8 +83,12 @@ export async function runGatewayLoop(params: {
clearTimeout(forceExitTimer);
server = null;
if (isRestart) {
const hadLock = lock != null;
// Release the lock BEFORE spawning so the child can acquire it immediately.
await lock?.release();
if (lock) {
await lock.release();
lock = null;
}
const respawn = restartGatewayProcessWithFreshPid();
if (respawn.mode === "spawned" || respawn.mode === "supervised") {
const modeLabel =
@@ -102,11 +106,29 @@ export async function runGatewayLoop(params: {
} else {
gatewayLog.info("restart mode: in-process restart (OPENCLAW_NO_RESPAWN)");
}
shuttingDown = false;
restartResolver?.();
let canContinueInProcessRestart = true;
if (hadLock) {
try {
lock = await acquireGatewayLock();
} catch (err) {
gatewayLog.error(
`failed to reacquire gateway lock for in-process restart: ${String(err)}`,
);
cleanupSignals();
params.runtime.exit(1);
canContinueInProcessRestart = false;
}
}
if (canContinueInProcessRestart) {
shuttingDown = false;
restartResolver?.();
}
}
} else {
await lock?.release();
if (lock) {
await lock.release();
lock = null;
}
cleanupSignals();
params.runtime.exit(0);
}
@@ -161,7 +183,10 @@ export async function runGatewayLoop(params: {
});
}
} finally {
await lock?.release();
if (lock) {
await lock.release();
lock = null;
}
cleanupSignals();
}
}