fix: tighten gateway restart loop handling (#23416) (thanks @jeffwnli)
This commit is contained in:
@@ -11,7 +11,9 @@ const markGatewaySigusr1RestartHandled = vi.fn();
|
||||
const getActiveTaskCount = vi.fn(() => 0);
|
||||
const waitForActiveTasks = vi.fn(async (_timeoutMs: number) => ({ drained: true }));
|
||||
const resetAllLanes = vi.fn();
|
||||
const restartGatewayProcessWithFreshPid = vi.fn(() => ({ mode: "skipped" as const }));
|
||||
const restartGatewayProcessWithFreshPid = vi.fn<
|
||||
() => { mode: "spawned" | "supervised" | "disabled" | "failed"; pid?: number; detail?: string }
|
||||
>(() => ({ mode: "disabled" }));
|
||||
const DRAIN_TIMEOUT_LOG = "drain timeout reached; proceeding with restart";
|
||||
const gatewayLog = {
|
||||
info: vi.fn(),
|
||||
@@ -30,8 +32,7 @@ vi.mock("../../infra/restart.js", () => ({
|
||||
}));
|
||||
|
||||
vi.mock("../../infra/process-respawn.js", () => ({
|
||||
restartGatewayProcessWithFreshPid: (...args: unknown[]) =>
|
||||
restartGatewayProcessWithFreshPid(...args),
|
||||
restartGatewayProcessWithFreshPid: () => restartGatewayProcessWithFreshPid(),
|
||||
}));
|
||||
|
||||
vi.mock("../../process/command-queue.js", () => ({
|
||||
@@ -140,6 +141,7 @@ describe("runGatewayLoop", () => {
|
||||
});
|
||||
expect(markGatewaySigusr1RestartHandled).toHaveBeenCalledTimes(2);
|
||||
expect(resetAllLanes).toHaveBeenCalledTimes(2);
|
||||
expect(acquireGatewayLock).toHaveBeenCalledTimes(3);
|
||||
} finally {
|
||||
removeNewSignalListeners("SIGTERM", beforeSigterm);
|
||||
removeNewSignalListeners("SIGINT", beforeSigint);
|
||||
@@ -153,8 +155,6 @@ describe("runGatewayLoop", () => {
|
||||
const lockRelease = vi.fn(async () => {});
|
||||
acquireGatewayLock.mockResolvedValueOnce({
|
||||
release: lockRelease,
|
||||
lockPath: "/tmp/test.lock",
|
||||
configPath: "/test/openclaw.json",
|
||||
});
|
||||
|
||||
// Override process-respawn to return "spawned" mode
|
||||
|
||||
@@ -23,7 +23,7 @@ export async function runGatewayLoop(params: {
|
||||
start: () => Promise<Awaited<ReturnType<typeof startGatewayServer>>>;
|
||||
runtime: typeof defaultRuntime;
|
||||
}) {
|
||||
const lock = await acquireGatewayLock();
|
||||
let lock = await acquireGatewayLock();
|
||||
let server: Awaited<ReturnType<typeof startGatewayServer>> | null = null;
|
||||
let shuttingDown = false;
|
||||
let restartResolver: (() => void) | null = null;
|
||||
@@ -83,8 +83,12 @@ export async function runGatewayLoop(params: {
|
||||
clearTimeout(forceExitTimer);
|
||||
server = null;
|
||||
if (isRestart) {
|
||||
const hadLock = lock != null;
|
||||
// Release the lock BEFORE spawning so the child can acquire it immediately.
|
||||
await lock?.release();
|
||||
if (lock) {
|
||||
await lock.release();
|
||||
lock = null;
|
||||
}
|
||||
const respawn = restartGatewayProcessWithFreshPid();
|
||||
if (respawn.mode === "spawned" || respawn.mode === "supervised") {
|
||||
const modeLabel =
|
||||
@@ -102,11 +106,29 @@ export async function runGatewayLoop(params: {
|
||||
} else {
|
||||
gatewayLog.info("restart mode: in-process restart (OPENCLAW_NO_RESPAWN)");
|
||||
}
|
||||
shuttingDown = false;
|
||||
restartResolver?.();
|
||||
let canContinueInProcessRestart = true;
|
||||
if (hadLock) {
|
||||
try {
|
||||
lock = await acquireGatewayLock();
|
||||
} catch (err) {
|
||||
gatewayLog.error(
|
||||
`failed to reacquire gateway lock for in-process restart: ${String(err)}`,
|
||||
);
|
||||
cleanupSignals();
|
||||
params.runtime.exit(1);
|
||||
canContinueInProcessRestart = false;
|
||||
}
|
||||
}
|
||||
if (canContinueInProcessRestart) {
|
||||
shuttingDown = false;
|
||||
restartResolver?.();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
await lock?.release();
|
||||
if (lock) {
|
||||
await lock.release();
|
||||
lock = null;
|
||||
}
|
||||
cleanupSignals();
|
||||
params.runtime.exit(0);
|
||||
}
|
||||
@@ -161,7 +183,10 @@ export async function runGatewayLoop(params: {
|
||||
});
|
||||
}
|
||||
} finally {
|
||||
await lock?.release();
|
||||
if (lock) {
|
||||
await lock.release();
|
||||
lock = null;
|
||||
}
|
||||
cleanupSignals();
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user