fix: reset stale execution state after SIGUSR1 in-process restart (#15195)

Merged via /review-pr -> /prepare-pr -> /merge-pr.

Prepared head SHA: 676f9ec45135be0d3471bb0444bc2ac7ce7d5224
Co-authored-by: joeykrug <5925937+joeykrug@users.noreply.github.com>
Co-authored-by: gumadeiras <5599352+gumadeiras@users.noreply.github.com>
Reviewed-by: @gumadeiras
This commit is contained in:
Joseph Krug
2026-02-13 16:30:09 -04:00
committed by GitHub
parent 2086cdfb9b
commit 4e9f933e88
11 changed files with 572 additions and 20 deletions

View File

@@ -6,7 +6,12 @@ import {
isGatewaySigusr1RestartExternallyAllowed,
} from "../../infra/restart.js";
import { createSubsystemLogger } from "../../logging/subsystem.js";
import { getActiveTaskCount, waitForActiveTasks } from "../../process/command-queue.js";
import {
getActiveTaskCount,
resetAllLanes,
waitForActiveTasks,
} from "../../process/command-queue.js";
import { createRestartIterationHook } from "../../process/restart-recovery.js";
const gatewayLog = createSubsystemLogger("gateway");
@@ -111,10 +116,21 @@ export async function runGatewayLoop(params: {
process.on("SIGUSR1", onSigusr1);
try {
const onIteration = createRestartIterationHook(() => {
// After an in-process restart (SIGUSR1), reset command-queue lane state.
// Interrupted tasks from the previous lifecycle may have left `active`
// counts elevated (their finally blocks never ran), permanently blocking
// new work from draining. This must happen here — at the restart
// coordinator level — rather than inside individual subsystem init
// functions, to avoid surprising cross-cutting side effects.
resetAllLanes();
});
// Keep process alive; SIGUSR1 triggers an in-process restart (no supervisor required).
// SIGTERM/SIGINT still exit after a graceful shutdown.
// eslint-disable-next-line no-constant-condition
while (true) {
onIteration();
server = await params.start();
await new Promise<void>((resolve) => {
restartResolver = resolve;