Files
openclaw/src/gateway/server-methods/update.ts
Rami Abdelrazzaq 0b8b95f2c9 fix(update): prevent gateway crash loop after failed self-update
The gateway unconditionally scheduled a SIGUSR1 restart after every
update.run call, even when the update itself failed (broken deps,
build errors, etc.). This left the process restarting into a broken
state — corrupted node_modules, partial builds — causing a crash loop
that required manual intervention.

Three fixes:

1. Only restart on success: scheduleGatewaySigusr1Restart is now
   gated on result.status === "ok". Failed or skipped updates still
   write the restart sentinel (so the status can be reported back to
   the user) but the running gateway stays alive.

2. Early bail on step failure: deps install, build, and ui:build now
   check exit codes immediately (matching the preflight section) so a
   failed deps install no longer cascades into a broken build and
   ui:build.

3. Auto-repair config during update: the doctor step now runs with
   --fix alongside --non-interactive, so unknown config keys left over
   from schema changes between versions are stripped automatically
   instead of causing a startup validation crash.
2026-02-16 23:54:49 +01:00

119 lines
3.7 KiB
TypeScript

import type { GatewayRequestHandlers } from "./types.js";
import { loadConfig } from "../../config/config.js";
import { extractDeliveryInfo } from "../../config/sessions.js";
import { resolveOpenClawPackageRoot } from "../../infra/openclaw-root.js";
import {
formatDoctorNonInteractiveHint,
type RestartSentinelPayload,
writeRestartSentinel,
} from "../../infra/restart-sentinel.js";
import { scheduleGatewaySigusr1Restart } from "../../infra/restart.js";
import { normalizeUpdateChannel } from "../../infra/update-channels.js";
import { runGatewayUpdate } from "../../infra/update-runner.js";
import { validateUpdateRunParams } from "../protocol/index.js";
import { parseRestartRequestParams } from "./restart-request.js";
import { assertValidParams } from "./validation.js";
export const updateHandlers: GatewayRequestHandlers = {
"update.run": async ({ params, respond }) => {
if (!assertValidParams(params, validateUpdateRunParams, "update.run", respond)) {
return;
}
const { sessionKey, note, restartDelayMs } = parseRestartRequestParams(params);
const { deliveryContext, threadId } = extractDeliveryInfo(sessionKey);
const timeoutMsRaw = (params as { timeoutMs?: unknown }).timeoutMs;
const timeoutMs =
typeof timeoutMsRaw === "number" && Number.isFinite(timeoutMsRaw)
? Math.max(1000, Math.floor(timeoutMsRaw))
: undefined;
let result: Awaited<ReturnType<typeof runGatewayUpdate>>;
try {
const config = loadConfig();
const configChannel = normalizeUpdateChannel(config.update?.channel);
const root =
(await resolveOpenClawPackageRoot({
moduleUrl: import.meta.url,
argv1: process.argv[1],
cwd: process.cwd(),
})) ?? process.cwd();
result = await runGatewayUpdate({
timeoutMs,
cwd: root,
argv1: process.argv[1],
channel: configChannel ?? undefined,
});
} catch (err) {
result = {
status: "error",
mode: "unknown",
reason: String(err),
steps: [],
durationMs: 0,
};
}
const payload: RestartSentinelPayload = {
kind: "update",
status: result.status,
ts: Date.now(),
sessionKey,
deliveryContext,
threadId,
message: note ?? null,
doctorHint: formatDoctorNonInteractiveHint(),
stats: {
mode: result.mode,
root: result.root ?? undefined,
before: result.before ?? null,
after: result.after ?? null,
steps: result.steps.map((step) => ({
name: step.name,
command: step.command,
cwd: step.cwd,
durationMs: step.durationMs,
log: {
stdoutTail: step.stdoutTail ?? null,
stderrTail: step.stderrTail ?? null,
exitCode: step.exitCode ?? null,
},
})),
reason: result.reason ?? null,
durationMs: result.durationMs,
},
};
let sentinelPath: string | null = null;
try {
sentinelPath = await writeRestartSentinel(payload);
} catch {
sentinelPath = null;
}
// Only restart the gateway when the update actually succeeded.
// Restarting after a failed update leaves the process in a broken state
// (corrupted node_modules, partial builds) and causes a crash loop.
const restart =
result.status === "ok"
? scheduleGatewaySigusr1Restart({
delayMs: restartDelayMs,
reason: "update.run",
})
: null;
respond(
true,
{
ok: result.status !== "error",
result,
restart,
sentinel: {
path: sentinelPath,
payload,
},
},
undefined,
);
},
};