fix: codex and similar processes keep dying on pty, solved by refactoring process spawning (#14257)
* exec: clean up PTY resources on timeout and exit * cli: harden resume cleanup and watchdog stalled runs * cli: productionize PTY and resume reliability paths * docs: add PTY process supervision architecture plan * docs: rewrite PTY supervision plan as pre-rewrite baseline * docs: switch PTY supervision plan to one-go execution * docs: add one-line root cause to PTY supervision plan * docs: add OS contracts and test matrix to PTY supervision plan * docs: define process-supervisor package placement and scope * docs: tie supervisor plan to existing CI lanes * docs: place PTY supervisor plan under src/process * refactor(process): route exec and cli runs through supervisor * docs(process): refresh PTY supervision plan * wip * fix(process): harden supervisor timeout and PTY termination * fix(process): harden supervisor adapters env and wait handling * ci: avoid failing formal conformance on comment permissions * test(ui): fix cron request mock argument typing * fix(ui): remove leftover conflict marker * fix: supervise PTY processes (#14257) (openclaw#14257) (thanks @onutc)
This commit is contained in:
@@ -1,17 +1,17 @@
|
||||
import type { AgentToolResult } from "@mariozechner/pi-agent-core";
|
||||
import type { ChildProcessWithoutNullStreams } from "node:child_process";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import path from "node:path";
|
||||
import type { ExecAsk, ExecHost, ExecSecurity } from "../infra/exec-approvals.js";
|
||||
import type { ProcessSession, SessionStdin } from "./bash-process-registry.js";
|
||||
import type { ProcessSession } from "./bash-process-registry.js";
|
||||
import type { ExecToolDetails } from "./bash-tools.exec.js";
|
||||
import type { BashSandboxConfig } from "./bash-tools.shared.js";
|
||||
import { requestHeartbeatNow } from "../infra/heartbeat-wake.js";
|
||||
import { mergePathPrepend } from "../infra/path-prepend.js";
|
||||
import { enqueueSystemEvent } from "../infra/system-events.js";
|
||||
export { applyPathPrepend, normalizePathPrepend } from "../infra/path-prepend.js";
|
||||
import type { ManagedRun } from "../process/supervisor/index.js";
|
||||
import { logWarn } from "../logger.js";
|
||||
import { formatSpawnError, spawnWithFallback } from "../process/spawn-utils.js";
|
||||
import { getProcessSupervisor } from "../process/supervisor/index.js";
|
||||
import {
|
||||
addSession,
|
||||
appendOutput,
|
||||
@@ -23,7 +23,6 @@ import {
|
||||
buildDockerExecArgs,
|
||||
chunkString,
|
||||
clampWithDefault,
|
||||
killSession,
|
||||
readEnvInt,
|
||||
} from "./bash-tools.shared.js";
|
||||
import { buildCursorPositionResponse, stripDsrRequests } from "./pty-dsr.js";
|
||||
@@ -147,26 +146,6 @@ export const execSchema = Type.Object({
|
||||
),
|
||||
});
|
||||
|
||||
type PtyExitEvent = { exitCode: number; signal?: number };
|
||||
type PtyListener<T> = (event: T) => void;
|
||||
type PtyHandle = {
|
||||
pid: number;
|
||||
write: (data: string | Buffer) => void;
|
||||
onData: (listener: PtyListener<string>) => void;
|
||||
onExit: (listener: PtyListener<PtyExitEvent>) => void;
|
||||
};
|
||||
type PtySpawn = (
|
||||
file: string,
|
||||
args: string[] | string,
|
||||
options: {
|
||||
name?: string;
|
||||
cols?: number;
|
||||
rows?: number;
|
||||
cwd?: string;
|
||||
env?: Record<string, string>;
|
||||
},
|
||||
) => PtyHandle;
|
||||
|
||||
export type ExecProcessOutcome = {
|
||||
status: "completed" | "failed";
|
||||
exitCode: number | null;
|
||||
@@ -319,138 +298,10 @@ export async function runExecProcess(opts: {
|
||||
}): Promise<ExecProcessHandle> {
|
||||
const startedAt = Date.now();
|
||||
const sessionId = createSessionSlug();
|
||||
let child: ChildProcessWithoutNullStreams | null = null;
|
||||
let pty: PtyHandle | null = null;
|
||||
let stdin: SessionStdin | undefined;
|
||||
const execCommand = opts.execCommand ?? opts.command;
|
||||
const supervisor = getProcessSupervisor();
|
||||
|
||||
const spawnFallbacks = [
|
||||
{
|
||||
label: "no-detach",
|
||||
options: { detached: false },
|
||||
},
|
||||
];
|
||||
|
||||
const handleSpawnFallback = (err: unknown, fallback: { label: string }) => {
|
||||
const errText = formatSpawnError(err);
|
||||
const warning = `Warning: spawn failed (${errText}); retrying with ${fallback.label}.`;
|
||||
logWarn(`exec: spawn failed (${errText}); retrying with ${fallback.label}.`);
|
||||
opts.warnings.push(warning);
|
||||
};
|
||||
|
||||
const spawnShellChild = async (
|
||||
shell: string,
|
||||
shellArgs: string[],
|
||||
): Promise<ChildProcessWithoutNullStreams> => {
|
||||
const { child: spawned } = await spawnWithFallback({
|
||||
argv: [shell, ...shellArgs, execCommand],
|
||||
options: {
|
||||
cwd: opts.workdir,
|
||||
env: opts.env,
|
||||
detached: process.platform !== "win32",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
windowsHide: true,
|
||||
},
|
||||
fallbacks: spawnFallbacks,
|
||||
onFallback: handleSpawnFallback,
|
||||
});
|
||||
return spawned as ChildProcessWithoutNullStreams;
|
||||
};
|
||||
|
||||
// `exec` does not currently accept tool-provided stdin content. For non-PTY runs,
|
||||
// keeping stdin open can cause commands like `wc -l` (or safeBins-hardened segments)
|
||||
// to block forever waiting for input, leading to accidental backgrounding.
|
||||
// For interactive flows, callers should use `pty: true` (stdin kept open).
|
||||
const maybeCloseNonPtyStdin = () => {
|
||||
if (opts.usePty) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
// Signal EOF immediately so stdin-only commands can terminate.
|
||||
child?.stdin?.end();
|
||||
} catch {
|
||||
// ignore stdin close errors
|
||||
}
|
||||
};
|
||||
|
||||
if (opts.sandbox) {
|
||||
const { child: spawned } = await spawnWithFallback({
|
||||
argv: [
|
||||
"docker",
|
||||
...buildDockerExecArgs({
|
||||
containerName: opts.sandbox.containerName,
|
||||
command: execCommand,
|
||||
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
|
||||
env: opts.env,
|
||||
tty: opts.usePty,
|
||||
}),
|
||||
],
|
||||
options: {
|
||||
cwd: opts.workdir,
|
||||
env: process.env,
|
||||
detached: process.platform !== "win32",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
windowsHide: true,
|
||||
},
|
||||
fallbacks: spawnFallbacks,
|
||||
onFallback: handleSpawnFallback,
|
||||
});
|
||||
child = spawned as ChildProcessWithoutNullStreams;
|
||||
stdin = child.stdin;
|
||||
maybeCloseNonPtyStdin();
|
||||
} else if (opts.usePty) {
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
try {
|
||||
const ptyModule = (await import("@lydell/node-pty")) as unknown as {
|
||||
spawn?: PtySpawn;
|
||||
default?: { spawn?: PtySpawn };
|
||||
};
|
||||
const spawnPty = ptyModule.spawn ?? ptyModule.default?.spawn;
|
||||
if (!spawnPty) {
|
||||
throw new Error("PTY support is unavailable (node-pty spawn not found).");
|
||||
}
|
||||
pty = spawnPty(shell, [...shellArgs, execCommand], {
|
||||
cwd: opts.workdir,
|
||||
env: opts.env,
|
||||
name: process.env.TERM ?? "xterm-256color",
|
||||
cols: 120,
|
||||
rows: 30,
|
||||
});
|
||||
stdin = {
|
||||
destroyed: false,
|
||||
write: (data, cb) => {
|
||||
try {
|
||||
pty?.write(data);
|
||||
cb?.(null);
|
||||
} catch (err) {
|
||||
cb?.(err as Error);
|
||||
}
|
||||
},
|
||||
end: () => {
|
||||
try {
|
||||
const eof = process.platform === "win32" ? "\x1a" : "\x04";
|
||||
pty?.write(eof);
|
||||
} catch {
|
||||
// ignore EOF errors
|
||||
}
|
||||
},
|
||||
};
|
||||
} catch (err) {
|
||||
const errText = String(err);
|
||||
const warning = `Warning: PTY spawn failed (${errText}); retrying without PTY for \`${opts.command}\`.`;
|
||||
logWarn(`exec: PTY spawn failed (${errText}); retrying without PTY for "${opts.command}".`);
|
||||
opts.warnings.push(warning);
|
||||
child = await spawnShellChild(shell, shellArgs);
|
||||
stdin = child.stdin;
|
||||
}
|
||||
} else {
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
child = await spawnShellChild(shell, shellArgs);
|
||||
stdin = child.stdin;
|
||||
maybeCloseNonPtyStdin();
|
||||
}
|
||||
|
||||
const session = {
|
||||
const session: ProcessSession = {
|
||||
id: sessionId,
|
||||
command: opts.command,
|
||||
scopeKey: opts.scopeKey,
|
||||
@@ -458,9 +309,9 @@ export async function runExecProcess(opts: {
|
||||
notifyOnExit: opts.notifyOnExit,
|
||||
notifyOnExitEmptySuccess: opts.notifyOnExitEmptySuccess === true,
|
||||
exitNotified: false,
|
||||
child: child ?? undefined,
|
||||
stdin,
|
||||
pid: child?.pid ?? pty?.pid,
|
||||
child: undefined,
|
||||
stdin: undefined,
|
||||
pid: undefined,
|
||||
startedAt,
|
||||
cwd: opts.workdir,
|
||||
maxOutputChars: opts.maxOutput,
|
||||
@@ -477,59 +328,9 @@ export async function runExecProcess(opts: {
|
||||
exitSignal: undefined as NodeJS.Signals | number | null | undefined,
|
||||
truncated: false,
|
||||
backgrounded: false,
|
||||
} satisfies ProcessSession;
|
||||
};
|
||||
addSession(session);
|
||||
|
||||
let settled = false;
|
||||
let timeoutTimer: NodeJS.Timeout | null = null;
|
||||
let timeoutFinalizeTimer: NodeJS.Timeout | null = null;
|
||||
let timedOut = false;
|
||||
const timeoutFinalizeMs = 1000;
|
||||
let resolveFn: ((outcome: ExecProcessOutcome) => void) | null = null;
|
||||
|
||||
const settle = (outcome: ExecProcessOutcome) => {
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
settled = true;
|
||||
resolveFn?.(outcome);
|
||||
};
|
||||
|
||||
const finalizeTimeout = () => {
|
||||
if (session.exited) {
|
||||
return;
|
||||
}
|
||||
markExited(session, null, "SIGKILL", "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const reason = `Command timed out after ${opts.timeoutSec} seconds`;
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: "SIGKILL",
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut: true,
|
||||
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
|
||||
});
|
||||
};
|
||||
|
||||
const onTimeout = () => {
|
||||
timedOut = true;
|
||||
killSession(session);
|
||||
if (!timeoutFinalizeTimer) {
|
||||
timeoutFinalizeTimer = setTimeout(() => {
|
||||
finalizeTimeout();
|
||||
}, timeoutFinalizeMs);
|
||||
}
|
||||
};
|
||||
|
||||
if (opts.timeoutSec > 0) {
|
||||
timeoutTimer = setTimeout(() => {
|
||||
onTimeout();
|
||||
}, opts.timeoutSec * 1000);
|
||||
}
|
||||
|
||||
const emitUpdate = () => {
|
||||
if (!opts.onUpdate) {
|
||||
return;
|
||||
@@ -565,116 +366,208 @@ export async function runExecProcess(opts: {
|
||||
}
|
||||
};
|
||||
|
||||
if (pty) {
|
||||
const cursorResponse = buildCursorPositionResponse();
|
||||
pty.onData((data) => {
|
||||
const raw = data.toString();
|
||||
const { cleaned, requests } = stripDsrRequests(raw);
|
||||
if (requests > 0) {
|
||||
const timeoutMs =
|
||||
typeof opts.timeoutSec === "number" && opts.timeoutSec > 0
|
||||
? Math.floor(opts.timeoutSec * 1000)
|
||||
: undefined;
|
||||
|
||||
const spawnSpec:
|
||||
| {
|
||||
mode: "child";
|
||||
argv: string[];
|
||||
env: NodeJS.ProcessEnv;
|
||||
stdinMode: "pipe-open" | "pipe-closed";
|
||||
}
|
||||
| {
|
||||
mode: "pty";
|
||||
ptyCommand: string;
|
||||
childFallbackArgv: string[];
|
||||
env: NodeJS.ProcessEnv;
|
||||
stdinMode: "pipe-open";
|
||||
} = (() => {
|
||||
if (opts.sandbox) {
|
||||
return {
|
||||
mode: "child" as const,
|
||||
argv: [
|
||||
"docker",
|
||||
...buildDockerExecArgs({
|
||||
containerName: opts.sandbox.containerName,
|
||||
command: execCommand,
|
||||
workdir: opts.containerWorkdir ?? opts.sandbox.containerWorkdir,
|
||||
env: opts.env,
|
||||
tty: opts.usePty,
|
||||
}),
|
||||
],
|
||||
env: process.env,
|
||||
stdinMode: opts.usePty ? ("pipe-open" as const) : ("pipe-closed" as const),
|
||||
};
|
||||
}
|
||||
const { shell, args: shellArgs } = getShellConfig();
|
||||
const childArgv = [shell, ...shellArgs, execCommand];
|
||||
if (opts.usePty) {
|
||||
return {
|
||||
mode: "pty" as const,
|
||||
ptyCommand: execCommand,
|
||||
childFallbackArgv: childArgv,
|
||||
env: opts.env,
|
||||
stdinMode: "pipe-open" as const,
|
||||
};
|
||||
}
|
||||
return {
|
||||
mode: "child" as const,
|
||||
argv: childArgv,
|
||||
env: opts.env,
|
||||
stdinMode: "pipe-closed" as const,
|
||||
};
|
||||
})();
|
||||
|
||||
let managedRun: ManagedRun | null = null;
|
||||
let usingPty = spawnSpec.mode === "pty";
|
||||
const cursorResponse = buildCursorPositionResponse();
|
||||
|
||||
const onSupervisorStdout = (chunk: string) => {
|
||||
if (usingPty) {
|
||||
const { cleaned, requests } = stripDsrRequests(chunk);
|
||||
if (requests > 0 && managedRun?.stdin) {
|
||||
for (let i = 0; i < requests; i += 1) {
|
||||
pty.write(cursorResponse);
|
||||
managedRun.stdin.write(cursorResponse);
|
||||
}
|
||||
}
|
||||
handleStdout(cleaned);
|
||||
});
|
||||
} else if (child) {
|
||||
child.stdout.on("data", handleStdout);
|
||||
child.stderr.on("data", handleStderr);
|
||||
}
|
||||
return;
|
||||
}
|
||||
handleStdout(chunk);
|
||||
};
|
||||
|
||||
const promise = new Promise<ExecProcessOutcome>((resolve) => {
|
||||
resolveFn = resolve;
|
||||
const handleExit = (code: number | null, exitSignal: NodeJS.Signals | number | null) => {
|
||||
if (timeoutTimer) {
|
||||
clearTimeout(timeoutTimer);
|
||||
}
|
||||
if (timeoutFinalizeTimer) {
|
||||
clearTimeout(timeoutFinalizeTimer);
|
||||
try {
|
||||
const spawnBase = {
|
||||
runId: sessionId,
|
||||
sessionId: opts.sessionKey?.trim() || sessionId,
|
||||
backendId: opts.sandbox ? "exec-sandbox" : "exec-host",
|
||||
scopeKey: opts.scopeKey,
|
||||
cwd: opts.workdir,
|
||||
env: spawnSpec.env,
|
||||
timeoutMs,
|
||||
captureOutput: false,
|
||||
onStdout: onSupervisorStdout,
|
||||
onStderr: handleStderr,
|
||||
};
|
||||
managedRun =
|
||||
spawnSpec.mode === "pty"
|
||||
? await supervisor.spawn({
|
||||
...spawnBase,
|
||||
mode: "pty",
|
||||
ptyCommand: spawnSpec.ptyCommand,
|
||||
})
|
||||
: await supervisor.spawn({
|
||||
...spawnBase,
|
||||
mode: "child",
|
||||
argv: spawnSpec.argv,
|
||||
stdinMode: spawnSpec.stdinMode,
|
||||
});
|
||||
} catch (err) {
|
||||
if (spawnSpec.mode === "pty") {
|
||||
const warning = `Warning: PTY spawn failed (${String(err)}); retrying without PTY for \`${opts.command}\`.`;
|
||||
logWarn(
|
||||
`exec: PTY spawn failed (${String(err)}); retrying without PTY for "${opts.command}".`,
|
||||
);
|
||||
opts.warnings.push(warning);
|
||||
usingPty = false;
|
||||
try {
|
||||
managedRun = await supervisor.spawn({
|
||||
runId: sessionId,
|
||||
sessionId: opts.sessionKey?.trim() || sessionId,
|
||||
backendId: "exec-host",
|
||||
scopeKey: opts.scopeKey,
|
||||
mode: "child",
|
||||
argv: spawnSpec.childFallbackArgv,
|
||||
cwd: opts.workdir,
|
||||
env: spawnSpec.env,
|
||||
stdinMode: "pipe-open",
|
||||
timeoutMs,
|
||||
captureOutput: false,
|
||||
onStdout: handleStdout,
|
||||
onStderr: handleStderr,
|
||||
});
|
||||
} catch (retryErr) {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
throw retryErr;
|
||||
}
|
||||
} else {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
session.stdin = managedRun.stdin;
|
||||
session.pid = managedRun.pid;
|
||||
|
||||
const promise = managedRun
|
||||
.wait()
|
||||
.then((exit): ExecProcessOutcome => {
|
||||
const durationMs = Date.now() - startedAt;
|
||||
const wasSignal = exitSignal != null;
|
||||
const isSuccess = code === 0 && !wasSignal && !timedOut;
|
||||
const status: "completed" | "failed" = isSuccess ? "completed" : "failed";
|
||||
markExited(session, code, exitSignal, status);
|
||||
const status: "completed" | "failed" =
|
||||
exit.exitCode === 0 && exit.reason === "exit" ? "completed" : "failed";
|
||||
markExited(session, exit.exitCode, exit.exitSignal, status);
|
||||
maybeNotifyOnExit(session, status);
|
||||
if (!session.child && session.stdin) {
|
||||
session.stdin.destroyed = true;
|
||||
}
|
||||
|
||||
if (settled) {
|
||||
return;
|
||||
}
|
||||
const aggregated = session.aggregated.trim();
|
||||
if (!isSuccess) {
|
||||
const reason = timedOut
|
||||
? `Command timed out after ${opts.timeoutSec} seconds`
|
||||
: wasSignal && exitSignal
|
||||
? `Command aborted by signal ${exitSignal}`
|
||||
: code === null
|
||||
? "Command aborted before exit code was captured"
|
||||
: `Command exited with code ${code}`;
|
||||
const message = aggregated ? `${aggregated}\n\n${reason}` : reason;
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: code ?? null,
|
||||
exitSignal: exitSignal ?? null,
|
||||
if (status === "completed") {
|
||||
return {
|
||||
status: "completed",
|
||||
exitCode: exit.exitCode ?? 0,
|
||||
exitSignal: exit.exitSignal,
|
||||
durationMs,
|
||||
aggregated,
|
||||
timedOut,
|
||||
reason: message,
|
||||
});
|
||||
return;
|
||||
timedOut: false,
|
||||
};
|
||||
}
|
||||
settle({
|
||||
status: "completed",
|
||||
exitCode: code ?? 0,
|
||||
exitSignal: exitSignal ?? null,
|
||||
const reason =
|
||||
exit.reason === "overall-timeout"
|
||||
? `Command timed out after ${opts.timeoutSec} seconds`
|
||||
: exit.reason === "no-output-timeout"
|
||||
? "Command timed out waiting for output"
|
||||
: exit.exitSignal != null
|
||||
? `Command aborted by signal ${exit.exitSignal}`
|
||||
: exit.exitCode == null
|
||||
? "Command aborted before exit code was captured"
|
||||
: `Command exited with code ${exit.exitCode}`;
|
||||
return {
|
||||
status: "failed",
|
||||
exitCode: exit.exitCode,
|
||||
exitSignal: exit.exitSignal,
|
||||
durationMs,
|
||||
aggregated,
|
||||
timedOut: exit.timedOut,
|
||||
reason: aggregated ? `${aggregated}\n\n${reason}` : reason,
|
||||
};
|
||||
})
|
||||
.catch((err): ExecProcessOutcome => {
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
|
||||
return {
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: null,
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut: false,
|
||||
});
|
||||
};
|
||||
|
||||
if (pty) {
|
||||
pty.onExit((event) => {
|
||||
const rawSignal = event.signal ?? null;
|
||||
const normalizedSignal = rawSignal === 0 ? null : rawSignal;
|
||||
handleExit(event.exitCode ?? null, normalizedSignal);
|
||||
});
|
||||
} else if (child) {
|
||||
child.once("close", (code, exitSignal) => {
|
||||
handleExit(code, exitSignal);
|
||||
});
|
||||
|
||||
child.once("error", (err) => {
|
||||
if (timeoutTimer) {
|
||||
clearTimeout(timeoutTimer);
|
||||
}
|
||||
if (timeoutFinalizeTimer) {
|
||||
clearTimeout(timeoutFinalizeTimer);
|
||||
}
|
||||
markExited(session, null, null, "failed");
|
||||
maybeNotifyOnExit(session, "failed");
|
||||
const aggregated = session.aggregated.trim();
|
||||
const message = aggregated ? `${aggregated}\n\n${String(err)}` : String(err);
|
||||
settle({
|
||||
status: "failed",
|
||||
exitCode: null,
|
||||
exitSignal: null,
|
||||
durationMs: Date.now() - startedAt,
|
||||
aggregated,
|
||||
timedOut,
|
||||
reason: message,
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
reason: message,
|
||||
};
|
||||
});
|
||||
|
||||
return {
|
||||
session,
|
||||
startedAt,
|
||||
pid: session.pid ?? undefined,
|
||||
promise,
|
||||
kill: () => killSession(session),
|
||||
kill: () => {
|
||||
managedRun?.cancel("manual-cancel");
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user