* fix(agents): wait for agent idle before flushing pending tool results When pi-agent-core's auto-retry mechanism handles overloaded/rate-limit errors, it resolves waitForRetry() on assistant message receipt — before tool execution completes in the retried agent loop. This causes the attempt's finally block to call flushPendingToolResults() while tools are still executing, inserting synthetic 'missing tool result' errors and causing silent agent failures. The fix adds a waitForIdle() call before the flush to ensure the agent's retry loop (including tool execution) has fully completed. Evidence from real session: tool call and synthetic error were only 53ms apart — the tool never had a chance to execute before being flushed. Root cause is in pi-agent-core's _resolveRetry() firing on message_end instead of agent_end, but this workaround in OpenClaw prevents the symptom without requiring an upstream fix. Fixes #8643 Fixes #13351 Refs #6682, #12595 * test: add tests for tool result flush race condition Validates that: - Real tool results are not replaced by synthetic errors when they arrive in time - Flush correctly inserts synthetic errors for genuinely orphaned tool calls - Flush is a no-op after real tool results have already been received Refs #8643, #13748 * fix(agents): add waitForIdle to all flushPendingToolResults call sites The original fix only covered the main run finally block, but there are two additional call sites that can trigger flushPendingToolResults while tools are still executing: 1. The catch block in attempt.ts (session setup error handler) 2. The finally block in compact.ts (compaction teardown) Both now await agent.waitForIdle() with a 30s timeout before flushing, matching the pattern already applied to the main finally block. Production testing on VPS with debug logging confirmed these additional paths can fire during sub-agent runs, producing spurious synthetic 'missing tool result' errors. * fix(agents): centralize idle-wait flush and clear timeout handle --------- Co-authored-by: Renue Development <dev@renuebyscience.com> Co-authored-by: Peter Steinberger <steipete@gmail.com>
46 lines
1.2 KiB
TypeScript
46 lines
1.2 KiB
TypeScript
type IdleAwareAgent = {
|
|
waitForIdle?: (() => Promise<void>) | undefined;
|
|
};
|
|
|
|
type ToolResultFlushManager = {
|
|
flushPendingToolResults?: (() => void) | undefined;
|
|
};
|
|
|
|
export const DEFAULT_WAIT_FOR_IDLE_TIMEOUT_MS = 30_000;
|
|
|
|
async function waitForAgentIdleBestEffort(
|
|
agent: IdleAwareAgent | null | undefined,
|
|
timeoutMs: number,
|
|
): Promise<void> {
|
|
const waitForIdle = agent?.waitForIdle;
|
|
if (typeof waitForIdle !== "function") {
|
|
return;
|
|
}
|
|
|
|
let timeoutHandle: ReturnType<typeof setTimeout> | undefined;
|
|
try {
|
|
await Promise.race([
|
|
waitForIdle.call(agent),
|
|
new Promise<void>((resolve) => {
|
|
timeoutHandle = setTimeout(resolve, timeoutMs);
|
|
timeoutHandle.unref?.();
|
|
}),
|
|
]);
|
|
} catch {
|
|
// Best-effort during cleanup.
|
|
} finally {
|
|
if (timeoutHandle) {
|
|
clearTimeout(timeoutHandle);
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function flushPendingToolResultsAfterIdle(opts: {
|
|
agent: IdleAwareAgent | null | undefined;
|
|
sessionManager: ToolResultFlushManager | null | undefined;
|
|
timeoutMs?: number;
|
|
}): Promise<void> {
|
|
await waitForAgentIdleBestEffort(opts.agent, opts.timeoutMs ?? DEFAULT_WAIT_FOR_IDLE_TIMEOUT_MS);
|
|
opts.sessionManager?.flushPendingToolResults?.();
|
|
}
|