Files
openclaw/src/agents/pi-embedded-utils.ts
George Zhang 309162f9a2 fix: strip leaked model control tokens from user-facing text (#42173)
Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens in their responses. Uses generic pattern in the text extraction pipeline, following the same architecture as stripMinimaxToolCallXml.

Closes #40020
Supersedes #40573

Co-authored-by: imwyvern <100903837+imwyvern@users.noreply.github.com>
2026-03-10 06:27:59 -07:00

454 lines
13 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import type { AgentMessage } from "@mariozechner/pi-agent-core";
import type { AssistantMessage } from "@mariozechner/pi-ai";
import { extractTextFromChatContent } from "../shared/chat-content.js";
import { stripReasoningTagsFromText } from "../shared/text/reasoning-tags.js";
import { sanitizeUserFacingText } from "./pi-embedded-helpers.js";
import { formatToolDetail, resolveToolDisplay } from "./tool-display.js";
export function isAssistantMessage(msg: AgentMessage | undefined): msg is AssistantMessage {
return msg?.role === "assistant";
}
/**
* Strip malformed Minimax tool invocations that leak into text content.
* Minimax sometimes embeds tool calls as XML in text blocks instead of
* proper structured tool calls. This removes:
* - <invoke name="...">...</invoke> blocks
* - </minimax:tool_call> closing tags
*/
export function stripMinimaxToolCallXml(text: string): string {
if (!text) {
return text;
}
if (!/minimax:tool_call/i.test(text)) {
return text;
}
// Remove <invoke ...>...</invoke> blocks (non-greedy to handle multiple).
let cleaned = text.replace(/<invoke\b[^>]*>[\s\S]*?<\/invoke>/gi, "");
// Remove stray minimax tool tags.
cleaned = cleaned.replace(/<\/?minimax:tool_call>/gi, "");
return cleaned;
}
/**
* Strip model control tokens leaked into assistant text output.
*
* Models like GLM-5 and DeepSeek sometimes emit internal delimiter tokens
* (e.g. `<|assistant|>`, `<|tool_call_result_begin|>`, `<begin▁of▁sentence>`)
* in their responses. These use the universal `<|...|>` convention (ASCII or
* full-width pipe variants) and should never reach end users.
*
* This is a provider bug — no upstream fix tracked yet.
* Remove this function when upstream providers stop leaking tokens.
* @see https://github.com/openclaw/openclaw/issues/40020
*/
// Match both ASCII pipe <|...|> and full-width pipe <...> (U+FF5C) variants.
const MODEL_SPECIAL_TOKEN_RE = /<[|][^|]*[|]>/g;
export function stripModelSpecialTokens(text: string): string {
if (!text) {
return text;
}
if (!MODEL_SPECIAL_TOKEN_RE.test(text)) {
return text;
}
MODEL_SPECIAL_TOKEN_RE.lastIndex = 0;
return text.replace(MODEL_SPECIAL_TOKEN_RE, " ").replace(/ +/g, " ").trim();
}
/**
* Strip downgraded tool call text representations that leak into text content.
* When replaying history to Gemini, tool calls without `thought_signature` are
* downgraded to text blocks like `[Tool Call: name (ID: ...)]`. These should
* not be shown to users.
*/
export function stripDowngradedToolCallText(text: string): string {
if (!text) {
return text;
}
if (!/\[Tool (?:Call|Result)/i.test(text) && !/\[Historical context/i.test(text)) {
return text;
}
const consumeJsonish = (
input: string,
start: number,
options?: { allowLeadingNewlines?: boolean },
): number | null => {
const { allowLeadingNewlines = false } = options ?? {};
let index = start;
while (index < input.length) {
const ch = input[index];
if (ch === " " || ch === "\t") {
index += 1;
continue;
}
if (allowLeadingNewlines && (ch === "\n" || ch === "\r")) {
index += 1;
continue;
}
break;
}
if (index >= input.length) {
return null;
}
const startChar = input[index];
if (startChar === "{" || startChar === "[") {
let depth = 0;
let inString = false;
let escape = false;
for (let i = index; i < input.length; i += 1) {
const ch = input[i];
if (inString) {
if (escape) {
escape = false;
} else if (ch === "\\") {
escape = true;
} else if (ch === '"') {
inString = false;
}
continue;
}
if (ch === '"') {
inString = true;
continue;
}
if (ch === "{" || ch === "[") {
depth += 1;
continue;
}
if (ch === "}" || ch === "]") {
depth -= 1;
if (depth === 0) {
return i + 1;
}
}
}
return null;
}
if (startChar === '"') {
let escape = false;
for (let i = index + 1; i < input.length; i += 1) {
const ch = input[i];
if (escape) {
escape = false;
continue;
}
if (ch === "\\") {
escape = true;
continue;
}
if (ch === '"') {
return i + 1;
}
}
return null;
}
let end = index;
while (end < input.length && input[end] !== "\n" && input[end] !== "\r") {
end += 1;
}
return end;
};
const stripToolCalls = (input: string): string => {
const markerRe = /\[Tool Call:[^\]]*\]/gi;
let result = "";
let cursor = 0;
for (const match of input.matchAll(markerRe)) {
const start = match.index ?? 0;
if (start < cursor) {
continue;
}
result += input.slice(cursor, start);
let index = start + match[0].length;
while (index < input.length && (input[index] === " " || input[index] === "\t")) {
index += 1;
}
if (input[index] === "\r") {
index += 1;
if (input[index] === "\n") {
index += 1;
}
} else if (input[index] === "\n") {
index += 1;
}
while (index < input.length && (input[index] === " " || input[index] === "\t")) {
index += 1;
}
if (input.slice(index, index + 9).toLowerCase() === "arguments") {
index += 9;
if (input[index] === ":") {
index += 1;
}
if (input[index] === " ") {
index += 1;
}
const end = consumeJsonish(input, index, { allowLeadingNewlines: true });
if (end !== null) {
index = end;
}
}
if (
(input[index] === "\n" || input[index] === "\r") &&
(result.endsWith("\n") || result.endsWith("\r") || result.length === 0)
) {
if (input[index] === "\r") {
index += 1;
}
if (input[index] === "\n") {
index += 1;
}
}
cursor = index;
}
result += input.slice(cursor);
return result;
};
// Remove [Tool Call: name (ID: ...)] blocks and their Arguments.
let cleaned = stripToolCalls(text);
// Remove [Tool Result for ID ...] blocks and their content.
cleaned = cleaned.replace(/\[Tool Result for ID[^\]]*\]\n?[\s\S]*?(?=\n*\[Tool |\n*$)/gi, "");
// Remove [Historical context: ...] markers (self-contained within brackets).
cleaned = cleaned.replace(/\[Historical context:[^\]]*\]\n?/gi, "");
return cleaned.trim();
}
/**
* Strip thinking tags and their content from text.
* This is a safety net for cases where the model outputs <think> tags
* that slip through other filtering mechanisms.
*/
export function stripThinkingTagsFromText(text: string): string {
return stripReasoningTagsFromText(text, { mode: "strict", trim: "both" });
}
export function extractAssistantText(msg: AssistantMessage): string {
const extracted =
extractTextFromChatContent(msg.content, {
sanitizeText: (text) =>
stripThinkingTagsFromText(
stripDowngradedToolCallText(stripModelSpecialTokens(stripMinimaxToolCallXml(text))),
).trim(),
joinWith: "\n",
normalizeText: (text) => text.trim(),
}) ?? "";
// Only apply keyword-based error rewrites when the assistant message is actually an error.
// Otherwise normal prose that *mentions* errors (e.g. "context overflow") can get clobbered.
const errorContext = msg.stopReason === "error" || Boolean(msg.errorMessage?.trim());
return sanitizeUserFacingText(extracted, { errorContext });
}
export function extractAssistantThinking(msg: AssistantMessage): string {
if (!Array.isArray(msg.content)) {
return "";
}
const blocks = msg.content
.map((block) => {
if (!block || typeof block !== "object") {
return "";
}
const record = block as unknown as Record<string, unknown>;
if (record.type === "thinking" && typeof record.thinking === "string") {
return record.thinking.trim();
}
return "";
})
.filter(Boolean);
return blocks.join("\n").trim();
}
export function formatReasoningMessage(text: string): string {
const trimmed = text.trim();
if (!trimmed) {
return "";
}
// Show reasoning in italics (cursive) for markdown-friendly surfaces (Discord, etc.).
// Keep the plain "Reasoning:" prefix so existing parsing/detection keeps working.
// Note: Underscore markdown cannot span multiple lines on Telegram, so we wrap
// each non-empty line separately.
const italicLines = trimmed
.split("\n")
.map((line) => (line ? `_${line}_` : line))
.join("\n");
return `Reasoning:\n${italicLines}`;
}
type ThinkTaggedSplitBlock =
| { type: "thinking"; thinking: string }
| { type: "text"; text: string };
export function splitThinkingTaggedText(text: string): ThinkTaggedSplitBlock[] | null {
const trimmedStart = text.trimStart();
// Avoid false positives: only treat it as structured thinking when it begins
// with a think tag (common for local/OpenAI-compat providers that emulate
// reasoning blocks via tags).
if (!trimmedStart.startsWith("<")) {
return null;
}
const openRe = /<\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
const closeRe = /<\s*\/\s*(?:think(?:ing)?|thought|antthinking)\s*>/i;
if (!openRe.test(trimmedStart)) {
return null;
}
if (!closeRe.test(text)) {
return null;
}
const scanRe = /<\s*(\/?)\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
let inThinking = false;
let cursor = 0;
let thinkingStart = 0;
const blocks: ThinkTaggedSplitBlock[] = [];
const pushText = (value: string) => {
if (!value) {
return;
}
blocks.push({ type: "text", text: value });
};
const pushThinking = (value: string) => {
const cleaned = value.trim();
if (!cleaned) {
return;
}
blocks.push({ type: "thinking", thinking: cleaned });
};
for (const match of text.matchAll(scanRe)) {
const index = match.index ?? 0;
const isClose = Boolean(match[1]?.includes("/"));
if (!inThinking && !isClose) {
pushText(text.slice(cursor, index));
thinkingStart = index + match[0].length;
inThinking = true;
continue;
}
if (inThinking && isClose) {
pushThinking(text.slice(thinkingStart, index));
cursor = index + match[0].length;
inThinking = false;
}
}
if (inThinking) {
return null;
}
pushText(text.slice(cursor));
const hasThinking = blocks.some((b) => b.type === "thinking");
if (!hasThinking) {
return null;
}
return blocks;
}
export function promoteThinkingTagsToBlocks(message: AssistantMessage): void {
if (!Array.isArray(message.content)) {
return;
}
const hasThinkingBlock = message.content.some(
(block) => block && typeof block === "object" && block.type === "thinking",
);
if (hasThinkingBlock) {
return;
}
const next: AssistantMessage["content"] = [];
let changed = false;
for (const block of message.content) {
if (!block || typeof block !== "object" || !("type" in block)) {
next.push(block);
continue;
}
if (block.type !== "text") {
next.push(block);
continue;
}
const split = splitThinkingTaggedText(block.text);
if (!split) {
next.push(block);
continue;
}
changed = true;
for (const part of split) {
if (part.type === "thinking") {
next.push({ type: "thinking", thinking: part.thinking });
} else if (part.type === "text") {
const cleaned = part.text.trimStart();
if (cleaned) {
next.push({ type: "text", text: cleaned });
}
}
}
}
if (!changed) {
return;
}
message.content = next;
}
export function extractThinkingFromTaggedText(text: string): string {
if (!text) {
return "";
}
const scanRe = /<\s*(\/?)\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
let result = "";
let lastIndex = 0;
let inThinking = false;
for (const match of text.matchAll(scanRe)) {
const idx = match.index ?? 0;
if (inThinking) {
result += text.slice(lastIndex, idx);
}
const isClose = match[1] === "/";
inThinking = !isClose;
lastIndex = idx + match[0].length;
}
return result.trim();
}
export function extractThinkingFromTaggedStream(text: string): string {
if (!text) {
return "";
}
const closed = extractThinkingFromTaggedText(text);
if (closed) {
return closed;
}
const openRe = /<\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
const closeRe = /<\s*\/\s*(?:think(?:ing)?|thought|antthinking)\s*>/gi;
const openMatches = [...text.matchAll(openRe)];
if (openMatches.length === 0) {
return "";
}
const closeMatches = [...text.matchAll(closeRe)];
const lastOpen = openMatches[openMatches.length - 1];
const lastClose = closeMatches[closeMatches.length - 1];
if (lastClose && (lastClose.index ?? -1) > (lastOpen.index ?? -1)) {
return closed;
}
const start = (lastOpen.index ?? 0) + lastOpen[0].length;
return text.slice(start).trim();
}
export function inferToolMetaFromArgs(toolName: string, args: unknown): string | undefined {
const display = resolveToolDisplay({ name: toolName, args });
return formatToolDetail(display);
}