* feat: web content security wrapping + gkeep/simple-backup skills * fix: harden web fetch + media text detection (#4058) (thanks @VACInc) --------- Co-authored-by: VAC <vac@vacs-mac-mini.localdomain> Co-authored-by: Peter Steinberger <steipete@gmail.com>
549 lines
16 KiB
TypeScript
549 lines
16 KiB
TypeScript
import path from "node:path";
|
|
import type { MsgContext } from "../auto-reply/templating.js";
|
|
import type { OpenClawConfig } from "../config/config.js";
|
|
import type {
|
|
MediaUnderstandingCapability,
|
|
MediaUnderstandingDecision,
|
|
MediaUnderstandingOutput,
|
|
MediaUnderstandingProvider,
|
|
} from "./types.js";
|
|
import { finalizeInboundContext } from "../auto-reply/reply/inbound-context.js";
|
|
import { logVerbose, shouldLogVerbose } from "../globals.js";
|
|
import {
|
|
DEFAULT_INPUT_FILE_MAX_BYTES,
|
|
DEFAULT_INPUT_FILE_MAX_CHARS,
|
|
DEFAULT_INPUT_FILE_MIMES,
|
|
DEFAULT_INPUT_MAX_REDIRECTS,
|
|
DEFAULT_INPUT_PDF_MAX_PAGES,
|
|
DEFAULT_INPUT_PDF_MAX_PIXELS,
|
|
DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
|
|
DEFAULT_INPUT_TIMEOUT_MS,
|
|
extractFileContentFromSource,
|
|
normalizeMimeList,
|
|
normalizeMimeType,
|
|
} from "../media/input-files.js";
|
|
import { resolveAttachmentKind } from "./attachments.js";
|
|
import { runWithConcurrency } from "./concurrency.js";
|
|
import {
|
|
extractMediaUserText,
|
|
formatAudioTranscripts,
|
|
formatMediaUnderstandingBody,
|
|
} from "./format.js";
|
|
import { resolveConcurrency } from "./resolve.js";
|
|
import {
|
|
type ActiveMediaModel,
|
|
buildProviderRegistry,
|
|
createMediaAttachmentCache,
|
|
normalizeMediaAttachments,
|
|
runCapability,
|
|
} from "./runner.js";
|
|
|
|
export type ApplyMediaUnderstandingResult = {
|
|
outputs: MediaUnderstandingOutput[];
|
|
decisions: MediaUnderstandingDecision[];
|
|
appliedImage: boolean;
|
|
appliedAudio: boolean;
|
|
appliedVideo: boolean;
|
|
appliedFile: boolean;
|
|
};
|
|
|
|
const CAPABILITY_ORDER: MediaUnderstandingCapability[] = ["image", "audio", "video"];
|
|
const EXTRA_TEXT_MIMES = [
|
|
"application/xml",
|
|
"text/xml",
|
|
"application/x-yaml",
|
|
"text/yaml",
|
|
"application/yaml",
|
|
"application/javascript",
|
|
"text/javascript",
|
|
"text/tab-separated-values",
|
|
];
|
|
const TEXT_EXT_MIME = new Map<string, string>([
|
|
[".csv", "text/csv"],
|
|
[".tsv", "text/tab-separated-values"],
|
|
[".txt", "text/plain"],
|
|
[".md", "text/markdown"],
|
|
[".log", "text/plain"],
|
|
[".ini", "text/plain"],
|
|
[".cfg", "text/plain"],
|
|
[".conf", "text/plain"],
|
|
[".env", "text/plain"],
|
|
[".json", "application/json"],
|
|
[".yaml", "text/yaml"],
|
|
[".yml", "text/yaml"],
|
|
[".xml", "application/xml"],
|
|
]);
|
|
|
|
const XML_ESCAPE_MAP: Record<string, string> = {
|
|
"<": "<",
|
|
">": ">",
|
|
"&": "&",
|
|
'"': """,
|
|
"'": "'",
|
|
};
|
|
|
|
/**
|
|
* Escapes special XML characters in attribute values to prevent injection.
|
|
*/
|
|
function xmlEscapeAttr(value: string): string {
|
|
return value.replace(/[<>&"']/g, (char) => XML_ESCAPE_MAP[char] ?? char);
|
|
}
|
|
|
|
function escapeFileBlockContent(value: string): string {
|
|
return value.replace(/<\s*\/\s*file\s*>/gi, "</file>").replace(/<\s*file\b/gi, "<file");
|
|
}
|
|
|
|
function sanitizeMimeType(value?: string): string | undefined {
|
|
if (!value) {
|
|
return undefined;
|
|
}
|
|
const trimmed = value.trim().toLowerCase();
|
|
if (!trimmed) {
|
|
return undefined;
|
|
}
|
|
const match = trimmed.match(/^([a-z0-9!#$&^_.+-]+\/[a-z0-9!#$&^_.+-]+)/);
|
|
return match?.[1];
|
|
}
|
|
|
|
function resolveFileLimits(cfg: OpenClawConfig) {
|
|
const files = cfg.gateway?.http?.endpoints?.responses?.files;
|
|
const allowedMimesConfigured = Boolean(files?.allowedMimes && files.allowedMimes.length > 0);
|
|
return {
|
|
allowUrl: files?.allowUrl ?? true,
|
|
allowedMimes: normalizeMimeList(files?.allowedMimes, DEFAULT_INPUT_FILE_MIMES),
|
|
allowedMimesConfigured,
|
|
maxBytes: files?.maxBytes ?? DEFAULT_INPUT_FILE_MAX_BYTES,
|
|
maxChars: files?.maxChars ?? DEFAULT_INPUT_FILE_MAX_CHARS,
|
|
maxRedirects: files?.maxRedirects ?? DEFAULT_INPUT_MAX_REDIRECTS,
|
|
timeoutMs: files?.timeoutMs ?? DEFAULT_INPUT_TIMEOUT_MS,
|
|
pdf: {
|
|
maxPages: files?.pdf?.maxPages ?? DEFAULT_INPUT_PDF_MAX_PAGES,
|
|
maxPixels: files?.pdf?.maxPixels ?? DEFAULT_INPUT_PDF_MAX_PIXELS,
|
|
minTextChars: files?.pdf?.minTextChars ?? DEFAULT_INPUT_PDF_MIN_TEXT_CHARS,
|
|
},
|
|
};
|
|
}
|
|
|
|
function appendFileBlocks(body: string | undefined, blocks: string[]): string {
|
|
if (!blocks || blocks.length === 0) {
|
|
return body ?? "";
|
|
}
|
|
const base = typeof body === "string" ? body.trim() : "";
|
|
const suffix = blocks.join("\n\n").trim();
|
|
if (!base) {
|
|
return suffix;
|
|
}
|
|
return `${base}\n\n${suffix}`.trim();
|
|
}
|
|
|
|
function resolveUtf16Charset(buffer?: Buffer): "utf-16le" | "utf-16be" | undefined {
|
|
if (!buffer || buffer.length < 2) {
|
|
return undefined;
|
|
}
|
|
const b0 = buffer[0];
|
|
const b1 = buffer[1];
|
|
if (b0 === 0xff && b1 === 0xfe) {
|
|
return "utf-16le";
|
|
}
|
|
if (b0 === 0xfe && b1 === 0xff) {
|
|
return "utf-16be";
|
|
}
|
|
const sampleLen = Math.min(buffer.length, 2048);
|
|
let zeroEven = 0;
|
|
let zeroOdd = 0;
|
|
for (let i = 0; i < sampleLen; i += 1) {
|
|
if (buffer[i] !== 0) {
|
|
continue;
|
|
}
|
|
if (i % 2 === 0) {
|
|
zeroEven += 1;
|
|
} else {
|
|
zeroOdd += 1;
|
|
}
|
|
}
|
|
const zeroCount = zeroEven + zeroOdd;
|
|
if (zeroCount / sampleLen > 0.2) {
|
|
return zeroOdd >= zeroEven ? "utf-16le" : "utf-16be";
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
const WORDISH_CHAR = /[\p{L}\p{N}]/u;
|
|
const CP1252_MAP: Array<string | undefined> = [
|
|
"\u20ac",
|
|
undefined,
|
|
"\u201a",
|
|
"\u0192",
|
|
"\u201e",
|
|
"\u2026",
|
|
"\u2020",
|
|
"\u2021",
|
|
"\u02c6",
|
|
"\u2030",
|
|
"\u0160",
|
|
"\u2039",
|
|
"\u0152",
|
|
undefined,
|
|
"\u017d",
|
|
undefined,
|
|
undefined,
|
|
"\u2018",
|
|
"\u2019",
|
|
"\u201c",
|
|
"\u201d",
|
|
"\u2022",
|
|
"\u2013",
|
|
"\u2014",
|
|
"\u02dc",
|
|
"\u2122",
|
|
"\u0161",
|
|
"\u203a",
|
|
"\u0153",
|
|
undefined,
|
|
"\u017e",
|
|
"\u0178",
|
|
];
|
|
|
|
function decodeLegacyText(buffer: Buffer): string {
|
|
let output = "";
|
|
for (const byte of buffer) {
|
|
if (byte >= 0x80 && byte <= 0x9f) {
|
|
const mapped = CP1252_MAP[byte - 0x80];
|
|
output += mapped ?? String.fromCharCode(byte);
|
|
continue;
|
|
}
|
|
output += String.fromCharCode(byte);
|
|
}
|
|
return output;
|
|
}
|
|
|
|
function getTextStats(text: string): { printableRatio: number; wordishRatio: number } {
|
|
if (!text) {
|
|
return { printableRatio: 0, wordishRatio: 0 };
|
|
}
|
|
let printable = 0;
|
|
let control = 0;
|
|
let wordish = 0;
|
|
for (const char of text) {
|
|
const code = char.codePointAt(0) ?? 0;
|
|
if (code === 9 || code === 10 || code === 13 || code === 32) {
|
|
printable += 1;
|
|
wordish += 1;
|
|
continue;
|
|
}
|
|
if (code < 32 || (code >= 0x7f && code <= 0x9f)) {
|
|
control += 1;
|
|
continue;
|
|
}
|
|
printable += 1;
|
|
if (WORDISH_CHAR.test(char)) {
|
|
wordish += 1;
|
|
}
|
|
}
|
|
const total = printable + control;
|
|
if (total === 0) {
|
|
return { printableRatio: 0, wordishRatio: 0 };
|
|
}
|
|
return { printableRatio: printable / total, wordishRatio: wordish / total };
|
|
}
|
|
|
|
function isMostlyPrintable(text: string): boolean {
|
|
return getTextStats(text).printableRatio > 0.85;
|
|
}
|
|
|
|
function looksLikeLegacyTextBytes(buffer: Buffer): boolean {
|
|
if (buffer.length === 0) {
|
|
return false;
|
|
}
|
|
const text = decodeLegacyText(buffer);
|
|
const { printableRatio, wordishRatio } = getTextStats(text);
|
|
return printableRatio > 0.95 && wordishRatio > 0.3;
|
|
}
|
|
|
|
function looksLikeUtf8Text(buffer?: Buffer): boolean {
|
|
if (!buffer || buffer.length === 0) {
|
|
return false;
|
|
}
|
|
const sample = buffer.subarray(0, Math.min(buffer.length, 4096));
|
|
try {
|
|
const text = new TextDecoder("utf-8", { fatal: true }).decode(sample);
|
|
return isMostlyPrintable(text);
|
|
} catch {
|
|
return looksLikeLegacyTextBytes(sample);
|
|
}
|
|
}
|
|
|
|
function decodeTextSample(buffer?: Buffer): string {
|
|
if (!buffer || buffer.length === 0) {
|
|
return "";
|
|
}
|
|
const sample = buffer.subarray(0, Math.min(buffer.length, 8192));
|
|
const utf16Charset = resolveUtf16Charset(sample);
|
|
if (utf16Charset === "utf-16be") {
|
|
const swapped = Buffer.alloc(sample.length);
|
|
for (let i = 0; i + 1 < sample.length; i += 2) {
|
|
swapped[i] = sample[i + 1];
|
|
swapped[i + 1] = sample[i];
|
|
}
|
|
return new TextDecoder("utf-16le").decode(swapped);
|
|
}
|
|
if (utf16Charset === "utf-16le") {
|
|
return new TextDecoder("utf-16le").decode(sample);
|
|
}
|
|
return new TextDecoder("utf-8").decode(sample);
|
|
}
|
|
|
|
function guessDelimitedMime(text: string): string | undefined {
|
|
if (!text) {
|
|
return undefined;
|
|
}
|
|
const line = text.split(/\r?\n/)[0] ?? "";
|
|
const tabs = (line.match(/\t/g) ?? []).length;
|
|
const commas = (line.match(/,/g) ?? []).length;
|
|
if (commas > 0) {
|
|
return "text/csv";
|
|
}
|
|
if (tabs > 0) {
|
|
return "text/tab-separated-values";
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function resolveTextMimeFromName(name?: string): string | undefined {
|
|
if (!name) {
|
|
return undefined;
|
|
}
|
|
const ext = path.extname(name).toLowerCase();
|
|
return TEXT_EXT_MIME.get(ext);
|
|
}
|
|
|
|
async function extractFileBlocks(params: {
|
|
attachments: ReturnType<typeof normalizeMediaAttachments>;
|
|
cache: ReturnType<typeof createMediaAttachmentCache>;
|
|
limits: ReturnType<typeof resolveFileLimits>;
|
|
skipAttachmentIndexes?: Set<number>;
|
|
}): Promise<string[]> {
|
|
const { attachments, cache, limits, skipAttachmentIndexes } = params;
|
|
if (!attachments || attachments.length === 0) {
|
|
return [];
|
|
}
|
|
const blocks: string[] = [];
|
|
for (const attachment of attachments) {
|
|
if (!attachment) {
|
|
continue;
|
|
}
|
|
if (skipAttachmentIndexes?.has(attachment.index)) {
|
|
continue;
|
|
}
|
|
const forcedTextMime = resolveTextMimeFromName(attachment.path ?? attachment.url ?? "");
|
|
const kind = forcedTextMime ? "document" : resolveAttachmentKind(attachment);
|
|
if (!forcedTextMime && (kind === "image" || kind === "video")) {
|
|
continue;
|
|
}
|
|
if (!limits.allowUrl && attachment.url && !attachment.path) {
|
|
if (shouldLogVerbose()) {
|
|
logVerbose(`media: file attachment skipped (url disabled) index=${attachment.index}`);
|
|
}
|
|
continue;
|
|
}
|
|
let bufferResult: Awaited<ReturnType<typeof cache.getBuffer>>;
|
|
try {
|
|
bufferResult = await cache.getBuffer({
|
|
attachmentIndex: attachment.index,
|
|
maxBytes: limits.maxBytes,
|
|
timeoutMs: limits.timeoutMs,
|
|
});
|
|
} catch (err) {
|
|
if (shouldLogVerbose()) {
|
|
logVerbose(`media: file attachment skipped (buffer): ${String(err)}`);
|
|
}
|
|
continue;
|
|
}
|
|
const nameHint = bufferResult?.fileName ?? attachment.path ?? attachment.url;
|
|
const forcedTextMimeResolved = forcedTextMime ?? resolveTextMimeFromName(nameHint ?? "");
|
|
const utf16Charset = resolveUtf16Charset(bufferResult?.buffer);
|
|
const textSample = decodeTextSample(bufferResult?.buffer);
|
|
const textLike = Boolean(utf16Charset) || looksLikeUtf8Text(bufferResult?.buffer);
|
|
if (!forcedTextMimeResolved && kind === "audio" && !textLike) {
|
|
continue;
|
|
}
|
|
const guessedDelimited = textLike ? guessDelimitedMime(textSample) : undefined;
|
|
const textHint =
|
|
forcedTextMimeResolved ?? guessedDelimited ?? (textLike ? "text/plain" : undefined);
|
|
const rawMime = bufferResult?.mime ?? attachment.mime;
|
|
const mimeType = sanitizeMimeType(textHint ?? normalizeMimeType(rawMime));
|
|
// Log when MIME type is overridden from non-text to text for auditability
|
|
if (textHint && rawMime && !rawMime.startsWith("text/")) {
|
|
logVerbose(
|
|
`media: MIME override from "${rawMime}" to "${textHint}" for index=${attachment.index}`,
|
|
);
|
|
}
|
|
if (!mimeType) {
|
|
if (shouldLogVerbose()) {
|
|
logVerbose(`media: file attachment skipped (unknown mime) index=${attachment.index}`);
|
|
}
|
|
continue;
|
|
}
|
|
const allowedMimes = new Set(limits.allowedMimes);
|
|
if (!limits.allowedMimesConfigured) {
|
|
for (const extra of EXTRA_TEXT_MIMES) {
|
|
allowedMimes.add(extra);
|
|
}
|
|
if (mimeType.startsWith("text/")) {
|
|
allowedMimes.add(mimeType);
|
|
}
|
|
}
|
|
if (!allowedMimes.has(mimeType)) {
|
|
if (shouldLogVerbose()) {
|
|
logVerbose(
|
|
`media: file attachment skipped (unsupported mime ${mimeType}) index=${attachment.index}`,
|
|
);
|
|
}
|
|
continue;
|
|
}
|
|
let extracted: Awaited<ReturnType<typeof extractFileContentFromSource>>;
|
|
try {
|
|
const mediaType = utf16Charset ? `${mimeType}; charset=${utf16Charset}` : mimeType;
|
|
const { allowedMimesConfigured: _allowedMimesConfigured, ...baseLimits } = limits;
|
|
extracted = await extractFileContentFromSource({
|
|
source: {
|
|
type: "base64",
|
|
data: bufferResult.buffer.toString("base64"),
|
|
mediaType,
|
|
filename: bufferResult.fileName,
|
|
},
|
|
limits: {
|
|
...baseLimits,
|
|
allowedMimes,
|
|
},
|
|
});
|
|
} catch (err) {
|
|
if (shouldLogVerbose()) {
|
|
logVerbose(`media: file attachment skipped (extract): ${String(err)}`);
|
|
}
|
|
continue;
|
|
}
|
|
const text = extracted?.text?.trim() ?? "";
|
|
let blockText = text;
|
|
if (!blockText) {
|
|
if (extracted?.images && extracted.images.length > 0) {
|
|
blockText = "[PDF content rendered to images; images not forwarded to model]";
|
|
} else {
|
|
blockText = "[No extractable text]";
|
|
}
|
|
}
|
|
const safeName = (bufferResult.fileName ?? `file-${attachment.index + 1}`)
|
|
.replace(/[\r\n\t]+/g, " ")
|
|
.trim();
|
|
// Escape XML special characters in attributes to prevent injection
|
|
blocks.push(
|
|
`<file name="${xmlEscapeAttr(safeName)}" mime="${xmlEscapeAttr(mimeType)}">\n${escapeFileBlockContent(blockText)}\n</file>`,
|
|
);
|
|
}
|
|
return blocks;
|
|
}
|
|
|
|
export async function applyMediaUnderstanding(params: {
|
|
ctx: MsgContext;
|
|
cfg: OpenClawConfig;
|
|
agentDir?: string;
|
|
providers?: Record<string, MediaUnderstandingProvider>;
|
|
activeModel?: ActiveMediaModel;
|
|
}): Promise<ApplyMediaUnderstandingResult> {
|
|
const { ctx, cfg } = params;
|
|
const commandCandidates = [ctx.CommandBody, ctx.RawBody, ctx.Body];
|
|
const originalUserText =
|
|
commandCandidates
|
|
.map((value) => extractMediaUserText(value))
|
|
.find((value) => value && value.trim()) ?? undefined;
|
|
|
|
const attachments = normalizeMediaAttachments(ctx);
|
|
const providerRegistry = buildProviderRegistry(params.providers);
|
|
const cache = createMediaAttachmentCache(attachments);
|
|
|
|
try {
|
|
const tasks = CAPABILITY_ORDER.map((capability) => async () => {
|
|
const config = cfg.tools?.media?.[capability];
|
|
return await runCapability({
|
|
capability,
|
|
cfg,
|
|
ctx,
|
|
attachments: cache,
|
|
media: attachments,
|
|
agentDir: params.agentDir,
|
|
providerRegistry,
|
|
config,
|
|
activeModel: params.activeModel,
|
|
});
|
|
});
|
|
|
|
const results = await runWithConcurrency(tasks, resolveConcurrency(cfg));
|
|
const outputs: MediaUnderstandingOutput[] = [];
|
|
const decisions: MediaUnderstandingDecision[] = [];
|
|
for (const entry of results) {
|
|
if (!entry) {
|
|
continue;
|
|
}
|
|
for (const output of entry.outputs) {
|
|
outputs.push(output);
|
|
}
|
|
decisions.push(entry.decision);
|
|
}
|
|
|
|
if (decisions.length > 0) {
|
|
ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions];
|
|
}
|
|
|
|
if (outputs.length > 0) {
|
|
ctx.Body = formatMediaUnderstandingBody({ body: ctx.Body, outputs });
|
|
const audioOutputs = outputs.filter((output) => output.kind === "audio.transcription");
|
|
if (audioOutputs.length > 0) {
|
|
const transcript = formatAudioTranscripts(audioOutputs);
|
|
ctx.Transcript = transcript;
|
|
if (originalUserText) {
|
|
ctx.CommandBody = originalUserText;
|
|
ctx.RawBody = originalUserText;
|
|
} else {
|
|
ctx.CommandBody = transcript;
|
|
ctx.RawBody = transcript;
|
|
}
|
|
} else if (originalUserText) {
|
|
ctx.CommandBody = originalUserText;
|
|
ctx.RawBody = originalUserText;
|
|
}
|
|
ctx.MediaUnderstanding = [...(ctx.MediaUnderstanding ?? []), ...outputs];
|
|
}
|
|
const audioAttachmentIndexes = new Set(
|
|
outputs
|
|
.filter((output) => output.kind === "audio.transcription")
|
|
.map((output) => output.attachmentIndex),
|
|
);
|
|
const fileBlocks = await extractFileBlocks({
|
|
attachments,
|
|
cache,
|
|
limits: resolveFileLimits(cfg),
|
|
skipAttachmentIndexes: audioAttachmentIndexes.size > 0 ? audioAttachmentIndexes : undefined,
|
|
});
|
|
if (fileBlocks.length > 0) {
|
|
ctx.Body = appendFileBlocks(ctx.Body, fileBlocks);
|
|
}
|
|
if (outputs.length > 0 || fileBlocks.length > 0) {
|
|
finalizeInboundContext(ctx, {
|
|
forceBodyForAgent: true,
|
|
forceBodyForCommands: outputs.length > 0 || fileBlocks.length > 0,
|
|
});
|
|
}
|
|
|
|
return {
|
|
outputs,
|
|
decisions,
|
|
appliedImage: outputs.some((output) => output.kind === "image.description"),
|
|
appliedAudio: outputs.some((output) => output.kind === "audio.transcription"),
|
|
appliedVideo: outputs.some((output) => output.kind === "video.description"),
|
|
appliedFile: fileBlocks.length > 0,
|
|
};
|
|
} finally {
|
|
await cache.cleanup();
|
|
}
|
|
}
|