diff --git a/docs/gateway/configuration.md b/docs/gateway/configuration.md index e8bc990db..6baee7e6d 100644 --- a/docs/gateway/configuration.md +++ b/docs/gateway/configuration.md @@ -2039,6 +2039,7 @@ of `every`, keep `HEARTBEAT.md` tiny, and/or choose a cheaper `model`. - `tools.web.search.cacheTtlMinutes` (default 15) - `tools.web.fetch.enabled` (default true) - `tools.web.fetch.maxChars` (default 50000) +- `tools.web.fetch.maxCharsCap` (default 50000; clamps maxChars from config/tool calls) - `tools.web.fetch.timeoutSeconds` (default 30) - `tools.web.fetch.cacheTtlMinutes` (default 15) - `tools.web.fetch.userAgent` (optional override) diff --git a/docs/tools/index.md b/docs/tools/index.md index 6a3974e99..a2c741af2 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -252,6 +252,7 @@ Core parameters: Notes: - Enable via `tools.web.fetch.enabled`. +- `maxChars` is clamped by `tools.web.fetch.maxCharsCap` (default 50000). - Responses are cached (default 15 min). - For JS-heavy sites, prefer the browser tool. - See [Web tools](/tools/web) for setup. diff --git a/docs/tools/web.md b/docs/tools/web.md index ab374b49c..4c1ff47b6 100644 --- a/docs/tools/web.md +++ b/docs/tools/web.md @@ -221,6 +221,7 @@ Fetch a URL and extract readable content. fetch: { enabled: true, maxChars: 50000, + maxCharsCap: 50000, timeoutSeconds: 30, cacheTtlMinutes: 15, maxRedirects: 3, @@ -252,6 +253,7 @@ Notes: - Firecrawl requests use bot-circumvention mode and cache results by default. - `web_fetch` sends a Chrome-like User-Agent and `Accept-Language` by default; override `userAgent` if needed. - `web_fetch` blocks private/internal hostnames and re-checks redirects (limit with `maxRedirects`). +- `maxChars` is clamped to `tools.web.fetch.maxCharsCap`. - `web_fetch` is best-effort extraction; some sites will need the browser tool. - See [Firecrawl](/tools/firecrawl) for key setup and service details. - Responses are cached (default 15 minutes) to reduce repeated fetches. diff --git a/src/agents/tools/web-fetch.ts b/src/agents/tools/web-fetch.ts index 6cfb4e57e..31ffaab11 100644 --- a/src/agents/tools/web-fetch.ts +++ b/src/agents/tools/web-fetch.ts @@ -95,6 +95,17 @@ function resolveFetchReadabilityEnabled(fetch?: WebFetchConfig): boolean { return true; } +function resolveFetchMaxCharsCap(fetch?: WebFetchConfig): number { + const raw = + fetch && "maxCharsCap" in fetch && typeof fetch.maxCharsCap === "number" + ? fetch.maxCharsCap + : undefined; + if (typeof raw !== "number" || !Number.isFinite(raw)) { + return DEFAULT_FETCH_MAX_CHARS; + } + return Math.max(100, Math.floor(raw)); +} + function resolveFirecrawlConfig(fetch?: WebFetchConfig): FirecrawlFetchConfig { if (!fetch || typeof fetch !== "object") { return undefined; @@ -160,9 +171,10 @@ function resolveFirecrawlMaxAgeMsOrDefault(firecrawl?: FirecrawlFetchConfig): nu return DEFAULT_FIRECRAWL_MAX_AGE_MS; } -function resolveMaxChars(value: unknown, fallback: number): number { +function resolveMaxChars(value: unknown, fallback: number, cap: number): number { const parsed = typeof value === "number" && Number.isFinite(value) ? value : fallback; - return Math.max(100, Math.floor(parsed)); + const clamped = Math.max(100, Math.floor(parsed)); + return Math.min(clamped, cap); } function resolveMaxRedirects(value: unknown, fallback: number): number { @@ -647,10 +659,15 @@ export function createWebFetchTool(options?: { const url = readStringParam(params, "url", { required: true }); const extractMode = readStringParam(params, "extractMode") === "text" ? "text" : "markdown"; const maxChars = readNumberParam(params, "maxChars", { integer: true }); + const maxCharsCap = resolveFetchMaxCharsCap(fetch); const result = await runWebFetch({ url, extractMode, - maxChars: resolveMaxChars(maxChars ?? fetch?.maxChars, DEFAULT_FETCH_MAX_CHARS), + maxChars: resolveMaxChars( + maxChars ?? fetch?.maxChars, + DEFAULT_FETCH_MAX_CHARS, + maxCharsCap, + ), maxRedirects: resolveMaxRedirects(fetch?.maxRedirects, DEFAULT_FETCH_MAX_REDIRECTS), timeoutSeconds: resolveTimeoutSeconds(fetch?.timeoutSeconds, DEFAULT_TIMEOUT_SECONDS), cacheTtlMs: resolveCacheTtlMs(fetch?.cacheTtlMinutes, DEFAULT_CACHE_TTL_MINUTES), diff --git a/src/agents/tools/web-tools.fetch.test.ts b/src/agents/tools/web-tools.fetch.test.ts index 9ced0e23e..b916fc582 100644 --- a/src/agents/tools/web-tools.fetch.test.ts +++ b/src/agents/tools/web-tools.fetch.test.ts @@ -49,6 +49,20 @@ function firecrawlError(): MockResponse { }; } +function textResponse( + text: string, + url = "https://example.com/", + contentType = "text/plain; charset=utf-8", +): MockResponse { + return { + ok: true, + status: 200, + url, + headers: makeHeaders({ "content-type": contentType }), + text: async () => text, + }; +} + function errorHtmlResponse( html: string, status = 404, @@ -322,6 +336,37 @@ describe("web_fetch extraction fallbacks", () => { expect(details.extractor).toBe("firecrawl"); expect(details.text).toContain("firecrawl fallback"); }); + + it("wraps external content and clamps oversized maxChars", async () => { + const large = "a".repeat(80_000); + const mockFetch = vi.fn( + (input: RequestInfo) => + Promise.resolve(textResponse(large, requestUrl(input))) as Promise, + ); + // @ts-expect-error mock fetch + global.fetch = mockFetch; + + const tool = createWebFetchTool({ + config: { + tools: { + web: { + fetch: { cacheTtlMinutes: 0, firecrawl: { enabled: false }, maxCharsCap: 10_000 }, + }, + }, + }, + sandboxed: false, + }); + + const result = await tool?.execute?.("call", { + url: "https://example.com/large", + maxChars: 200_000, + }); + const details = result?.details as { text?: string; length?: number; truncated?: boolean }; + expect(details.text).toContain("<<>>"); + expect(details.text).toContain("Source: Web Fetch"); + expect(details.length).toBeLessThanOrEqual(10_000); + expect(details.truncated).toBe(true); + }); it("strips and truncates HTML from error responses", async () => { const long = "x".repeat(12_000); const html = diff --git a/src/config/schema.ts b/src/config/schema.ts index 09bcd038c..c918d38f0 100644 --- a/src/config/schema.ts +++ b/src/config/schema.ts @@ -483,6 +483,8 @@ const FIELD_HELP: Record = { 'Perplexity model override (default: "perplexity/sonar-pro").', "tools.web.fetch.enabled": "Enable the web_fetch tool (lightweight HTTP fetch).", "tools.web.fetch.maxChars": "Max characters returned by web_fetch (truncated).", + "tools.web.fetch.maxCharsCap": + "Hard cap for web_fetch maxChars (applies to config and tool calls).", "tools.web.fetch.timeoutSeconds": "Timeout in seconds for web_fetch requests.", "tools.web.fetch.cacheTtlMinutes": "Cache TTL in minutes for web_fetch results.", "tools.web.fetch.maxRedirects": "Maximum redirects allowed for web_fetch (default: 3).", diff --git a/src/config/types.tools.ts b/src/config/types.tools.ts index db32cb59d..b08032427 100644 --- a/src/config/types.tools.ts +++ b/src/config/types.tools.ts @@ -361,6 +361,8 @@ export type ToolsConfig = { enabled?: boolean; /** Max characters to return from fetched content. */ maxChars?: number; + /** Hard cap for maxChars (tool or config), defaults to 50000. */ + maxCharsCap?: number; /** Timeout in seconds for fetch requests. */ timeoutSeconds?: number; /** Cache TTL in minutes for fetched content. */ diff --git a/src/config/zod-schema.agent-runtime.ts b/src/config/zod-schema.agent-runtime.ts index 1314397e3..c63742218 100644 --- a/src/config/zod-schema.agent-runtime.ts +++ b/src/config/zod-schema.agent-runtime.ts @@ -191,6 +191,7 @@ export const ToolsWebFetchSchema = z .object({ enabled: z.boolean().optional(), maxChars: z.number().int().positive().optional(), + maxCharsCap: z.number().int().positive().optional(), timeoutSeconds: z.number().int().positive().optional(), cacheTtlMinutes: z.number().nonnegative().optional(), maxRedirects: z.number().int().nonnegative().optional(),