Files
openclaw/src/agents/pi-embedded-block-chunker.ts
2026-03-07 16:50:35 +00:00

423 lines
11 KiB
TypeScript

import type { FenceSpan } from "../markdown/fences.js";
import { findFenceSpanAt, isSafeFenceBreak, parseFenceSpans } from "../markdown/fences.js";
export type BlockReplyChunking = {
minChars: number;
maxChars: number;
breakPreference?: "paragraph" | "newline" | "sentence";
/** When true, flush eagerly on \n\n paragraph boundaries regardless of minChars. */
flushOnParagraph?: boolean;
};
type FenceSplit = {
closeFenceLine: string;
reopenFenceLine: string;
fence: FenceSpan;
};
type BreakResult = {
index: number;
fenceSplit?: FenceSplit;
};
type ParagraphBreak = {
index: number;
length: number;
};
function findSafeSentenceBreakIndex(
text: string,
fenceSpans: FenceSpan[],
minChars: number,
offset = 0,
): number {
const matches = text.matchAll(/[.!?](?=\s|$)/g);
let sentenceIdx = -1;
for (const match of matches) {
const at = match.index ?? -1;
if (at < minChars) {
continue;
}
const candidate = at + 1;
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
sentenceIdx = candidate;
}
}
return sentenceIdx >= minChars ? sentenceIdx : -1;
}
function findSafeParagraphBreakIndex(params: {
text: string;
fenceSpans: FenceSpan[];
minChars: number;
reverse: boolean;
offset?: number;
}): number {
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
let paragraphIdx = reverse ? text.lastIndexOf("\n\n") : text.indexOf("\n\n");
while (reverse ? paragraphIdx >= minChars : paragraphIdx !== -1) {
const candidates = [paragraphIdx, paragraphIdx + 1];
for (const candidate of candidates) {
if (candidate < minChars) {
continue;
}
if (candidate < 0 || candidate >= text.length) {
continue;
}
if (isSafeFenceBreak(fenceSpans, offset + candidate)) {
return candidate;
}
}
paragraphIdx = reverse
? text.lastIndexOf("\n\n", paragraphIdx - 1)
: text.indexOf("\n\n", paragraphIdx + 2);
}
return -1;
}
function findSafeNewlineBreakIndex(params: {
text: string;
fenceSpans: FenceSpan[];
minChars: number;
reverse: boolean;
offset?: number;
}): number {
const { text, fenceSpans, minChars, reverse, offset = 0 } = params;
let newlineIdx = reverse ? text.lastIndexOf("\n") : text.indexOf("\n");
while (reverse ? newlineIdx >= minChars : newlineIdx !== -1) {
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, offset + newlineIdx)) {
return newlineIdx;
}
newlineIdx = reverse
? text.lastIndexOf("\n", newlineIdx - 1)
: text.indexOf("\n", newlineIdx + 1);
}
return -1;
}
export class EmbeddedBlockChunker {
#buffer = "";
readonly #chunking: BlockReplyChunking;
constructor(chunking: BlockReplyChunking) {
this.#chunking = chunking;
}
append(text: string) {
if (!text) {
return;
}
this.#buffer += text;
}
reset() {
this.#buffer = "";
}
get bufferedText() {
return this.#buffer;
}
hasBuffered(): boolean {
return this.#buffer.length > 0;
}
drain(params: { force: boolean; emit: (chunk: string) => void }) {
// KNOWN: We cannot split inside fenced code blocks (Markdown breaks + UI glitches).
// When forced (maxChars), we close + reopen the fence to keep Markdown valid.
const { force, emit } = params;
const minChars = Math.max(1, Math.floor(this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
if (this.#buffer.length < minChars && !force && !this.#chunking.flushOnParagraph) {
return;
}
if (force && this.#buffer.length <= maxChars) {
if (this.#buffer.trim().length > 0) {
emit(this.#buffer);
}
this.#buffer = "";
return;
}
const source = this.#buffer;
const fenceSpans = parseFenceSpans(source);
let start = 0;
let reopenFence: FenceSpan | undefined;
while (start < source.length) {
const reopenPrefix = reopenFence ? `${reopenFence.openLine}\n` : "";
const remainingLength = reopenPrefix.length + (source.length - start);
if (!force && !this.#chunking.flushOnParagraph && remainingLength < minChars) {
break;
}
if (this.#chunking.flushOnParagraph && !force) {
const paragraphBreak = findNextParagraphBreak(source, fenceSpans, start);
const paragraphLimit = Math.max(1, maxChars - reopenPrefix.length);
if (paragraphBreak && paragraphBreak.index - start <= paragraphLimit) {
const chunk = `${reopenPrefix}${source.slice(start, paragraphBreak.index)}`;
if (chunk.trim().length > 0) {
emit(chunk);
}
start = skipLeadingNewlines(source, paragraphBreak.index + paragraphBreak.length);
reopenFence = undefined;
continue;
}
if (remainingLength < maxChars) {
break;
}
}
const view = source.slice(start);
const breakResult =
force && remainingLength <= maxChars
? this.#pickSoftBreakIndex(view, fenceSpans, 1, start)
: this.#pickBreakIndex(
view,
fenceSpans,
force || this.#chunking.flushOnParagraph ? 1 : undefined,
start,
);
if (breakResult.index <= 0) {
if (force) {
emit(`${reopenPrefix}${source.slice(start)}`);
start = source.length;
reopenFence = undefined;
}
break;
}
const consumed = this.#emitBreakResult({
breakResult,
emit,
reopenPrefix,
source,
start,
});
if (consumed === null) {
continue;
}
start = consumed.start;
reopenFence = consumed.reopenFence;
const nextLength =
(reopenFence ? `${reopenFence.openLine}\n`.length : 0) + (source.length - start);
if (nextLength < minChars && !force && !this.#chunking.flushOnParagraph) {
break;
}
if (nextLength < maxChars && !force && !this.#chunking.flushOnParagraph) {
break;
}
}
this.#buffer = reopenFence
? `${reopenFence.openLine}\n${source.slice(start)}`
: stripLeadingNewlines(source.slice(start));
}
#emitBreakResult(params: {
breakResult: BreakResult;
emit: (chunk: string) => void;
reopenPrefix: string;
source: string;
start: number;
}): { start: number; reopenFence?: FenceSpan } | null {
const { breakResult, emit, reopenPrefix, source, start } = params;
const breakIdx = breakResult.index;
if (breakIdx <= 0) {
return null;
}
const absoluteBreakIdx = start + breakIdx;
let rawChunk = `${reopenPrefix}${source.slice(start, absoluteBreakIdx)}`;
if (rawChunk.trim().length === 0) {
return { start: skipLeadingNewlines(source, absoluteBreakIdx), reopenFence: undefined };
}
const fenceSplit = breakResult.fenceSplit;
if (fenceSplit) {
const closeFence = rawChunk.endsWith("\n")
? `${fenceSplit.closeFenceLine}\n`
: `\n${fenceSplit.closeFenceLine}\n`;
rawChunk = `${rawChunk}${closeFence}`;
}
emit(rawChunk);
if (fenceSplit) {
return { start: absoluteBreakIdx, reopenFence: fenceSplit.fence };
}
const nextStart =
absoluteBreakIdx < source.length && /\s/.test(source[absoluteBreakIdx])
? absoluteBreakIdx + 1
: absoluteBreakIdx;
return { start: skipLeadingNewlines(source, nextStart), reopenFence: undefined };
}
#pickSoftBreakIndex(
buffer: string,
fenceSpans: FenceSpan[],
minCharsOverride?: number,
offset = 0,
): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
if (buffer.length < minChars) {
return { index: -1 };
}
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
const paragraphIdx = findSafeParagraphBreakIndex({
text: buffer,
fenceSpans,
minChars,
reverse: false,
offset,
});
if (paragraphIdx !== -1) {
return { index: paragraphIdx };
}
}
if (preference === "paragraph" || preference === "newline") {
const newlineIdx = findSafeNewlineBreakIndex({
text: buffer,
fenceSpans,
minChars,
reverse: false,
offset,
});
if (newlineIdx !== -1) {
return { index: newlineIdx };
}
}
if (preference !== "newline") {
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars, offset);
if (sentenceIdx !== -1) {
return { index: sentenceIdx };
}
}
return { index: -1 };
}
#pickBreakIndex(
buffer: string,
fenceSpans: FenceSpan[],
minCharsOverride?: number,
offset = 0,
): BreakResult {
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
if (buffer.length < minChars) {
return { index: -1 };
}
const window = buffer.slice(0, Math.min(maxChars, buffer.length));
const preference = this.#chunking.breakPreference ?? "paragraph";
if (preference === "paragraph") {
const paragraphIdx = findSafeParagraphBreakIndex({
text: window,
fenceSpans,
minChars,
reverse: true,
offset,
});
if (paragraphIdx !== -1) {
return { index: paragraphIdx };
}
}
if (preference === "paragraph" || preference === "newline") {
const newlineIdx = findSafeNewlineBreakIndex({
text: window,
fenceSpans,
minChars,
reverse: true,
offset,
});
if (newlineIdx !== -1) {
return { index: newlineIdx };
}
}
if (preference !== "newline") {
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars, offset);
if (sentenceIdx !== -1) {
return { index: sentenceIdx };
}
}
if (preference === "newline" && buffer.length < maxChars) {
return { index: -1 };
}
for (let i = window.length - 1; i >= minChars; i--) {
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, offset + i)) {
return { index: i };
}
}
if (buffer.length >= maxChars) {
if (isSafeFenceBreak(fenceSpans, offset + maxChars)) {
return { index: maxChars };
}
const fence = findFenceSpanAt(fenceSpans, offset + maxChars);
if (fence) {
return {
index: maxChars,
fenceSplit: {
closeFenceLine: `${fence.indent}${fence.marker}`,
reopenFenceLine: fence.openLine,
fence,
},
};
}
return { index: maxChars };
}
return { index: -1 };
}
}
function skipLeadingNewlines(value: string, start = 0): number {
let i = start;
while (i < value.length && value[i] === "\n") {
i++;
}
return i;
}
function stripLeadingNewlines(value: string): string {
const start = skipLeadingNewlines(value);
return start > 0 ? value.slice(start) : value;
}
function findNextParagraphBreak(
buffer: string,
fenceSpans: FenceSpan[],
startIndex = 0,
): ParagraphBreak | null {
if (startIndex < 0) {
return null;
}
const re = /\n[\t ]*\n+/g;
re.lastIndex = startIndex;
let match: RegExpExecArray | null;
while ((match = re.exec(buffer)) !== null) {
const index = match.index ?? -1;
if (index < 0) {
continue;
}
if (!isSafeFenceBreak(fenceSpans, index)) {
continue;
}
return { index, length: match[0].length };
}
return null;
}