351 lines
10 KiB
TypeScript
351 lines
10 KiB
TypeScript
import type { FenceSpan } from "../markdown/fences.js";
|
|
import { findFenceSpanAt, isSafeFenceBreak, parseFenceSpans } from "../markdown/fences.js";
|
|
|
|
export type BlockReplyChunking = {
|
|
minChars: number;
|
|
maxChars: number;
|
|
breakPreference?: "paragraph" | "newline" | "sentence";
|
|
/** When true, flush eagerly on \n\n paragraph boundaries regardless of minChars. */
|
|
flushOnParagraph?: boolean;
|
|
};
|
|
|
|
type FenceSplit = {
|
|
closeFenceLine: string;
|
|
reopenFenceLine: string;
|
|
};
|
|
|
|
type BreakResult = {
|
|
index: number;
|
|
fenceSplit?: FenceSplit;
|
|
};
|
|
|
|
type ParagraphBreak = {
|
|
index: number;
|
|
length: number;
|
|
};
|
|
|
|
function findSafeSentenceBreakIndex(
|
|
text: string,
|
|
fenceSpans: FenceSpan[],
|
|
minChars: number,
|
|
): number {
|
|
const matches = text.matchAll(/[.!?](?=\s|$)/g);
|
|
let sentenceIdx = -1;
|
|
for (const match of matches) {
|
|
const at = match.index ?? -1;
|
|
if (at < minChars) {
|
|
continue;
|
|
}
|
|
const candidate = at + 1;
|
|
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
|
sentenceIdx = candidate;
|
|
}
|
|
}
|
|
return sentenceIdx >= minChars ? sentenceIdx : -1;
|
|
}
|
|
|
|
export class EmbeddedBlockChunker {
|
|
#buffer = "";
|
|
readonly #chunking: BlockReplyChunking;
|
|
|
|
constructor(chunking: BlockReplyChunking) {
|
|
this.#chunking = chunking;
|
|
}
|
|
|
|
append(text: string) {
|
|
if (!text) {
|
|
return;
|
|
}
|
|
this.#buffer += text;
|
|
}
|
|
|
|
reset() {
|
|
this.#buffer = "";
|
|
}
|
|
|
|
get bufferedText() {
|
|
return this.#buffer;
|
|
}
|
|
|
|
hasBuffered(): boolean {
|
|
return this.#buffer.length > 0;
|
|
}
|
|
|
|
drain(params: { force: boolean; emit: (chunk: string) => void }) {
|
|
// KNOWN: We cannot split inside fenced code blocks (Markdown breaks + UI glitches).
|
|
// When forced (maxChars), we close + reopen the fence to keep Markdown valid.
|
|
const { force, emit } = params;
|
|
const minChars = Math.max(1, Math.floor(this.#chunking.minChars));
|
|
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
|
|
|
|
// When flushOnParagraph is set (chunkMode="newline"), eagerly split on \n\n
|
|
// boundaries regardless of minChars so each paragraph is sent immediately.
|
|
if (this.#chunking.flushOnParagraph && !force) {
|
|
this.#drainParagraphs(emit, maxChars);
|
|
return;
|
|
}
|
|
|
|
if (this.#buffer.length < minChars && !force) {
|
|
return;
|
|
}
|
|
|
|
if (force && this.#buffer.length <= maxChars) {
|
|
if (this.#buffer.trim().length > 0) {
|
|
emit(this.#buffer);
|
|
}
|
|
this.#buffer = "";
|
|
return;
|
|
}
|
|
|
|
while (this.#buffer.length >= minChars || (force && this.#buffer.length > 0)) {
|
|
const breakResult =
|
|
force && this.#buffer.length <= maxChars
|
|
? this.#pickSoftBreakIndex(this.#buffer, 1)
|
|
: this.#pickBreakIndex(this.#buffer, force ? 1 : undefined);
|
|
if (breakResult.index <= 0) {
|
|
if (force) {
|
|
emit(this.#buffer);
|
|
this.#buffer = "";
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (!this.#emitBreakResult(breakResult, emit)) {
|
|
continue;
|
|
}
|
|
|
|
if (this.#buffer.length < minChars && !force) {
|
|
return;
|
|
}
|
|
if (this.#buffer.length < maxChars && !force) {
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Eagerly emit complete paragraphs (text before \n\n) regardless of minChars. */
|
|
#drainParagraphs(emit: (chunk: string) => void, maxChars: number) {
|
|
while (this.#buffer.length > 0) {
|
|
const fenceSpans = parseFenceSpans(this.#buffer);
|
|
const paragraphBreak = findNextParagraphBreak(this.#buffer, fenceSpans);
|
|
if (!paragraphBreak || paragraphBreak.index > maxChars) {
|
|
// No paragraph boundary yet (or the next boundary is too far). If the
|
|
// buffer exceeds maxChars, fall back to normal break logic to avoid
|
|
// oversized chunks or unbounded accumulation.
|
|
if (this.#buffer.length >= maxChars) {
|
|
const breakResult = this.#pickBreakIndex(this.#buffer, 1);
|
|
if (breakResult.index > 0) {
|
|
this.#emitBreakResult(breakResult, emit);
|
|
continue;
|
|
}
|
|
}
|
|
return;
|
|
}
|
|
|
|
const chunk = this.#buffer.slice(0, paragraphBreak.index);
|
|
if (chunk.trim().length > 0) {
|
|
emit(chunk);
|
|
}
|
|
this.#buffer = stripLeadingNewlines(
|
|
this.#buffer.slice(paragraphBreak.index + paragraphBreak.length),
|
|
);
|
|
}
|
|
}
|
|
|
|
#emitBreakResult(breakResult: BreakResult, emit: (chunk: string) => void): boolean {
|
|
const breakIdx = breakResult.index;
|
|
if (breakIdx <= 0) {
|
|
return false;
|
|
}
|
|
|
|
let rawChunk = this.#buffer.slice(0, breakIdx);
|
|
if (rawChunk.trim().length === 0) {
|
|
this.#buffer = stripLeadingNewlines(this.#buffer.slice(breakIdx)).trimStart();
|
|
return false;
|
|
}
|
|
|
|
let nextBuffer = this.#buffer.slice(breakIdx);
|
|
const fenceSplit = breakResult.fenceSplit;
|
|
if (fenceSplit) {
|
|
const closeFence = rawChunk.endsWith("\n")
|
|
? `${fenceSplit.closeFenceLine}\n`
|
|
: `\n${fenceSplit.closeFenceLine}\n`;
|
|
rawChunk = `${rawChunk}${closeFence}`;
|
|
|
|
const reopenFence = fenceSplit.reopenFenceLine.endsWith("\n")
|
|
? fenceSplit.reopenFenceLine
|
|
: `${fenceSplit.reopenFenceLine}\n`;
|
|
nextBuffer = `${reopenFence}${nextBuffer}`;
|
|
}
|
|
|
|
emit(rawChunk);
|
|
|
|
if (fenceSplit) {
|
|
this.#buffer = nextBuffer;
|
|
} else {
|
|
const nextStart =
|
|
breakIdx < this.#buffer.length && /\s/.test(this.#buffer[breakIdx])
|
|
? breakIdx + 1
|
|
: breakIdx;
|
|
this.#buffer = stripLeadingNewlines(this.#buffer.slice(nextStart));
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
#pickSoftBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
|
|
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
|
|
if (buffer.length < minChars) {
|
|
return { index: -1 };
|
|
}
|
|
const fenceSpans = parseFenceSpans(buffer);
|
|
const preference = this.#chunking.breakPreference ?? "paragraph";
|
|
|
|
if (preference === "paragraph") {
|
|
let paragraphIdx = buffer.indexOf("\n\n");
|
|
while (paragraphIdx !== -1) {
|
|
const candidates = [paragraphIdx, paragraphIdx + 1];
|
|
for (const candidate of candidates) {
|
|
if (candidate < minChars) {
|
|
continue;
|
|
}
|
|
if (candidate < 0 || candidate >= buffer.length) {
|
|
continue;
|
|
}
|
|
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
|
return { index: candidate };
|
|
}
|
|
}
|
|
paragraphIdx = buffer.indexOf("\n\n", paragraphIdx + 2);
|
|
}
|
|
}
|
|
|
|
if (preference === "paragraph" || preference === "newline") {
|
|
let newlineIdx = buffer.indexOf("\n");
|
|
while (newlineIdx !== -1) {
|
|
if (newlineIdx >= minChars && isSafeFenceBreak(fenceSpans, newlineIdx)) {
|
|
return { index: newlineIdx };
|
|
}
|
|
newlineIdx = buffer.indexOf("\n", newlineIdx + 1);
|
|
}
|
|
}
|
|
|
|
if (preference !== "newline") {
|
|
const sentenceIdx = findSafeSentenceBreakIndex(buffer, fenceSpans, minChars);
|
|
if (sentenceIdx !== -1) {
|
|
return { index: sentenceIdx };
|
|
}
|
|
}
|
|
|
|
return { index: -1 };
|
|
}
|
|
|
|
#pickBreakIndex(buffer: string, minCharsOverride?: number): BreakResult {
|
|
const minChars = Math.max(1, Math.floor(minCharsOverride ?? this.#chunking.minChars));
|
|
const maxChars = Math.max(minChars, Math.floor(this.#chunking.maxChars));
|
|
if (buffer.length < minChars) {
|
|
return { index: -1 };
|
|
}
|
|
const window = buffer.slice(0, Math.min(maxChars, buffer.length));
|
|
const fenceSpans = parseFenceSpans(buffer);
|
|
|
|
const preference = this.#chunking.breakPreference ?? "paragraph";
|
|
if (preference === "paragraph") {
|
|
let paragraphIdx = window.lastIndexOf("\n\n");
|
|
while (paragraphIdx >= minChars) {
|
|
const candidates = [paragraphIdx, paragraphIdx + 1];
|
|
for (const candidate of candidates) {
|
|
if (candidate < minChars) {
|
|
continue;
|
|
}
|
|
if (candidate < 0 || candidate >= buffer.length) {
|
|
continue;
|
|
}
|
|
if (isSafeFenceBreak(fenceSpans, candidate)) {
|
|
return { index: candidate };
|
|
}
|
|
}
|
|
paragraphIdx = window.lastIndexOf("\n\n", paragraphIdx - 1);
|
|
}
|
|
}
|
|
|
|
if (preference === "paragraph" || preference === "newline") {
|
|
let newlineIdx = window.lastIndexOf("\n");
|
|
while (newlineIdx >= minChars) {
|
|
if (isSafeFenceBreak(fenceSpans, newlineIdx)) {
|
|
return { index: newlineIdx };
|
|
}
|
|
newlineIdx = window.lastIndexOf("\n", newlineIdx - 1);
|
|
}
|
|
}
|
|
|
|
if (preference !== "newline") {
|
|
const sentenceIdx = findSafeSentenceBreakIndex(window, fenceSpans, minChars);
|
|
if (sentenceIdx !== -1) {
|
|
return { index: sentenceIdx };
|
|
}
|
|
}
|
|
|
|
if (preference === "newline" && buffer.length < maxChars) {
|
|
return { index: -1 };
|
|
}
|
|
|
|
for (let i = window.length - 1; i >= minChars; i--) {
|
|
if (/\s/.test(window[i]) && isSafeFenceBreak(fenceSpans, i)) {
|
|
return { index: i };
|
|
}
|
|
}
|
|
|
|
if (buffer.length >= maxChars) {
|
|
if (isSafeFenceBreak(fenceSpans, maxChars)) {
|
|
return { index: maxChars };
|
|
}
|
|
const fence = findFenceSpanAt(fenceSpans, maxChars);
|
|
if (fence) {
|
|
return {
|
|
index: maxChars,
|
|
fenceSplit: {
|
|
closeFenceLine: `${fence.indent}${fence.marker}`,
|
|
reopenFenceLine: fence.openLine,
|
|
},
|
|
};
|
|
}
|
|
return { index: maxChars };
|
|
}
|
|
|
|
return { index: -1 };
|
|
}
|
|
}
|
|
|
|
function stripLeadingNewlines(value: string): string {
|
|
let i = 0;
|
|
while (i < value.length && value[i] === "\n") {
|
|
i++;
|
|
}
|
|
return i > 0 ? value.slice(i) : value;
|
|
}
|
|
|
|
function findNextParagraphBreak(
|
|
buffer: string,
|
|
fenceSpans: FenceSpan[],
|
|
startIndex = 0,
|
|
): ParagraphBreak | null {
|
|
if (startIndex < 0) {
|
|
return null;
|
|
}
|
|
const re = /\n[\t ]*\n+/g;
|
|
re.lastIndex = startIndex;
|
|
let match: RegExpExecArray | null;
|
|
while ((match = re.exec(buffer)) !== null) {
|
|
const index = match.index ?? -1;
|
|
if (index < 0) {
|
|
continue;
|
|
}
|
|
if (!isSafeFenceBreak(fenceSpans, index)) {
|
|
continue;
|
|
}
|
|
return { index, length: match[0].length };
|
|
}
|
|
return null;
|
|
}
|