diff --git a/src/memory/chunking/markdown-boundaries.ts b/src/memory/chunking/markdown-boundaries.ts index 52c6dd6b579..60986549092 100644 --- a/src/memory/chunking/markdown-boundaries.ts +++ b/src/memory/chunking/markdown-boundaries.ts @@ -18,7 +18,7 @@ export function isHeading(line: string): boolean { */ export function isCodeBlockFence(line: string): boolean { const trimmed = line.trim(); - return /^```/.test(trimmed); + return /^```|^~~~/.test(trimmed); } /** @@ -71,7 +71,7 @@ export function isStructuralBoundary(line: string): boolean { * * @param line - The line to split * @param maxLength - Maximum length for the split - * @returns Index to split at, or -1 if no good split found + * @returns Index to split at (always a valid index between 0 and maxLength inclusive) */ export function findSafeSplitPoint(line: string, maxLength: number): number { if (line.length <= maxLength) { @@ -89,15 +89,11 @@ export function findSafeSplitPoint(line: string, maxLength: number): number { } // Priority 2: Common punctuation that can end a segment - const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"]; - for (const punct of punctuation) { - for (let i = searchEnd; i > 0; i--) { - if (line[i - 1] === punct) { - // Make sure it's not part of something like "..." or "::" - if (i < line.length && line[i] !== punct) { - return i; - } - } + // Single pass to find the rightmost punctuation of any type + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) { + return i; } } @@ -171,3 +167,118 @@ export function getLineRole(line: string): LineRole { if (isThematicBreak(line)) return "thematic_break"; return "content"; } + +/** + * Get the heading level (1-6) from a heading line. + * Returns 0 if the line is not a heading. + */ +export function getHeadingLevel(line: string): number { + const trimmed = line.trimStart(); + const match = trimmed.match(/^#{1,6}\s/); + if (match) { + return match[0].trim().length; + } + return 0; +} + +/** + * Calculate UTF-8 byte size of a string. + */ +function estimateUtf8Bytes(text: string): number { + if (!text) { + return 0; + } + return Buffer.byteLength(text, "utf8"); +} + +/** + * Split a long line at safe points, respecting UTF-8 byte limits. + * + * This function is byte-aware and correctly handles multi-byte characters + * (CJK, emoji, etc.) that would otherwise exceed byte limits when using + * character-based splitting. + * + * @param line - The line to split + * @param maxBytes - Maximum UTF-8 bytes per segment + * @returns Array of line segments + */ +export function splitLongLineByBytes(line: string, maxBytes: number): string[] { + const totalBytes = estimateUtf8Bytes(line); + if (totalBytes <= maxBytes) { + return [line]; + } + + const segments: string[] = []; + let remaining = line; + + while (remaining.length > 0) { + const remainingBytes = estimateUtf8Bytes(remaining); + if (remainingBytes <= maxBytes) { + segments.push(remaining); + break; + } + + // Find the longest substring that fits within maxBytes + // UTF-16 code units are always <= UTF-8 bytes, so we can use character count + // as an upper bound for the binary search + let high = Math.min(remaining.length, maxBytes); + let low = 0; + let bestEnd = 0; + + // Binary search for the split point + while (low <= high) { + const mid = Math.floor((low + high) / 2); + const candidate = remaining.slice(0, mid); + const bytes = estimateUtf8Bytes(candidate); + + if (bytes <= maxBytes) { + bestEnd = mid; + low = mid + 1; + } else { + high = mid - 1; + } + } + + // Now try to find a better split point within the safe range + // Look for natural boundaries before bestEnd + const searchEnd = Math.min(bestEnd, remaining.length); + let splitPoint = bestEnd; + + // Priority 1: Space followed by non-space (word boundary) + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (remaining[i - 1] === " " && remaining[i] !== " ") { + splitPoint = i - 1; + break; + } + } + + // Priority 2: Common punctuation (only if space search didn't find a boundary) + if (splitPoint === bestEnd) { + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) { + splitPoint = i; + break; + } + } + } + + if (splitPoint <= 0) { + // Can't find a good split, force at bestEnd + splitPoint = Math.max(1, bestEnd); + } + + segments.push(remaining.slice(0, splitPoint).trimEnd()); + remaining = remaining.slice(splitPoint).trimStart(); + } + + return segments; +} diff --git a/src/memory/chunking/simple-chunker.ts b/src/memory/chunking/simple-chunker.ts index a14c4ee3330..8dba90b8a62 100644 --- a/src/memory/chunking/simple-chunker.ts +++ b/src/memory/chunking/simple-chunker.ts @@ -11,7 +11,7 @@ import { estimateUtf8Bytes } from "../embedding-input-limits.js"; import { hashText, type MemoryChunk } from "../internal.js"; import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js"; -import { splitLongLine } from "./markdown-boundaries.js"; +import { splitLongLineByBytes } from "./markdown-boundaries.js"; import { buildTextEmbeddingInput } from "../embedding-inputs.js"; /** @@ -30,6 +30,9 @@ export class SimpleChunker implements ChunkStrategy { readonly name = "simple"; chunk(content: string, config: ChunkingConfig): MemoryChunk[] { + // Note: config.preserveStructure is reserved for future Phase 2+ features. + // The SimpleChunker handles basic UTF-8 byte-aware chunking; use + // SemanticChunker for structure-aware splitting. const lines = content.split("\n"); // Handle empty content - split("\n") on empty string returns [""] if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) { @@ -119,9 +122,9 @@ export class SimpleChunker implements ChunkStrategy { return [entry]; } - // Split the long line at natural boundaries + // Split the long line at natural boundaries, respecting UTF-8 byte limits const maxTextBytes = maxBytes - 1; // Account for newline - const textSegments = splitLongLine(entry.line, maxTextBytes); + const textSegments = splitLongLineByBytes(entry.line, maxTextBytes); return textSegments.map((segment) => ({ line: segment,