fix(chunking): address PR review comments for Phase 1

- Fix isCodeBlockFence to detect both ``` and ~~~ fences
- Fix getHeadingLevel to trim leading whitespace (add new function)
- Fix findSafeSplitPoint punctuation search (single pass)
- Update findSafeSplitPoint JSDoc to match actual behavior
- Add splitLongLineByBytes for UTF-8 byte-aware splitting
- Update simple-chunker to use byte-based splitting
- Document preserveStructure as reserved for future use

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
ly-wang19 2026-03-17 16:35:03 +08:00
parent 7f34b90713
commit 048209a576
2 changed files with 128 additions and 14 deletions

View File

@ -18,7 +18,7 @@ export function isHeading(line: string): boolean {
*/
export function isCodeBlockFence(line: string): boolean {
const trimmed = line.trim();
return /^```/.test(trimmed);
return /^```|^~~~/.test(trimmed);
}
/**
@ -71,7 +71,7 @@ export function isStructuralBoundary(line: string): boolean {
*
* @param line - The line to split
* @param maxLength - Maximum length for the split
* @returns Index to split at, or -1 if no good split found
* @returns Index to split at (always a valid index between 0 and maxLength inclusive)
*/
export function findSafeSplitPoint(line: string, maxLength: number): number {
if (line.length <= maxLength) {
@ -89,15 +89,11 @@ export function findSafeSplitPoint(line: string, maxLength: number): number {
}
// Priority 2: Common punctuation that can end a segment
const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"];
for (const punct of punctuation) {
for (let i = searchEnd; i > 0; i--) {
if (line[i - 1] === punct) {
// Make sure it's not part of something like "..." or "::"
if (i < line.length && line[i] !== punct) {
return i;
}
}
// Single pass to find the rightmost punctuation of any type
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
for (let i = searchEnd; i > 0; i--) {
if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
return i;
}
}
@ -171,3 +167,118 @@ export function getLineRole(line: string): LineRole {
if (isThematicBreak(line)) return "thematic_break";
return "content";
}
/**
* Get the heading level (1-6) from a heading line.
* Returns 0 if the line is not a heading.
*/
export function getHeadingLevel(line: string): number {
const trimmed = line.trimStart();
const match = trimmed.match(/^#{1,6}\s/);
if (match) {
return match[0].trim().length;
}
return 0;
}
/**
* Calculate UTF-8 byte size of a string.
*/
function estimateUtf8Bytes(text: string): number {
if (!text) {
return 0;
}
return Buffer.byteLength(text, "utf8");
}
/**
* Split a long line at safe points, respecting UTF-8 byte limits.
*
* This function is byte-aware and correctly handles multi-byte characters
* (CJK, emoji, etc.) that would otherwise exceed byte limits when using
* character-based splitting.
*
* @param line - The line to split
* @param maxBytes - Maximum UTF-8 bytes per segment
* @returns Array of line segments
*/
export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
const totalBytes = estimateUtf8Bytes(line);
if (totalBytes <= maxBytes) {
return [line];
}
const segments: string[] = [];
let remaining = line;
while (remaining.length > 0) {
const remainingBytes = estimateUtf8Bytes(remaining);
if (remainingBytes <= maxBytes) {
segments.push(remaining);
break;
}
// Find the longest substring that fits within maxBytes
// UTF-16 code units are always <= UTF-8 bytes, so we can use character count
// as an upper bound for the binary search
let high = Math.min(remaining.length, maxBytes);
let low = 0;
let bestEnd = 0;
// Binary search for the split point
while (low <= high) {
const mid = Math.floor((low + high) / 2);
const candidate = remaining.slice(0, mid);
const bytes = estimateUtf8Bytes(candidate);
if (bytes <= maxBytes) {
bestEnd = mid;
low = mid + 1;
} else {
high = mid - 1;
}
}
// Now try to find a better split point within the safe range
// Look for natural boundaries before bestEnd
const searchEnd = Math.min(bestEnd, remaining.length);
let splitPoint = bestEnd;
// Priority 1: Space followed by non-space (word boundary)
for (let i = searchEnd; i > 0; i--) {
const testSegment = remaining.slice(0, i);
if (estimateUtf8Bytes(testSegment) > maxBytes) {
break;
}
if (remaining[i - 1] === " " && remaining[i] !== " ") {
splitPoint = i - 1;
break;
}
}
// Priority 2: Common punctuation (only if space search didn't find a boundary)
if (splitPoint === bestEnd) {
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
for (let i = searchEnd; i > 0; i--) {
const testSegment = remaining.slice(0, i);
if (estimateUtf8Bytes(testSegment) > maxBytes) {
break;
}
if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
splitPoint = i;
break;
}
}
}
if (splitPoint <= 0) {
// Can't find a good split, force at bestEnd
splitPoint = Math.max(1, bestEnd);
}
segments.push(remaining.slice(0, splitPoint).trimEnd());
remaining = remaining.slice(splitPoint).trimStart();
}
return segments;
}

View File

@ -11,7 +11,7 @@
import { estimateUtf8Bytes } from "../embedding-input-limits.js";
import { hashText, type MemoryChunk } from "../internal.js";
import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
import { splitLongLine } from "./markdown-boundaries.js";
import { splitLongLineByBytes } from "./markdown-boundaries.js";
import { buildTextEmbeddingInput } from "../embedding-inputs.js";
/**
@ -30,6 +30,9 @@ export class SimpleChunker implements ChunkStrategy {
readonly name = "simple";
chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
// Note: config.preserveStructure is reserved for future Phase 2+ features.
// The SimpleChunker handles basic UTF-8 byte-aware chunking; use
// SemanticChunker for structure-aware splitting.
const lines = content.split("\n");
// Handle empty content - split("\n") on empty string returns [""]
if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
@ -119,9 +122,9 @@ export class SimpleChunker implements ChunkStrategy {
return [entry];
}
// Split the long line at natural boundaries
// Split the long line at natural boundaries, respecting UTF-8 byte limits
const maxTextBytes = maxBytes - 1; // Account for newline
const textSegments = splitLongLine(entry.line, maxTextBytes);
const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);
return textSegments.map((segment) => ({
line: segment,