fix(chunking): address PR review comments for Phase 1
- Fix isCodeBlockFence to detect both ``` and ~~~ fences - Fix getHeadingLevel to trim leading whitespace (add new function) - Fix findSafeSplitPoint punctuation search (single pass) - Update findSafeSplitPoint JSDoc to match actual behavior - Add splitLongLineByBytes for UTF-8 byte-aware splitting - Update simple-chunker to use byte-based splitting - Document preserveStructure as reserved for future use Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7f34b90713
commit
048209a576
@ -18,7 +18,7 @@ export function isHeading(line: string): boolean {
|
||||
*/
|
||||
export function isCodeBlockFence(line: string): boolean {
|
||||
const trimmed = line.trim();
|
||||
return /^```/.test(trimmed);
|
||||
return /^```|^~~~/.test(trimmed);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -71,7 +71,7 @@ export function isStructuralBoundary(line: string): boolean {
|
||||
*
|
||||
* @param line - The line to split
|
||||
* @param maxLength - Maximum length for the split
|
||||
* @returns Index to split at, or -1 if no good split found
|
||||
* @returns Index to split at (always a valid index between 0 and maxLength inclusive)
|
||||
*/
|
||||
export function findSafeSplitPoint(line: string, maxLength: number): number {
|
||||
if (line.length <= maxLength) {
|
||||
@ -89,15 +89,11 @@ export function findSafeSplitPoint(line: string, maxLength: number): number {
|
||||
}
|
||||
|
||||
// Priority 2: Common punctuation that can end a segment
|
||||
const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"];
|
||||
for (const punct of punctuation) {
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
if (line[i - 1] === punct) {
|
||||
// Make sure it's not part of something like "..." or "::"
|
||||
if (i < line.length && line[i] !== punct) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
// Single pass to find the rightmost punctuation of any type
|
||||
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
@ -171,3 +167,118 @@ export function getLineRole(line: string): LineRole {
|
||||
if (isThematicBreak(line)) return "thematic_break";
|
||||
return "content";
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the heading level (1-6) from a heading line.
|
||||
* Returns 0 if the line is not a heading.
|
||||
*/
|
||||
export function getHeadingLevel(line: string): number {
|
||||
const trimmed = line.trimStart();
|
||||
const match = trimmed.match(/^#{1,6}\s/);
|
||||
if (match) {
|
||||
return match[0].trim().length;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate UTF-8 byte size of a string.
|
||||
*/
|
||||
function estimateUtf8Bytes(text: string): number {
|
||||
if (!text) {
|
||||
return 0;
|
||||
}
|
||||
return Buffer.byteLength(text, "utf8");
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a long line at safe points, respecting UTF-8 byte limits.
|
||||
*
|
||||
* This function is byte-aware and correctly handles multi-byte characters
|
||||
* (CJK, emoji, etc.) that would otherwise exceed byte limits when using
|
||||
* character-based splitting.
|
||||
*
|
||||
* @param line - The line to split
|
||||
* @param maxBytes - Maximum UTF-8 bytes per segment
|
||||
* @returns Array of line segments
|
||||
*/
|
||||
export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
|
||||
const totalBytes = estimateUtf8Bytes(line);
|
||||
if (totalBytes <= maxBytes) {
|
||||
return [line];
|
||||
}
|
||||
|
||||
const segments: string[] = [];
|
||||
let remaining = line;
|
||||
|
||||
while (remaining.length > 0) {
|
||||
const remainingBytes = estimateUtf8Bytes(remaining);
|
||||
if (remainingBytes <= maxBytes) {
|
||||
segments.push(remaining);
|
||||
break;
|
||||
}
|
||||
|
||||
// Find the longest substring that fits within maxBytes
|
||||
// UTF-16 code units are always <= UTF-8 bytes, so we can use character count
|
||||
// as an upper bound for the binary search
|
||||
let high = Math.min(remaining.length, maxBytes);
|
||||
let low = 0;
|
||||
let bestEnd = 0;
|
||||
|
||||
// Binary search for the split point
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const candidate = remaining.slice(0, mid);
|
||||
const bytes = estimateUtf8Bytes(candidate);
|
||||
|
||||
if (bytes <= maxBytes) {
|
||||
bestEnd = mid;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Now try to find a better split point within the safe range
|
||||
// Look for natural boundaries before bestEnd
|
||||
const searchEnd = Math.min(bestEnd, remaining.length);
|
||||
let splitPoint = bestEnd;
|
||||
|
||||
// Priority 1: Space followed by non-space (word boundary)
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
const testSegment = remaining.slice(0, i);
|
||||
if (estimateUtf8Bytes(testSegment) > maxBytes) {
|
||||
break;
|
||||
}
|
||||
if (remaining[i - 1] === " " && remaining[i] !== " ") {
|
||||
splitPoint = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 2: Common punctuation (only if space search didn't find a boundary)
|
||||
if (splitPoint === bestEnd) {
|
||||
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
const testSegment = remaining.slice(0, i);
|
||||
if (estimateUtf8Bytes(testSegment) > maxBytes) {
|
||||
break;
|
||||
}
|
||||
if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
|
||||
splitPoint = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (splitPoint <= 0) {
|
||||
// Can't find a good split, force at bestEnd
|
||||
splitPoint = Math.max(1, bestEnd);
|
||||
}
|
||||
|
||||
segments.push(remaining.slice(0, splitPoint).trimEnd());
|
||||
remaining = remaining.slice(splitPoint).trimStart();
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
import { estimateUtf8Bytes } from "../embedding-input-limits.js";
|
||||
import { hashText, type MemoryChunk } from "../internal.js";
|
||||
import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
|
||||
import { splitLongLine } from "./markdown-boundaries.js";
|
||||
import { splitLongLineByBytes } from "./markdown-boundaries.js";
|
||||
import { buildTextEmbeddingInput } from "../embedding-inputs.js";
|
||||
|
||||
/**
|
||||
@ -30,6 +30,9 @@ export class SimpleChunker implements ChunkStrategy {
|
||||
readonly name = "simple";
|
||||
|
||||
chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
|
||||
// Note: config.preserveStructure is reserved for future Phase 2+ features.
|
||||
// The SimpleChunker handles basic UTF-8 byte-aware chunking; use
|
||||
// SemanticChunker for structure-aware splitting.
|
||||
const lines = content.split("\n");
|
||||
// Handle empty content - split("\n") on empty string returns [""]
|
||||
if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
|
||||
@ -119,9 +122,9 @@ export class SimpleChunker implements ChunkStrategy {
|
||||
return [entry];
|
||||
}
|
||||
|
||||
// Split the long line at natural boundaries
|
||||
// Split the long line at natural boundaries, respecting UTF-8 byte limits
|
||||
const maxTextBytes = maxBytes - 1; // Account for newline
|
||||
const textSegments = splitLongLine(entry.line, maxTextBytes);
|
||||
const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);
|
||||
|
||||
return textSegments.map((segment) => ({
|
||||
line: segment,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user