fix(chunking): address PR review comments for Phase 1

- Fix isCodeBlockFence to detect both ``` and ~~~ fences - Fix getHeadingLevel to trim leading whitespace (add new function) - Fix findSafeSplitPoint punctuation search (single pass) - Update findSafeSplitPoint JSDoc to match actual behavior - Add splitLongLineByBytes for UTF-8 byte-aware splitting - Update simple-chunker to use byte-based splitting - Document preserveStructure as reserved for future use Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-17 16:35:03 +08:00 · 2026-03-17 16:35:03 +08:00 · 048209a576
commit 048209a576
parent 7f34b90713
2 changed files with 128 additions and 14 deletions
--- a/src/memory/chunking/markdown-boundaries.ts
+++ b/src/memory/chunking/markdown-boundaries.ts
@ -18,7 +18,7 @@ export function isHeading(line: string): boolean {
 */
 export function isCodeBlockFence(line: string): boolean {
  const trimmed = line.trim();
-  return /^```/.test(trimmed);
+  return /^```|^~~~/.test(trimmed);
 }

 /**
@ -71,7 +71,7 @@ export function isStructuralBoundary(line: string): boolean {
 *
 * @param line - The line to split
 * @param maxLength - Maximum length for the split
- * @returns Index to split at, or -1 if no good split found
+ * @returns Index to split at (always a valid index between 0 and maxLength inclusive)
 */
 export function findSafeSplitPoint(line: string, maxLength: number): number {
  if (line.length <= maxLength) {
@ -89,15 +89,11 @@ export function findSafeSplitPoint(line: string, maxLength: number): number {
  }

  // Priority 2: Common punctuation that can end a segment
-  const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"];
-  for (const punct of punctuation) {
-    for (let i = searchEnd; i > 0; i--) {
-      if (line[i - 1] === punct) {
-        // Make sure it's not part of something like "..." or "::"
-        if (i < line.length && line[i] !== punct) {
-          return i;
-        }
-      }
+  // Single pass to find the rightmost punctuation of any type
+  const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
+  for (let i = searchEnd; i > 0; i--) {
+    if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
+      return i;
    }
  }

@ -171,3 +167,118 @@ export function getLineRole(line: string): LineRole {
  if (isThematicBreak(line)) return "thematic_break";
  return "content";
 }
+
+/**
+ * Get the heading level (1-6) from a heading line.
+ * Returns 0 if the line is not a heading.
+ */
+export function getHeadingLevel(line: string): number {
+  const trimmed = line.trimStart();
+  const match = trimmed.match(/^#{1,6}\s/);
+  if (match) {
+    return match[0].trim().length;
+  }
+  return 0;
+}
+
+/**
+ * Calculate UTF-8 byte size of a string.
+ */
+function estimateUtf8Bytes(text: string): number {
+  if (!text) {
+    return 0;
+  }
+  return Buffer.byteLength(text, "utf8");
+}
+
+/**
+ * Split a long line at safe points, respecting UTF-8 byte limits.
+ *
+ * This function is byte-aware and correctly handles multi-byte characters
+ * (CJK, emoji, etc.) that would otherwise exceed byte limits when using
+ * character-based splitting.
+ *
+ * @param line - The line to split
+ * @param maxBytes - Maximum UTF-8 bytes per segment
+ * @returns Array of line segments
+ */
+export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
+  const totalBytes = estimateUtf8Bytes(line);
+  if (totalBytes <= maxBytes) {
+    return [line];
+  }
+
+  const segments: string[] = [];
+  let remaining = line;
+
+  while (remaining.length > 0) {
+    const remainingBytes = estimateUtf8Bytes(remaining);
+    if (remainingBytes <= maxBytes) {
+      segments.push(remaining);
+      break;
+    }
+
+    // Find the longest substring that fits within maxBytes
+    // UTF-16 code units are always <= UTF-8 bytes, so we can use character count
+    // as an upper bound for the binary search
+    let high = Math.min(remaining.length, maxBytes);
+    let low = 0;
+    let bestEnd = 0;
+
+    // Binary search for the split point
+    while (low <= high) {
+      const mid = Math.floor((low + high) / 2);
+      const candidate = remaining.slice(0, mid);
+      const bytes = estimateUtf8Bytes(candidate);
+
+      if (bytes <= maxBytes) {
+        bestEnd = mid;
+        low = mid + 1;
+      } else {
+        high = mid - 1;
+      }
+    }
+
+    // Now try to find a better split point within the safe range
+    // Look for natural boundaries before bestEnd
+    const searchEnd = Math.min(bestEnd, remaining.length);
+    let splitPoint = bestEnd;
+
+    // Priority 1: Space followed by non-space (word boundary)
+    for (let i = searchEnd; i > 0; i--) {
+      const testSegment = remaining.slice(0, i);
+      if (estimateUtf8Bytes(testSegment) > maxBytes) {
+        break;
+      }
+      if (remaining[i - 1] === " " && remaining[i] !== " ") {
+        splitPoint = i - 1;
+        break;
+      }
+    }
+
+    // Priority 2: Common punctuation (only if space search didn't find a boundary)
+    if (splitPoint === bestEnd) {
+      const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
+      for (let i = searchEnd; i > 0; i--) {
+        const testSegment = remaining.slice(0, i);
+        if (estimateUtf8Bytes(testSegment) > maxBytes) {
+          break;
+        }
+        if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
+          splitPoint = i;
+          break;
+        }
+      }
+    }
+
+    if (splitPoint <= 0) {
+      // Can't find a good split, force at bestEnd
+      splitPoint = Math.max(1, bestEnd);
+    }
+
+    segments.push(remaining.slice(0, splitPoint).trimEnd());
+    remaining = remaining.slice(splitPoint).trimStart();
+  }
+
+  return segments;
+}
--- a/src/memory/chunking/simple-chunker.ts
+++ b/src/memory/chunking/simple-chunker.ts
@ -11,7 +11,7 @@
 import { estimateUtf8Bytes } from "../embedding-input-limits.js";
 import { hashText, type MemoryChunk } from "../internal.js";
 import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
-import { splitLongLine } from "./markdown-boundaries.js";
+import { splitLongLineByBytes } from "./markdown-boundaries.js";
 import { buildTextEmbeddingInput } from "../embedding-inputs.js";

 /**
@ -30,6 +30,9 @@ export class SimpleChunker implements ChunkStrategy {
  readonly name = "simple";

  chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
+    // Note: config.preserveStructure is reserved for future Phase 2+ features.
+    // The SimpleChunker handles basic UTF-8 byte-aware chunking; use
+    // SemanticChunker for structure-aware splitting.
    const lines = content.split("\n");
    // Handle empty content - split("\n") on empty string returns [""]
    if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
@ -119,9 +122,9 @@ export class SimpleChunker implements ChunkStrategy {
      return [entry];
    }

-    // Split the long line at natural boundaries
+    // Split the long line at natural boundaries, respecting UTF-8 byte limits
    const maxTextBytes = maxBytes - 1; // Account for newline
-    const textSegments = splitLongLine(entry.line, maxTextBytes);
+    const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);

    return textSegments.map((segment) => ({
      line: segment,