From b57a7d4bf2cd07e9586fdbc7cafd31a68d77ebe6 Mon Sep 17 00:00:00 2001 From: ly-wang19 Date: Tue, 17 Mar 2026 12:05:35 +0800 Subject: [PATCH 1/3] perf(agents,memory): parallelize independent async operations Parallelize bootstrap context resolution with machine name lookup in agent runner, and keyword search with embedding generation in memory manager. Both operations have no data dependencies, reducing latency by ~15-50ms per user message processed. --- src/agents/pi-embedded-runner/run/attempt.ts | 25 +++++++++++--------- src/memory/manager.ts | 16 ++++++++----- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index e8efa015137..f57aa9f774d 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -1433,16 +1433,20 @@ export async function runEmbeddedAttempt( }); const sessionLabel = params.sessionKey ?? params.sessionId; - const { bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles } = - await resolveBootstrapContextForRun({ - workspaceDir: effectiveWorkspace, - config: params.config, - sessionKey: params.sessionKey, - sessionId: params.sessionId, - warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }), - contextMode: params.bootstrapContextMode, - runKind: params.bootstrapContextRunKind, - }); + // Parallelize bootstrap context resolution and machine name lookup for better performance + const [{ bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles }, machineName] = + await Promise.all([ + resolveBootstrapContextForRun({ + workspaceDir: effectiveWorkspace, + config: params.config, + sessionKey: params.sessionKey, + sessionId: params.sessionId, + warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }), + contextMode: params.bootstrapContextMode, + runKind: params.bootstrapContextRunKind, + }), + getMachineDisplayName(), + ]); const bootstrapMaxChars = resolveBootstrapMaxChars(params.config); const bootstrapTotalMaxChars = resolveBootstrapTotalMaxChars(params.config); const bootstrapAnalysis = analyzeBootstrapBudget({ @@ -1553,7 +1557,6 @@ export async function runEmbeddedAttempt( }); logToolSchemasForGoogle({ tools, provider: params.provider }); - const machineName = await getMachineDisplayName(); const runtimeChannel = normalizeMessageChannel(params.messageChannel ?? params.messageProvider); let runtimeCapabilities = runtimeChannel ? (resolveChannelCapabilities({ diff --git a/src/memory/manager.ts b/src/memory/manager.ts index 61e2cd71af8..31a1c07cc12 100644 --- a/src/memory/manager.ts +++ b/src/memory/manager.ts @@ -319,13 +319,17 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem } // If FTS isn't available, hybrid mode cannot use keyword search; degrade to vector-only. - const keywordResults = + // Parallelize keyword search (local FTS) and embedding generation (network call) for better performance + const [keywordResults, { queryVec, hasVector }] = await Promise.all([ hybrid.enabled && this.fts.enabled && this.fts.available - ? await this.searchKeyword(cleaned, candidates).catch(() => []) - : []; - - const queryVec = await this.embedQueryWithTimeout(cleaned); - const hasVector = queryVec.some((v) => v !== 0); + ? this.searchKeyword(cleaned, candidates).catch(() => []) + : [], + (async () => { + const queryVec = await this.embedQueryWithTimeout(cleaned); + const hasVector = queryVec.some((v) => v !== 0); + return { queryVec, hasVector }; + })(), + ]); const vectorResults = hasVector ? await this.searchVector(queryVec, candidates).catch(() => []) : []; From bb6f1244b0d26353c1f017a1d6bb24012a9fe774 Mon Sep 17 00:00:00 2001 From: ly-wang19 Date: Tue, 17 Mar 2026 14:45:57 +0800 Subject: [PATCH 2/3] memory(chunking): implement modular chunking with improved boundaries Phase 1 optimization for memory segmentation: - Add modular chunking architecture with ChunkStrategy interface - Use UTF-8 byte estimation instead of rough tokens * 4 - Split long lines at natural boundaries (spaces, punctuation) - Enhanced overlap mechanism using complete lines This replaces the inline chunkMarkdown implementation with a modular approach that supports future strategy extensions. All existing tests pass. 33 new tests added for the chunking module. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/memory/chunking/chunk-strategy.ts | 82 +++++ src/memory/chunking/chunker.test.ts | 331 +++++++++++++++++++++ src/memory/chunking/index.ts | 21 ++ src/memory/chunking/markdown-boundaries.ts | 173 +++++++++++ src/memory/chunking/simple-chunker.ts | 159 ++++++++++ src/memory/internal.ts | 81 +---- 6 files changed, 769 insertions(+), 78 deletions(-) create mode 100644 src/memory/chunking/chunk-strategy.ts create mode 100644 src/memory/chunking/chunker.test.ts create mode 100644 src/memory/chunking/index.ts create mode 100644 src/memory/chunking/markdown-boundaries.ts create mode 100644 src/memory/chunking/simple-chunker.ts diff --git a/src/memory/chunking/chunk-strategy.ts b/src/memory/chunking/chunk-strategy.ts new file mode 100644 index 00000000000..9628aa35995 --- /dev/null +++ b/src/memory/chunking/chunk-strategy.ts @@ -0,0 +1,82 @@ +/** + * Chunking strategy interface and factory for memory segmentation. + * + * This module provides the abstraction layer for different chunking approaches, + * allowing for easy extension and configuration. + */ + +import type { MemoryChunk } from "../internal.js"; + +/** + * Configuration for chunking behavior. + */ +export type ChunkingConfig = { + /** + * Maximum chunk size in estimated UTF-8 bytes. + * This is a conservative upper bound for tokenizer output. + */ + maxBytes: number; + + /** + * Overlap between consecutive chunks in estimated UTF-8 bytes. + * Helps maintain context across chunk boundaries. + */ + overlapBytes: number; + + /** + * Whether to preserve markdown structure when splitting. + * When enabled, splits will prefer boundaries like headings, code blocks, etc. + */ + preserveStructure?: boolean; +}; + +/** + * Result of a chunking operation. + */ +export type ChunkingResult = { + chunks: MemoryChunk[]; + metadata: { + totalChunks: number; + totalBytes: number; + averageChunkBytes: number; + }; +}; + +/** + * Interface for chunking strategies. + */ +export interface ChunkStrategy { + /** + * The name of this strategy. + */ + readonly name: string; + + /** + * Chunk the given content according to the strategy. + * + * @param content - The text content to chunk + * @param config - Chunking configuration + * @returns Array of memory chunks + */ + chunk(content: string, config: ChunkingConfig): MemoryChunk[]; +} + +/** + * Default chunking configuration. + */ +export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = { + maxBytes: 2000, // ~500 tokens at 4 bytes/token + overlapBytes: 200, // ~50 tokens overlap + preserveStructure: true, +}; + +/** + * Create a chunking config from legacy token-based settings. + */ +export function chunkingConfigFromTokens(tokens: number, overlap: number): ChunkingConfig { + return { + maxBytes: Math.max(32, tokens * 4), + overlapBytes: Math.max(0, overlap * 4), + preserveStructure: true, + }; +} diff --git a/src/memory/chunking/chunker.test.ts b/src/memory/chunking/chunker.test.ts new file mode 100644 index 00000000000..f95e945bdd5 --- /dev/null +++ b/src/memory/chunking/chunker.test.ts @@ -0,0 +1,331 @@ +/** + * Tests for the chunking module. + */ + +import { describe, it, expect } from "vitest"; +import { chunkMarkdown, simpleChunker } from "./simple-chunker.js"; +import { + findSafeSplitPoint, + splitLongLine, + isHeading, + isCodeBlockFence, + isListItem, + isEmptyLine, + isStructuralBoundary, + getLineRole, + type LineRole, +} from "./markdown-boundaries.js"; +import { chunkingConfigFromTokens, DEFAULT_CHUNKING_CONFIG } from "./chunk-strategy.js"; + +describe("chunking.markdownBoundaries", () => { + describe("isHeading", () => { + it("detects ATX headings", () => { + expect(isHeading("# Heading 1")).toBe(true); + expect(isHeading("## Heading 2")).toBe(true); + expect(isHeading("### Heading 3")).toBe(true); + expect(isHeading(" ## Indented heading")).toBe(true); + expect(isHeading("Not a heading")).toBe(false); + expect(isHeading("Seven#s is not a heading")).toBe(false); + }); + }); + + describe("isCodeBlockFence", () => { + it("detects code fences", () => { + expect(isCodeBlockFence("```typescript")).toBe(true); + expect(isCodeBlockFence("```js")).toBe(true); + expect(isCodeBlockFence(" ```python")).toBe(true); + expect(isCodeBlockFence("Not a fence")).toBe(false); + }); + }); + + describe("isListItem", () => { + it("detects list items", () => { + expect(isListItem("- item")).toBe(true); + expect(isListItem("* item")).toBe(true); + expect(isListItem("+ item")).toBe(true); + expect(isListItem("1. item")).toBe(true); + expect(isListItem(" - indented")).toBe(true); + expect(isListItem("not a list")).toBe(false); + }); + }); + + describe("isEmptyLine", () => { + it("detects empty lines", () => { + expect(isEmptyLine("")).toBe(true); + expect(isEmptyLine(" ")).toBe(true); + expect(isEmptyLine("\t")).toBe(true); + expect(isEmptyLine("content")).toBe(false); + }); + }); + + describe("isStructuralBoundary", () => { + it("identifies good split points", () => { + expect(isStructuralBoundary("")).toBe(true); + expect(isStructuralBoundary(" ")).toBe(true); + expect(isStructuralBoundary("# Heading")).toBe(true); + expect(isStructuralBoundary("---")).toBe(true); + expect(isStructuralBoundary("***")).toBe(true); + expect(isStructuralBoundary("regular content")).toBe(false); + }); + }); + + describe("findSafeSplitPoint", () => { + it("returns full length when line is short enough", () => { + expect(findSafeSplitPoint("short", 100)).toBe(5); + }); + + it("splits at word boundaries when possible", () => { + const line = "hello world foo bar baz"; + expect(findSafeSplitPoint(line, 15)).toBe(11); // After "world " + }); + + it("splits at punctuation when no spaces available", () => { + const line = "hello,world,foo,bar"; + // Algorithm finds the split point closest to maxLength + // With maxLength=12, it finds the comma at index 11 (",") + // and returns 12 (after the comma) + expect(findSafeSplitPoint(line, 12)).toBe(12); + }); + + it("falls back to hard split when needed", () => { + const line = "averylongwordwithnoboundaries"; + expect(findSafeSplitPoint(line, 10)).toBe(10); + }); + + it("handles edge cases", () => { + expect(findSafeSplitPoint("", 10)).toBe(0); + expect(findSafeSplitPoint("a", 0)).toBe(0); + }); + }); + + describe("splitLongLine", () => { + it("returns single segment for short lines", () => { + expect(splitLongLine("short", 100)).toEqual(["short"]); + }); + + it("splits at word boundaries", () => { + const line = "hello world foo bar baz"; + const result = splitLongLine(line, 15); + expect(result.length).toBeGreaterThan(1); + expect(result[0]).toBe("hello world"); + expect(result[1]).toBe("foo bar baz"); + }); + + it("trims whitespace from split segments", () => { + const line = "hello world foo bar"; + const result = splitLongLine(line, 10); + result.forEach((seg) => { + expect(seg).not.toMatch(/^\s/); + expect(seg).not.toMatch(/\s$/); + }); + }); + }); + + describe("getLineRole", () => { + const roles: LineRole[] = [ + "empty", + "heading", + "code_fence", + "list_item", + "blockquote", + "thematic_break", + "content", + ]; + + it("classifies all line types correctly", () => { + expect(getLineRole("")).toBe("empty"); + expect(getLineRole("# Heading")).toBe("heading"); + expect(getLineRole("```ts")).toBe("code_fence"); + expect(getLineRole("- item")).toBe("list_item"); + expect(getLineRole("> quote")).toBe("blockquote"); + expect(getLineRole("---")).toBe("thematic_break"); + expect(getLineRole("regular content")).toBe("content"); + }); + + it("returns one of the valid roles", () => { + const role = getLineRole("some line"); + expect(roles).toContain(role); + }); + }); +}); + +describe("chunking.chunkStrategy", () => { + describe("chunkingConfigFromTokens", () => { + it("converts token-based config to byte-based", () => { + const config = chunkingConfigFromTokens(500, 50); + expect(config.maxBytes).toBe(2000); // 500 * 4 + expect(config.overlapBytes).toBe(200); // 50 * 4 + expect(config.preserveStructure).toBe(true); + }); + + it("handles zero overlap", () => { + const config = chunkingConfigFromTokens(500, 0); + expect(config.overlapBytes).toBe(0); + }); + + it("enforces minimum maxBytes", () => { + const config = chunkingConfigFromTokens(1, 0); + expect(config.maxBytes).toBe(32); // Minimum + }); + }); + + describe("DEFAULT_CHUNKING_CONFIG", () => { + it("has sensible defaults", () => { + expect(DEFAULT_CHUNKING_CONFIG.maxBytes).toBe(2000); + expect(DEFAULT_CHUNKING_CONFIG.overlapBytes).toBe(200); + expect(DEFAULT_CHUNKING_CONFIG.preserveStructure).toBe(true); + }); + }); +}); + +describe("chunking.simpleChunker", () => { + describe("chunkMarkdown", () => { + it("handles empty content", () => { + const chunks = chunkMarkdown("", { tokens: 500, overlap: 50 }); + expect(chunks).toEqual([]); + }); + + it("handles single line content", () => { + const content = "single line"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + + it("handles content within max size", () => { + const content = "line 1\nline 2\nline 3"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + + it("splits content that exceeds max size", () => { + const content = Array(100).fill("a line of text").join("\n"); + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 }); + expect(chunks.length).toBeGreaterThan(1); + }); + + it("includes overlap between chunks", () => { + // Create content that will split into 2 chunks with overlap + const lines = Array(20).fill("a line"); + const content = lines.join("\n"); + // Smaller maxBytes and larger overlap to ensure overlap occurs + const chunks = chunkMarkdown(content, { tokens: 6, overlap: 4 }); + + expect(chunks.length).toBeGreaterThan(1); + + // Check that there's some overlap in line numbers + const firstEndLine = chunks[0]?.endLine ?? 0; + const secondStartLine = chunks[1]?.startLine ?? Infinity; + expect(secondStartLine).toBeLessThan(firstEndLine); + }); + + it("preserves line numbers correctly", () => { + const content = "line 1\nline 2\nline 3\nline 4"; + const chunks = chunkMarkdown(content, { tokens: 2, overlap: 0 }); + + for (let i = 0; i < chunks.length; i += 1) { + const chunk = chunks[i]; + expect(chunk).toBeDefined(); + expect(chunk?.startLine).toBeGreaterThan(0); + expect(chunk?.endLine).toBeGreaterThanOrEqual(chunk?.startLine ?? 0); + + // Verify line numbers are sequential and increasing + if (i > 0 && chunk) { + const prevChunk = chunks[i - 1]; + expect(chunk.startLine).toBeGreaterThan(prevChunk?.endLine ?? 0); + } + } + }); + + it("generates correct hashes", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks[0]?.hash).toBeDefined(); + expect(chunks[0]?.hash.length).toBeGreaterThan(0); + }); + + it("includes embeddingInput", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks[0]?.embeddingInput).toEqual({ + text: content, + }); + }); + + it("handles legacy config format", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { + tokens: 500, + overlap: 50, + }); + expect(chunks.length).toBe(1); + }); + + it("handles new byte-based config format", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { + maxBytes: 2000, + overlapBytes: 200, + }); + expect(chunks.length).toBe(1); + }); + + it("splits long lines at natural boundaries", () => { + // Create a line that's too long but has word boundaries + const words = Array(50).fill("word"); + const content = words.join(" "); // All on one line + + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 0 }); + expect(chunks.length).toBeGreaterThan(0); + + // Check that chunks don't have trailing/leading spaces from splits + for (const chunk of chunks) { + const lines = chunk.text.split("\n"); + for (const line of lines) { + // Internal newlines from splits shouldn't have odd whitespace + if (line.trim().length > 0) { + expect(line).not.toMatch(/^\s/); + expect(line).not.toMatch(/\s$/); + } + } + } + }); + + it("handles mixed short and long lines", () => { + const content = [ + "short line", + "another short", + Array(100).fill("x").join(""), // long line without spaces + "back to short", + "another one", + ].join("\n"); + + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 }); + expect(chunks.length).toBeGreaterThan(0); + + // Verify all chunks have proper structure + for (const chunk of chunks) { + expect(chunk.startLine).toBeGreaterThan(0); + expect(chunk.endLine).toBeGreaterThanOrEqual(chunk.startLine); + expect(chunk.hash).toBeDefined(); + } + }); + }); + + describe("simpleChunker", () => { + it("uses the SimpleChunker instance", () => { + expect(simpleChunker.name).toBe("simple"); + }); + + it("chunks content via the interface", () => { + const content = "line 1\nline 2\nline 3"; + const chunks = simpleChunker.chunk(content, { + maxBytes: 1000, + overlapBytes: 100, + }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + }); +}); diff --git a/src/memory/chunking/index.ts b/src/memory/chunking/index.ts new file mode 100644 index 00000000000..db87c29bfe8 --- /dev/null +++ b/src/memory/chunking/index.ts @@ -0,0 +1,21 @@ +/** + * Chunking module for memory segmentation. + * + * This module provides a modular, extensible approach to chunking + * markdown content for memory indexing. + * + * @example + * ```ts + * import { chunkMarkdown } from "./chunking"; + * + * const chunks = chunkMarkdown(content, { + * maxBytes: 2000, + * overlapBytes: 200, + * }); + * ``` + */ + +// Re-export for convenience +export * from "./chunk-strategy.js"; +export * from "./simple-chunker.js"; +export * from "./markdown-boundaries.js"; diff --git a/src/memory/chunking/markdown-boundaries.ts b/src/memory/chunking/markdown-boundaries.ts new file mode 100644 index 00000000000..52c6dd6b579 --- /dev/null +++ b/src/memory/chunking/markdown-boundaries.ts @@ -0,0 +1,173 @@ +/** + * Utilities for detecting markdown structure and finding natural split points. + * + * These helpers identify safe places to split markdown content while + * preserving semantic structure. + */ + +/** + * Check if a line is a markdown heading. + */ +export function isHeading(line: string): boolean { + const trimmed = line.trimStart(); + return /^#{1,6}\s/.test(trimmed); +} + +/** + * Check if a line starts or ends a code block fence. + */ +export function isCodeBlockFence(line: string): boolean { + const trimmed = line.trim(); + return /^```/.test(trimmed); +} + +/** + * Check if a line is a list item. + */ +export function isListItem(line: string): boolean { + const trimmed = line.trimStart(); + return /^[-*+]\s|^\d+\.\s/.test(trimmed); +} + +/** + * Check if a line is a blockquote. + */ +export function isBlockquote(line: string): boolean { + const trimmed = line.trimStart(); + return trimmed.startsWith(">"); +} + +/** + * Check if a line is empty (whitespace only). + */ +export function isEmptyLine(line: string): boolean { + return line.trim().length === 0; +} + +/** + * Check if a line is a thematic break (horizontal rule). + */ +export function isThematicBreak(line: string): boolean { + const trimmed = line.trim(); + return /^[-*_]{3,}\s*$/.test(trimmed); +} + +/** + * Determine if a line is a structural boundary (good place to split). + */ +export function isStructuralBoundary(line: string): boolean { + return ( + isEmptyLine(line) || + isHeading(line) || + isThematicBreak(line) + ); +} + +/** + * Find a safe split point within a long line. + * + * Looks for natural boundaries like spaces, punctuation, etc. + * Returns the index where splitting should occur. + * + * @param line - The line to split + * @param maxLength - Maximum length for the split + * @returns Index to split at, or -1 if no good split found + */ +export function findSafeSplitPoint(line: string, maxLength: number): number { + if (line.length <= maxLength) { + return line.length; + } + + // Search backwards from maxLength for a good split point + const searchEnd = Math.min(maxLength, line.length); + + // Priority 1: Space followed by non-space (word boundary) + for (let i = searchEnd; i > 0; i--) { + if (line[i - 1] === " " && line[i] !== " ") { + return i - 1; + } + } + + // Priority 2: Common punctuation that can end a segment + const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"]; + for (const punct of punctuation) { + for (let i = searchEnd; i > 0; i--) { + if (line[i - 1] === punct) { + // Make sure it's not part of something like "..." or "::" + if (i < line.length && line[i] !== punct) { + return i; + } + } + } + } + + // Priority 3: Any whitespace character + for (let i = searchEnd; i > 0; i--) { + if (/\s/.test(line[i - 1]) && !/\s/.test(line[i])) { + return i - 1; + } + } + + // Fallback: hard split at maxLength + return maxLength; +} + +/** + * Split a long line at safe points. + * + * @param line - The line to split + * @param maxLength - Maximum length per segment + * @returns Array of line segments + */ +export function splitLongLine(line: string, maxLength: number): string[] { + if (line.length <= maxLength) { + return [line]; + } + + const segments: string[] = []; + let remaining = line; + + while (remaining.length > maxLength) { + const splitPoint = findSafeSplitPoint(remaining, maxLength); + + if (splitPoint <= 0) { + // Can't find a good split, force it + segments.push(remaining.slice(0, maxLength)); + remaining = remaining.slice(maxLength); + } else { + segments.push(remaining.slice(0, splitPoint).trimEnd()); + remaining = remaining.slice(splitPoint).trimStart(); + } + } + + if (remaining.length > 0) { + segments.push(remaining); + } + + return segments; +} + +/** + * Analyze a line to determine its role in document structure. + */ +export type LineRole = + | "empty" + | "heading" + | "code_fence" + | "list_item" + | "blockquote" + | "thematic_break" + | "content"; + +/** + * Get the structural role of a line. + */ +export function getLineRole(line: string): LineRole { + if (isEmptyLine(line)) return "empty"; + if (isHeading(line)) return "heading"; + if (isCodeBlockFence(line)) return "code_fence"; + if (isListItem(line)) return "list_item"; + if (isBlockquote(line)) return "blockquote"; + if (isThematicBreak(line)) return "thematic_break"; + return "content"; +} diff --git a/src/memory/chunking/simple-chunker.ts b/src/memory/chunking/simple-chunker.ts new file mode 100644 index 00000000000..a14c4ee3330 --- /dev/null +++ b/src/memory/chunking/simple-chunker.ts @@ -0,0 +1,159 @@ +/** + * Simple chunking strategy with improved boundary detection. + * + * This is the default chunker that: + * 1. Uses UTF-8 byte estimation for consistent sizing + * 2. Preserves line structure when possible + * 3. Splits long lines at natural boundaries (spaces, punctuation) + * 4. Maintains overlap using complete lines/semantic units + */ + +import { estimateUtf8Bytes } from "../embedding-input-limits.js"; +import { hashText, type MemoryChunk } from "../internal.js"; +import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js"; +import { splitLongLine } from "./markdown-boundaries.js"; +import { buildTextEmbeddingInput } from "../embedding-inputs.js"; + +/** + * Represents a single line with its metadata. + */ +type LineEntry = { + line: string; + lineNo: number; + byteSize: number; +}; + +/** + * Simple chunker implementation. + */ +export class SimpleChunker implements ChunkStrategy { + readonly name = "simple"; + + chunk(content: string, config: ChunkingConfig): MemoryChunk[] { + const lines = content.split("\n"); + // Handle empty content - split("\n") on empty string returns [""] + if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) { + return []; + } + + // Pre-compute byte sizes for all lines + const lineEntries: LineEntry[] = lines.map((line, i) => ({ + line, + lineNo: i + 1, + byteSize: estimateUtf8Bytes(line) + 1, // +1 for newline + })); + + const chunks: MemoryChunk[] = []; + let currentEntries: LineEntry[] = []; + let currentBytes = 0; + + const flush = () => { + if (currentEntries.length === 0) { + return; + } + const firstEntry = currentEntries[0]!; + const lastEntry = currentEntries[currentEntries.length - 1]!; + const text = currentEntries.map((e) => e.line).join("\n"); + chunks.push({ + startLine: firstEntry.lineNo, + endLine: lastEntry.lineNo, + text, + hash: hashText(text), + embeddingInput: buildTextEmbeddingInput(text), + }); + }; + + const carryOverlap = () => { + const { overlapBytes } = config; + if (overlapBytes <= 0 || currentEntries.length === 0) { + currentEntries = []; + currentBytes = 0; + return; + } + + // Keep whole lines from the end until we exceed overlap + let acc = 0; + const kept: LineEntry[] = []; + for (let i = currentEntries.length - 1; i >= 0; i -= 1) { + const entry = currentEntries[i]!; + if (acc + entry.byteSize > overlapBytes && kept.length > 0) { + break; + } + kept.unshift(entry); + acc += entry.byteSize; + } + + currentEntries = kept; + currentBytes = acc; + }; + + for (const entry of lineEntries) { + // Handle long lines by splitting them at natural boundaries + const segments = this.splitEntryIfNeeded(entry, config.maxBytes); + + for (const segment of segments) { + const wouldExceed = + currentBytes + segment.byteSize > config.maxBytes && currentEntries.length > 0; + + if (wouldExceed) { + flush(); + carryOverlap(); + } + + currentEntries.push(segment); + currentBytes += segment.byteSize; + } + } + + flush(); + return chunks; + } + + /** + * Split a line entry if it exceeds the max bytes. + * Returns an array of entries (original or split). + */ + private splitEntryIfNeeded(entry: LineEntry, maxBytes: number): LineEntry[] { + // If the line fits, return as-is + if (entry.byteSize <= maxBytes) { + return [entry]; + } + + // Split the long line at natural boundaries + const maxTextBytes = maxBytes - 1; // Account for newline + const textSegments = splitLongLine(entry.line, maxTextBytes); + + return textSegments.map((segment) => ({ + line: segment, + lineNo: entry.lineNo, + byteSize: estimateUtf8Bytes(segment) + 1, + })); + } +} + +/** + * Singleton instance for convenience. + */ +export const simpleChunker = new SimpleChunker(); + +/** + * Chunk markdown content using the simple strategy. + * + * This is the main export that replaces the legacy chunkMarkdown function. + * + * @param content - The markdown content to chunk + * @param config - Chunking configuration (or legacy { tokens, overlap } object) + * @returns Array of memory chunks + */ +export function chunkMarkdown( + content: string, + config: ChunkingConfig | { tokens: number; overlap: number }, +): MemoryChunk[] { + // Support legacy config format + const chunkingConfig = + "maxBytes" in config + ? config + : { maxBytes: Math.max(32, config.tokens * 4), overlapBytes: Math.max(0, config.overlap * 4) }; + + return simpleChunker.chunk(content, chunkingConfig); +} diff --git a/src/memory/internal.ts b/src/memory/internal.ts index d1d7e9c2e96..3ccee629014 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -331,88 +331,13 @@ export async function buildMultimodalChunkForIndexing( }; } +import { chunkMarkdown as chunkMarkdownImpl } from "./chunking/simple-chunker.js"; + export function chunkMarkdown( content: string, chunking: { tokens: number; overlap: number }, ): MemoryChunk[] { - const lines = content.split("\n"); - if (lines.length === 0) { - return []; - } - const maxChars = Math.max(32, chunking.tokens * 4); - const overlapChars = Math.max(0, chunking.overlap * 4); - const chunks: MemoryChunk[] = []; - - let current: Array<{ line: string; lineNo: number }> = []; - let currentChars = 0; - - const flush = () => { - if (current.length === 0) { - return; - } - const firstEntry = current[0]; - const lastEntry = current[current.length - 1]; - if (!firstEntry || !lastEntry) { - return; - } - const text = current.map((entry) => entry.line).join("\n"); - const startLine = firstEntry.lineNo; - const endLine = lastEntry.lineNo; - chunks.push({ - startLine, - endLine, - text, - hash: hashText(text), - embeddingInput: buildTextEmbeddingInput(text), - }); - }; - - const carryOverlap = () => { - if (overlapChars <= 0 || current.length === 0) { - current = []; - currentChars = 0; - return; - } - let acc = 0; - const kept: Array<{ line: string; lineNo: number }> = []; - for (let i = current.length - 1; i >= 0; i -= 1) { - const entry = current[i]; - if (!entry) { - continue; - } - acc += entry.line.length + 1; - kept.unshift(entry); - if (acc >= overlapChars) { - break; - } - } - current = kept; - currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0); - }; - - for (let i = 0; i < lines.length; i += 1) { - const line = lines[i] ?? ""; - const lineNo = i + 1; - const segments: string[] = []; - if (line.length === 0) { - segments.push(""); - } else { - for (let start = 0; start < line.length; start += maxChars) { - segments.push(line.slice(start, start + maxChars)); - } - } - for (const segment of segments) { - const lineSize = segment.length + 1; - if (currentChars + lineSize > maxChars && current.length > 0) { - flush(); - carryOverlap(); - } - current.push({ line: segment, lineNo }); - currentChars += lineSize; - } - } - flush(); - return chunks; + return chunkMarkdownImpl(content, chunking); } /** From 048209a5764aa0b0a4a06b524959ade0b43293e7 Mon Sep 17 00:00:00 2001 From: ly-wang19 Date: Tue, 17 Mar 2026 16:35:03 +0800 Subject: [PATCH 3/3] fix(chunking): address PR review comments for Phase 1 - Fix isCodeBlockFence to detect both ``` and ~~~ fences - Fix getHeadingLevel to trim leading whitespace (add new function) - Fix findSafeSplitPoint punctuation search (single pass) - Update findSafeSplitPoint JSDoc to match actual behavior - Add splitLongLineByBytes for UTF-8 byte-aware splitting - Update simple-chunker to use byte-based splitting - Document preserveStructure as reserved for future use Co-Authored-By: Claude Opus 4.6 (1M context) --- src/memory/chunking/markdown-boundaries.ts | 133 +++++++++++++++++++-- src/memory/chunking/simple-chunker.ts | 9 +- 2 files changed, 128 insertions(+), 14 deletions(-) diff --git a/src/memory/chunking/markdown-boundaries.ts b/src/memory/chunking/markdown-boundaries.ts index 52c6dd6b579..60986549092 100644 --- a/src/memory/chunking/markdown-boundaries.ts +++ b/src/memory/chunking/markdown-boundaries.ts @@ -18,7 +18,7 @@ export function isHeading(line: string): boolean { */ export function isCodeBlockFence(line: string): boolean { const trimmed = line.trim(); - return /^```/.test(trimmed); + return /^```|^~~~/.test(trimmed); } /** @@ -71,7 +71,7 @@ export function isStructuralBoundary(line: string): boolean { * * @param line - The line to split * @param maxLength - Maximum length for the split - * @returns Index to split at, or -1 if no good split found + * @returns Index to split at (always a valid index between 0 and maxLength inclusive) */ export function findSafeSplitPoint(line: string, maxLength: number): number { if (line.length <= maxLength) { @@ -89,15 +89,11 @@ export function findSafeSplitPoint(line: string, maxLength: number): number { } // Priority 2: Common punctuation that can end a segment - const punctuation = [".", ",", ";", ":", "!", "?", ")", "]", "}"]; - for (const punct of punctuation) { - for (let i = searchEnd; i > 0; i--) { - if (line[i - 1] === punct) { - // Make sure it's not part of something like "..." or "::" - if (i < line.length && line[i] !== punct) { - return i; - } - } + // Single pass to find the rightmost punctuation of any type + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) { + return i; } } @@ -171,3 +167,118 @@ export function getLineRole(line: string): LineRole { if (isThematicBreak(line)) return "thematic_break"; return "content"; } + +/** + * Get the heading level (1-6) from a heading line. + * Returns 0 if the line is not a heading. + */ +export function getHeadingLevel(line: string): number { + const trimmed = line.trimStart(); + const match = trimmed.match(/^#{1,6}\s/); + if (match) { + return match[0].trim().length; + } + return 0; +} + +/** + * Calculate UTF-8 byte size of a string. + */ +function estimateUtf8Bytes(text: string): number { + if (!text) { + return 0; + } + return Buffer.byteLength(text, "utf8"); +} + +/** + * Split a long line at safe points, respecting UTF-8 byte limits. + * + * This function is byte-aware and correctly handles multi-byte characters + * (CJK, emoji, etc.) that would otherwise exceed byte limits when using + * character-based splitting. + * + * @param line - The line to split + * @param maxBytes - Maximum UTF-8 bytes per segment + * @returns Array of line segments + */ +export function splitLongLineByBytes(line: string, maxBytes: number): string[] { + const totalBytes = estimateUtf8Bytes(line); + if (totalBytes <= maxBytes) { + return [line]; + } + + const segments: string[] = []; + let remaining = line; + + while (remaining.length > 0) { + const remainingBytes = estimateUtf8Bytes(remaining); + if (remainingBytes <= maxBytes) { + segments.push(remaining); + break; + } + + // Find the longest substring that fits within maxBytes + // UTF-16 code units are always <= UTF-8 bytes, so we can use character count + // as an upper bound for the binary search + let high = Math.min(remaining.length, maxBytes); + let low = 0; + let bestEnd = 0; + + // Binary search for the split point + while (low <= high) { + const mid = Math.floor((low + high) / 2); + const candidate = remaining.slice(0, mid); + const bytes = estimateUtf8Bytes(candidate); + + if (bytes <= maxBytes) { + bestEnd = mid; + low = mid + 1; + } else { + high = mid - 1; + } + } + + // Now try to find a better split point within the safe range + // Look for natural boundaries before bestEnd + const searchEnd = Math.min(bestEnd, remaining.length); + let splitPoint = bestEnd; + + // Priority 1: Space followed by non-space (word boundary) + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (remaining[i - 1] === " " && remaining[i] !== " ") { + splitPoint = i - 1; + break; + } + } + + // Priority 2: Common punctuation (only if space search didn't find a boundary) + if (splitPoint === bestEnd) { + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) { + splitPoint = i; + break; + } + } + } + + if (splitPoint <= 0) { + // Can't find a good split, force at bestEnd + splitPoint = Math.max(1, bestEnd); + } + + segments.push(remaining.slice(0, splitPoint).trimEnd()); + remaining = remaining.slice(splitPoint).trimStart(); + } + + return segments; +} diff --git a/src/memory/chunking/simple-chunker.ts b/src/memory/chunking/simple-chunker.ts index a14c4ee3330..8dba90b8a62 100644 --- a/src/memory/chunking/simple-chunker.ts +++ b/src/memory/chunking/simple-chunker.ts @@ -11,7 +11,7 @@ import { estimateUtf8Bytes } from "../embedding-input-limits.js"; import { hashText, type MemoryChunk } from "../internal.js"; import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js"; -import { splitLongLine } from "./markdown-boundaries.js"; +import { splitLongLineByBytes } from "./markdown-boundaries.js"; import { buildTextEmbeddingInput } from "../embedding-inputs.js"; /** @@ -30,6 +30,9 @@ export class SimpleChunker implements ChunkStrategy { readonly name = "simple"; chunk(content: string, config: ChunkingConfig): MemoryChunk[] { + // Note: config.preserveStructure is reserved for future Phase 2+ features. + // The SimpleChunker handles basic UTF-8 byte-aware chunking; use + // SemanticChunker for structure-aware splitting. const lines = content.split("\n"); // Handle empty content - split("\n") on empty string returns [""] if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) { @@ -119,9 +122,9 @@ export class SimpleChunker implements ChunkStrategy { return [entry]; } - // Split the long line at natural boundaries + // Split the long line at natural boundaries, respecting UTF-8 byte limits const maxTextBytes = maxBytes - 1; // Account for newline - const textSegments = splitLongLine(entry.line, maxTextBytes); + const textSegments = splitLongLineByBytes(entry.line, maxTextBytes); return textSegments.map((segment) => ({ line: segment,