diff --git a/src/agents/pi-embedded-runner/run/attempt.ts b/src/agents/pi-embedded-runner/run/attempt.ts index d785218f819..d2d3ddb0c2f 100644 --- a/src/agents/pi-embedded-runner/run/attempt.ts +++ b/src/agents/pi-embedded-runner/run/attempt.ts @@ -1680,16 +1680,20 @@ export async function runEmbeddedAttempt( }); const sessionLabel = params.sessionKey ?? params.sessionId; - const { bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles } = - await resolveBootstrapContextForRun({ - workspaceDir: effectiveWorkspace, - config: params.config, - sessionKey: params.sessionKey, - sessionId: params.sessionId, - warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }), - contextMode: params.bootstrapContextMode, - runKind: params.bootstrapContextRunKind, - }); + // Parallelize bootstrap context resolution and machine name lookup for better performance + const [{ bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles }, machineName] = + await Promise.all([ + resolveBootstrapContextForRun({ + workspaceDir: effectiveWorkspace, + config: params.config, + sessionKey: params.sessionKey, + sessionId: params.sessionId, + warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }), + contextMode: params.bootstrapContextMode, + runKind: params.bootstrapContextRunKind, + }), + getMachineDisplayName(), + ]); const bootstrapMaxChars = resolveBootstrapMaxChars(params.config); const bootstrapTotalMaxChars = resolveBootstrapTotalMaxChars(params.config); const bootstrapAnalysis = analyzeBootstrapBudget({ @@ -1827,7 +1831,6 @@ export async function runEmbeddedAttempt( }); logToolSchemasForGoogle({ tools: effectiveTools, provider: params.provider }); - const machineName = await getMachineDisplayName(); const runtimeChannel = normalizeMessageChannel(params.messageChannel ?? params.messageProvider); let runtimeCapabilities = runtimeChannel ? (resolveChannelCapabilities({ diff --git a/src/memory/chunking/chunk-strategy.ts b/src/memory/chunking/chunk-strategy.ts new file mode 100644 index 00000000000..9628aa35995 --- /dev/null +++ b/src/memory/chunking/chunk-strategy.ts @@ -0,0 +1,82 @@ +/** + * Chunking strategy interface and factory for memory segmentation. + * + * This module provides the abstraction layer for different chunking approaches, + * allowing for easy extension and configuration. + */ + +import type { MemoryChunk } from "../internal.js"; + +/** + * Configuration for chunking behavior. + */ +export type ChunkingConfig = { + /** + * Maximum chunk size in estimated UTF-8 bytes. + * This is a conservative upper bound for tokenizer output. + */ + maxBytes: number; + + /** + * Overlap between consecutive chunks in estimated UTF-8 bytes. + * Helps maintain context across chunk boundaries. + */ + overlapBytes: number; + + /** + * Whether to preserve markdown structure when splitting. + * When enabled, splits will prefer boundaries like headings, code blocks, etc. + */ + preserveStructure?: boolean; +}; + +/** + * Result of a chunking operation. + */ +export type ChunkingResult = { + chunks: MemoryChunk[]; + metadata: { + totalChunks: number; + totalBytes: number; + averageChunkBytes: number; + }; +}; + +/** + * Interface for chunking strategies. + */ +export interface ChunkStrategy { + /** + * The name of this strategy. + */ + readonly name: string; + + /** + * Chunk the given content according to the strategy. + * + * @param content - The text content to chunk + * @param config - Chunking configuration + * @returns Array of memory chunks + */ + chunk(content: string, config: ChunkingConfig): MemoryChunk[]; +} + +/** + * Default chunking configuration. + */ +export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = { + maxBytes: 2000, // ~500 tokens at 4 bytes/token + overlapBytes: 200, // ~50 tokens overlap + preserveStructure: true, +}; + +/** + * Create a chunking config from legacy token-based settings. + */ +export function chunkingConfigFromTokens(tokens: number, overlap: number): ChunkingConfig { + return { + maxBytes: Math.max(32, tokens * 4), + overlapBytes: Math.max(0, overlap * 4), + preserveStructure: true, + }; +} diff --git a/src/memory/chunking/chunker.test.ts b/src/memory/chunking/chunker.test.ts new file mode 100644 index 00000000000..f95e945bdd5 --- /dev/null +++ b/src/memory/chunking/chunker.test.ts @@ -0,0 +1,331 @@ +/** + * Tests for the chunking module. + */ + +import { describe, it, expect } from "vitest"; +import { chunkMarkdown, simpleChunker } from "./simple-chunker.js"; +import { + findSafeSplitPoint, + splitLongLine, + isHeading, + isCodeBlockFence, + isListItem, + isEmptyLine, + isStructuralBoundary, + getLineRole, + type LineRole, +} from "./markdown-boundaries.js"; +import { chunkingConfigFromTokens, DEFAULT_CHUNKING_CONFIG } from "./chunk-strategy.js"; + +describe("chunking.markdownBoundaries", () => { + describe("isHeading", () => { + it("detects ATX headings", () => { + expect(isHeading("# Heading 1")).toBe(true); + expect(isHeading("## Heading 2")).toBe(true); + expect(isHeading("### Heading 3")).toBe(true); + expect(isHeading(" ## Indented heading")).toBe(true); + expect(isHeading("Not a heading")).toBe(false); + expect(isHeading("Seven#s is not a heading")).toBe(false); + }); + }); + + describe("isCodeBlockFence", () => { + it("detects code fences", () => { + expect(isCodeBlockFence("```typescript")).toBe(true); + expect(isCodeBlockFence("```js")).toBe(true); + expect(isCodeBlockFence(" ```python")).toBe(true); + expect(isCodeBlockFence("Not a fence")).toBe(false); + }); + }); + + describe("isListItem", () => { + it("detects list items", () => { + expect(isListItem("- item")).toBe(true); + expect(isListItem("* item")).toBe(true); + expect(isListItem("+ item")).toBe(true); + expect(isListItem("1. item")).toBe(true); + expect(isListItem(" - indented")).toBe(true); + expect(isListItem("not a list")).toBe(false); + }); + }); + + describe("isEmptyLine", () => { + it("detects empty lines", () => { + expect(isEmptyLine("")).toBe(true); + expect(isEmptyLine(" ")).toBe(true); + expect(isEmptyLine("\t")).toBe(true); + expect(isEmptyLine("content")).toBe(false); + }); + }); + + describe("isStructuralBoundary", () => { + it("identifies good split points", () => { + expect(isStructuralBoundary("")).toBe(true); + expect(isStructuralBoundary(" ")).toBe(true); + expect(isStructuralBoundary("# Heading")).toBe(true); + expect(isStructuralBoundary("---")).toBe(true); + expect(isStructuralBoundary("***")).toBe(true); + expect(isStructuralBoundary("regular content")).toBe(false); + }); + }); + + describe("findSafeSplitPoint", () => { + it("returns full length when line is short enough", () => { + expect(findSafeSplitPoint("short", 100)).toBe(5); + }); + + it("splits at word boundaries when possible", () => { + const line = "hello world foo bar baz"; + expect(findSafeSplitPoint(line, 15)).toBe(11); // After "world " + }); + + it("splits at punctuation when no spaces available", () => { + const line = "hello,world,foo,bar"; + // Algorithm finds the split point closest to maxLength + // With maxLength=12, it finds the comma at index 11 (",") + // and returns 12 (after the comma) + expect(findSafeSplitPoint(line, 12)).toBe(12); + }); + + it("falls back to hard split when needed", () => { + const line = "averylongwordwithnoboundaries"; + expect(findSafeSplitPoint(line, 10)).toBe(10); + }); + + it("handles edge cases", () => { + expect(findSafeSplitPoint("", 10)).toBe(0); + expect(findSafeSplitPoint("a", 0)).toBe(0); + }); + }); + + describe("splitLongLine", () => { + it("returns single segment for short lines", () => { + expect(splitLongLine("short", 100)).toEqual(["short"]); + }); + + it("splits at word boundaries", () => { + const line = "hello world foo bar baz"; + const result = splitLongLine(line, 15); + expect(result.length).toBeGreaterThan(1); + expect(result[0]).toBe("hello world"); + expect(result[1]).toBe("foo bar baz"); + }); + + it("trims whitespace from split segments", () => { + const line = "hello world foo bar"; + const result = splitLongLine(line, 10); + result.forEach((seg) => { + expect(seg).not.toMatch(/^\s/); + expect(seg).not.toMatch(/\s$/); + }); + }); + }); + + describe("getLineRole", () => { + const roles: LineRole[] = [ + "empty", + "heading", + "code_fence", + "list_item", + "blockquote", + "thematic_break", + "content", + ]; + + it("classifies all line types correctly", () => { + expect(getLineRole("")).toBe("empty"); + expect(getLineRole("# Heading")).toBe("heading"); + expect(getLineRole("```ts")).toBe("code_fence"); + expect(getLineRole("- item")).toBe("list_item"); + expect(getLineRole("> quote")).toBe("blockquote"); + expect(getLineRole("---")).toBe("thematic_break"); + expect(getLineRole("regular content")).toBe("content"); + }); + + it("returns one of the valid roles", () => { + const role = getLineRole("some line"); + expect(roles).toContain(role); + }); + }); +}); + +describe("chunking.chunkStrategy", () => { + describe("chunkingConfigFromTokens", () => { + it("converts token-based config to byte-based", () => { + const config = chunkingConfigFromTokens(500, 50); + expect(config.maxBytes).toBe(2000); // 500 * 4 + expect(config.overlapBytes).toBe(200); // 50 * 4 + expect(config.preserveStructure).toBe(true); + }); + + it("handles zero overlap", () => { + const config = chunkingConfigFromTokens(500, 0); + expect(config.overlapBytes).toBe(0); + }); + + it("enforces minimum maxBytes", () => { + const config = chunkingConfigFromTokens(1, 0); + expect(config.maxBytes).toBe(32); // Minimum + }); + }); + + describe("DEFAULT_CHUNKING_CONFIG", () => { + it("has sensible defaults", () => { + expect(DEFAULT_CHUNKING_CONFIG.maxBytes).toBe(2000); + expect(DEFAULT_CHUNKING_CONFIG.overlapBytes).toBe(200); + expect(DEFAULT_CHUNKING_CONFIG.preserveStructure).toBe(true); + }); + }); +}); + +describe("chunking.simpleChunker", () => { + describe("chunkMarkdown", () => { + it("handles empty content", () => { + const chunks = chunkMarkdown("", { tokens: 500, overlap: 50 }); + expect(chunks).toEqual([]); + }); + + it("handles single line content", () => { + const content = "single line"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + + it("handles content within max size", () => { + const content = "line 1\nline 2\nline 3"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + + it("splits content that exceeds max size", () => { + const content = Array(100).fill("a line of text").join("\n"); + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 }); + expect(chunks.length).toBeGreaterThan(1); + }); + + it("includes overlap between chunks", () => { + // Create content that will split into 2 chunks with overlap + const lines = Array(20).fill("a line"); + const content = lines.join("\n"); + // Smaller maxBytes and larger overlap to ensure overlap occurs + const chunks = chunkMarkdown(content, { tokens: 6, overlap: 4 }); + + expect(chunks.length).toBeGreaterThan(1); + + // Check that there's some overlap in line numbers + const firstEndLine = chunks[0]?.endLine ?? 0; + const secondStartLine = chunks[1]?.startLine ?? Infinity; + expect(secondStartLine).toBeLessThan(firstEndLine); + }); + + it("preserves line numbers correctly", () => { + const content = "line 1\nline 2\nline 3\nline 4"; + const chunks = chunkMarkdown(content, { tokens: 2, overlap: 0 }); + + for (let i = 0; i < chunks.length; i += 1) { + const chunk = chunks[i]; + expect(chunk).toBeDefined(); + expect(chunk?.startLine).toBeGreaterThan(0); + expect(chunk?.endLine).toBeGreaterThanOrEqual(chunk?.startLine ?? 0); + + // Verify line numbers are sequential and increasing + if (i > 0 && chunk) { + const prevChunk = chunks[i - 1]; + expect(chunk.startLine).toBeGreaterThan(prevChunk?.endLine ?? 0); + } + } + }); + + it("generates correct hashes", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks[0]?.hash).toBeDefined(); + expect(chunks[0]?.hash.length).toBeGreaterThan(0); + }); + + it("includes embeddingInput", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 }); + expect(chunks[0]?.embeddingInput).toEqual({ + text: content, + }); + }); + + it("handles legacy config format", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { + tokens: 500, + overlap: 50, + }); + expect(chunks.length).toBe(1); + }); + + it("handles new byte-based config format", () => { + const content = "test content"; + const chunks = chunkMarkdown(content, { + maxBytes: 2000, + overlapBytes: 200, + }); + expect(chunks.length).toBe(1); + }); + + it("splits long lines at natural boundaries", () => { + // Create a line that's too long but has word boundaries + const words = Array(50).fill("word"); + const content = words.join(" "); // All on one line + + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 0 }); + expect(chunks.length).toBeGreaterThan(0); + + // Check that chunks don't have trailing/leading spaces from splits + for (const chunk of chunks) { + const lines = chunk.text.split("\n"); + for (const line of lines) { + // Internal newlines from splits shouldn't have odd whitespace + if (line.trim().length > 0) { + expect(line).not.toMatch(/^\s/); + expect(line).not.toMatch(/\s$/); + } + } + } + }); + + it("handles mixed short and long lines", () => { + const content = [ + "short line", + "another short", + Array(100).fill("x").join(""), // long line without spaces + "back to short", + "another one", + ].join("\n"); + + const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 }); + expect(chunks.length).toBeGreaterThan(0); + + // Verify all chunks have proper structure + for (const chunk of chunks) { + expect(chunk.startLine).toBeGreaterThan(0); + expect(chunk.endLine).toBeGreaterThanOrEqual(chunk.startLine); + expect(chunk.hash).toBeDefined(); + } + }); + }); + + describe("simpleChunker", () => { + it("uses the SimpleChunker instance", () => { + expect(simpleChunker.name).toBe("simple"); + }); + + it("chunks content via the interface", () => { + const content = "line 1\nline 2\nline 3"; + const chunks = simpleChunker.chunk(content, { + maxBytes: 1000, + overlapBytes: 100, + }); + expect(chunks.length).toBe(1); + expect(chunks[0]?.text).toBe(content); + }); + }); +}); diff --git a/src/memory/chunking/index.ts b/src/memory/chunking/index.ts new file mode 100644 index 00000000000..db87c29bfe8 --- /dev/null +++ b/src/memory/chunking/index.ts @@ -0,0 +1,21 @@ +/** + * Chunking module for memory segmentation. + * + * This module provides a modular, extensible approach to chunking + * markdown content for memory indexing. + * + * @example + * ```ts + * import { chunkMarkdown } from "./chunking"; + * + * const chunks = chunkMarkdown(content, { + * maxBytes: 2000, + * overlapBytes: 200, + * }); + * ``` + */ + +// Re-export for convenience +export * from "./chunk-strategy.js"; +export * from "./simple-chunker.js"; +export * from "./markdown-boundaries.js"; diff --git a/src/memory/chunking/markdown-boundaries.ts b/src/memory/chunking/markdown-boundaries.ts new file mode 100644 index 00000000000..60986549092 --- /dev/null +++ b/src/memory/chunking/markdown-boundaries.ts @@ -0,0 +1,284 @@ +/** + * Utilities for detecting markdown structure and finding natural split points. + * + * These helpers identify safe places to split markdown content while + * preserving semantic structure. + */ + +/** + * Check if a line is a markdown heading. + */ +export function isHeading(line: string): boolean { + const trimmed = line.trimStart(); + return /^#{1,6}\s/.test(trimmed); +} + +/** + * Check if a line starts or ends a code block fence. + */ +export function isCodeBlockFence(line: string): boolean { + const trimmed = line.trim(); + return /^```|^~~~/.test(trimmed); +} + +/** + * Check if a line is a list item. + */ +export function isListItem(line: string): boolean { + const trimmed = line.trimStart(); + return /^[-*+]\s|^\d+\.\s/.test(trimmed); +} + +/** + * Check if a line is a blockquote. + */ +export function isBlockquote(line: string): boolean { + const trimmed = line.trimStart(); + return trimmed.startsWith(">"); +} + +/** + * Check if a line is empty (whitespace only). + */ +export function isEmptyLine(line: string): boolean { + return line.trim().length === 0; +} + +/** + * Check if a line is a thematic break (horizontal rule). + */ +export function isThematicBreak(line: string): boolean { + const trimmed = line.trim(); + return /^[-*_]{3,}\s*$/.test(trimmed); +} + +/** + * Determine if a line is a structural boundary (good place to split). + */ +export function isStructuralBoundary(line: string): boolean { + return ( + isEmptyLine(line) || + isHeading(line) || + isThematicBreak(line) + ); +} + +/** + * Find a safe split point within a long line. + * + * Looks for natural boundaries like spaces, punctuation, etc. + * Returns the index where splitting should occur. + * + * @param line - The line to split + * @param maxLength - Maximum length for the split + * @returns Index to split at (always a valid index between 0 and maxLength inclusive) + */ +export function findSafeSplitPoint(line: string, maxLength: number): number { + if (line.length <= maxLength) { + return line.length; + } + + // Search backwards from maxLength for a good split point + const searchEnd = Math.min(maxLength, line.length); + + // Priority 1: Space followed by non-space (word boundary) + for (let i = searchEnd; i > 0; i--) { + if (line[i - 1] === " " && line[i] !== " ") { + return i - 1; + } + } + + // Priority 2: Common punctuation that can end a segment + // Single pass to find the rightmost punctuation of any type + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) { + return i; + } + } + + // Priority 3: Any whitespace character + for (let i = searchEnd; i > 0; i--) { + if (/\s/.test(line[i - 1]) && !/\s/.test(line[i])) { + return i - 1; + } + } + + // Fallback: hard split at maxLength + return maxLength; +} + +/** + * Split a long line at safe points. + * + * @param line - The line to split + * @param maxLength - Maximum length per segment + * @returns Array of line segments + */ +export function splitLongLine(line: string, maxLength: number): string[] { + if (line.length <= maxLength) { + return [line]; + } + + const segments: string[] = []; + let remaining = line; + + while (remaining.length > maxLength) { + const splitPoint = findSafeSplitPoint(remaining, maxLength); + + if (splitPoint <= 0) { + // Can't find a good split, force it + segments.push(remaining.slice(0, maxLength)); + remaining = remaining.slice(maxLength); + } else { + segments.push(remaining.slice(0, splitPoint).trimEnd()); + remaining = remaining.slice(splitPoint).trimStart(); + } + } + + if (remaining.length > 0) { + segments.push(remaining); + } + + return segments; +} + +/** + * Analyze a line to determine its role in document structure. + */ +export type LineRole = + | "empty" + | "heading" + | "code_fence" + | "list_item" + | "blockquote" + | "thematic_break" + | "content"; + +/** + * Get the structural role of a line. + */ +export function getLineRole(line: string): LineRole { + if (isEmptyLine(line)) return "empty"; + if (isHeading(line)) return "heading"; + if (isCodeBlockFence(line)) return "code_fence"; + if (isListItem(line)) return "list_item"; + if (isBlockquote(line)) return "blockquote"; + if (isThematicBreak(line)) return "thematic_break"; + return "content"; +} + +/** + * Get the heading level (1-6) from a heading line. + * Returns 0 if the line is not a heading. + */ +export function getHeadingLevel(line: string): number { + const trimmed = line.trimStart(); + const match = trimmed.match(/^#{1,6}\s/); + if (match) { + return match[0].trim().length; + } + return 0; +} + +/** + * Calculate UTF-8 byte size of a string. + */ +function estimateUtf8Bytes(text: string): number { + if (!text) { + return 0; + } + return Buffer.byteLength(text, "utf8"); +} + +/** + * Split a long line at safe points, respecting UTF-8 byte limits. + * + * This function is byte-aware and correctly handles multi-byte characters + * (CJK, emoji, etc.) that would otherwise exceed byte limits when using + * character-based splitting. + * + * @param line - The line to split + * @param maxBytes - Maximum UTF-8 bytes per segment + * @returns Array of line segments + */ +export function splitLongLineByBytes(line: string, maxBytes: number): string[] { + const totalBytes = estimateUtf8Bytes(line); + if (totalBytes <= maxBytes) { + return [line]; + } + + const segments: string[] = []; + let remaining = line; + + while (remaining.length > 0) { + const remainingBytes = estimateUtf8Bytes(remaining); + if (remainingBytes <= maxBytes) { + segments.push(remaining); + break; + } + + // Find the longest substring that fits within maxBytes + // UTF-16 code units are always <= UTF-8 bytes, so we can use character count + // as an upper bound for the binary search + let high = Math.min(remaining.length, maxBytes); + let low = 0; + let bestEnd = 0; + + // Binary search for the split point + while (low <= high) { + const mid = Math.floor((low + high) / 2); + const candidate = remaining.slice(0, mid); + const bytes = estimateUtf8Bytes(candidate); + + if (bytes <= maxBytes) { + bestEnd = mid; + low = mid + 1; + } else { + high = mid - 1; + } + } + + // Now try to find a better split point within the safe range + // Look for natural boundaries before bestEnd + const searchEnd = Math.min(bestEnd, remaining.length); + let splitPoint = bestEnd; + + // Priority 1: Space followed by non-space (word boundary) + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (remaining[i - 1] === " " && remaining[i] !== " ") { + splitPoint = i - 1; + break; + } + } + + // Priority 2: Common punctuation (only if space search didn't find a boundary) + if (splitPoint === bestEnd) { + const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]); + for (let i = searchEnd; i > 0; i--) { + const testSegment = remaining.slice(0, i); + if (estimateUtf8Bytes(testSegment) > maxBytes) { + break; + } + if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) { + splitPoint = i; + break; + } + } + } + + if (splitPoint <= 0) { + // Can't find a good split, force at bestEnd + splitPoint = Math.max(1, bestEnd); + } + + segments.push(remaining.slice(0, splitPoint).trimEnd()); + remaining = remaining.slice(splitPoint).trimStart(); + } + + return segments; +} diff --git a/src/memory/chunking/simple-chunker.ts b/src/memory/chunking/simple-chunker.ts new file mode 100644 index 00000000000..8dba90b8a62 --- /dev/null +++ b/src/memory/chunking/simple-chunker.ts @@ -0,0 +1,162 @@ +/** + * Simple chunking strategy with improved boundary detection. + * + * This is the default chunker that: + * 1. Uses UTF-8 byte estimation for consistent sizing + * 2. Preserves line structure when possible + * 3. Splits long lines at natural boundaries (spaces, punctuation) + * 4. Maintains overlap using complete lines/semantic units + */ + +import { estimateUtf8Bytes } from "../embedding-input-limits.js"; +import { hashText, type MemoryChunk } from "../internal.js"; +import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js"; +import { splitLongLineByBytes } from "./markdown-boundaries.js"; +import { buildTextEmbeddingInput } from "../embedding-inputs.js"; + +/** + * Represents a single line with its metadata. + */ +type LineEntry = { + line: string; + lineNo: number; + byteSize: number; +}; + +/** + * Simple chunker implementation. + */ +export class SimpleChunker implements ChunkStrategy { + readonly name = "simple"; + + chunk(content: string, config: ChunkingConfig): MemoryChunk[] { + // Note: config.preserveStructure is reserved for future Phase 2+ features. + // The SimpleChunker handles basic UTF-8 byte-aware chunking; use + // SemanticChunker for structure-aware splitting. + const lines = content.split("\n"); + // Handle empty content - split("\n") on empty string returns [""] + if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) { + return []; + } + + // Pre-compute byte sizes for all lines + const lineEntries: LineEntry[] = lines.map((line, i) => ({ + line, + lineNo: i + 1, + byteSize: estimateUtf8Bytes(line) + 1, // +1 for newline + })); + + const chunks: MemoryChunk[] = []; + let currentEntries: LineEntry[] = []; + let currentBytes = 0; + + const flush = () => { + if (currentEntries.length === 0) { + return; + } + const firstEntry = currentEntries[0]!; + const lastEntry = currentEntries[currentEntries.length - 1]!; + const text = currentEntries.map((e) => e.line).join("\n"); + chunks.push({ + startLine: firstEntry.lineNo, + endLine: lastEntry.lineNo, + text, + hash: hashText(text), + embeddingInput: buildTextEmbeddingInput(text), + }); + }; + + const carryOverlap = () => { + const { overlapBytes } = config; + if (overlapBytes <= 0 || currentEntries.length === 0) { + currentEntries = []; + currentBytes = 0; + return; + } + + // Keep whole lines from the end until we exceed overlap + let acc = 0; + const kept: LineEntry[] = []; + for (let i = currentEntries.length - 1; i >= 0; i -= 1) { + const entry = currentEntries[i]!; + if (acc + entry.byteSize > overlapBytes && kept.length > 0) { + break; + } + kept.unshift(entry); + acc += entry.byteSize; + } + + currentEntries = kept; + currentBytes = acc; + }; + + for (const entry of lineEntries) { + // Handle long lines by splitting them at natural boundaries + const segments = this.splitEntryIfNeeded(entry, config.maxBytes); + + for (const segment of segments) { + const wouldExceed = + currentBytes + segment.byteSize > config.maxBytes && currentEntries.length > 0; + + if (wouldExceed) { + flush(); + carryOverlap(); + } + + currentEntries.push(segment); + currentBytes += segment.byteSize; + } + } + + flush(); + return chunks; + } + + /** + * Split a line entry if it exceeds the max bytes. + * Returns an array of entries (original or split). + */ + private splitEntryIfNeeded(entry: LineEntry, maxBytes: number): LineEntry[] { + // If the line fits, return as-is + if (entry.byteSize <= maxBytes) { + return [entry]; + } + + // Split the long line at natural boundaries, respecting UTF-8 byte limits + const maxTextBytes = maxBytes - 1; // Account for newline + const textSegments = splitLongLineByBytes(entry.line, maxTextBytes); + + return textSegments.map((segment) => ({ + line: segment, + lineNo: entry.lineNo, + byteSize: estimateUtf8Bytes(segment) + 1, + })); + } +} + +/** + * Singleton instance for convenience. + */ +export const simpleChunker = new SimpleChunker(); + +/** + * Chunk markdown content using the simple strategy. + * + * This is the main export that replaces the legacy chunkMarkdown function. + * + * @param content - The markdown content to chunk + * @param config - Chunking configuration (or legacy { tokens, overlap } object) + * @returns Array of memory chunks + */ +export function chunkMarkdown( + content: string, + config: ChunkingConfig | { tokens: number; overlap: number }, +): MemoryChunk[] { + // Support legacy config format + const chunkingConfig = + "maxBytes" in config + ? config + : { maxBytes: Math.max(32, config.tokens * 4), overlapBytes: Math.max(0, config.overlap * 4) }; + + return simpleChunker.chunk(content, chunkingConfig); +} diff --git a/src/memory/internal.ts b/src/memory/internal.ts index d1d7e9c2e96..3ccee629014 100644 --- a/src/memory/internal.ts +++ b/src/memory/internal.ts @@ -331,88 +331,13 @@ export async function buildMultimodalChunkForIndexing( }; } +import { chunkMarkdown as chunkMarkdownImpl } from "./chunking/simple-chunker.js"; + export function chunkMarkdown( content: string, chunking: { tokens: number; overlap: number }, ): MemoryChunk[] { - const lines = content.split("\n"); - if (lines.length === 0) { - return []; - } - const maxChars = Math.max(32, chunking.tokens * 4); - const overlapChars = Math.max(0, chunking.overlap * 4); - const chunks: MemoryChunk[] = []; - - let current: Array<{ line: string; lineNo: number }> = []; - let currentChars = 0; - - const flush = () => { - if (current.length === 0) { - return; - } - const firstEntry = current[0]; - const lastEntry = current[current.length - 1]; - if (!firstEntry || !lastEntry) { - return; - } - const text = current.map((entry) => entry.line).join("\n"); - const startLine = firstEntry.lineNo; - const endLine = lastEntry.lineNo; - chunks.push({ - startLine, - endLine, - text, - hash: hashText(text), - embeddingInput: buildTextEmbeddingInput(text), - }); - }; - - const carryOverlap = () => { - if (overlapChars <= 0 || current.length === 0) { - current = []; - currentChars = 0; - return; - } - let acc = 0; - const kept: Array<{ line: string; lineNo: number }> = []; - for (let i = current.length - 1; i >= 0; i -= 1) { - const entry = current[i]; - if (!entry) { - continue; - } - acc += entry.line.length + 1; - kept.unshift(entry); - if (acc >= overlapChars) { - break; - } - } - current = kept; - currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0); - }; - - for (let i = 0; i < lines.length; i += 1) { - const line = lines[i] ?? ""; - const lineNo = i + 1; - const segments: string[] = []; - if (line.length === 0) { - segments.push(""); - } else { - for (let start = 0; start < line.length; start += maxChars) { - segments.push(line.slice(start, start + maxChars)); - } - } - for (const segment of segments) { - const lineSize = segment.length + 1; - if (currentChars + lineSize > maxChars && current.length > 0) { - flush(); - carryOverlap(); - } - current.push({ line: segment, lineNo }); - currentChars += lineSize; - } - } - flush(); - return chunks; + return chunkMarkdownImpl(content, chunking); } /** diff --git a/src/memory/manager.ts b/src/memory/manager.ts index 93a2332c9a9..132b316ec61 100644 --- a/src/memory/manager.ts +++ b/src/memory/manager.ts @@ -337,13 +337,17 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem } // If FTS isn't available, hybrid mode cannot use keyword search; degrade to vector-only. - const keywordResults = + // Parallelize keyword search (local FTS) and embedding generation (network call) for better performance + const [keywordResults, { queryVec, hasVector }] = await Promise.all([ hybrid.enabled && this.fts.enabled && this.fts.available - ? await this.searchKeyword(cleaned, candidates).catch(() => []) - : []; - - const queryVec = await this.embedQueryWithTimeout(cleaned); - const hasVector = queryVec.some((v) => v !== 0); + ? this.searchKeyword(cleaned, candidates).catch(() => []) + : [], + (async () => { + const queryVec = await this.embedQueryWithTimeout(cleaned); + const hasVector = queryVec.some((v) => v !== 0); + return { queryVec, hasVector }; + })(), + ]); const vectorResults = hasVector ? await this.searchVector(queryVec, candidates).catch(() => []) : [];