Merge 016eca1f0fb311cad335e5a8da33bfe6a5504156 into 598f1826d8b2bc969aace2c6459824737667218c

This commit is contained in:
ly-wang19 2026-03-20 20:20:22 -07:00 committed by GitHub
commit 599e72253f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 907 additions and 95 deletions

View File

@ -1680,16 +1680,20 @@ export async function runEmbeddedAttempt(
});
const sessionLabel = params.sessionKey ?? params.sessionId;
const { bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles } =
await resolveBootstrapContextForRun({
workspaceDir: effectiveWorkspace,
config: params.config,
sessionKey: params.sessionKey,
sessionId: params.sessionId,
warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
contextMode: params.bootstrapContextMode,
runKind: params.bootstrapContextRunKind,
});
// Parallelize bootstrap context resolution and machine name lookup for better performance
const [{ bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles }, machineName] =
await Promise.all([
resolveBootstrapContextForRun({
workspaceDir: effectiveWorkspace,
config: params.config,
sessionKey: params.sessionKey,
sessionId: params.sessionId,
warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
contextMode: params.bootstrapContextMode,
runKind: params.bootstrapContextRunKind,
}),
getMachineDisplayName(),
]);
const bootstrapMaxChars = resolveBootstrapMaxChars(params.config);
const bootstrapTotalMaxChars = resolveBootstrapTotalMaxChars(params.config);
const bootstrapAnalysis = analyzeBootstrapBudget({
@ -1827,7 +1831,6 @@ export async function runEmbeddedAttempt(
});
logToolSchemasForGoogle({ tools: effectiveTools, provider: params.provider });
const machineName = await getMachineDisplayName();
const runtimeChannel = normalizeMessageChannel(params.messageChannel ?? params.messageProvider);
let runtimeCapabilities = runtimeChannel
? (resolveChannelCapabilities({

View File

@ -0,0 +1,82 @@
/**
* Chunking strategy interface and factory for memory segmentation.
*
* This module provides the abstraction layer for different chunking approaches,
* allowing for easy extension and configuration.
*/
import type { MemoryChunk } from "../internal.js";
/**
* Configuration for chunking behavior.
*/
export type ChunkingConfig = {
/**
* Maximum chunk size in estimated UTF-8 bytes.
* This is a conservative upper bound for tokenizer output.
*/
maxBytes: number;
/**
* Overlap between consecutive chunks in estimated UTF-8 bytes.
* Helps maintain context across chunk boundaries.
*/
overlapBytes: number;
/**
* Whether to preserve markdown structure when splitting.
* When enabled, splits will prefer boundaries like headings, code blocks, etc.
*/
preserveStructure?: boolean;
};
/**
* Result of a chunking operation.
*/
export type ChunkingResult = {
chunks: MemoryChunk[];
metadata: {
totalChunks: number;
totalBytes: number;
averageChunkBytes: number;
};
};
/**
* Interface for chunking strategies.
*/
export interface ChunkStrategy {
/**
* The name of this strategy.
*/
readonly name: string;
/**
* Chunk the given content according to the strategy.
*
* @param content - The text content to chunk
* @param config - Chunking configuration
* @returns Array of memory chunks
*/
chunk(content: string, config: ChunkingConfig): MemoryChunk[];
}
/**
* Default chunking configuration.
*/
export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
maxBytes: 2000, // ~500 tokens at 4 bytes/token
overlapBytes: 200, // ~50 tokens overlap
preserveStructure: true,
};
/**
* Create a chunking config from legacy token-based settings.
*/
export function chunkingConfigFromTokens(tokens: number, overlap: number): ChunkingConfig {
return {
maxBytes: Math.max(32, tokens * 4),
overlapBytes: Math.max(0, overlap * 4),
preserveStructure: true,
};
}

View File

@ -0,0 +1,331 @@
/**
* Tests for the chunking module.
*/
import { describe, it, expect } from "vitest";
import { chunkMarkdown, simpleChunker } from "./simple-chunker.js";
import {
findSafeSplitPoint,
splitLongLine,
isHeading,
isCodeBlockFence,
isListItem,
isEmptyLine,
isStructuralBoundary,
getLineRole,
type LineRole,
} from "./markdown-boundaries.js";
import { chunkingConfigFromTokens, DEFAULT_CHUNKING_CONFIG } from "./chunk-strategy.js";
describe("chunking.markdownBoundaries", () => {
describe("isHeading", () => {
it("detects ATX headings", () => {
expect(isHeading("# Heading 1")).toBe(true);
expect(isHeading("## Heading 2")).toBe(true);
expect(isHeading("### Heading 3")).toBe(true);
expect(isHeading(" ## Indented heading")).toBe(true);
expect(isHeading("Not a heading")).toBe(false);
expect(isHeading("Seven#s is not a heading")).toBe(false);
});
});
describe("isCodeBlockFence", () => {
it("detects code fences", () => {
expect(isCodeBlockFence("```typescript")).toBe(true);
expect(isCodeBlockFence("```js")).toBe(true);
expect(isCodeBlockFence(" ```python")).toBe(true);
expect(isCodeBlockFence("Not a fence")).toBe(false);
});
});
describe("isListItem", () => {
it("detects list items", () => {
expect(isListItem("- item")).toBe(true);
expect(isListItem("* item")).toBe(true);
expect(isListItem("+ item")).toBe(true);
expect(isListItem("1. item")).toBe(true);
expect(isListItem(" - indented")).toBe(true);
expect(isListItem("not a list")).toBe(false);
});
});
describe("isEmptyLine", () => {
it("detects empty lines", () => {
expect(isEmptyLine("")).toBe(true);
expect(isEmptyLine(" ")).toBe(true);
expect(isEmptyLine("\t")).toBe(true);
expect(isEmptyLine("content")).toBe(false);
});
});
describe("isStructuralBoundary", () => {
it("identifies good split points", () => {
expect(isStructuralBoundary("")).toBe(true);
expect(isStructuralBoundary(" ")).toBe(true);
expect(isStructuralBoundary("# Heading")).toBe(true);
expect(isStructuralBoundary("---")).toBe(true);
expect(isStructuralBoundary("***")).toBe(true);
expect(isStructuralBoundary("regular content")).toBe(false);
});
});
describe("findSafeSplitPoint", () => {
it("returns full length when line is short enough", () => {
expect(findSafeSplitPoint("short", 100)).toBe(5);
});
it("splits at word boundaries when possible", () => {
const line = "hello world foo bar baz";
expect(findSafeSplitPoint(line, 15)).toBe(11); // After "world "
});
it("splits at punctuation when no spaces available", () => {
const line = "hello,world,foo,bar";
// Algorithm finds the split point closest to maxLength
// With maxLength=12, it finds the comma at index 11 (",")
// and returns 12 (after the comma)
expect(findSafeSplitPoint(line, 12)).toBe(12);
});
it("falls back to hard split when needed", () => {
const line = "averylongwordwithnoboundaries";
expect(findSafeSplitPoint(line, 10)).toBe(10);
});
it("handles edge cases", () => {
expect(findSafeSplitPoint("", 10)).toBe(0);
expect(findSafeSplitPoint("a", 0)).toBe(0);
});
});
describe("splitLongLine", () => {
it("returns single segment for short lines", () => {
expect(splitLongLine("short", 100)).toEqual(["short"]);
});
it("splits at word boundaries", () => {
const line = "hello world foo bar baz";
const result = splitLongLine(line, 15);
expect(result.length).toBeGreaterThan(1);
expect(result[0]).toBe("hello world");
expect(result[1]).toBe("foo bar baz");
});
it("trims whitespace from split segments", () => {
const line = "hello world foo bar";
const result = splitLongLine(line, 10);
result.forEach((seg) => {
expect(seg).not.toMatch(/^\s/);
expect(seg).not.toMatch(/\s$/);
});
});
});
describe("getLineRole", () => {
const roles: LineRole[] = [
"empty",
"heading",
"code_fence",
"list_item",
"blockquote",
"thematic_break",
"content",
];
it("classifies all line types correctly", () => {
expect(getLineRole("")).toBe("empty");
expect(getLineRole("# Heading")).toBe("heading");
expect(getLineRole("```ts")).toBe("code_fence");
expect(getLineRole("- item")).toBe("list_item");
expect(getLineRole("> quote")).toBe("blockquote");
expect(getLineRole("---")).toBe("thematic_break");
expect(getLineRole("regular content")).toBe("content");
});
it("returns one of the valid roles", () => {
const role = getLineRole("some line");
expect(roles).toContain(role);
});
});
});
describe("chunking.chunkStrategy", () => {
describe("chunkingConfigFromTokens", () => {
it("converts token-based config to byte-based", () => {
const config = chunkingConfigFromTokens(500, 50);
expect(config.maxBytes).toBe(2000); // 500 * 4
expect(config.overlapBytes).toBe(200); // 50 * 4
expect(config.preserveStructure).toBe(true);
});
it("handles zero overlap", () => {
const config = chunkingConfigFromTokens(500, 0);
expect(config.overlapBytes).toBe(0);
});
it("enforces minimum maxBytes", () => {
const config = chunkingConfigFromTokens(1, 0);
expect(config.maxBytes).toBe(32); // Minimum
});
});
describe("DEFAULT_CHUNKING_CONFIG", () => {
it("has sensible defaults", () => {
expect(DEFAULT_CHUNKING_CONFIG.maxBytes).toBe(2000);
expect(DEFAULT_CHUNKING_CONFIG.overlapBytes).toBe(200);
expect(DEFAULT_CHUNKING_CONFIG.preserveStructure).toBe(true);
});
});
});
describe("chunking.simpleChunker", () => {
describe("chunkMarkdown", () => {
it("handles empty content", () => {
const chunks = chunkMarkdown("", { tokens: 500, overlap: 50 });
expect(chunks).toEqual([]);
});
it("handles single line content", () => {
const content = "single line";
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
expect(chunks.length).toBe(1);
expect(chunks[0]?.text).toBe(content);
});
it("handles content within max size", () => {
const content = "line 1\nline 2\nline 3";
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
expect(chunks.length).toBe(1);
expect(chunks[0]?.text).toBe(content);
});
it("splits content that exceeds max size", () => {
const content = Array(100).fill("a line of text").join("\n");
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
expect(chunks.length).toBeGreaterThan(1);
});
it("includes overlap between chunks", () => {
// Create content that will split into 2 chunks with overlap
const lines = Array(20).fill("a line");
const content = lines.join("\n");
// Smaller maxBytes and larger overlap to ensure overlap occurs
const chunks = chunkMarkdown(content, { tokens: 6, overlap: 4 });
expect(chunks.length).toBeGreaterThan(1);
// Check that there's some overlap in line numbers
const firstEndLine = chunks[0]?.endLine ?? 0;
const secondStartLine = chunks[1]?.startLine ?? Infinity;
expect(secondStartLine).toBeLessThan(firstEndLine);
});
it("preserves line numbers correctly", () => {
const content = "line 1\nline 2\nline 3\nline 4";
const chunks = chunkMarkdown(content, { tokens: 2, overlap: 0 });
for (let i = 0; i < chunks.length; i += 1) {
const chunk = chunks[i];
expect(chunk).toBeDefined();
expect(chunk?.startLine).toBeGreaterThan(0);
expect(chunk?.endLine).toBeGreaterThanOrEqual(chunk?.startLine ?? 0);
// Verify line numbers are sequential and increasing
if (i > 0 && chunk) {
const prevChunk = chunks[i - 1];
expect(chunk.startLine).toBeGreaterThan(prevChunk?.endLine ?? 0);
}
}
});
it("generates correct hashes", () => {
const content = "test content";
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
expect(chunks[0]?.hash).toBeDefined();
expect(chunks[0]?.hash.length).toBeGreaterThan(0);
});
it("includes embeddingInput", () => {
const content = "test content";
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
expect(chunks[0]?.embeddingInput).toEqual({
text: content,
});
});
it("handles legacy config format", () => {
const content = "test content";
const chunks = chunkMarkdown(content, {
tokens: 500,
overlap: 50,
});
expect(chunks.length).toBe(1);
});
it("handles new byte-based config format", () => {
const content = "test content";
const chunks = chunkMarkdown(content, {
maxBytes: 2000,
overlapBytes: 200,
});
expect(chunks.length).toBe(1);
});
it("splits long lines at natural boundaries", () => {
// Create a line that's too long but has word boundaries
const words = Array(50).fill("word");
const content = words.join(" "); // All on one line
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 0 });
expect(chunks.length).toBeGreaterThan(0);
// Check that chunks don't have trailing/leading spaces from splits
for (const chunk of chunks) {
const lines = chunk.text.split("\n");
for (const line of lines) {
// Internal newlines from splits shouldn't have odd whitespace
if (line.trim().length > 0) {
expect(line).not.toMatch(/^\s/);
expect(line).not.toMatch(/\s$/);
}
}
}
});
it("handles mixed short and long lines", () => {
const content = [
"short line",
"another short",
Array(100).fill("x").join(""), // long line without spaces
"back to short",
"another one",
].join("\n");
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
expect(chunks.length).toBeGreaterThan(0);
// Verify all chunks have proper structure
for (const chunk of chunks) {
expect(chunk.startLine).toBeGreaterThan(0);
expect(chunk.endLine).toBeGreaterThanOrEqual(chunk.startLine);
expect(chunk.hash).toBeDefined();
}
});
});
describe("simpleChunker", () => {
it("uses the SimpleChunker instance", () => {
expect(simpleChunker.name).toBe("simple");
});
it("chunks content via the interface", () => {
const content = "line 1\nline 2\nline 3";
const chunks = simpleChunker.chunk(content, {
maxBytes: 1000,
overlapBytes: 100,
});
expect(chunks.length).toBe(1);
expect(chunks[0]?.text).toBe(content);
});
});
});

View File

@ -0,0 +1,21 @@
/**
* Chunking module for memory segmentation.
*
* This module provides a modular, extensible approach to chunking
* markdown content for memory indexing.
*
* @example
* ```ts
* import { chunkMarkdown } from "./chunking";
*
* const chunks = chunkMarkdown(content, {
* maxBytes: 2000,
* overlapBytes: 200,
* });
* ```
*/
// Re-export for convenience
export * from "./chunk-strategy.js";
export * from "./simple-chunker.js";
export * from "./markdown-boundaries.js";

View File

@ -0,0 +1,284 @@
/**
* Utilities for detecting markdown structure and finding natural split points.
*
* These helpers identify safe places to split markdown content while
* preserving semantic structure.
*/
/**
* Check if a line is a markdown heading.
*/
export function isHeading(line: string): boolean {
const trimmed = line.trimStart();
return /^#{1,6}\s/.test(trimmed);
}
/**
* Check if a line starts or ends a code block fence.
*/
export function isCodeBlockFence(line: string): boolean {
const trimmed = line.trim();
return /^```|^~~~/.test(trimmed);
}
/**
* Check if a line is a list item.
*/
export function isListItem(line: string): boolean {
const trimmed = line.trimStart();
return /^[-*+]\s|^\d+\.\s/.test(trimmed);
}
/**
* Check if a line is a blockquote.
*/
export function isBlockquote(line: string): boolean {
const trimmed = line.trimStart();
return trimmed.startsWith(">");
}
/**
* Check if a line is empty (whitespace only).
*/
export function isEmptyLine(line: string): boolean {
return line.trim().length === 0;
}
/**
* Check if a line is a thematic break (horizontal rule).
*/
export function isThematicBreak(line: string): boolean {
const trimmed = line.trim();
return /^[-*_]{3,}\s*$/.test(trimmed);
}
/**
* Determine if a line is a structural boundary (good place to split).
*/
export function isStructuralBoundary(line: string): boolean {
return (
isEmptyLine(line) ||
isHeading(line) ||
isThematicBreak(line)
);
}
/**
* Find a safe split point within a long line.
*
* Looks for natural boundaries like spaces, punctuation, etc.
* Returns the index where splitting should occur.
*
* @param line - The line to split
* @param maxLength - Maximum length for the split
* @returns Index to split at (always a valid index between 0 and maxLength inclusive)
*/
export function findSafeSplitPoint(line: string, maxLength: number): number {
if (line.length <= maxLength) {
return line.length;
}
// Search backwards from maxLength for a good split point
const searchEnd = Math.min(maxLength, line.length);
// Priority 1: Space followed by non-space (word boundary)
for (let i = searchEnd; i > 0; i--) {
if (line[i - 1] === " " && line[i] !== " ") {
return i - 1;
}
}
// Priority 2: Common punctuation that can end a segment
// Single pass to find the rightmost punctuation of any type
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
for (let i = searchEnd; i > 0; i--) {
if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
return i;
}
}
// Priority 3: Any whitespace character
for (let i = searchEnd; i > 0; i--) {
if (/\s/.test(line[i - 1]) && !/\s/.test(line[i])) {
return i - 1;
}
}
// Fallback: hard split at maxLength
return maxLength;
}
/**
* Split a long line at safe points.
*
* @param line - The line to split
* @param maxLength - Maximum length per segment
* @returns Array of line segments
*/
export function splitLongLine(line: string, maxLength: number): string[] {
if (line.length <= maxLength) {
return [line];
}
const segments: string[] = [];
let remaining = line;
while (remaining.length > maxLength) {
const splitPoint = findSafeSplitPoint(remaining, maxLength);
if (splitPoint <= 0) {
// Can't find a good split, force it
segments.push(remaining.slice(0, maxLength));
remaining = remaining.slice(maxLength);
} else {
segments.push(remaining.slice(0, splitPoint).trimEnd());
remaining = remaining.slice(splitPoint).trimStart();
}
}
if (remaining.length > 0) {
segments.push(remaining);
}
return segments;
}
/**
* Analyze a line to determine its role in document structure.
*/
export type LineRole =
| "empty"
| "heading"
| "code_fence"
| "list_item"
| "blockquote"
| "thematic_break"
| "content";
/**
* Get the structural role of a line.
*/
export function getLineRole(line: string): LineRole {
if (isEmptyLine(line)) return "empty";
if (isHeading(line)) return "heading";
if (isCodeBlockFence(line)) return "code_fence";
if (isListItem(line)) return "list_item";
if (isBlockquote(line)) return "blockquote";
if (isThematicBreak(line)) return "thematic_break";
return "content";
}
/**
* Get the heading level (1-6) from a heading line.
* Returns 0 if the line is not a heading.
*/
export function getHeadingLevel(line: string): number {
const trimmed = line.trimStart();
const match = trimmed.match(/^#{1,6}\s/);
if (match) {
return match[0].trim().length;
}
return 0;
}
/**
* Calculate UTF-8 byte size of a string.
*/
function estimateUtf8Bytes(text: string): number {
if (!text) {
return 0;
}
return Buffer.byteLength(text, "utf8");
}
/**
* Split a long line at safe points, respecting UTF-8 byte limits.
*
* This function is byte-aware and correctly handles multi-byte characters
* (CJK, emoji, etc.) that would otherwise exceed byte limits when using
* character-based splitting.
*
* @param line - The line to split
* @param maxBytes - Maximum UTF-8 bytes per segment
* @returns Array of line segments
*/
export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
const totalBytes = estimateUtf8Bytes(line);
if (totalBytes <= maxBytes) {
return [line];
}
const segments: string[] = [];
let remaining = line;
while (remaining.length > 0) {
const remainingBytes = estimateUtf8Bytes(remaining);
if (remainingBytes <= maxBytes) {
segments.push(remaining);
break;
}
// Find the longest substring that fits within maxBytes
// UTF-16 code units are always <= UTF-8 bytes, so we can use character count
// as an upper bound for the binary search
let high = Math.min(remaining.length, maxBytes);
let low = 0;
let bestEnd = 0;
// Binary search for the split point
while (low <= high) {
const mid = Math.floor((low + high) / 2);
const candidate = remaining.slice(0, mid);
const bytes = estimateUtf8Bytes(candidate);
if (bytes <= maxBytes) {
bestEnd = mid;
low = mid + 1;
} else {
high = mid - 1;
}
}
// Now try to find a better split point within the safe range
// Look for natural boundaries before bestEnd
const searchEnd = Math.min(bestEnd, remaining.length);
let splitPoint = bestEnd;
// Priority 1: Space followed by non-space (word boundary)
for (let i = searchEnd; i > 0; i--) {
const testSegment = remaining.slice(0, i);
if (estimateUtf8Bytes(testSegment) > maxBytes) {
break;
}
if (remaining[i - 1] === " " && remaining[i] !== " ") {
splitPoint = i - 1;
break;
}
}
// Priority 2: Common punctuation (only if space search didn't find a boundary)
if (splitPoint === bestEnd) {
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
for (let i = searchEnd; i > 0; i--) {
const testSegment = remaining.slice(0, i);
if (estimateUtf8Bytes(testSegment) > maxBytes) {
break;
}
if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
splitPoint = i;
break;
}
}
}
if (splitPoint <= 0) {
// Can't find a good split, force at bestEnd
splitPoint = Math.max(1, bestEnd);
}
segments.push(remaining.slice(0, splitPoint).trimEnd());
remaining = remaining.slice(splitPoint).trimStart();
}
return segments;
}

View File

@ -0,0 +1,162 @@
/**
* Simple chunking strategy with improved boundary detection.
*
* This is the default chunker that:
* 1. Uses UTF-8 byte estimation for consistent sizing
* 2. Preserves line structure when possible
* 3. Splits long lines at natural boundaries (spaces, punctuation)
* 4. Maintains overlap using complete lines/semantic units
*/
import { estimateUtf8Bytes } from "../embedding-input-limits.js";
import { hashText, type MemoryChunk } from "../internal.js";
import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
import { splitLongLineByBytes } from "./markdown-boundaries.js";
import { buildTextEmbeddingInput } from "../embedding-inputs.js";
/**
* Represents a single line with its metadata.
*/
type LineEntry = {
line: string;
lineNo: number;
byteSize: number;
};
/**
* Simple chunker implementation.
*/
export class SimpleChunker implements ChunkStrategy {
readonly name = "simple";
chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
// Note: config.preserveStructure is reserved for future Phase 2+ features.
// The SimpleChunker handles basic UTF-8 byte-aware chunking; use
// SemanticChunker for structure-aware splitting.
const lines = content.split("\n");
// Handle empty content - split("\n") on empty string returns [""]
if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
return [];
}
// Pre-compute byte sizes for all lines
const lineEntries: LineEntry[] = lines.map((line, i) => ({
line,
lineNo: i + 1,
byteSize: estimateUtf8Bytes(line) + 1, // +1 for newline
}));
const chunks: MemoryChunk[] = [];
let currentEntries: LineEntry[] = [];
let currentBytes = 0;
const flush = () => {
if (currentEntries.length === 0) {
return;
}
const firstEntry = currentEntries[0]!;
const lastEntry = currentEntries[currentEntries.length - 1]!;
const text = currentEntries.map((e) => e.line).join("\n");
chunks.push({
startLine: firstEntry.lineNo,
endLine: lastEntry.lineNo,
text,
hash: hashText(text),
embeddingInput: buildTextEmbeddingInput(text),
});
};
const carryOverlap = () => {
const { overlapBytes } = config;
if (overlapBytes <= 0 || currentEntries.length === 0) {
currentEntries = [];
currentBytes = 0;
return;
}
// Keep whole lines from the end until we exceed overlap
let acc = 0;
const kept: LineEntry[] = [];
for (let i = currentEntries.length - 1; i >= 0; i -= 1) {
const entry = currentEntries[i]!;
if (acc + entry.byteSize > overlapBytes && kept.length > 0) {
break;
}
kept.unshift(entry);
acc += entry.byteSize;
}
currentEntries = kept;
currentBytes = acc;
};
for (const entry of lineEntries) {
// Handle long lines by splitting them at natural boundaries
const segments = this.splitEntryIfNeeded(entry, config.maxBytes);
for (const segment of segments) {
const wouldExceed =
currentBytes + segment.byteSize > config.maxBytes && currentEntries.length > 0;
if (wouldExceed) {
flush();
carryOverlap();
}
currentEntries.push(segment);
currentBytes += segment.byteSize;
}
}
flush();
return chunks;
}
/**
* Split a line entry if it exceeds the max bytes.
* Returns an array of entries (original or split).
*/
private splitEntryIfNeeded(entry: LineEntry, maxBytes: number): LineEntry[] {
// If the line fits, return as-is
if (entry.byteSize <= maxBytes) {
return [entry];
}
// Split the long line at natural boundaries, respecting UTF-8 byte limits
const maxTextBytes = maxBytes - 1; // Account for newline
const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);
return textSegments.map((segment) => ({
line: segment,
lineNo: entry.lineNo,
byteSize: estimateUtf8Bytes(segment) + 1,
}));
}
}
/**
* Singleton instance for convenience.
*/
export const simpleChunker = new SimpleChunker();
/**
* Chunk markdown content using the simple strategy.
*
* This is the main export that replaces the legacy chunkMarkdown function.
*
* @param content - The markdown content to chunk
* @param config - Chunking configuration (or legacy { tokens, overlap } object)
* @returns Array of memory chunks
*/
export function chunkMarkdown(
content: string,
config: ChunkingConfig | { tokens: number; overlap: number },
): MemoryChunk[] {
// Support legacy config format
const chunkingConfig =
"maxBytes" in config
? config
: { maxBytes: Math.max(32, config.tokens * 4), overlapBytes: Math.max(0, config.overlap * 4) };
return simpleChunker.chunk(content, chunkingConfig);
}

View File

@ -331,88 +331,13 @@ export async function buildMultimodalChunkForIndexing(
};
}
import { chunkMarkdown as chunkMarkdownImpl } from "./chunking/simple-chunker.js";
export function chunkMarkdown(
content: string,
chunking: { tokens: number; overlap: number },
): MemoryChunk[] {
const lines = content.split("\n");
if (lines.length === 0) {
return [];
}
const maxChars = Math.max(32, chunking.tokens * 4);
const overlapChars = Math.max(0, chunking.overlap * 4);
const chunks: MemoryChunk[] = [];
let current: Array<{ line: string; lineNo: number }> = [];
let currentChars = 0;
const flush = () => {
if (current.length === 0) {
return;
}
const firstEntry = current[0];
const lastEntry = current[current.length - 1];
if (!firstEntry || !lastEntry) {
return;
}
const text = current.map((entry) => entry.line).join("\n");
const startLine = firstEntry.lineNo;
const endLine = lastEntry.lineNo;
chunks.push({
startLine,
endLine,
text,
hash: hashText(text),
embeddingInput: buildTextEmbeddingInput(text),
});
};
const carryOverlap = () => {
if (overlapChars <= 0 || current.length === 0) {
current = [];
currentChars = 0;
return;
}
let acc = 0;
const kept: Array<{ line: string; lineNo: number }> = [];
for (let i = current.length - 1; i >= 0; i -= 1) {
const entry = current[i];
if (!entry) {
continue;
}
acc += entry.line.length + 1;
kept.unshift(entry);
if (acc >= overlapChars) {
break;
}
}
current = kept;
currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0);
};
for (let i = 0; i < lines.length; i += 1) {
const line = lines[i] ?? "";
const lineNo = i + 1;
const segments: string[] = [];
if (line.length === 0) {
segments.push("");
} else {
for (let start = 0; start < line.length; start += maxChars) {
segments.push(line.slice(start, start + maxChars));
}
}
for (const segment of segments) {
const lineSize = segment.length + 1;
if (currentChars + lineSize > maxChars && current.length > 0) {
flush();
carryOverlap();
}
current.push({ line: segment, lineNo });
currentChars += lineSize;
}
}
flush();
return chunks;
return chunkMarkdownImpl(content, chunking);
}
/**

View File

@ -337,13 +337,17 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
}
// If FTS isn't available, hybrid mode cannot use keyword search; degrade to vector-only.
const keywordResults =
// Parallelize keyword search (local FTS) and embedding generation (network call) for better performance
const [keywordResults, { queryVec, hasVector }] = await Promise.all([
hybrid.enabled && this.fts.enabled && this.fts.available
? await this.searchKeyword(cleaned, candidates).catch(() => [])
: [];
const queryVec = await this.embedQueryWithTimeout(cleaned);
const hasVector = queryVec.some((v) => v !== 0);
? this.searchKeyword(cleaned, candidates).catch(() => [])
: [],
(async () => {
const queryVec = await this.embedQueryWithTimeout(cleaned);
const hasVector = queryVec.some((v) => v !== 0);
return { queryVec, hasVector };
})(),
]);
const vectorResults = hasVector
? await this.searchVector(queryVec, candidates).catch(() => [])
: [];