Merge 016eca1f0fb311cad335e5a8da33bfe6a5504156 into 598f1826d8b2bc969aace2c6459824737667218c
This commit is contained in:
commit
599e72253f
@ -1680,16 +1680,20 @@ export async function runEmbeddedAttempt(
|
||||
});
|
||||
|
||||
const sessionLabel = params.sessionKey ?? params.sessionId;
|
||||
const { bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles } =
|
||||
await resolveBootstrapContextForRun({
|
||||
workspaceDir: effectiveWorkspace,
|
||||
config: params.config,
|
||||
sessionKey: params.sessionKey,
|
||||
sessionId: params.sessionId,
|
||||
warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
|
||||
contextMode: params.bootstrapContextMode,
|
||||
runKind: params.bootstrapContextRunKind,
|
||||
});
|
||||
// Parallelize bootstrap context resolution and machine name lookup for better performance
|
||||
const [{ bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles }, machineName] =
|
||||
await Promise.all([
|
||||
resolveBootstrapContextForRun({
|
||||
workspaceDir: effectiveWorkspace,
|
||||
config: params.config,
|
||||
sessionKey: params.sessionKey,
|
||||
sessionId: params.sessionId,
|
||||
warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
|
||||
contextMode: params.bootstrapContextMode,
|
||||
runKind: params.bootstrapContextRunKind,
|
||||
}),
|
||||
getMachineDisplayName(),
|
||||
]);
|
||||
const bootstrapMaxChars = resolveBootstrapMaxChars(params.config);
|
||||
const bootstrapTotalMaxChars = resolveBootstrapTotalMaxChars(params.config);
|
||||
const bootstrapAnalysis = analyzeBootstrapBudget({
|
||||
@ -1827,7 +1831,6 @@ export async function runEmbeddedAttempt(
|
||||
});
|
||||
logToolSchemasForGoogle({ tools: effectiveTools, provider: params.provider });
|
||||
|
||||
const machineName = await getMachineDisplayName();
|
||||
const runtimeChannel = normalizeMessageChannel(params.messageChannel ?? params.messageProvider);
|
||||
let runtimeCapabilities = runtimeChannel
|
||||
? (resolveChannelCapabilities({
|
||||
|
||||
82
src/memory/chunking/chunk-strategy.ts
Normal file
82
src/memory/chunking/chunk-strategy.ts
Normal file
@ -0,0 +1,82 @@
|
||||
/**
|
||||
* Chunking strategy interface and factory for memory segmentation.
|
||||
*
|
||||
* This module provides the abstraction layer for different chunking approaches,
|
||||
* allowing for easy extension and configuration.
|
||||
*/
|
||||
|
||||
import type { MemoryChunk } from "../internal.js";
|
||||
|
||||
/**
|
||||
* Configuration for chunking behavior.
|
||||
*/
|
||||
export type ChunkingConfig = {
|
||||
/**
|
||||
* Maximum chunk size in estimated UTF-8 bytes.
|
||||
* This is a conservative upper bound for tokenizer output.
|
||||
*/
|
||||
maxBytes: number;
|
||||
|
||||
/**
|
||||
* Overlap between consecutive chunks in estimated UTF-8 bytes.
|
||||
* Helps maintain context across chunk boundaries.
|
||||
*/
|
||||
overlapBytes: number;
|
||||
|
||||
/**
|
||||
* Whether to preserve markdown structure when splitting.
|
||||
* When enabled, splits will prefer boundaries like headings, code blocks, etc.
|
||||
*/
|
||||
preserveStructure?: boolean;
|
||||
};
|
||||
|
||||
/**
|
||||
* Result of a chunking operation.
|
||||
*/
|
||||
export type ChunkingResult = {
|
||||
chunks: MemoryChunk[];
|
||||
metadata: {
|
||||
totalChunks: number;
|
||||
totalBytes: number;
|
||||
averageChunkBytes: number;
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Interface for chunking strategies.
|
||||
*/
|
||||
export interface ChunkStrategy {
|
||||
/**
|
||||
* The name of this strategy.
|
||||
*/
|
||||
readonly name: string;
|
||||
|
||||
/**
|
||||
* Chunk the given content according to the strategy.
|
||||
*
|
||||
* @param content - The text content to chunk
|
||||
* @param config - Chunking configuration
|
||||
* @returns Array of memory chunks
|
||||
*/
|
||||
chunk(content: string, config: ChunkingConfig): MemoryChunk[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Default chunking configuration.
|
||||
*/
|
||||
export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
|
||||
maxBytes: 2000, // ~500 tokens at 4 bytes/token
|
||||
overlapBytes: 200, // ~50 tokens overlap
|
||||
preserveStructure: true,
|
||||
};
|
||||
|
||||
/**
|
||||
* Create a chunking config from legacy token-based settings.
|
||||
*/
|
||||
export function chunkingConfigFromTokens(tokens: number, overlap: number): ChunkingConfig {
|
||||
return {
|
||||
maxBytes: Math.max(32, tokens * 4),
|
||||
overlapBytes: Math.max(0, overlap * 4),
|
||||
preserveStructure: true,
|
||||
};
|
||||
}
|
||||
331
src/memory/chunking/chunker.test.ts
Normal file
331
src/memory/chunking/chunker.test.ts
Normal file
@ -0,0 +1,331 @@
|
||||
/**
|
||||
* Tests for the chunking module.
|
||||
*/
|
||||
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { chunkMarkdown, simpleChunker } from "./simple-chunker.js";
|
||||
import {
|
||||
findSafeSplitPoint,
|
||||
splitLongLine,
|
||||
isHeading,
|
||||
isCodeBlockFence,
|
||||
isListItem,
|
||||
isEmptyLine,
|
||||
isStructuralBoundary,
|
||||
getLineRole,
|
||||
type LineRole,
|
||||
} from "./markdown-boundaries.js";
|
||||
import { chunkingConfigFromTokens, DEFAULT_CHUNKING_CONFIG } from "./chunk-strategy.js";
|
||||
|
||||
describe("chunking.markdownBoundaries", () => {
|
||||
describe("isHeading", () => {
|
||||
it("detects ATX headings", () => {
|
||||
expect(isHeading("# Heading 1")).toBe(true);
|
||||
expect(isHeading("## Heading 2")).toBe(true);
|
||||
expect(isHeading("### Heading 3")).toBe(true);
|
||||
expect(isHeading(" ## Indented heading")).toBe(true);
|
||||
expect(isHeading("Not a heading")).toBe(false);
|
||||
expect(isHeading("Seven#s is not a heading")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isCodeBlockFence", () => {
|
||||
it("detects code fences", () => {
|
||||
expect(isCodeBlockFence("```typescript")).toBe(true);
|
||||
expect(isCodeBlockFence("```js")).toBe(true);
|
||||
expect(isCodeBlockFence(" ```python")).toBe(true);
|
||||
expect(isCodeBlockFence("Not a fence")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isListItem", () => {
|
||||
it("detects list items", () => {
|
||||
expect(isListItem("- item")).toBe(true);
|
||||
expect(isListItem("* item")).toBe(true);
|
||||
expect(isListItem("+ item")).toBe(true);
|
||||
expect(isListItem("1. item")).toBe(true);
|
||||
expect(isListItem(" - indented")).toBe(true);
|
||||
expect(isListItem("not a list")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isEmptyLine", () => {
|
||||
it("detects empty lines", () => {
|
||||
expect(isEmptyLine("")).toBe(true);
|
||||
expect(isEmptyLine(" ")).toBe(true);
|
||||
expect(isEmptyLine("\t")).toBe(true);
|
||||
expect(isEmptyLine("content")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("isStructuralBoundary", () => {
|
||||
it("identifies good split points", () => {
|
||||
expect(isStructuralBoundary("")).toBe(true);
|
||||
expect(isStructuralBoundary(" ")).toBe(true);
|
||||
expect(isStructuralBoundary("# Heading")).toBe(true);
|
||||
expect(isStructuralBoundary("---")).toBe(true);
|
||||
expect(isStructuralBoundary("***")).toBe(true);
|
||||
expect(isStructuralBoundary("regular content")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("findSafeSplitPoint", () => {
|
||||
it("returns full length when line is short enough", () => {
|
||||
expect(findSafeSplitPoint("short", 100)).toBe(5);
|
||||
});
|
||||
|
||||
it("splits at word boundaries when possible", () => {
|
||||
const line = "hello world foo bar baz";
|
||||
expect(findSafeSplitPoint(line, 15)).toBe(11); // After "world "
|
||||
});
|
||||
|
||||
it("splits at punctuation when no spaces available", () => {
|
||||
const line = "hello,world,foo,bar";
|
||||
// Algorithm finds the split point closest to maxLength
|
||||
// With maxLength=12, it finds the comma at index 11 (",")
|
||||
// and returns 12 (after the comma)
|
||||
expect(findSafeSplitPoint(line, 12)).toBe(12);
|
||||
});
|
||||
|
||||
it("falls back to hard split when needed", () => {
|
||||
const line = "averylongwordwithnoboundaries";
|
||||
expect(findSafeSplitPoint(line, 10)).toBe(10);
|
||||
});
|
||||
|
||||
it("handles edge cases", () => {
|
||||
expect(findSafeSplitPoint("", 10)).toBe(0);
|
||||
expect(findSafeSplitPoint("a", 0)).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("splitLongLine", () => {
|
||||
it("returns single segment for short lines", () => {
|
||||
expect(splitLongLine("short", 100)).toEqual(["short"]);
|
||||
});
|
||||
|
||||
it("splits at word boundaries", () => {
|
||||
const line = "hello world foo bar baz";
|
||||
const result = splitLongLine(line, 15);
|
||||
expect(result.length).toBeGreaterThan(1);
|
||||
expect(result[0]).toBe("hello world");
|
||||
expect(result[1]).toBe("foo bar baz");
|
||||
});
|
||||
|
||||
it("trims whitespace from split segments", () => {
|
||||
const line = "hello world foo bar";
|
||||
const result = splitLongLine(line, 10);
|
||||
result.forEach((seg) => {
|
||||
expect(seg).not.toMatch(/^\s/);
|
||||
expect(seg).not.toMatch(/\s$/);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("getLineRole", () => {
|
||||
const roles: LineRole[] = [
|
||||
"empty",
|
||||
"heading",
|
||||
"code_fence",
|
||||
"list_item",
|
||||
"blockquote",
|
||||
"thematic_break",
|
||||
"content",
|
||||
];
|
||||
|
||||
it("classifies all line types correctly", () => {
|
||||
expect(getLineRole("")).toBe("empty");
|
||||
expect(getLineRole("# Heading")).toBe("heading");
|
||||
expect(getLineRole("```ts")).toBe("code_fence");
|
||||
expect(getLineRole("- item")).toBe("list_item");
|
||||
expect(getLineRole("> quote")).toBe("blockquote");
|
||||
expect(getLineRole("---")).toBe("thematic_break");
|
||||
expect(getLineRole("regular content")).toBe("content");
|
||||
});
|
||||
|
||||
it("returns one of the valid roles", () => {
|
||||
const role = getLineRole("some line");
|
||||
expect(roles).toContain(role);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunking.chunkStrategy", () => {
|
||||
describe("chunkingConfigFromTokens", () => {
|
||||
it("converts token-based config to byte-based", () => {
|
||||
const config = chunkingConfigFromTokens(500, 50);
|
||||
expect(config.maxBytes).toBe(2000); // 500 * 4
|
||||
expect(config.overlapBytes).toBe(200); // 50 * 4
|
||||
expect(config.preserveStructure).toBe(true);
|
||||
});
|
||||
|
||||
it("handles zero overlap", () => {
|
||||
const config = chunkingConfigFromTokens(500, 0);
|
||||
expect(config.overlapBytes).toBe(0);
|
||||
});
|
||||
|
||||
it("enforces minimum maxBytes", () => {
|
||||
const config = chunkingConfigFromTokens(1, 0);
|
||||
expect(config.maxBytes).toBe(32); // Minimum
|
||||
});
|
||||
});
|
||||
|
||||
describe("DEFAULT_CHUNKING_CONFIG", () => {
|
||||
it("has sensible defaults", () => {
|
||||
expect(DEFAULT_CHUNKING_CONFIG.maxBytes).toBe(2000);
|
||||
expect(DEFAULT_CHUNKING_CONFIG.overlapBytes).toBe(200);
|
||||
expect(DEFAULT_CHUNKING_CONFIG.preserveStructure).toBe(true);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe("chunking.simpleChunker", () => {
|
||||
describe("chunkMarkdown", () => {
|
||||
it("handles empty content", () => {
|
||||
const chunks = chunkMarkdown("", { tokens: 500, overlap: 50 });
|
||||
expect(chunks).toEqual([]);
|
||||
});
|
||||
|
||||
it("handles single line content", () => {
|
||||
const content = "single line";
|
||||
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
|
||||
expect(chunks.length).toBe(1);
|
||||
expect(chunks[0]?.text).toBe(content);
|
||||
});
|
||||
|
||||
it("handles content within max size", () => {
|
||||
const content = "line 1\nline 2\nline 3";
|
||||
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
|
||||
expect(chunks.length).toBe(1);
|
||||
expect(chunks[0]?.text).toBe(content);
|
||||
});
|
||||
|
||||
it("splits content that exceeds max size", () => {
|
||||
const content = Array(100).fill("a line of text").join("\n");
|
||||
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
});
|
||||
|
||||
it("includes overlap between chunks", () => {
|
||||
// Create content that will split into 2 chunks with overlap
|
||||
const lines = Array(20).fill("a line");
|
||||
const content = lines.join("\n");
|
||||
// Smaller maxBytes and larger overlap to ensure overlap occurs
|
||||
const chunks = chunkMarkdown(content, { tokens: 6, overlap: 4 });
|
||||
|
||||
expect(chunks.length).toBeGreaterThan(1);
|
||||
|
||||
// Check that there's some overlap in line numbers
|
||||
const firstEndLine = chunks[0]?.endLine ?? 0;
|
||||
const secondStartLine = chunks[1]?.startLine ?? Infinity;
|
||||
expect(secondStartLine).toBeLessThan(firstEndLine);
|
||||
});
|
||||
|
||||
it("preserves line numbers correctly", () => {
|
||||
const content = "line 1\nline 2\nline 3\nline 4";
|
||||
const chunks = chunkMarkdown(content, { tokens: 2, overlap: 0 });
|
||||
|
||||
for (let i = 0; i < chunks.length; i += 1) {
|
||||
const chunk = chunks[i];
|
||||
expect(chunk).toBeDefined();
|
||||
expect(chunk?.startLine).toBeGreaterThan(0);
|
||||
expect(chunk?.endLine).toBeGreaterThanOrEqual(chunk?.startLine ?? 0);
|
||||
|
||||
// Verify line numbers are sequential and increasing
|
||||
if (i > 0 && chunk) {
|
||||
const prevChunk = chunks[i - 1];
|
||||
expect(chunk.startLine).toBeGreaterThan(prevChunk?.endLine ?? 0);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("generates correct hashes", () => {
|
||||
const content = "test content";
|
||||
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
|
||||
expect(chunks[0]?.hash).toBeDefined();
|
||||
expect(chunks[0]?.hash.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it("includes embeddingInput", () => {
|
||||
const content = "test content";
|
||||
const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
|
||||
expect(chunks[0]?.embeddingInput).toEqual({
|
||||
text: content,
|
||||
});
|
||||
});
|
||||
|
||||
it("handles legacy config format", () => {
|
||||
const content = "test content";
|
||||
const chunks = chunkMarkdown(content, {
|
||||
tokens: 500,
|
||||
overlap: 50,
|
||||
});
|
||||
expect(chunks.length).toBe(1);
|
||||
});
|
||||
|
||||
it("handles new byte-based config format", () => {
|
||||
const content = "test content";
|
||||
const chunks = chunkMarkdown(content, {
|
||||
maxBytes: 2000,
|
||||
overlapBytes: 200,
|
||||
});
|
||||
expect(chunks.length).toBe(1);
|
||||
});
|
||||
|
||||
it("splits long lines at natural boundaries", () => {
|
||||
// Create a line that's too long but has word boundaries
|
||||
const words = Array(50).fill("word");
|
||||
const content = words.join(" "); // All on one line
|
||||
|
||||
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 0 });
|
||||
expect(chunks.length).toBeGreaterThan(0);
|
||||
|
||||
// Check that chunks don't have trailing/leading spaces from splits
|
||||
for (const chunk of chunks) {
|
||||
const lines = chunk.text.split("\n");
|
||||
for (const line of lines) {
|
||||
// Internal newlines from splits shouldn't have odd whitespace
|
||||
if (line.trim().length > 0) {
|
||||
expect(line).not.toMatch(/^\s/);
|
||||
expect(line).not.toMatch(/\s$/);
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
it("handles mixed short and long lines", () => {
|
||||
const content = [
|
||||
"short line",
|
||||
"another short",
|
||||
Array(100).fill("x").join(""), // long line without spaces
|
||||
"back to short",
|
||||
"another one",
|
||||
].join("\n");
|
||||
|
||||
const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
|
||||
expect(chunks.length).toBeGreaterThan(0);
|
||||
|
||||
// Verify all chunks have proper structure
|
||||
for (const chunk of chunks) {
|
||||
expect(chunk.startLine).toBeGreaterThan(0);
|
||||
expect(chunk.endLine).toBeGreaterThanOrEqual(chunk.startLine);
|
||||
expect(chunk.hash).toBeDefined();
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
describe("simpleChunker", () => {
|
||||
it("uses the SimpleChunker instance", () => {
|
||||
expect(simpleChunker.name).toBe("simple");
|
||||
});
|
||||
|
||||
it("chunks content via the interface", () => {
|
||||
const content = "line 1\nline 2\nline 3";
|
||||
const chunks = simpleChunker.chunk(content, {
|
||||
maxBytes: 1000,
|
||||
overlapBytes: 100,
|
||||
});
|
||||
expect(chunks.length).toBe(1);
|
||||
expect(chunks[0]?.text).toBe(content);
|
||||
});
|
||||
});
|
||||
});
|
||||
21
src/memory/chunking/index.ts
Normal file
21
src/memory/chunking/index.ts
Normal file
@ -0,0 +1,21 @@
|
||||
/**
|
||||
* Chunking module for memory segmentation.
|
||||
*
|
||||
* This module provides a modular, extensible approach to chunking
|
||||
* markdown content for memory indexing.
|
||||
*
|
||||
* @example
|
||||
* ```ts
|
||||
* import { chunkMarkdown } from "./chunking";
|
||||
*
|
||||
* const chunks = chunkMarkdown(content, {
|
||||
* maxBytes: 2000,
|
||||
* overlapBytes: 200,
|
||||
* });
|
||||
* ```
|
||||
*/
|
||||
|
||||
// Re-export for convenience
|
||||
export * from "./chunk-strategy.js";
|
||||
export * from "./simple-chunker.js";
|
||||
export * from "./markdown-boundaries.js";
|
||||
284
src/memory/chunking/markdown-boundaries.ts
Normal file
284
src/memory/chunking/markdown-boundaries.ts
Normal file
@ -0,0 +1,284 @@
|
||||
/**
|
||||
* Utilities for detecting markdown structure and finding natural split points.
|
||||
*
|
||||
* These helpers identify safe places to split markdown content while
|
||||
* preserving semantic structure.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Check if a line is a markdown heading.
|
||||
*/
|
||||
export function isHeading(line: string): boolean {
|
||||
const trimmed = line.trimStart();
|
||||
return /^#{1,6}\s/.test(trimmed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a line starts or ends a code block fence.
|
||||
*/
|
||||
export function isCodeBlockFence(line: string): boolean {
|
||||
const trimmed = line.trim();
|
||||
return /^```|^~~~/.test(trimmed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a line is a list item.
|
||||
*/
|
||||
export function isListItem(line: string): boolean {
|
||||
const trimmed = line.trimStart();
|
||||
return /^[-*+]\s|^\d+\.\s/.test(trimmed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a line is a blockquote.
|
||||
*/
|
||||
export function isBlockquote(line: string): boolean {
|
||||
const trimmed = line.trimStart();
|
||||
return trimmed.startsWith(">");
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a line is empty (whitespace only).
|
||||
*/
|
||||
export function isEmptyLine(line: string): boolean {
|
||||
return line.trim().length === 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if a line is a thematic break (horizontal rule).
|
||||
*/
|
||||
export function isThematicBreak(line: string): boolean {
|
||||
const trimmed = line.trim();
|
||||
return /^[-*_]{3,}\s*$/.test(trimmed);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine if a line is a structural boundary (good place to split).
|
||||
*/
|
||||
export function isStructuralBoundary(line: string): boolean {
|
||||
return (
|
||||
isEmptyLine(line) ||
|
||||
isHeading(line) ||
|
||||
isThematicBreak(line)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a safe split point within a long line.
|
||||
*
|
||||
* Looks for natural boundaries like spaces, punctuation, etc.
|
||||
* Returns the index where splitting should occur.
|
||||
*
|
||||
* @param line - The line to split
|
||||
* @param maxLength - Maximum length for the split
|
||||
* @returns Index to split at (always a valid index between 0 and maxLength inclusive)
|
||||
*/
|
||||
export function findSafeSplitPoint(line: string, maxLength: number): number {
|
||||
if (line.length <= maxLength) {
|
||||
return line.length;
|
||||
}
|
||||
|
||||
// Search backwards from maxLength for a good split point
|
||||
const searchEnd = Math.min(maxLength, line.length);
|
||||
|
||||
// Priority 1: Space followed by non-space (word boundary)
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
if (line[i - 1] === " " && line[i] !== " ") {
|
||||
return i - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 2: Common punctuation that can end a segment
|
||||
// Single pass to find the rightmost punctuation of any type
|
||||
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 3: Any whitespace character
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
if (/\s/.test(line[i - 1]) && !/\s/.test(line[i])) {
|
||||
return i - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Fallback: hard split at maxLength
|
||||
return maxLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a long line at safe points.
|
||||
*
|
||||
* @param line - The line to split
|
||||
* @param maxLength - Maximum length per segment
|
||||
* @returns Array of line segments
|
||||
*/
|
||||
export function splitLongLine(line: string, maxLength: number): string[] {
|
||||
if (line.length <= maxLength) {
|
||||
return [line];
|
||||
}
|
||||
|
||||
const segments: string[] = [];
|
||||
let remaining = line;
|
||||
|
||||
while (remaining.length > maxLength) {
|
||||
const splitPoint = findSafeSplitPoint(remaining, maxLength);
|
||||
|
||||
if (splitPoint <= 0) {
|
||||
// Can't find a good split, force it
|
||||
segments.push(remaining.slice(0, maxLength));
|
||||
remaining = remaining.slice(maxLength);
|
||||
} else {
|
||||
segments.push(remaining.slice(0, splitPoint).trimEnd());
|
||||
remaining = remaining.slice(splitPoint).trimStart();
|
||||
}
|
||||
}
|
||||
|
||||
if (remaining.length > 0) {
|
||||
segments.push(remaining);
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
|
||||
/**
|
||||
* Analyze a line to determine its role in document structure.
|
||||
*/
|
||||
export type LineRole =
|
||||
| "empty"
|
||||
| "heading"
|
||||
| "code_fence"
|
||||
| "list_item"
|
||||
| "blockquote"
|
||||
| "thematic_break"
|
||||
| "content";
|
||||
|
||||
/**
|
||||
* Get the structural role of a line.
|
||||
*/
|
||||
export function getLineRole(line: string): LineRole {
|
||||
if (isEmptyLine(line)) return "empty";
|
||||
if (isHeading(line)) return "heading";
|
||||
if (isCodeBlockFence(line)) return "code_fence";
|
||||
if (isListItem(line)) return "list_item";
|
||||
if (isBlockquote(line)) return "blockquote";
|
||||
if (isThematicBreak(line)) return "thematic_break";
|
||||
return "content";
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the heading level (1-6) from a heading line.
|
||||
* Returns 0 if the line is not a heading.
|
||||
*/
|
||||
export function getHeadingLevel(line: string): number {
|
||||
const trimmed = line.trimStart();
|
||||
const match = trimmed.match(/^#{1,6}\s/);
|
||||
if (match) {
|
||||
return match[0].trim().length;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate UTF-8 byte size of a string.
|
||||
*/
|
||||
function estimateUtf8Bytes(text: string): number {
|
||||
if (!text) {
|
||||
return 0;
|
||||
}
|
||||
return Buffer.byteLength(text, "utf8");
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a long line at safe points, respecting UTF-8 byte limits.
|
||||
*
|
||||
* This function is byte-aware and correctly handles multi-byte characters
|
||||
* (CJK, emoji, etc.) that would otherwise exceed byte limits when using
|
||||
* character-based splitting.
|
||||
*
|
||||
* @param line - The line to split
|
||||
* @param maxBytes - Maximum UTF-8 bytes per segment
|
||||
* @returns Array of line segments
|
||||
*/
|
||||
export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
|
||||
const totalBytes = estimateUtf8Bytes(line);
|
||||
if (totalBytes <= maxBytes) {
|
||||
return [line];
|
||||
}
|
||||
|
||||
const segments: string[] = [];
|
||||
let remaining = line;
|
||||
|
||||
while (remaining.length > 0) {
|
||||
const remainingBytes = estimateUtf8Bytes(remaining);
|
||||
if (remainingBytes <= maxBytes) {
|
||||
segments.push(remaining);
|
||||
break;
|
||||
}
|
||||
|
||||
// Find the longest substring that fits within maxBytes
|
||||
// UTF-16 code units are always <= UTF-8 bytes, so we can use character count
|
||||
// as an upper bound for the binary search
|
||||
let high = Math.min(remaining.length, maxBytes);
|
||||
let low = 0;
|
||||
let bestEnd = 0;
|
||||
|
||||
// Binary search for the split point
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const candidate = remaining.slice(0, mid);
|
||||
const bytes = estimateUtf8Bytes(candidate);
|
||||
|
||||
if (bytes <= maxBytes) {
|
||||
bestEnd = mid;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Now try to find a better split point within the safe range
|
||||
// Look for natural boundaries before bestEnd
|
||||
const searchEnd = Math.min(bestEnd, remaining.length);
|
||||
let splitPoint = bestEnd;
|
||||
|
||||
// Priority 1: Space followed by non-space (word boundary)
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
const testSegment = remaining.slice(0, i);
|
||||
if (estimateUtf8Bytes(testSegment) > maxBytes) {
|
||||
break;
|
||||
}
|
||||
if (remaining[i - 1] === " " && remaining[i] !== " ") {
|
||||
splitPoint = i - 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Priority 2: Common punctuation (only if space search didn't find a boundary)
|
||||
if (splitPoint === bestEnd) {
|
||||
const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
|
||||
for (let i = searchEnd; i > 0; i--) {
|
||||
const testSegment = remaining.slice(0, i);
|
||||
if (estimateUtf8Bytes(testSegment) > maxBytes) {
|
||||
break;
|
||||
}
|
||||
if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
|
||||
splitPoint = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (splitPoint <= 0) {
|
||||
// Can't find a good split, force at bestEnd
|
||||
splitPoint = Math.max(1, bestEnd);
|
||||
}
|
||||
|
||||
segments.push(remaining.slice(0, splitPoint).trimEnd());
|
||||
remaining = remaining.slice(splitPoint).trimStart();
|
||||
}
|
||||
|
||||
return segments;
|
||||
}
|
||||
162
src/memory/chunking/simple-chunker.ts
Normal file
162
src/memory/chunking/simple-chunker.ts
Normal file
@ -0,0 +1,162 @@
|
||||
/**
|
||||
* Simple chunking strategy with improved boundary detection.
|
||||
*
|
||||
* This is the default chunker that:
|
||||
* 1. Uses UTF-8 byte estimation for consistent sizing
|
||||
* 2. Preserves line structure when possible
|
||||
* 3. Splits long lines at natural boundaries (spaces, punctuation)
|
||||
* 4. Maintains overlap using complete lines/semantic units
|
||||
*/
|
||||
|
||||
import { estimateUtf8Bytes } from "../embedding-input-limits.js";
|
||||
import { hashText, type MemoryChunk } from "../internal.js";
|
||||
import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
|
||||
import { splitLongLineByBytes } from "./markdown-boundaries.js";
|
||||
import { buildTextEmbeddingInput } from "../embedding-inputs.js";
|
||||
|
||||
/**
|
||||
* Represents a single line with its metadata.
|
||||
*/
|
||||
type LineEntry = {
|
||||
line: string;
|
||||
lineNo: number;
|
||||
byteSize: number;
|
||||
};
|
||||
|
||||
/**
|
||||
* Simple chunker implementation.
|
||||
*/
|
||||
export class SimpleChunker implements ChunkStrategy {
|
||||
readonly name = "simple";
|
||||
|
||||
chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
|
||||
// Note: config.preserveStructure is reserved for future Phase 2+ features.
|
||||
// The SimpleChunker handles basic UTF-8 byte-aware chunking; use
|
||||
// SemanticChunker for structure-aware splitting.
|
||||
const lines = content.split("\n");
|
||||
// Handle empty content - split("\n") on empty string returns [""]
|
||||
if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
|
||||
return [];
|
||||
}
|
||||
|
||||
// Pre-compute byte sizes for all lines
|
||||
const lineEntries: LineEntry[] = lines.map((line, i) => ({
|
||||
line,
|
||||
lineNo: i + 1,
|
||||
byteSize: estimateUtf8Bytes(line) + 1, // +1 for newline
|
||||
}));
|
||||
|
||||
const chunks: MemoryChunk[] = [];
|
||||
let currentEntries: LineEntry[] = [];
|
||||
let currentBytes = 0;
|
||||
|
||||
const flush = () => {
|
||||
if (currentEntries.length === 0) {
|
||||
return;
|
||||
}
|
||||
const firstEntry = currentEntries[0]!;
|
||||
const lastEntry = currentEntries[currentEntries.length - 1]!;
|
||||
const text = currentEntries.map((e) => e.line).join("\n");
|
||||
chunks.push({
|
||||
startLine: firstEntry.lineNo,
|
||||
endLine: lastEntry.lineNo,
|
||||
text,
|
||||
hash: hashText(text),
|
||||
embeddingInput: buildTextEmbeddingInput(text),
|
||||
});
|
||||
};
|
||||
|
||||
const carryOverlap = () => {
|
||||
const { overlapBytes } = config;
|
||||
if (overlapBytes <= 0 || currentEntries.length === 0) {
|
||||
currentEntries = [];
|
||||
currentBytes = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// Keep whole lines from the end until we exceed overlap
|
||||
let acc = 0;
|
||||
const kept: LineEntry[] = [];
|
||||
for (let i = currentEntries.length - 1; i >= 0; i -= 1) {
|
||||
const entry = currentEntries[i]!;
|
||||
if (acc + entry.byteSize > overlapBytes && kept.length > 0) {
|
||||
break;
|
||||
}
|
||||
kept.unshift(entry);
|
||||
acc += entry.byteSize;
|
||||
}
|
||||
|
||||
currentEntries = kept;
|
||||
currentBytes = acc;
|
||||
};
|
||||
|
||||
for (const entry of lineEntries) {
|
||||
// Handle long lines by splitting them at natural boundaries
|
||||
const segments = this.splitEntryIfNeeded(entry, config.maxBytes);
|
||||
|
||||
for (const segment of segments) {
|
||||
const wouldExceed =
|
||||
currentBytes + segment.byteSize > config.maxBytes && currentEntries.length > 0;
|
||||
|
||||
if (wouldExceed) {
|
||||
flush();
|
||||
carryOverlap();
|
||||
}
|
||||
|
||||
currentEntries.push(segment);
|
||||
currentBytes += segment.byteSize;
|
||||
}
|
||||
}
|
||||
|
||||
flush();
|
||||
return chunks;
|
||||
}
|
||||
|
||||
/**
|
||||
* Split a line entry if it exceeds the max bytes.
|
||||
* Returns an array of entries (original or split).
|
||||
*/
|
||||
private splitEntryIfNeeded(entry: LineEntry, maxBytes: number): LineEntry[] {
|
||||
// If the line fits, return as-is
|
||||
if (entry.byteSize <= maxBytes) {
|
||||
return [entry];
|
||||
}
|
||||
|
||||
// Split the long line at natural boundaries, respecting UTF-8 byte limits
|
||||
const maxTextBytes = maxBytes - 1; // Account for newline
|
||||
const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);
|
||||
|
||||
return textSegments.map((segment) => ({
|
||||
line: segment,
|
||||
lineNo: entry.lineNo,
|
||||
byteSize: estimateUtf8Bytes(segment) + 1,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Singleton instance for convenience.
|
||||
*/
|
||||
export const simpleChunker = new SimpleChunker();
|
||||
|
||||
/**
|
||||
* Chunk markdown content using the simple strategy.
|
||||
*
|
||||
* This is the main export that replaces the legacy chunkMarkdown function.
|
||||
*
|
||||
* @param content - The markdown content to chunk
|
||||
* @param config - Chunking configuration (or legacy { tokens, overlap } object)
|
||||
* @returns Array of memory chunks
|
||||
*/
|
||||
export function chunkMarkdown(
|
||||
content: string,
|
||||
config: ChunkingConfig | { tokens: number; overlap: number },
|
||||
): MemoryChunk[] {
|
||||
// Support legacy config format
|
||||
const chunkingConfig =
|
||||
"maxBytes" in config
|
||||
? config
|
||||
: { maxBytes: Math.max(32, config.tokens * 4), overlapBytes: Math.max(0, config.overlap * 4) };
|
||||
|
||||
return simpleChunker.chunk(content, chunkingConfig);
|
||||
}
|
||||
@ -331,88 +331,13 @@ export async function buildMultimodalChunkForIndexing(
|
||||
};
|
||||
}
|
||||
|
||||
import { chunkMarkdown as chunkMarkdownImpl } from "./chunking/simple-chunker.js";
|
||||
|
||||
export function chunkMarkdown(
|
||||
content: string,
|
||||
chunking: { tokens: number; overlap: number },
|
||||
): MemoryChunk[] {
|
||||
const lines = content.split("\n");
|
||||
if (lines.length === 0) {
|
||||
return [];
|
||||
}
|
||||
const maxChars = Math.max(32, chunking.tokens * 4);
|
||||
const overlapChars = Math.max(0, chunking.overlap * 4);
|
||||
const chunks: MemoryChunk[] = [];
|
||||
|
||||
let current: Array<{ line: string; lineNo: number }> = [];
|
||||
let currentChars = 0;
|
||||
|
||||
const flush = () => {
|
||||
if (current.length === 0) {
|
||||
return;
|
||||
}
|
||||
const firstEntry = current[0];
|
||||
const lastEntry = current[current.length - 1];
|
||||
if (!firstEntry || !lastEntry) {
|
||||
return;
|
||||
}
|
||||
const text = current.map((entry) => entry.line).join("\n");
|
||||
const startLine = firstEntry.lineNo;
|
||||
const endLine = lastEntry.lineNo;
|
||||
chunks.push({
|
||||
startLine,
|
||||
endLine,
|
||||
text,
|
||||
hash: hashText(text),
|
||||
embeddingInput: buildTextEmbeddingInput(text),
|
||||
});
|
||||
};
|
||||
|
||||
const carryOverlap = () => {
|
||||
if (overlapChars <= 0 || current.length === 0) {
|
||||
current = [];
|
||||
currentChars = 0;
|
||||
return;
|
||||
}
|
||||
let acc = 0;
|
||||
const kept: Array<{ line: string; lineNo: number }> = [];
|
||||
for (let i = current.length - 1; i >= 0; i -= 1) {
|
||||
const entry = current[i];
|
||||
if (!entry) {
|
||||
continue;
|
||||
}
|
||||
acc += entry.line.length + 1;
|
||||
kept.unshift(entry);
|
||||
if (acc >= overlapChars) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
current = kept;
|
||||
currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0);
|
||||
};
|
||||
|
||||
for (let i = 0; i < lines.length; i += 1) {
|
||||
const line = lines[i] ?? "";
|
||||
const lineNo = i + 1;
|
||||
const segments: string[] = [];
|
||||
if (line.length === 0) {
|
||||
segments.push("");
|
||||
} else {
|
||||
for (let start = 0; start < line.length; start += maxChars) {
|
||||
segments.push(line.slice(start, start + maxChars));
|
||||
}
|
||||
}
|
||||
for (const segment of segments) {
|
||||
const lineSize = segment.length + 1;
|
||||
if (currentChars + lineSize > maxChars && current.length > 0) {
|
||||
flush();
|
||||
carryOverlap();
|
||||
}
|
||||
current.push({ line: segment, lineNo });
|
||||
currentChars += lineSize;
|
||||
}
|
||||
}
|
||||
flush();
|
||||
return chunks;
|
||||
return chunkMarkdownImpl(content, chunking);
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@ -337,13 +337,17 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
|
||||
}
|
||||
|
||||
// If FTS isn't available, hybrid mode cannot use keyword search; degrade to vector-only.
|
||||
const keywordResults =
|
||||
// Parallelize keyword search (local FTS) and embedding generation (network call) for better performance
|
||||
const [keywordResults, { queryVec, hasVector }] = await Promise.all([
|
||||
hybrid.enabled && this.fts.enabled && this.fts.available
|
||||
? await this.searchKeyword(cleaned, candidates).catch(() => [])
|
||||
: [];
|
||||
|
||||
const queryVec = await this.embedQueryWithTimeout(cleaned);
|
||||
const hasVector = queryVec.some((v) => v !== 0);
|
||||
? this.searchKeyword(cleaned, candidates).catch(() => [])
|
||||
: [],
|
||||
(async () => {
|
||||
const queryVec = await this.embedQueryWithTimeout(cleaned);
|
||||
const hasVector = queryVec.some((v) => v !== 0);
|
||||
return { queryVec, hasVector };
|
||||
})(),
|
||||
]);
|
||||
const vectorResults = hasVector
|
||||
? await this.searchVector(queryVec, candidates).catch(() => [])
|
||||
: [];
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user