Merge 016eca1f0fb311cad335e5a8da33bfe6a5504156 into 598f1826d8b2bc969aace2c6459824737667218c

2026-03-20 20:20:22 -07:00 · 2026-03-20 20:20:22 -07:00 · 599e72253f
commit 599e72253f
parent 598f1826d8 016eca1f0f
8 changed files with 907 additions and 95 deletions
--- a/src/agents/pi-embedded-runner/run/attempt.ts
+++ b/src/agents/pi-embedded-runner/run/attempt.ts
@ -1680,16 +1680,20 @@ export async function runEmbeddedAttempt(
    });

    const sessionLabel = params.sessionKey ?? params.sessionId;
-    const { bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles } =
-      await resolveBootstrapContextForRun({
-        workspaceDir: effectiveWorkspace,
-        config: params.config,
-        sessionKey: params.sessionKey,
-        sessionId: params.sessionId,
-        warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
-        contextMode: params.bootstrapContextMode,
-        runKind: params.bootstrapContextRunKind,
-      });
+    // Parallelize bootstrap context resolution and machine name lookup for better performance
+    const [{ bootstrapFiles: hookAdjustedBootstrapFiles, contextFiles }, machineName] =
+      await Promise.all([
+        resolveBootstrapContextForRun({
+          workspaceDir: effectiveWorkspace,
+          config: params.config,
+          sessionKey: params.sessionKey,
+          sessionId: params.sessionId,
+          warn: makeBootstrapWarn({ sessionLabel, warn: (message) => log.warn(message) }),
+          contextMode: params.bootstrapContextMode,
+          runKind: params.bootstrapContextRunKind,
+        }),
+        getMachineDisplayName(),
+      ]);
    const bootstrapMaxChars = resolveBootstrapMaxChars(params.config);
    const bootstrapTotalMaxChars = resolveBootstrapTotalMaxChars(params.config);
    const bootstrapAnalysis = analyzeBootstrapBudget({
@ -1827,7 +1831,6 @@ export async function runEmbeddedAttempt(
    });
    logToolSchemasForGoogle({ tools: effectiveTools, provider: params.provider });

-    const machineName = await getMachineDisplayName();
    const runtimeChannel = normalizeMessageChannel(params.messageChannel ?? params.messageProvider);
    let runtimeCapabilities = runtimeChannel
      ? (resolveChannelCapabilities({
--- a/src/memory/chunking/chunk-strategy.ts
+++ b/src/memory/chunking/chunk-strategy.ts
@ -0,0 +1,82 @@
+/**
+ * Chunking strategy interface and factory for memory segmentation.
+ *
+ * This module provides the abstraction layer for different chunking approaches,
+ * allowing for easy extension and configuration.
+ */
+
+import type { MemoryChunk } from "../internal.js";
+
+/**
+ * Configuration for chunking behavior.
+ */
+export type ChunkingConfig = {
+  /**
+   * Maximum chunk size in estimated UTF-8 bytes.
+   * This is a conservative upper bound for tokenizer output.
+   */
+  maxBytes: number;
+
+  /**
+   * Overlap between consecutive chunks in estimated UTF-8 bytes.
+   * Helps maintain context across chunk boundaries.
+   */
+  overlapBytes: number;
+
+  /**
+   * Whether to preserve markdown structure when splitting.
+   * When enabled, splits will prefer boundaries like headings, code blocks, etc.
+   */
+  preserveStructure?: boolean;
+};
+
+/**
+ * Result of a chunking operation.
+ */
+export type ChunkingResult = {
+  chunks: MemoryChunk[];
+  metadata: {
+    totalChunks: number;
+    totalBytes: number;
+    averageChunkBytes: number;
+  };
+};
+
+/**
+ * Interface for chunking strategies.
+ */
+export interface ChunkStrategy {
+  /**
+   * The name of this strategy.
+   */
+  readonly name: string;
+
+  /**
+   * Chunk the given content according to the strategy.
+   *
+   * @param content - The text content to chunk
+   * @param config - Chunking configuration
+   * @returns Array of memory chunks
+   */
+  chunk(content: string, config: ChunkingConfig): MemoryChunk[];
+}
+
+/**
+ * Default chunking configuration.
+ */
+export const DEFAULT_CHUNKING_CONFIG: ChunkingConfig = {
+  maxBytes: 2000, // ~500 tokens at 4 bytes/token
+  overlapBytes: 200, // ~50 tokens overlap
+  preserveStructure: true,
+};
+
+/**
+ * Create a chunking config from legacy token-based settings.
+ */
+export function chunkingConfigFromTokens(tokens: number, overlap: number): ChunkingConfig {
+  return {
+    maxBytes: Math.max(32, tokens * 4),
+    overlapBytes: Math.max(0, overlap * 4),
+    preserveStructure: true,
+  };
+}
--- a/src/memory/chunking/chunker.test.ts
+++ b/src/memory/chunking/chunker.test.ts
@ -0,0 +1,331 @@
+/**
+ * Tests for the chunking module.
+ */
+
+import { describe, it, expect } from "vitest";
+import { chunkMarkdown, simpleChunker } from "./simple-chunker.js";
+import {
+  findSafeSplitPoint,
+  splitLongLine,
+  isHeading,
+  isCodeBlockFence,
+  isListItem,
+  isEmptyLine,
+  isStructuralBoundary,
+  getLineRole,
+  type LineRole,
+} from "./markdown-boundaries.js";
+import { chunkingConfigFromTokens, DEFAULT_CHUNKING_CONFIG } from "./chunk-strategy.js";
+
+describe("chunking.markdownBoundaries", () => {
+  describe("isHeading", () => {
+    it("detects ATX headings", () => {
+      expect(isHeading("# Heading 1")).toBe(true);
+      expect(isHeading("## Heading 2")).toBe(true);
+      expect(isHeading("### Heading 3")).toBe(true);
+      expect(isHeading("  ## Indented heading")).toBe(true);
+      expect(isHeading("Not a heading")).toBe(false);
+      expect(isHeading("Seven#s is not a heading")).toBe(false);
+    });
+  });
+
+  describe("isCodeBlockFence", () => {
+    it("detects code fences", () => {
+      expect(isCodeBlockFence("```typescript")).toBe(true);
+      expect(isCodeBlockFence("```js")).toBe(true);
+      expect(isCodeBlockFence("   ```python")).toBe(true);
+      expect(isCodeBlockFence("Not a fence")).toBe(false);
+    });
+  });
+
+  describe("isListItem", () => {
+    it("detects list items", () => {
+      expect(isListItem("- item")).toBe(true);
+      expect(isListItem("* item")).toBe(true);
+      expect(isListItem("+ item")).toBe(true);
+      expect(isListItem("1. item")).toBe(true);
+      expect(isListItem("  - indented")).toBe(true);
+      expect(isListItem("not a list")).toBe(false);
+    });
+  });
+
+  describe("isEmptyLine", () => {
+    it("detects empty lines", () => {
+      expect(isEmptyLine("")).toBe(true);
+      expect(isEmptyLine("   ")).toBe(true);
+      expect(isEmptyLine("\t")).toBe(true);
+      expect(isEmptyLine("content")).toBe(false);
+    });
+  });
+
+  describe("isStructuralBoundary", () => {
+    it("identifies good split points", () => {
+      expect(isStructuralBoundary("")).toBe(true);
+      expect(isStructuralBoundary("   ")).toBe(true);
+      expect(isStructuralBoundary("# Heading")).toBe(true);
+      expect(isStructuralBoundary("---")).toBe(true);
+      expect(isStructuralBoundary("***")).toBe(true);
+      expect(isStructuralBoundary("regular content")).toBe(false);
+    });
+  });
+
+  describe("findSafeSplitPoint", () => {
+    it("returns full length when line is short enough", () => {
+      expect(findSafeSplitPoint("short", 100)).toBe(5);
+    });
+
+    it("splits at word boundaries when possible", () => {
+      const line = "hello world foo bar baz";
+      expect(findSafeSplitPoint(line, 15)).toBe(11); // After "world "
+    });
+
+    it("splits at punctuation when no spaces available", () => {
+      const line = "hello,world,foo,bar";
+      // Algorithm finds the split point closest to maxLength
+      // With maxLength=12, it finds the comma at index 11 (",")
+      // and returns 12 (after the comma)
+      expect(findSafeSplitPoint(line, 12)).toBe(12);
+    });
+
+    it("falls back to hard split when needed", () => {
+      const line = "averylongwordwithnoboundaries";
+      expect(findSafeSplitPoint(line, 10)).toBe(10);
+    });
+
+    it("handles edge cases", () => {
+      expect(findSafeSplitPoint("", 10)).toBe(0);
+      expect(findSafeSplitPoint("a", 0)).toBe(0);
+    });
+  });
+
+  describe("splitLongLine", () => {
+    it("returns single segment for short lines", () => {
+      expect(splitLongLine("short", 100)).toEqual(["short"]);
+    });
+
+    it("splits at word boundaries", () => {
+      const line = "hello world foo bar baz";
+      const result = splitLongLine(line, 15);
+      expect(result.length).toBeGreaterThan(1);
+      expect(result[0]).toBe("hello world");
+      expect(result[1]).toBe("foo bar baz");
+    });
+
+    it("trims whitespace from split segments", () => {
+      const line = "hello world foo bar";
+      const result = splitLongLine(line, 10);
+      result.forEach((seg) => {
+        expect(seg).not.toMatch(/^\s/);
+        expect(seg).not.toMatch(/\s$/);
+      });
+    });
+  });
+
+  describe("getLineRole", () => {
+    const roles: LineRole[] = [
+      "empty",
+      "heading",
+      "code_fence",
+      "list_item",
+      "blockquote",
+      "thematic_break",
+      "content",
+    ];
+
+    it("classifies all line types correctly", () => {
+      expect(getLineRole("")).toBe("empty");
+      expect(getLineRole("# Heading")).toBe("heading");
+      expect(getLineRole("```ts")).toBe("code_fence");
+      expect(getLineRole("- item")).toBe("list_item");
+      expect(getLineRole("> quote")).toBe("blockquote");
+      expect(getLineRole("---")).toBe("thematic_break");
+      expect(getLineRole("regular content")).toBe("content");
+    });
+
+    it("returns one of the valid roles", () => {
+      const role = getLineRole("some line");
+      expect(roles).toContain(role);
+    });
+  });
+});
+
+describe("chunking.chunkStrategy", () => {
+  describe("chunkingConfigFromTokens", () => {
+    it("converts token-based config to byte-based", () => {
+      const config = chunkingConfigFromTokens(500, 50);
+      expect(config.maxBytes).toBe(2000); // 500 * 4
+      expect(config.overlapBytes).toBe(200); // 50 * 4
+      expect(config.preserveStructure).toBe(true);
+    });
+
+    it("handles zero overlap", () => {
+      const config = chunkingConfigFromTokens(500, 0);
+      expect(config.overlapBytes).toBe(0);
+    });
+
+    it("enforces minimum maxBytes", () => {
+      const config = chunkingConfigFromTokens(1, 0);
+      expect(config.maxBytes).toBe(32); // Minimum
+    });
+  });
+
+  describe("DEFAULT_CHUNKING_CONFIG", () => {
+    it("has sensible defaults", () => {
+      expect(DEFAULT_CHUNKING_CONFIG.maxBytes).toBe(2000);
+      expect(DEFAULT_CHUNKING_CONFIG.overlapBytes).toBe(200);
+      expect(DEFAULT_CHUNKING_CONFIG.preserveStructure).toBe(true);
+    });
+  });
+});
+
+describe("chunking.simpleChunker", () => {
+  describe("chunkMarkdown", () => {
+    it("handles empty content", () => {
+      const chunks = chunkMarkdown("", { tokens: 500, overlap: 50 });
+      expect(chunks).toEqual([]);
+    });
+
+    it("handles single line content", () => {
+      const content = "single line";
+      const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
+      expect(chunks.length).toBe(1);
+      expect(chunks[0]?.text).toBe(content);
+    });
+
+    it("handles content within max size", () => {
+      const content = "line 1\nline 2\nline 3";
+      const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
+      expect(chunks.length).toBe(1);
+      expect(chunks[0]?.text).toBe(content);
+    });
+
+    it("splits content that exceeds max size", () => {
+      const content = Array(100).fill("a line of text").join("\n");
+      const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
+      expect(chunks.length).toBeGreaterThan(1);
+    });
+
+    it("includes overlap between chunks", () => {
+      // Create content that will split into 2 chunks with overlap
+      const lines = Array(20).fill("a line");
+      const content = lines.join("\n");
+      // Smaller maxBytes and larger overlap to ensure overlap occurs
+      const chunks = chunkMarkdown(content, { tokens: 6, overlap: 4 });
+
+      expect(chunks.length).toBeGreaterThan(1);
+
+      // Check that there's some overlap in line numbers
+      const firstEndLine = chunks[0]?.endLine ?? 0;
+      const secondStartLine = chunks[1]?.startLine ?? Infinity;
+      expect(secondStartLine).toBeLessThan(firstEndLine);
+    });
+
+    it("preserves line numbers correctly", () => {
+      const content = "line 1\nline 2\nline 3\nline 4";
+      const chunks = chunkMarkdown(content, { tokens: 2, overlap: 0 });
+
+      for (let i = 0; i < chunks.length; i += 1) {
+        const chunk = chunks[i];
+        expect(chunk).toBeDefined();
+        expect(chunk?.startLine).toBeGreaterThan(0);
+        expect(chunk?.endLine).toBeGreaterThanOrEqual(chunk?.startLine ?? 0);
+
+        // Verify line numbers are sequential and increasing
+        if (i > 0 && chunk) {
+          const prevChunk = chunks[i - 1];
+          expect(chunk.startLine).toBeGreaterThan(prevChunk?.endLine ?? 0);
+        }
+      }
+    });
+
+    it("generates correct hashes", () => {
+      const content = "test content";
+      const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
+      expect(chunks[0]?.hash).toBeDefined();
+      expect(chunks[0]?.hash.length).toBeGreaterThan(0);
+    });
+
+    it("includes embeddingInput", () => {
+      const content = "test content";
+      const chunks = chunkMarkdown(content, { tokens: 500, overlap: 50 });
+      expect(chunks[0]?.embeddingInput).toEqual({
+        text: content,
+      });
+    });
+
+    it("handles legacy config format", () => {
+      const content = "test content";
+      const chunks = chunkMarkdown(content, {
+        tokens: 500,
+        overlap: 50,
+      });
+      expect(chunks.length).toBe(1);
+    });
+
+    it("handles new byte-based config format", () => {
+      const content = "test content";
+      const chunks = chunkMarkdown(content, {
+        maxBytes: 2000,
+        overlapBytes: 200,
+      });
+      expect(chunks.length).toBe(1);
+    });
+
+    it("splits long lines at natural boundaries", () => {
+      // Create a line that's too long but has word boundaries
+      const words = Array(50).fill("word");
+      const content = words.join(" "); // All on one line
+
+      const chunks = chunkMarkdown(content, { tokens: 10, overlap: 0 });
+      expect(chunks.length).toBeGreaterThan(0);
+
+      // Check that chunks don't have trailing/leading spaces from splits
+      for (const chunk of chunks) {
+        const lines = chunk.text.split("\n");
+        for (const line of lines) {
+          // Internal newlines from splits shouldn't have odd whitespace
+          if (line.trim().length > 0) {
+            expect(line).not.toMatch(/^\s/);
+            expect(line).not.toMatch(/\s$/);
+          }
+        }
+      }
+    });
+
+    it("handles mixed short and long lines", () => {
+      const content = [
+        "short line",
+        "another short",
+        Array(100).fill("x").join(""), // long line without spaces
+        "back to short",
+        "another one",
+      ].join("\n");
+
+      const chunks = chunkMarkdown(content, { tokens: 10, overlap: 2 });
+      expect(chunks.length).toBeGreaterThan(0);
+
+      // Verify all chunks have proper structure
+      for (const chunk of chunks) {
+        expect(chunk.startLine).toBeGreaterThan(0);
+        expect(chunk.endLine).toBeGreaterThanOrEqual(chunk.startLine);
+        expect(chunk.hash).toBeDefined();
+      }
+    });
+  });
+
+  describe("simpleChunker", () => {
+    it("uses the SimpleChunker instance", () => {
+      expect(simpleChunker.name).toBe("simple");
+    });
+
+    it("chunks content via the interface", () => {
+      const content = "line 1\nline 2\nline 3";
+      const chunks = simpleChunker.chunk(content, {
+        maxBytes: 1000,
+        overlapBytes: 100,
+      });
+      expect(chunks.length).toBe(1);
+      expect(chunks[0]?.text).toBe(content);
+    });
+  });
+});
--- a/src/memory/chunking/index.ts
+++ b/src/memory/chunking/index.ts
@ -0,0 +1,21 @@
+/**
+ * Chunking module for memory segmentation.
+ *
+ * This module provides a modular, extensible approach to chunking
+ * markdown content for memory indexing.
+ *
+ * @example
+ * ```ts
+ * import { chunkMarkdown } from "./chunking";
+ *
+ * const chunks = chunkMarkdown(content, {
+ *   maxBytes: 2000,
+ *   overlapBytes: 200,
+ * });
+ * ```
+ */
+
+// Re-export for convenience
+export * from "./chunk-strategy.js";
+export * from "./simple-chunker.js";
+export * from "./markdown-boundaries.js";
--- a/src/memory/chunking/markdown-boundaries.ts
+++ b/src/memory/chunking/markdown-boundaries.ts
@ -0,0 +1,284 @@
+/**
+ * Utilities for detecting markdown structure and finding natural split points.
+ *
+ * These helpers identify safe places to split markdown content while
+ * preserving semantic structure.
+ */
+
+/**
+ * Check if a line is a markdown heading.
+ */
+export function isHeading(line: string): boolean {
+  const trimmed = line.trimStart();
+  return /^#{1,6}\s/.test(trimmed);
+}
+
+/**
+ * Check if a line starts or ends a code block fence.
+ */
+export function isCodeBlockFence(line: string): boolean {
+  const trimmed = line.trim();
+  return /^```|^~~~/.test(trimmed);
+}
+
+/**
+ * Check if a line is a list item.
+ */
+export function isListItem(line: string): boolean {
+  const trimmed = line.trimStart();
+  return /^[-*+]\s|^\d+\.\s/.test(trimmed);
+}
+
+/**
+ * Check if a line is a blockquote.
+ */
+export function isBlockquote(line: string): boolean {
+  const trimmed = line.trimStart();
+  return trimmed.startsWith(">");
+}
+
+/**
+ * Check if a line is empty (whitespace only).
+ */
+export function isEmptyLine(line: string): boolean {
+  return line.trim().length === 0;
+}
+
+/**
+ * Check if a line is a thematic break (horizontal rule).
+ */
+export function isThematicBreak(line: string): boolean {
+  const trimmed = line.trim();
+  return /^[-*_]{3,}\s*$/.test(trimmed);
+}
+
+/**
+ * Determine if a line is a structural boundary (good place to split).
+ */
+export function isStructuralBoundary(line: string): boolean {
+  return (
+    isEmptyLine(line) ||
+    isHeading(line) ||
+    isThematicBreak(line)
+  );
+}
+
+/**
+ * Find a safe split point within a long line.
+ *
+ * Looks for natural boundaries like spaces, punctuation, etc.
+ * Returns the index where splitting should occur.
+ *
+ * @param line - The line to split
+ * @param maxLength - Maximum length for the split
+ * @returns Index to split at (always a valid index between 0 and maxLength inclusive)
+ */
+export function findSafeSplitPoint(line: string, maxLength: number): number {
+  if (line.length <= maxLength) {
+    return line.length;
+  }
+
+  // Search backwards from maxLength for a good split point
+  const searchEnd = Math.min(maxLength, line.length);
+
+  // Priority 1: Space followed by non-space (word boundary)
+  for (let i = searchEnd; i > 0; i--) {
+    if (line[i - 1] === " " && line[i] !== " ") {
+      return i - 1;
+    }
+  }
+
+  // Priority 2: Common punctuation that can end a segment
+  // Single pass to find the rightmost punctuation of any type
+  const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
+  for (let i = searchEnd; i > 0; i--) {
+    if (punctSet.has(line[i - 1]) && i < line.length && line[i] !== line[i - 1]) {
+      return i;
+    }
+  }
+
+  // Priority 3: Any whitespace character
+  for (let i = searchEnd; i > 0; i--) {
+    if (/\s/.test(line[i - 1]) && !/\s/.test(line[i])) {
+      return i - 1;
+    }
+  }
+
+  // Fallback: hard split at maxLength
+  return maxLength;
+}
+
+/**
+ * Split a long line at safe points.
+ *
+ * @param line - The line to split
+ * @param maxLength - Maximum length per segment
+ * @returns Array of line segments
+ */
+export function splitLongLine(line: string, maxLength: number): string[] {
+  if (line.length <= maxLength) {
+    return [line];
+  }
+
+  const segments: string[] = [];
+  let remaining = line;
+
+  while (remaining.length > maxLength) {
+    const splitPoint = findSafeSplitPoint(remaining, maxLength);
+
+    if (splitPoint <= 0) {
+      // Can't find a good split, force it
+      segments.push(remaining.slice(0, maxLength));
+      remaining = remaining.slice(maxLength);
+    } else {
+      segments.push(remaining.slice(0, splitPoint).trimEnd());
+      remaining = remaining.slice(splitPoint).trimStart();
+    }
+  }
+
+  if (remaining.length > 0) {
+    segments.push(remaining);
+  }
+
+  return segments;
+}
+
+/**
+ * Analyze a line to determine its role in document structure.
+ */
+export type LineRole =
+  | "empty"
+  | "heading"
+  | "code_fence"
+  | "list_item"
+  | "blockquote"
+  | "thematic_break"
+  | "content";
+
+/**
+ * Get the structural role of a line.
+ */
+export function getLineRole(line: string): LineRole {
+  if (isEmptyLine(line)) return "empty";
+  if (isHeading(line)) return "heading";
+  if (isCodeBlockFence(line)) return "code_fence";
+  if (isListItem(line)) return "list_item";
+  if (isBlockquote(line)) return "blockquote";
+  if (isThematicBreak(line)) return "thematic_break";
+  return "content";
+}
+
+/**
+ * Get the heading level (1-6) from a heading line.
+ * Returns 0 if the line is not a heading.
+ */
+export function getHeadingLevel(line: string): number {
+  const trimmed = line.trimStart();
+  const match = trimmed.match(/^#{1,6}\s/);
+  if (match) {
+    return match[0].trim().length;
+  }
+  return 0;
+}
+
+/**
+ * Calculate UTF-8 byte size of a string.
+ */
+function estimateUtf8Bytes(text: string): number {
+  if (!text) {
+    return 0;
+  }
+  return Buffer.byteLength(text, "utf8");
+}
+
+/**
+ * Split a long line at safe points, respecting UTF-8 byte limits.
+ *
+ * This function is byte-aware and correctly handles multi-byte characters
+ * (CJK, emoji, etc.) that would otherwise exceed byte limits when using
+ * character-based splitting.
+ *
+ * @param line - The line to split
+ * @param maxBytes - Maximum UTF-8 bytes per segment
+ * @returns Array of line segments
+ */
+export function splitLongLineByBytes(line: string, maxBytes: number): string[] {
+  const totalBytes = estimateUtf8Bytes(line);
+  if (totalBytes <= maxBytes) {
+    return [line];
+  }
+
+  const segments: string[] = [];
+  let remaining = line;
+
+  while (remaining.length > 0) {
+    const remainingBytes = estimateUtf8Bytes(remaining);
+    if (remainingBytes <= maxBytes) {
+      segments.push(remaining);
+      break;
+    }
+
+    // Find the longest substring that fits within maxBytes
+    // UTF-16 code units are always <= UTF-8 bytes, so we can use character count
+    // as an upper bound for the binary search
+    let high = Math.min(remaining.length, maxBytes);
+    let low = 0;
+    let bestEnd = 0;
+
+    // Binary search for the split point
+    while (low <= high) {
+      const mid = Math.floor((low + high) / 2);
+      const candidate = remaining.slice(0, mid);
+      const bytes = estimateUtf8Bytes(candidate);
+
+      if (bytes <= maxBytes) {
+        bestEnd = mid;
+        low = mid + 1;
+      } else {
+        high = mid - 1;
+      }
+    }
+
+    // Now try to find a better split point within the safe range
+    // Look for natural boundaries before bestEnd
+    const searchEnd = Math.min(bestEnd, remaining.length);
+    let splitPoint = bestEnd;
+
+    // Priority 1: Space followed by non-space (word boundary)
+    for (let i = searchEnd; i > 0; i--) {
+      const testSegment = remaining.slice(0, i);
+      if (estimateUtf8Bytes(testSegment) > maxBytes) {
+        break;
+      }
+      if (remaining[i - 1] === " " && remaining[i] !== " ") {
+        splitPoint = i - 1;
+        break;
+      }
+    }
+
+    // Priority 2: Common punctuation (only if space search didn't find a boundary)
+    if (splitPoint === bestEnd) {
+      const punctSet = new Set([".", ",", ";", ":", "!", "?", ")", "]", "}"]);
+      for (let i = searchEnd; i > 0; i--) {
+        const testSegment = remaining.slice(0, i);
+        if (estimateUtf8Bytes(testSegment) > maxBytes) {
+          break;
+        }
+        if (punctSet.has(remaining[i - 1]) && i < remaining.length && remaining[i] !== remaining[i - 1]) {
+          splitPoint = i;
+          break;
+        }
+      }
+    }
+
+    if (splitPoint <= 0) {
+      // Can't find a good split, force at bestEnd
+      splitPoint = Math.max(1, bestEnd);
+    }
+
+    segments.push(remaining.slice(0, splitPoint).trimEnd());
+    remaining = remaining.slice(splitPoint).trimStart();
+  }
+
+  return segments;
+}
--- a/src/memory/chunking/simple-chunker.ts
+++ b/src/memory/chunking/simple-chunker.ts
@ -0,0 +1,162 @@
+/**
+ * Simple chunking strategy with improved boundary detection.
+ *
+ * This is the default chunker that:
+ * 1. Uses UTF-8 byte estimation for consistent sizing
+ * 2. Preserves line structure when possible
+ * 3. Splits long lines at natural boundaries (spaces, punctuation)
+ * 4. Maintains overlap using complete lines/semantic units
+ */
+
+import { estimateUtf8Bytes } from "../embedding-input-limits.js";
+import { hashText, type MemoryChunk } from "../internal.js";
+import type { ChunkStrategy, ChunkingConfig } from "./chunk-strategy.js";
+import { splitLongLineByBytes } from "./markdown-boundaries.js";
+import { buildTextEmbeddingInput } from "../embedding-inputs.js";
+
+/**
+ * Represents a single line with its metadata.
+ */
+type LineEntry = {
+  line: string;
+  lineNo: number;
+  byteSize: number;
+};
+
+/**
+ * Simple chunker implementation.
+ */
+export class SimpleChunker implements ChunkStrategy {
+  readonly name = "simple";
+
+  chunk(content: string, config: ChunkingConfig): MemoryChunk[] {
+    // Note: config.preserveStructure is reserved for future Phase 2+ features.
+    // The SimpleChunker handles basic UTF-8 byte-aware chunking; use
+    // SemanticChunker for structure-aware splitting.
+    const lines = content.split("\n");
+    // Handle empty content - split("\n") on empty string returns [""]
+    if (lines.length === 0 || (lines.length === 1 && lines[0] === "")) {
+      return [];
+    }
+
+    // Pre-compute byte sizes for all lines
+    const lineEntries: LineEntry[] = lines.map((line, i) => ({
+      line,
+      lineNo: i + 1,
+      byteSize: estimateUtf8Bytes(line) + 1, // +1 for newline
+    }));
+
+    const chunks: MemoryChunk[] = [];
+    let currentEntries: LineEntry[] = [];
+    let currentBytes = 0;
+
+    const flush = () => {
+      if (currentEntries.length === 0) {
+        return;
+      }
+      const firstEntry = currentEntries[0]!;
+      const lastEntry = currentEntries[currentEntries.length - 1]!;
+      const text = currentEntries.map((e) => e.line).join("\n");
+      chunks.push({
+        startLine: firstEntry.lineNo,
+        endLine: lastEntry.lineNo,
+        text,
+        hash: hashText(text),
+        embeddingInput: buildTextEmbeddingInput(text),
+      });
+    };
+
+    const carryOverlap = () => {
+      const { overlapBytes } = config;
+      if (overlapBytes <= 0 || currentEntries.length === 0) {
+        currentEntries = [];
+        currentBytes = 0;
+        return;
+      }
+
+      // Keep whole lines from the end until we exceed overlap
+      let acc = 0;
+      const kept: LineEntry[] = [];
+      for (let i = currentEntries.length - 1; i >= 0; i -= 1) {
+        const entry = currentEntries[i]!;
+        if (acc + entry.byteSize > overlapBytes && kept.length > 0) {
+          break;
+        }
+        kept.unshift(entry);
+        acc += entry.byteSize;
+      }
+
+      currentEntries = kept;
+      currentBytes = acc;
+    };
+
+    for (const entry of lineEntries) {
+      // Handle long lines by splitting them at natural boundaries
+      const segments = this.splitEntryIfNeeded(entry, config.maxBytes);
+
+      for (const segment of segments) {
+        const wouldExceed =
+          currentBytes + segment.byteSize > config.maxBytes && currentEntries.length > 0;
+
+        if (wouldExceed) {
+          flush();
+          carryOverlap();
+        }
+
+        currentEntries.push(segment);
+        currentBytes += segment.byteSize;
+      }
+    }
+
+    flush();
+    return chunks;
+  }
+
+  /**
+   * Split a line entry if it exceeds the max bytes.
+   * Returns an array of entries (original or split).
+   */
+  private splitEntryIfNeeded(entry: LineEntry, maxBytes: number): LineEntry[] {
+    // If the line fits, return as-is
+    if (entry.byteSize <= maxBytes) {
+      return [entry];
+    }
+
+    // Split the long line at natural boundaries, respecting UTF-8 byte limits
+    const maxTextBytes = maxBytes - 1; // Account for newline
+    const textSegments = splitLongLineByBytes(entry.line, maxTextBytes);
+
+    return textSegments.map((segment) => ({
+      line: segment,
+      lineNo: entry.lineNo,
+      byteSize: estimateUtf8Bytes(segment) + 1,
+    }));
+  }
+}
+
+/**
+ * Singleton instance for convenience.
+ */
+export const simpleChunker = new SimpleChunker();
+
+/**
+ * Chunk markdown content using the simple strategy.
+ *
+ * This is the main export that replaces the legacy chunkMarkdown function.
+ *
+ * @param content - The markdown content to chunk
+ * @param config - Chunking configuration (or legacy { tokens, overlap } object)
+ * @returns Array of memory chunks
+ */
+export function chunkMarkdown(
+  content: string,
+  config: ChunkingConfig | { tokens: number; overlap: number },
+): MemoryChunk[] {
+  // Support legacy config format
+  const chunkingConfig =
+    "maxBytes" in config
+      ? config
+      : { maxBytes: Math.max(32, config.tokens * 4), overlapBytes: Math.max(0, config.overlap * 4) };
+
+  return simpleChunker.chunk(content, chunkingConfig);
+}
--- a/src/memory/internal.ts
+++ b/src/memory/internal.ts
@ -331,88 +331,13 @@ export async function buildMultimodalChunkForIndexing(
  };
 }

+import { chunkMarkdown as chunkMarkdownImpl } from "./chunking/simple-chunker.js";
+
 export function chunkMarkdown(
  content: string,
  chunking: { tokens: number; overlap: number },
 ): MemoryChunk[] {
-  const lines = content.split("\n");
-  if (lines.length === 0) {
-    return [];
-  }
-  const maxChars = Math.max(32, chunking.tokens * 4);
-  const overlapChars = Math.max(0, chunking.overlap * 4);
-  const chunks: MemoryChunk[] = [];
-
-  let current: Array<{ line: string; lineNo: number }> = [];
-  let currentChars = 0;
-
-  const flush = () => {
-    if (current.length === 0) {
-      return;
-    }
-    const firstEntry = current[0];
-    const lastEntry = current[current.length - 1];
-    if (!firstEntry || !lastEntry) {
-      return;
-    }
-    const text = current.map((entry) => entry.line).join("\n");
-    const startLine = firstEntry.lineNo;
-    const endLine = lastEntry.lineNo;
-    chunks.push({
-      startLine,
-      endLine,
-      text,
-      hash: hashText(text),
-      embeddingInput: buildTextEmbeddingInput(text),
-    });
-  };
-
-  const carryOverlap = () => {
-    if (overlapChars <= 0 || current.length === 0) {
-      current = [];
-      currentChars = 0;
-      return;
-    }
-    let acc = 0;
-    const kept: Array<{ line: string; lineNo: number }> = [];
-    for (let i = current.length - 1; i >= 0; i -= 1) {
-      const entry = current[i];
-      if (!entry) {
-        continue;
-      }
-      acc += entry.line.length + 1;
-      kept.unshift(entry);
-      if (acc >= overlapChars) {
-        break;
-      }
-    }
-    current = kept;
-    currentChars = kept.reduce((sum, entry) => sum + entry.line.length + 1, 0);
-  };
-
-  for (let i = 0; i < lines.length; i += 1) {
-    const line = lines[i] ?? "";
-    const lineNo = i + 1;
-    const segments: string[] = [];
-    if (line.length === 0) {
-      segments.push("");
-    } else {
-      for (let start = 0; start < line.length; start += maxChars) {
-        segments.push(line.slice(start, start + maxChars));
-      }
-    }
-    for (const segment of segments) {
-      const lineSize = segment.length + 1;
-      if (currentChars + lineSize > maxChars && current.length > 0) {
-        flush();
-        carryOverlap();
-      }
-      current.push({ line: segment, lineNo });
-      currentChars += lineSize;
-    }
-  }
-  flush();
-  return chunks;
+  return chunkMarkdownImpl(content, chunking);
 }

 /**
--- a/src/memory/manager.ts
+++ b/src/memory/manager.ts
@ -337,13 +337,17 @@ export class MemoryIndexManager extends MemoryManagerEmbeddingOps implements Mem
    }

    // If FTS isn't available, hybrid mode cannot use keyword search; degrade to vector-only.
-    const keywordResults =
+    // Parallelize keyword search (local FTS) and embedding generation (network call) for better performance
+    const [keywordResults, { queryVec, hasVector }] = await Promise.all([
      hybrid.enabled && this.fts.enabled && this.fts.available
-        ? await this.searchKeyword(cleaned, candidates).catch(() => [])
-        : [];
-
-    const queryVec = await this.embedQueryWithTimeout(cleaned);
-    const hasVector = queryVec.some((v) => v !== 0);
+        ? this.searchKeyword(cleaned, candidates).catch(() => [])
+        : [],
+      (async () => {
+        const queryVec = await this.embedQueryWithTimeout(cleaned);
+        const hasVector = queryVec.some((v) => v !== 0);
+        return { queryVec, hasVector };
+      })(),
+    ]);
    const vectorResults = hasVector
      ? await this.searchVector(queryVec, candidates).catch(() => [])
      : [];