From a4b958efcda976654b84d278a62d1362a49099e0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 15 Feb 2026 13:44:41 +0000 Subject: [PATCH] perf(test): cover embedding chunk limits without indexing --- src/memory/embedding-chunk-limits.test.ts | 52 ++++++++++++++++ .../manager.embedding-token-limit.test.ts | 60 ------------------- 2 files changed, 52 insertions(+), 60 deletions(-) create mode 100644 src/memory/embedding-chunk-limits.test.ts delete mode 100644 src/memory/manager.embedding-token-limit.test.ts diff --git a/src/memory/embedding-chunk-limits.test.ts b/src/memory/embedding-chunk-limits.test.ts new file mode 100644 index 00000000000..9785693ee75 --- /dev/null +++ b/src/memory/embedding-chunk-limits.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from "vitest"; +import type { EmbeddingProvider } from "./embeddings.js"; +import { enforceEmbeddingMaxInputTokens } from "./embedding-chunk-limits.js"; +import { estimateUtf8Bytes } from "./embedding-input-limits.js"; + +function createProvider(maxInputTokens: number): EmbeddingProvider { + return { + id: "mock", + model: "mock-embed", + maxInputTokens, + embedQuery: async () => [0], + embedBatch: async () => [[0]], + }; +} + +describe("embedding chunk limits", () => { + it("splits oversized chunks so each embedding input stays <= maxInputTokens bytes", () => { + const provider = createProvider(8192); + const input = { + startLine: 1, + endLine: 1, + text: "x".repeat(9000), + hash: "ignored", + }; + + const out = enforceEmbeddingMaxInputTokens(provider, [input]); + expect(out.length).toBeGreaterThan(1); + expect(out.map((chunk) => chunk.text).join("")).toBe(input.text); + expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true); + expect(out.every((chunk) => chunk.startLine === 1 && chunk.endLine === 1)).toBe(true); + expect(out.every((chunk) => typeof chunk.hash === "string" && chunk.hash.length > 0)).toBe( + true, + ); + }); + + it("does not split inside surrogate pairs (emoji)", () => { + const provider = createProvider(8192); + const emoji = "😀"; + const inputText = `${emoji.repeat(2100)}\n${emoji.repeat(2100)}`; + + const out = enforceEmbeddingMaxInputTokens(provider, [ + { startLine: 1, endLine: 2, text: inputText, hash: "ignored" }, + ]); + + expect(out.length).toBeGreaterThan(1); + expect(out.map((chunk) => chunk.text).join("")).toBe(inputText); + expect(out.every((chunk) => estimateUtf8Bytes(chunk.text) <= 8192)).toBe(true); + + // If we split inside surrogate pairs we'd likely end up with replacement chars. + expect(out.map((chunk) => chunk.text).join("")).not.toContain("\uFFFD"); + }); +}); diff --git a/src/memory/manager.embedding-token-limit.test.ts b/src/memory/manager.embedding-token-limit.test.ts deleted file mode 100644 index 51b0de421b6..00000000000 --- a/src/memory/manager.embedding-token-limit.test.ts +++ /dev/null @@ -1,60 +0,0 @@ -import fs from "node:fs/promises"; -import path from "node:path"; -import { describe, expect, it } from "vitest"; -import { installEmbeddingManagerFixture } from "./embedding-manager.test-harness.js"; - -const fx = installEmbeddingManagerFixture({ - fixturePrefix: "openclaw-mem-token-", - largeTokens: 10_000, - smallTokens: 1000, - createCfg: ({ workspaceDir, indexPath, tokens }) => ({ - agents: { - defaults: { - workspace: workspaceDir, - memorySearch: { - provider: "openai", - model: "mock-embed", - store: { path: indexPath, vector: { enabled: false } }, - chunking: { tokens, overlap: 0 }, - sync: { watch: false, onSessionStart: false, onSearch: false }, - query: { minScore: 0 }, - }, - }, - list: [{ id: "main", default: true }], - }, - }), -}); -const { embedBatch } = fx; - -describe("memory embedding token limits", () => { - it("splits oversized chunks so each embedding input stays <= 8192 UTF-8 bytes", async () => { - const memoryDir = fx.getMemoryDir(); - const managerLarge = fx.getManagerLarge(); - const content = "x".repeat(9500); - await fs.writeFile(path.join(memoryDir, "2026-01-09.md"), content); - await managerLarge.sync({ reason: "test" }); - - const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []); - expect(inputs.length).toBeGreaterThan(1); - expect( - Math.max(...inputs.map((input) => Buffer.byteLength(input, "utf8"))), - ).toBeLessThanOrEqual(8192); - }); - - it("uses UTF-8 byte estimates when batching multibyte chunks", async () => { - const memoryDir = fx.getMemoryDir(); - const managerSmall = fx.getManagerSmall(); - const line = "😀".repeat(1800); - const content = `${line}\n${line}\n${line}`; - await fs.writeFile(path.join(memoryDir, "2026-01-10.md"), content); - await managerSmall.sync({ reason: "test" }); - - const batchSizes = embedBatch.mock.calls.map( - (call) => (call[0] as string[] | undefined)?.length ?? 0, - ); - expect(batchSizes.length).toBe(3); - expect(batchSizes.every((size) => size === 1)).toBe(true); - const inputs = embedBatch.mock.calls.flatMap((call) => call[0] ?? []); - expect(inputs.every((input) => Buffer.byteLength(input, "utf8") <= 8192)).toBe(true); - }); -});