From 208d5d6b81f6ad6b68c0550398db360d3f517a93 Mon Sep 17 00:00:00 2001 From: Saurabh Date: Fri, 20 Mar 2026 12:47:40 +0530 Subject: [PATCH 1/3] fix: handle UTF-8 multi-byte boundaries in streaming assembly --- src/agents/subagent-registry.ts | 9 +++++++- src/infra/jsonl-socket.test.ts | 41 +++++++++++++++++++++++++++++++++ src/infra/jsonl-socket.ts | 7 +++++- src/memory/qmd-process.ts | 9 ++++++-- src/tui/tui-local-shell.ts | 9 ++++++-- 5 files changed, 69 insertions(+), 6 deletions(-) diff --git a/src/agents/subagent-registry.ts b/src/agents/subagent-registry.ts index 3c11f1850a2..e05242ee42a 100644 --- a/src/agents/subagent-registry.ts +++ b/src/agents/subagent-registry.ts @@ -112,7 +112,14 @@ function capFrozenResultText(resultText: string): string { 0, FROZEN_RESULT_TEXT_MAX_BYTES - Buffer.byteLength(notice, "utf8"), ); - const payload = Buffer.from(trimmed, "utf8").subarray(0, maxPayloadBytes).toString("utf8"); + const buf = Buffer.from(trimmed, "utf8"); + let end = maxPayloadBytes; + // Walk back if we landed in the middle of a multi-byte UTF-8 sequence + // (continuation bytes have the bit pattern 10xxxxxx, i.e. 0x80..0xBF). + while (end > 0 && buf[end] !== undefined && (buf[end] & 0xc0) === 0x80) { + end--; + } + const payload = buf.subarray(0, end).toString("utf8"); return `${payload}${notice}`; } diff --git a/src/infra/jsonl-socket.test.ts b/src/infra/jsonl-socket.test.ts index af8bf0fdaed..441ab3dddc7 100644 --- a/src/infra/jsonl-socket.test.ts +++ b/src/infra/jsonl-socket.test.ts @@ -35,6 +35,47 @@ describe.runIf(process.platform !== "win32")("requestJsonlSocket", () => { }); }); + it("handles multi-byte UTF-8 characters split across TCP chunks", async () => { + await withTempDir({ prefix: "openclaw-jsonl-socket-" }, async (dir) => { + const socketPath = path.join(dir, "socket.sock"); + // "你好" in UTF-8 is 6 bytes: e4 bd a0 e5 a5 bd + // We split the JSON line so the first chunk ends mid-character. + const fullLine = '{"text":"你好"}\n'; + const fullBuf = Buffer.from(fullLine, "utf8"); + // Split inside the second character (after first byte of 好) + const splitPoint = fullBuf.indexOf(0xa5); // second byte of 好 (e5 a5 bd) + const chunk1 = fullBuf.subarray(0, splitPoint); + const chunk2 = fullBuf.subarray(splitPoint); + + const server = net.createServer((socket) => { + socket.on("data", () => { + // Send the two halves separately to simulate a TCP chunk boundary + // landing in the middle of a multi-byte UTF-8 sequence. + socket.write(chunk1); + setTimeout(() => socket.write(chunk2), 10); + }); + }); + await new Promise((resolve) => server.listen(socketPath, resolve)); + + try { + const result = await requestJsonlSocket({ + socketPath, + payload: "{}", + timeoutMs: 2000, + accept: (msg) => { + const value = msg as { text?: string }; + return value.text ?? null; + }, + }); + expect(result).toBe("你好"); + // Verify no U+FFFD replacement character + expect(result).not.toContain("\uFFFD"); + } finally { + server.close(); + } + }); + }); + it("returns null on timeout and on socket errors", async () => { await withTempDir({ prefix: "openclaw-jsonl-socket-" }, async (dir) => { const socketPath = path.join(dir, "socket.sock"); diff --git a/src/infra/jsonl-socket.ts b/src/infra/jsonl-socket.ts index bd485e2c139..b1d0fc6515d 100644 --- a/src/infra/jsonl-socket.ts +++ b/src/infra/jsonl-socket.ts @@ -1,4 +1,5 @@ import net from "node:net"; +import { StringDecoder } from "node:string_decoder"; export async function requestJsonlSocket(params: { socketPath: string; @@ -11,6 +12,10 @@ export async function requestJsonlSocket(params: { const client = new net.Socket(); let settled = false; let buffer = ""; + // StringDecoder buffers incomplete multi-byte UTF-8 sequences across + // TCP chunks, preventing U+FFFD replacement when a character boundary + // falls on a chunk boundary. + const decoder = new StringDecoder("utf8"); const finish = (value: T | null) => { if (settled) { @@ -32,7 +37,7 @@ export async function requestJsonlSocket(params: { client.write(`${payload}\n`); }); client.on("data", (data) => { - buffer += data.toString("utf8"); + buffer += decoder.write(data); let idx = buffer.indexOf("\n"); while (idx !== -1) { const line = buffer.slice(0, idx).trim(); diff --git a/src/memory/qmd-process.ts b/src/memory/qmd-process.ts index 5a70cd3c361..eb146d0c575 100644 --- a/src/memory/qmd-process.ts +++ b/src/memory/qmd-process.ts @@ -1,4 +1,5 @@ import { spawn } from "node:child_process"; +import { StringDecoder } from "node:string_decoder"; import { materializeWindowsSpawnProgram, resolveWindowsSpawnProgram, @@ -55,16 +56,20 @@ export async function runCliCommand(params: { reject(new Error(`${params.commandSummary} timed out after ${params.timeoutMs}ms`)); }, params.timeoutMs) : null; + // StringDecoder buffers incomplete multi-byte UTF-8 sequences across + // TCP/pipe chunks, preventing U+FFFD replacement at chunk boundaries. + const stdoutDecoder = new StringDecoder("utf8"); + const stderrDecoder = new StringDecoder("utf8"); child.stdout.on("data", (data) => { if (discardStdout) { return; } - const next = appendOutputWithCap(stdout, data.toString("utf8"), params.maxOutputChars); + const next = appendOutputWithCap(stdout, stdoutDecoder.write(data), params.maxOutputChars); stdout = next.text; stdoutTruncated = stdoutTruncated || next.truncated; }); child.stderr.on("data", (data) => { - const next = appendOutputWithCap(stderr, data.toString("utf8"), params.maxOutputChars); + const next = appendOutputWithCap(stderr, stderrDecoder.write(data), params.maxOutputChars); stderr = next.text; stderrTruncated = stderrTruncated || next.truncated; }); diff --git a/src/tui/tui-local-shell.ts b/src/tui/tui-local-shell.ts index defea7397f8..6aa77c6658f 100644 --- a/src/tui/tui-local-shell.ts +++ b/src/tui/tui-local-shell.ts @@ -1,4 +1,5 @@ import { spawn } from "node:child_process"; +import { StringDecoder } from "node:string_decoder"; import type { Component, SelectItem } from "@mariozechner/pi-tui"; import { createSearchableSelectList } from "./components/selectors.js"; @@ -116,11 +117,15 @@ export function createLocalShellRunner(deps: LocalShellDeps) { let stdout = ""; let stderr = ""; + // StringDecoder buffers incomplete multi-byte UTF-8 sequences across + // pipe chunks, preventing U+FFFD replacement at chunk boundaries. + const stdoutDecoder = new StringDecoder("utf8"); + const stderrDecoder = new StringDecoder("utf8"); child.stdout.on("data", (buf) => { - stdout = appendWithCap(stdout, buf.toString("utf8")); + stdout = appendWithCap(stdout, stdoutDecoder.write(buf)); }); child.stderr.on("data", (buf) => { - stderr = appendWithCap(stderr, buf.toString("utf8")); + stderr = appendWithCap(stderr, stderrDecoder.write(buf)); }); child.on("close", (code, signal) => { From c3f775aae48b342e2443e08b05a3936841fd8d07 Mon Sep 17 00:00:00 2001 From: Saurabh Date: Fri, 20 Mar 2026 13:20:47 +0530 Subject: [PATCH 2/3] fix: correct comment to reference pipes instead of TCP in qmd-process --- src/memory/qmd-process.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memory/qmd-process.ts b/src/memory/qmd-process.ts index eb146d0c575..e55acd7b20c 100644 --- a/src/memory/qmd-process.ts +++ b/src/memory/qmd-process.ts @@ -57,7 +57,7 @@ export async function runCliCommand(params: { }, params.timeoutMs) : null; // StringDecoder buffers incomplete multi-byte UTF-8 sequences across - // TCP/pipe chunks, preventing U+FFFD replacement at chunk boundaries. + // pipe chunks, preventing U+FFFD replacement at chunk boundaries. const stdoutDecoder = new StringDecoder("utf8"); const stderrDecoder = new StringDecoder("utf8"); child.stdout.on("data", (data) => { From 110bfd8bb7d4063f994441df2a2e454f45cbac73 Mon Sep 17 00:00:00 2001 From: Saurabh Mishra Date: Fri, 20 Mar 2026 21:27:36 +0530 Subject: [PATCH 3/3] Update src/memory/qmd-process.ts Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>