From d68c53ae36367092c671a127f1951adc2bed84f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A8=81=E5=93=A5Wego?= Date: Thu, 19 Mar 2026 10:15:44 +0800 Subject: [PATCH] gateway: improve oversized transcript placeholder semantics --- src/gateway/session-utils.fs.ts | 46 ++++++++++++++++++++++++++++++--- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/src/gateway/session-utils.fs.ts b/src/gateway/session-utils.fs.ts index e92e3c3a8c9..ceaf8a1413b 100644 --- a/src/gateway/session-utils.fs.ts +++ b/src/gateway/session-utils.fs.ts @@ -146,6 +146,11 @@ export function readSessionMessages( lineCount >= MIN_TAIL_LINES_TARGET || tailBytes >= MAX_TAIL_BYTES_CAP ) { + if (tailBytes >= MAX_TAIL_BYTES_CAP && start > 0 && lineCount < MIN_TAIL_LINES_TARGET) { + console.warn( + `[session-utils] transcript tail cap hit for session ${sessionId}: read ${tailBytes} bytes (cap ${MAX_TAIL_BYTES_CAP}) but only recovered ~${lineCount} lines; history may be truncated`, + ); + } break; } @@ -173,7 +178,10 @@ export function readSessionMessages( `[session-utils] oversized transcript line in session ${sessionId}: ${trimmed.length} chars (max ${MAX_LINE_CHARS}); emitting placeholder`, ); // Best-effort: preserve original role/timestamp without JSON.parse (which would stall). - const roleMatch = trimmed.match(/"role"\s*:\s*"([^"]+)"/); + // Regex scans are limited to a prefix so we don't do O(n) work over tens/hundreds of MB. + const scan = trimmed.slice(0, 1_000_000); + + const roleMatch = scan.match(/"role"\s*:\s*"([^"]+)"/); const roleCandidate = roleMatch?.[1]; const role = roleCandidate === "user" || @@ -184,28 +192,58 @@ export function readSessionMessages( : "assistant"; let timestamp = Date.now(); - const tsNum = trimmed.match(/"timestamp"\s*:\s*(\d{10,13})/); + const tsNum = scan.match(/"timestamp"\s*:\s*(\d{10,13})/); if (tsNum?.[1]) { const n = Number(tsNum[1]); if (Number.isFinite(n)) { timestamp = n; } } else { - const tsIso = trimmed.match(/"timestamp"\s*:\s*"([^"]+)"/); + const tsIso = scan.match(/"timestamp"\s*:\s*"([^"]+)"/); const d = tsIso?.[1] ? Date.parse(tsIso[1]) : Number.NaN; if (Number.isFinite(d)) { timestamp = d; } } + // If the record is oversized mainly due to inline media (e.g. image blocks with `data:`), + // downstream sanitization would normally strip that payload. Preserve some semantics here + // without parsing the full JSON. + const looksLikeInlineMedia = /"type"\s*:\s*"image"/.test(scan) || /"data"\s*:\s*"data:/.test(scan); + + let placeholderText = "[chat.history omitted: message too large]"; + const textSnips: string[] = []; + if (looksLikeInlineMedia) { + placeholderText = "[chat.history omitted: inline media too large]"; + + // Best-effort: pull out a few text snippets so the user still sees *something* meaningful + // even if the record is oversized due to inline media payload. + const re = /"text"\s*:\s*"((?:\\.|[^"\\]){0,3000})"/g; + let match: RegExpExecArray | null; + while (textSnips.length < 4 && (match = re.exec(scan))) { + const raw = match[1]; + const unescaped = raw.replace(/\\n/g, "\n").replace(/\\t/g, "\t"); + const flat = unescaped.replace(/\s+/g, " ").trim(); + if (flat) { + textSnips.push(flat); + } + } + } + + const blocks: Array<{ type: string; text: string }> = [{ type: "text", text: placeholderText }]; + if (textSnips.length) { + blocks.push({ type: "text", text: `Context (best-effort): ${textSnips.join(" | ")}` }); + } + messages.push({ role, - content: [{ type: "text", text: "[chat.history omitted: message too large]" }], + content: blocks, timestamp, __openclaw: { kind: "oversized_transcript_line", sizeChars: trimmed.length, guessed: true, + looksLikeInlineMedia, }, }); continue;