From d6648e77bc6b4ddd731c811500d3dfa227a1bb67 Mon Sep 17 00:00:00 2001 From: Eulices Lopez Date: Wed, 18 Mar 2026 01:52:58 -0400 Subject: [PATCH] fix: preserve synthetic transcript attachment order --- src/media-understanding/apply.ts | 1 + src/media-understanding/format.test.ts | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/media-understanding/apply.ts b/src/media-understanding/apply.ts index 1f97eaba5d7..d18bdbb282b 100644 --- a/src/media-understanding/apply.ts +++ b/src/media-understanding/apply.ts @@ -532,6 +532,7 @@ export async function applyMediaUnderstanding(params: { (output) => !audioOutputAttachmentIndexes.has(output.attachmentIndex), ); outputs.push(...syntheticSkippedAudioOutputs); + outputs.sort((left, right) => left.attachmentIndex - right.attachmentIndex); if (decisions.length > 0) { ctx.MediaUnderstandingDecisions = [...(ctx.MediaUnderstandingDecisions ?? []), ...decisions]; diff --git a/src/media-understanding/format.test.ts b/src/media-understanding/format.test.ts index 172ecadf985..57ea5ebf83b 100644 --- a/src/media-understanding/format.test.ts +++ b/src/media-understanding/format.test.ts @@ -88,4 +88,29 @@ describe("formatMediaUnderstandingBody", () => { }); expect(body).toBe("[Image]\nDescription:\na cat"); }); + + it("labels audio transcripts by their attachment order", () => { + const body = formatMediaUnderstandingBody({ + outputs: [ + { + kind: "audio.transcription", + attachmentIndex: 0, + text: "first clip was silent", + provider: "openclaw", + }, + { + kind: "audio.transcription", + attachmentIndex: 1, + text: "second clip has speech", + provider: "groq", + }, + ], + }); + expect(body).toBe( + [ + "[Audio 1/2]\nTranscript:\nfirst clip was silent", + "[Audio 2/2]\nTranscript:\nsecond clip has speech", + ].join("\n\n"), + ); + }); });