feat(telegram): support outbound media groups via sendMediaGroup

Send 2-10 photos/videos as Telegram albums using sendMediaGroup when
all items are groupable (photo/video only). Falls back to existing
per-item delivery for single items, GIFs, audio, documents, or mixed
non-groupable media.

Caption is placed on the first album item (subject to 1024-char limit)
with overflow sent as a follow-up text message. Buttons are sent as a
follow-up since sendMediaGroup does not support reply_markup. Threading,
reply-to, silent, and pin behaviors are preserved.

Closes #13620
Supersedes #21309

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
shinya 2026-03-21 14:08:24 +08:00
parent 8a05c05596
commit 204b19991f
3 changed files with 576 additions and 30 deletions

View File

@ -1,4 +1,5 @@
import { type Bot, GrammyError, InputFile } from "grammy";
import { type Bot, GrammyError, InputFile, InputMediaBuilder } from "grammy";
import type { InputMediaPhoto, InputMediaVideo } from "grammy/types";
import type { ReplyToMode } from "openclaw/plugin-sdk/config-runtime";
import type { MarkdownTableMode } from "openclaw/plugin-sdk/config-runtime";
import { fireAndForgetHook } from "openclaw/plugin-sdk/hook-runtime";
@ -30,6 +31,7 @@ import { buildInlineKeyboard } from "../send.js";
import { resolveTelegramVoiceSend } from "../voice.js";
import {
buildTelegramSendParams,
sendTelegramMediaGroup,
sendTelegramText,
sendTelegramWithThreadFallback,
} from "./delivery.send.js";
@ -90,6 +92,144 @@ function markDelivered(progress: DeliveryProgress): void {
progress.deliveredCount += 1;
}
const MEDIA_GROUP_MIN = 2;
const MEDIA_GROUP_MAX = 10;
type GroupableMediaKind = "image" | "video";
/** Returns true when all media items are photos/videos and count is 2-10. */
function isGroupableMediaList(mediaList: string[], reply: ReplyPayload): boolean {
if (reply.audioAsVoice) return false;
return mediaList.length >= MEDIA_GROUP_MIN && mediaList.length <= MEDIA_GROUP_MAX;
}
/**
* Sends 2-10 photos/videos as a single Telegram album via sendMediaGroup.
* Caption is placed on the first item (subject to 1024-char limit).
* Buttons and overflow text are sent as follow-up messages because
* sendMediaGroup does not support reply_markup.
*
* Returns the first delivered message_id, or undefined on empty delivery.
* Returns `null` when the loaded media turns out to be non-groupable
* (e.g. GIF, audio, document) so the caller can fall back to per-item delivery.
*/
async function deliverMediaGroupReply(params: {
reply: ReplyPayload;
mediaList: string[];
bot: Bot;
chatId: string;
runtime: RuntimeEnv;
thread?: TelegramThreadSpec | null;
tableMode?: MarkdownTableMode;
mediaLocalRoots?: readonly string[];
chunkText: ChunkTextFn;
mediaLoader: typeof loadWebMedia;
linkPreview?: boolean;
silent?: boolean;
replyQuoteText?: string;
replyMarkup?: ReturnType<typeof buildInlineKeyboard>;
replyToId?: number;
replyToMode: ReplyToMode;
progress: DeliveryProgress;
}): Promise<number | undefined | null> {
// Load all media and build InputMedia entries, bailing out if any item
// is not a groupable type (photo/video).
const inputMedia: Array<InputMediaPhoto | InputMediaVideo> = [];
for (let i = 0; i < params.mediaList.length; i++) {
const mediaUrl = params.mediaList[i]!;
const media = await params.mediaLoader(
mediaUrl,
buildOutboundMediaLoadOptions({ mediaLocalRoots: params.mediaLocalRoots }),
);
const kind = kindFromMime(media.contentType ?? undefined) as
| GroupableMediaKind
| string
| undefined;
const isGif = isGifMedia({
contentType: media.contentType,
fileName: media.fileName,
});
// GIFs, audio, documents, and unknown types are not groupable.
if (isGif || (kind !== "image" && kind !== "video")) {
return null;
}
const fileName = media.fileName ?? "file";
const file = new InputFile(media.buffer, fileName);
// Caption is only attached to the first item in the group.
let captionOpts: { caption?: string; parse_mode?: "HTML" } = {};
if (i === 0) {
const { caption } = splitTelegramCaption(params.reply.text ?? undefined);
if (caption) {
const htmlCaption = renderTelegramHtmlText(caption, { tableMode: params.tableMode });
captionOpts = { caption: htmlCaption, parse_mode: "HTML" };
}
}
if (kind === "video") {
inputMedia.push(InputMediaBuilder.video(file, captionOpts));
} else {
inputMedia.push(InputMediaBuilder.photo(file, captionOpts));
}
}
const replyToMessageId = resolveReplyToForSend({
replyToId: params.replyToId,
replyToMode: params.replyToMode,
progress: params.progress,
});
const firstMessageId = await sendTelegramMediaGroup({
bot: params.bot,
chatId: params.chatId,
media: inputMedia,
runtime: params.runtime,
thread: params.thread,
replyToMessageId,
silent: params.silent,
});
// Mark all items as delivered.
for (let i = 0; i < inputMedia.length; i++) {
markDelivered(params.progress);
}
markReplyApplied(params.progress, replyToMessageId);
// Handle caption overflow and/or buttons as follow-up messages.
// sendMediaGroup does not support reply_markup, so buttons must go in a
// separate follow-up message.
const { followUpText } = splitTelegramCaption(params.reply.text ?? undefined);
if (followUpText) {
await sendPendingFollowUpText({
bot: params.bot,
chatId: params.chatId,
runtime: params.runtime,
thread: params.thread,
chunkText: params.chunkText,
text: followUpText,
replyMarkup: params.replyMarkup,
linkPreview: params.linkPreview,
silent: params.silent,
replyToId: params.replyToId,
replyToMode: params.replyToMode,
progress: params.progress,
});
} else if (params.replyMarkup) {
// No caption overflow, but buttons need a carrier message since
// sendMediaGroup does not support reply_markup.
await sendTelegramText(params.bot, params.chatId, "\u200B", params.runtime, {
thread: params.thread,
textMode: "html",
plainText: "\u200B",
silent: params.silent,
replyMarkup: params.replyMarkup,
});
markDelivered(params.progress);
}
return firstMessageId;
}
async function deliverTextReply(params: {
bot: Bot;
chatId: string;
@ -671,26 +811,56 @@ export async function deliverReplies(params: {
progress,
});
} else {
firstDeliveredMessageId = await deliverMediaReply({
reply,
mediaList,
bot: params.bot,
chatId: params.chatId,
runtime: params.runtime,
thread: params.thread,
tableMode: params.tableMode,
mediaLocalRoots: params.mediaLocalRoots,
chunkText,
mediaLoader,
onVoiceRecording: params.onVoiceRecording,
linkPreview: params.linkPreview,
silent: params.silent,
replyQuoteText: params.replyQuoteText,
replyMarkup,
replyToId,
replyToMode: params.replyToMode,
progress,
});
// Try sending as a media group (album) when criteria are met.
let usedMediaGroup = false;
if (isGroupableMediaList(mediaList, reply)) {
const groupResult = await deliverMediaGroupReply({
reply,
mediaList,
bot: params.bot,
chatId: params.chatId,
runtime: params.runtime,
thread: params.thread,
tableMode: params.tableMode,
mediaLocalRoots: params.mediaLocalRoots,
chunkText,
mediaLoader,
linkPreview: params.linkPreview,
silent: params.silent,
replyQuoteText: params.replyQuoteText,
replyMarkup,
replyToId,
replyToMode: params.replyToMode,
progress,
});
// null means the loaded media was not groupable; fall back to per-item.
if (groupResult !== null) {
firstDeliveredMessageId = groupResult;
usedMediaGroup = true;
}
}
if (!usedMediaGroup) {
firstDeliveredMessageId = await deliverMediaReply({
reply,
mediaList,
bot: params.bot,
chatId: params.chatId,
runtime: params.runtime,
thread: params.thread,
tableMode: params.tableMode,
mediaLocalRoots: params.mediaLocalRoots,
chunkText,
mediaLoader,
onVoiceRecording: params.onVoiceRecording,
linkPreview: params.linkPreview,
silent: params.silent,
replyQuoteText: params.replyQuoteText,
replyMarkup,
replyToId,
replyToMode: params.replyToMode,
progress,
});
}
}
await maybePinFirstDeliveredMessage({
shouldPin: shouldPinFirstMessage,

View File

@ -1,4 +1,5 @@
import { type Bot, GrammyError } from "grammy";
import type { InputMediaPhoto, InputMediaVideo } from "grammy/types";
import { formatErrorMessage } from "openclaw/plugin-sdk/infra-runtime";
import type { RuntimeEnv } from "openclaw/plugin-sdk/runtime-env";
import { withTelegramApiErrorLogging } from "../api-logging.js";
@ -92,6 +93,32 @@ export function buildTelegramSendParams(opts?: {
return params;
}
export async function sendTelegramMediaGroup(params: {
bot: Bot;
chatId: string;
media: ReadonlyArray<InputMediaPhoto | InputMediaVideo>;
runtime: RuntimeEnv;
thread?: TelegramThreadSpec | null;
replyToMessageId?: number;
silent?: boolean;
}): Promise<number> {
const requestParams = buildTelegramSendParams({
replyToMessageId: params.replyToMessageId,
thread: params.thread,
silent: params.silent,
});
const result = await sendTelegramWithThreadFallback({
operation: "sendMediaGroup",
runtime: params.runtime,
thread: params.thread,
requestParams,
send: (effectiveParams) =>
params.bot.api.sendMediaGroup(params.chatId, params.media, { ...effectiveParams }),
});
// sendMediaGroup returns an array; first message_id is used for pinning/hooks.
return result[0]?.message_id ?? 0;
}
export async function sendTelegramText(
bot: Bot,
chatId: string,

View File

@ -59,6 +59,18 @@ vi.mock("grammy", () => ({
GrammyError: class GrammyError extends Error {
description = "";
},
InputMediaBuilder: {
photo: (media: unknown, opts?: Record<string, unknown>) => ({
type: "photo",
media,
...opts,
}),
video: (media: unknown, opts?: Record<string, unknown>) => ({
type: "video",
media,
...opts,
}),
},
}));
function createRuntime(withLog = true): RuntimeStub {
@ -830,19 +842,26 @@ describe("deliverReplies", () => {
}
});
it("replyToMode 'first' only applies reply-to to first media item", async () => {
it("replyToMode 'first' only applies reply-to to first media item (per-item path)", async () => {
// Use a GIF + photo to force per-item delivery (non-groupable mix).
const runtime = createRuntime();
const sendPhoto = vi.fn().mockResolvedValue({
const sendAnimation = vi.fn().mockResolvedValue({
message_id: 30,
chat: { id: "123" },
});
const bot = createBot({ sendPhoto });
const sendPhoto = vi.fn().mockResolvedValue({
message_id: 31,
chat: { id: "123" },
});
const bot = createBot({ sendAnimation, sendPhoto });
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("a.gif", "image/gif", "gif1");
// Media group fallback re-loads (first item detected as GIF).
mockMediaLoad("a.gif", "image/gif", "gif1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverReplies({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"], replyToId: "900" }],
replies: [{ mediaUrls: ["https://a.gif", "https://b.jpg"], replyToId: "900" }],
chatId: "123",
token: "tok",
runtime,
@ -851,13 +870,14 @@ describe("deliverReplies", () => {
textLimit: 4000,
});
expect(sendPhoto).toHaveBeenCalledTimes(2);
// First media should have reply_to_message_id
expect(sendPhoto.mock.calls[0][2]).toEqual(
expect(sendAnimation).toHaveBeenCalledTimes(1);
expect(sendPhoto).toHaveBeenCalledTimes(1);
// First media (GIF) should have reply_to_message_id
expect(sendAnimation.mock.calls[0][2]).toEqual(
expect.objectContaining({ reply_to_message_id: 900 }),
);
// Second media should NOT have reply_to_message_id
expect(sendPhoto.mock.calls[1][2]).not.toHaveProperty("reply_to_message_id");
expect(sendPhoto.mock.calls[0][2]).not.toHaveProperty("reply_to_message_id");
});
it("pins the first delivered text message when telegram pin is requested", async () => {
@ -917,4 +937,333 @@ describe("deliverReplies", () => {
expect(sendVoice).toHaveBeenCalledTimes(1);
expect(sendMessage).not.toHaveBeenCalled();
});
describe("media groups (albums)", () => {
function createMediaGroupHarness(messageId = 50) {
const runtime = createRuntime();
const sendMediaGroup = vi.fn().mockResolvedValue([
{ message_id: messageId, chat: { id: "123" } },
{ message_id: messageId + 1, chat: { id: "123" } },
]);
const sendMessage = vi.fn().mockResolvedValue({
message_id: messageId + 10,
chat: { id: "123" },
});
const sendPhoto = vi.fn().mockResolvedValue({
message_id: messageId + 20,
chat: { id: "123" },
});
const bot = createBot({ sendMediaGroup, sendMessage, sendPhoto });
return { runtime, sendMediaGroup, sendMessage, sendPhoto, bot };
}
it("sends 2 photos as a media group via sendMediaGroup", async () => {
const { runtime, sendMediaGroup, sendPhoto, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"] }],
runtime,
bot,
});
expect(sendMediaGroup).toHaveBeenCalledTimes(1);
expect(sendPhoto).not.toHaveBeenCalled();
const media = sendMediaGroup.mock.calls[0][1];
expect(media).toHaveLength(2);
expect(media[0].type).toBe("photo");
expect(media[1].type).toBe("photo");
});
it("sends mixed photos and videos as a media group", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.mp4", "video/mp4", "vid1");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.mp4"] }],
runtime,
bot,
});
expect(sendMediaGroup).toHaveBeenCalledTimes(1);
const media = sendMediaGroup.mock.calls[0][1];
expect(media[0].type).toBe("photo");
expect(media[1].type).toBe("video");
});
it("places caption only on the first item in the group", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"], text: "album caption" }],
runtime,
bot,
});
expect(sendMediaGroup).toHaveBeenCalledTimes(1);
const media = sendMediaGroup.mock.calls[0][1];
expect(media[0].caption).toBe("album caption");
expect(media[0].parse_mode).toBe("HTML");
expect(media[1].caption).toBeUndefined();
});
it("sends caption overflow as a follow-up text message", async () => {
const { runtime, sendMediaGroup, sendMessage, bot } = createMediaGroupHarness();
const longCaption = "x".repeat(1025);
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"], text: longCaption }],
runtime,
bot,
});
expect(sendMediaGroup).toHaveBeenCalledTimes(1);
// Caption should not be on any media item when it overflows.
const media = sendMediaGroup.mock.calls[0][1];
expect(media[0].caption).toBeUndefined();
// Follow-up text message should be sent.
expect(sendMessage).toHaveBeenCalledTimes(1);
expect(sendMessage.mock.calls[0][1]).toContain("x".repeat(100));
});
it("falls back to per-item when a GIF is in the list", async () => {
const runtime = createRuntime();
const sendMediaGroup = vi.fn();
const sendAnimation = vi.fn().mockResolvedValue({
message_id: 60,
chat: { id: "123" },
});
const sendPhoto = vi.fn().mockResolvedValue({
message_id: 61,
chat: { id: "123" },
});
const bot = createBot({ sendMediaGroup, sendAnimation, sendPhoto });
// First media is a GIF — detected during group loading, triggers fallback.
mockMediaLoad("anim.gif", "image/gif", "gif1");
// Per-item fallback will re-load the media.
mockMediaLoad("anim.gif", "image/gif", "gif1");
mockMediaLoad("b.jpg", "image/jpeg", "img1");
await deliverWith({
replies: [{ mediaUrls: ["https://anim.gif", "https://b.jpg"] }],
runtime,
bot,
});
expect(sendMediaGroup).not.toHaveBeenCalled();
expect(sendAnimation).toHaveBeenCalledTimes(1);
expect(sendPhoto).toHaveBeenCalledTimes(1);
});
it("falls back to per-item when audio is in the list", async () => {
const runtime = createRuntime();
const sendMediaGroup = vi.fn();
const sendPhoto = vi.fn().mockResolvedValue({
message_id: 70,
chat: { id: "123" },
});
const sendAudio = vi.fn().mockResolvedValue({
message_id: 71,
chat: { id: "123" },
});
const bot = createBot({ sendMediaGroup, sendPhoto, sendAudio });
// First media is audio — not groupable.
mockMediaLoad("song.mp3", "audio/mpeg", "audio1");
// Per-item fallback re-loads.
mockMediaLoad("song.mp3", "audio/mpeg", "audio1");
mockMediaLoad("b.jpg", "image/jpeg", "img1");
await deliverWith({
replies: [{ mediaUrls: ["https://song.mp3", "https://b.jpg"] }],
runtime,
bot,
});
expect(sendMediaGroup).not.toHaveBeenCalled();
expect(sendAudio).toHaveBeenCalledTimes(1);
expect(sendPhoto).toHaveBeenCalledTimes(1);
});
it("does not use media group for a single item", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
const sendPhoto = vi.fn().mockResolvedValue({
message_id: 80,
chat: { id: "123" },
});
const singleBot = createBot({ sendMediaGroup, sendPhoto });
mockMediaLoad("a.jpg", "image/jpeg", "img1");
await deliverWith({
replies: [{ mediaUrl: "https://a.jpg" }],
runtime,
bot: singleBot,
});
expect(sendMediaGroup).not.toHaveBeenCalled();
expect(sendPhoto).toHaveBeenCalledTimes(1);
});
it("sends buttons as a follow-up message since sendMediaGroup has no reply_markup", async () => {
const { runtime, sendMediaGroup, sendMessage, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [
{
mediaUrls: ["https://a.jpg", "https://b.jpg"],
text: "short caption",
channelData: {
telegram: {
buttons: [[{ text: "Click", callback_data: "click" }]],
},
},
},
],
runtime,
bot,
});
expect(sendMediaGroup).toHaveBeenCalledTimes(1);
// Buttons should be in a follow-up message.
expect(sendMessage).toHaveBeenCalledTimes(1);
expect(sendMessage.mock.calls[0][2]).toEqual(
expect.objectContaining({
reply_markup: {
inline_keyboard: [[{ text: "Click", callback_data: "click" }]],
},
}),
);
});
it("includes message_thread_id for DM topics in media groups", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"] }],
runtime,
bot,
thread: { id: 42, scope: "dm" },
});
expect(sendMediaGroup).toHaveBeenCalledWith(
"123",
expect.any(Array),
expect.objectContaining({
message_thread_id: 42,
}),
);
});
it("sets disable_notification when silent is true for media groups", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"] }],
runtime,
bot,
silent: true,
});
expect(sendMediaGroup).toHaveBeenCalledWith(
"123",
expect.any(Array),
expect.objectContaining({
disable_notification: true,
}),
);
});
it("applies reply_to_message_id on media group", async () => {
const { runtime, sendMediaGroup, bot } = createMediaGroupHarness();
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverReplies({
...baseDeliveryParams,
replies: [{ mediaUrls: ["https://a.jpg", "https://b.jpg"], replyToId: "500" }],
runtime,
bot,
replyToMode: "first",
mediaLoader: loadWebMedia,
});
expect(sendMediaGroup).toHaveBeenCalledWith(
"123",
expect.any(Array),
expect.objectContaining({
reply_to_message_id: 500,
}),
);
});
it("does not use media group for audioAsVoice replies", async () => {
const runtime = createRuntime();
const sendMediaGroup = vi.fn();
const sendVoice = vi.fn().mockResolvedValue({
message_id: 90,
chat: { id: "123" },
});
const bot = createBot({ sendMediaGroup, sendVoice });
mockMediaLoad("a.ogg", "audio/ogg", "voice1");
mockMediaLoad("b.ogg", "audio/ogg", "voice2");
await deliverWith({
replies: [{ mediaUrls: ["https://a.ogg", "https://b.ogg"], audioAsVoice: true }],
runtime,
bot,
});
expect(sendMediaGroup).not.toHaveBeenCalled();
});
it("pins the first message from a media group when pin is requested", async () => {
const runtime = createRuntime();
const sendMediaGroup = vi.fn().mockResolvedValue([
{ message_id: 100, chat: { id: "123" } },
{ message_id: 101, chat: { id: "123" } },
]);
const pinChatMessage = vi.fn().mockResolvedValue(true);
const bot = createBot({ sendMediaGroup, pinChatMessage });
mockMediaLoad("a.jpg", "image/jpeg", "img1");
mockMediaLoad("b.jpg", "image/jpeg", "img2");
await deliverWith({
replies: [
{
mediaUrls: ["https://a.jpg", "https://b.jpg"],
channelData: { telegram: { pin: true } },
},
],
runtime,
bot,
});
expect(pinChatMessage).toHaveBeenCalledTimes(1);
expect(pinChatMessage).toHaveBeenCalledWith("123", 100, { disable_notification: true });
});
});
});