diff --git a/CHANGELOG.md b/CHANGELOG.md index 1b16e3f6efa..e99959251ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -151,9 +151,12 @@ Docs: https://docs.openclaw.ai ### Breaking +- Skills/image generation: remove the bundled `nano-banana-pro` skill wrapper. Use `agents.defaults.imageGenerationModel.primary: "google/gemini-3-pro-image-preview"` for the native Nano Banana-style path instead. + - Browser/Chrome MCP: remove the legacy Chrome extension relay path, bundled extension assets, `driver: "extension"`, and `browser.relayBindHost`. Run `openclaw doctor --fix` to migrate host-local browser config to `existing-session` / `user`; Docker, headless, sandbox, and remote browser flows still use raw CDP. (#47893) Thanks @vincentkoc. - Plugins/runtime: remove the public `openclaw/extension-api` surface with no compatibility shim. Bundled plugins must use injected runtime for host-side operations (for example `api.runtime.agent.runEmbeddedPiAgent`) and any remaining direct imports must come from narrow `openclaw/plugin-sdk/*` subpaths instead of the monolithic SDK root. - Tools/image generation: standardize the stock image create/edit path on the core `image_generate` tool. The old `nano-banana-pro` docs/examples are gone; if you previously copied that sample-skill config, switch to `agents.defaults.imageGenerationModel` for built-in image generation or install a separate third-party skill explicitly. +- Skills/image generation: remove the bundled `nano-banana-pro` skill wrapper. Use `agents.defaults.imageGenerationModel.primary: "google/gemini-3-pro-image-preview"` for the native Nano Banana-style path instead. - Plugins/message discovery: require `ChannelMessageActionAdapter.describeMessageTool(...)` for shared `message` tool discovery. The legacy `listActions`, `getCapabilities`, and `getToolSchema` adapter methods are removed. Plugin authors should migrate message discovery to `describeMessageTool(...)` and keep channel-specific action runtime code inside the owning plugin package. Thanks @gumadeiras. ## 2026.3.13 diff --git a/docs/gateway/configuration-reference.md b/docs/gateway/configuration-reference.md index 6cf6272483e..49c743db623 100644 --- a/docs/gateway/configuration-reference.md +++ b/docs/gateway/configuration-reference.md @@ -905,7 +905,9 @@ Time format in system prompt. Default: `auto` (OS preference). - Also used as fallback routing when the selected/default model cannot accept image input. - `imageGenerationModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the shared image-generation capability and any future tool/plugin surface that generates images. + - Typical values: `google/gemini-3-pro-image-preview` for the native Nano Banana-style flow, `fal/fal-ai/flux/dev` for fal, or `openai/gpt-image-1` for OpenAI Images. - If omitted, `image_generate` can still infer a best-effort provider default from compatible auth-backed image-generation providers. + - Typical values: `google/gemini-3-pro-image-preview`, `fal/fal-ai/flux/dev`, `openai/gpt-image-1`. - `pdfModel`: accepts either a string (`"provider/model"`) or an object (`{ primary, fallbacks }`). - Used by the `pdf` tool for model routing. - If omitted, the PDF tool falls back to `imageModel`, then to best-effort provider defaults. diff --git a/docs/tools/index.md b/docs/tools/index.md index f5eb956f13e..55e52bf46da 100644 --- a/docs/tools/index.md +++ b/docs/tools/index.md @@ -421,9 +421,24 @@ Notes: - Use `action: "list"` to inspect registered providers, default models, supported model ids, sizes, resolutions, and edit support. - Returns local `MEDIA:` lines so channels can deliver the generated files directly. - Uses the image-generation model directly (independent of the main chat model). -- Google-backed flows support reference-image edits plus explicit `1K|2K|4K` resolution hints. +- Google-backed flows, including `google/gemini-3-pro-image-preview` for the native Nano Banana-style path, support reference-image edits plus explicit `1K|2K|4K` resolution hints. - When editing and `resolution` is omitted, OpenClaw infers a draft/final resolution from the input image size. -- This is the built-in replacement for the old sample `nano-banana-pro` skill workflow. Use `agents.defaults.imageGenerationModel`, not `skills.entries`, for stock image generation. +- This is the built-in replacement for the old `nano-banana-pro` skill workflow. Use `agents.defaults.imageGenerationModel`, not `skills.entries`, for stock image generation. + +Native example: + +```json5 +{ + agents: { + defaults: { + imageGenerationModel: { + primary: "google/gemini-3-pro-image-preview", // native Nano Banana path + fallbacks: ["fal/fal-ai/flux/dev"], + }, + }, + }, +} +``` ### `pdf` diff --git a/docs/tools/skills-config.md b/docs/tools/skills-config.md index 697cb46dad6..83242afaf5d 100644 --- a/docs/tools/skills-config.md +++ b/docs/tools/skills-config.md @@ -42,6 +42,11 @@ For built-in image generation/editing, prefer `agents.defaults.imageGenerationMo plus the core `image_generate` tool. `skills.entries.*` is only for custom or third-party skill workflows. +Examples: + +- Native Nano Banana-style setup: `agents.defaults.imageGenerationModel.primary: "google/gemini-3-pro-image-preview"` +- Native fal setup: `agents.defaults.imageGenerationModel.primary: "fal/fal-ai/flux/dev"` + ## Fields - `allowBundled`: optional allowlist for **bundled** skills only. When set, only diff --git a/skills/nano-banana-pro/SKILL.md b/skills/nano-banana-pro/SKILL.md deleted file mode 100644 index 8a46f1a99ba..00000000000 --- a/skills/nano-banana-pro/SKILL.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -name: nano-banana-pro -description: Generate or edit images via Gemini 3 Pro Image (Nano Banana Pro). -homepage: https://ai.google.dev/ -metadata: - { - "openclaw": - { - "emoji": "🍌", - "requires": { "bins": ["uv"], "env": ["GEMINI_API_KEY"] }, - "primaryEnv": "GEMINI_API_KEY", - "install": - [ - { - "id": "uv-brew", - "kind": "brew", - "formula": "uv", - "bins": ["uv"], - "label": "Install uv (brew)", - }, - ], - }, - } ---- - -# Nano Banana Pro (Gemini 3 Pro Image) - -Use the bundled script to generate or edit images. - -Generate - -```bash -uv run {baseDir}/scripts/generate_image.py --prompt "your image description" --filename "output.png" --resolution 1K -``` - -Edit (single image) - -```bash -uv run {baseDir}/scripts/generate_image.py --prompt "edit instructions" --filename "output.png" -i "/path/in.png" --resolution 2K -``` - -Multi-image composition (up to 14 images) - -```bash -uv run {baseDir}/scripts/generate_image.py --prompt "combine these into one scene" --filename "output.png" -i img1.png -i img2.png -i img3.png -``` - -API key - -- `GEMINI_API_KEY` env var -- Or set `skills."nano-banana-pro".apiKey` / `skills."nano-banana-pro".env.GEMINI_API_KEY` in `~/.openclaw/openclaw.json` - -Specific aspect ratio (optional) - -```bash -uv run {baseDir}/scripts/generate_image.py --prompt "portrait photo" --filename "output.png" --aspect-ratio 9:16 -``` - -Notes - -- Resolutions: `1K` (default), `2K`, `4K`. -- Aspect ratios: `1:1`, `2:3`, `3:2`, `3:4`, `4:3`, `4:5`, `5:4`, `9:16`, `16:9`, `21:9`. Without `--aspect-ratio` / `-a`, the model picks freely - use this flag for avatars, profile pics, or consistent batch generation. -- Use timestamps in filenames: `yyyy-mm-dd-hh-mm-ss-name.png`. -- The script prints a `MEDIA:` line for OpenClaw to auto-attach on supported chat providers. -- Do not read the image back; report the saved path only. diff --git a/skills/nano-banana-pro/scripts/generate_image.py b/skills/nano-banana-pro/scripts/generate_image.py deleted file mode 100755 index 796022adfba..00000000000 --- a/skills/nano-banana-pro/scripts/generate_image.py +++ /dev/null @@ -1,235 +0,0 @@ -#!/usr/bin/env python3 -# /// script -# requires-python = ">=3.10" -# dependencies = [ -# "google-genai>=1.0.0", -# "pillow>=10.0.0", -# ] -# /// -""" -Generate images using Google's Nano Banana Pro (Gemini 3 Pro Image) API. - -Usage: - uv run generate_image.py --prompt "your image description" --filename "output.png" [--resolution 1K|2K|4K] [--api-key KEY] - -Multi-image editing (up to 14 images): - uv run generate_image.py --prompt "combine these images" --filename "output.png" -i img1.png -i img2.png -i img3.png -""" - -import argparse -import os -import sys -from pathlib import Path - -SUPPORTED_ASPECT_RATIOS = [ - "1:1", - "2:3", - "3:2", - "3:4", - "4:3", - "4:5", - "5:4", - "9:16", - "16:9", - "21:9", -] - - -def get_api_key(provided_key: str | None) -> str | None: - """Get API key from argument first, then environment.""" - if provided_key: - return provided_key - return os.environ.get("GEMINI_API_KEY") - - -def auto_detect_resolution(max_input_dim: int) -> str: - """Infer output resolution from the largest input image dimension.""" - if max_input_dim >= 3000: - return "4K" - if max_input_dim >= 1500: - return "2K" - return "1K" - - -def choose_output_resolution( - requested_resolution: str | None, - max_input_dim: int, - has_input_images: bool, -) -> tuple[str, bool]: - """Choose final resolution and whether it was auto-detected. - - Auto-detection is only applied when the user did not pass --resolution. - """ - if requested_resolution is not None: - return requested_resolution, False - - if has_input_images and max_input_dim > 0: - return auto_detect_resolution(max_input_dim), True - - return "1K", False - - -def main(): - parser = argparse.ArgumentParser( - description="Generate images using Nano Banana Pro (Gemini 3 Pro Image)" - ) - parser.add_argument( - "--prompt", "-p", - required=True, - help="Image description/prompt" - ) - parser.add_argument( - "--filename", "-f", - required=True, - help="Output filename (e.g., sunset-mountains.png)" - ) - parser.add_argument( - "--input-image", "-i", - action="append", - dest="input_images", - metavar="IMAGE", - help="Input image path(s) for editing/composition. Can be specified multiple times (up to 14 images)." - ) - parser.add_argument( - "--resolution", "-r", - choices=["1K", "2K", "4K"], - default=None, - help="Output resolution: 1K, 2K, or 4K. If omitted with input images, auto-detect from largest image dimension." - ) - parser.add_argument( - "--aspect-ratio", "-a", - choices=SUPPORTED_ASPECT_RATIOS, - default=None, - help=f"Output aspect ratio (default: model decides). Options: {', '.join(SUPPORTED_ASPECT_RATIOS)}" - ) - parser.add_argument( - "--api-key", "-k", - help="Gemini API key (overrides GEMINI_API_KEY env var)" - ) - - args = parser.parse_args() - - # Get API key - api_key = get_api_key(args.api_key) - if not api_key: - print("Error: No API key provided.", file=sys.stderr) - print("Please either:", file=sys.stderr) - print(" 1. Provide --api-key argument", file=sys.stderr) - print(" 2. Set GEMINI_API_KEY environment variable", file=sys.stderr) - sys.exit(1) - - # Import here after checking API key to avoid slow import on error - from google import genai - from google.genai import types - from PIL import Image as PILImage - - # Initialise client - client = genai.Client(api_key=api_key) - - # Set up output path - output_path = Path(args.filename) - output_path.parent.mkdir(parents=True, exist_ok=True) - - # Load input images if provided (up to 14 supported by Nano Banana Pro) - input_images = [] - max_input_dim = 0 - if args.input_images: - if len(args.input_images) > 14: - print(f"Error: Too many input images ({len(args.input_images)}). Maximum is 14.", file=sys.stderr) - sys.exit(1) - - for img_path in args.input_images: - try: - with PILImage.open(img_path) as img: - copied = img.copy() - width, height = copied.size - input_images.append(copied) - print(f"Loaded input image: {img_path}") - - # Track largest dimension for auto-resolution - max_input_dim = max(max_input_dim, width, height) - except Exception as e: - print(f"Error loading input image '{img_path}': {e}", file=sys.stderr) - sys.exit(1) - - output_resolution, auto_detected = choose_output_resolution( - requested_resolution=args.resolution, - max_input_dim=max_input_dim, - has_input_images=bool(input_images), - ) - if auto_detected: - print( - f"Auto-detected resolution: {output_resolution} " - f"(from max input dimension {max_input_dim})" - ) - - # Build contents (images first if editing, prompt only if generating) - if input_images: - contents = [*input_images, args.prompt] - img_count = len(input_images) - print(f"Processing {img_count} image{'s' if img_count > 1 else ''} with resolution {output_resolution}...") - else: - contents = args.prompt - print(f"Generating image with resolution {output_resolution}...") - - try: - # Build image config with optional aspect ratio - image_cfg_kwargs = {"image_size": output_resolution} - if args.aspect_ratio: - image_cfg_kwargs["aspect_ratio"] = args.aspect_ratio - - response = client.models.generate_content( - model="gemini-3-pro-image-preview", - contents=contents, - config=types.GenerateContentConfig( - response_modalities=["TEXT", "IMAGE"], - image_config=types.ImageConfig(**image_cfg_kwargs) - ) - ) - - # Process response and convert to PNG - image_saved = False - for part in response.parts: - if part.text is not None: - print(f"Model response: {part.text}") - elif part.inline_data is not None: - # Convert inline data to PIL Image and save as PNG - from io import BytesIO - - # inline_data.data is already bytes, not base64 - image_data = part.inline_data.data - if isinstance(image_data, str): - # If it's a string, it might be base64 - import base64 - image_data = base64.b64decode(image_data) - - image = PILImage.open(BytesIO(image_data)) - - # Ensure RGB mode for PNG (convert RGBA to RGB with white background if needed) - if image.mode == 'RGBA': - rgb_image = PILImage.new('RGB', image.size, (255, 255, 255)) - rgb_image.paste(image, mask=image.split()[3]) - rgb_image.save(str(output_path), 'PNG') - elif image.mode == 'RGB': - image.save(str(output_path), 'PNG') - else: - image.convert('RGB').save(str(output_path), 'PNG') - image_saved = True - - if image_saved: - full_path = output_path.resolve() - print(f"\nImage saved: {full_path}") - # OpenClaw parses MEDIA: tokens and will attach the file on - # supported chat providers. Emit the canonical MEDIA: form. - print(f"MEDIA:{full_path}") - else: - print("Error: No image was generated in the response.", file=sys.stderr) - sys.exit(1) - - except Exception as e: - print(f"Error generating image: {e}", file=sys.stderr) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/skills/nano-banana-pro/scripts/test_generate_image.py b/skills/nano-banana-pro/scripts/test_generate_image.py deleted file mode 100644 index 1dbae257428..00000000000 --- a/skills/nano-banana-pro/scripts/test_generate_image.py +++ /dev/null @@ -1,36 +0,0 @@ -import importlib.util -from pathlib import Path - -import pytest - -MODULE_PATH = Path(__file__).with_name("generate_image.py") -SPEC = importlib.util.spec_from_file_location("generate_image", MODULE_PATH) -assert SPEC and SPEC.loader -MODULE = importlib.util.module_from_spec(SPEC) -SPEC.loader.exec_module(MODULE) - - -@pytest.mark.parametrize( - ("max_input_dim", "expected"), - [ - (0, "1K"), - (1499, "1K"), - (1500, "2K"), - (2999, "2K"), - (3000, "4K"), - ], -) -def test_auto_detect_resolution_thresholds(max_input_dim, expected): - assert MODULE.auto_detect_resolution(max_input_dim) == expected - - -def test_choose_output_resolution_auto_detects_when_resolution_omitted(): - assert MODULE.choose_output_resolution(None, 2200, True) == ("2K", True) - - -def test_choose_output_resolution_defaults_to_1k_without_inputs(): - assert MODULE.choose_output_resolution(None, 0, False) == ("1K", False) - - -def test_choose_output_resolution_respects_explicit_1k_with_large_input(): - assert MODULE.choose_output_resolution("1K", 3500, True) == ("1K", False) diff --git a/src/agents/tools/image-generate-tool.test.ts b/src/agents/tools/image-generate-tool.test.ts index 86f5aaf07d9..50df1718daf 100644 --- a/src/agents/tools/image-generate-tool.test.ts +++ b/src/agents/tools/image-generate-tool.test.ts @@ -14,8 +14,23 @@ function stubImageGenerationProviders() { id: "google", defaultModel: "gemini-3.1-flash-image-preview", models: ["gemini-3.1-flash-image-preview", "gemini-3-pro-image-preview"], - supportedResolutions: ["1K", "2K", "4K"], - supportsImageEditing: true, + capabilities: { + generate: { + maxCount: 4, + supportsAspectRatio: true, + supportsResolution: true, + }, + edit: { + enabled: true, + maxInputImages: 5, + supportsAspectRatio: true, + supportsResolution: true, + }, + geometry: { + resolutions: ["1K", "2K", "4K"], + aspectRatios: ["1:1", "16:9"], + }, + }, generateImage: vi.fn(async () => { throw new Error("not used"); }), @@ -24,8 +39,19 @@ function stubImageGenerationProviders() { id: "openai", defaultModel: "gpt-image-1", models: ["gpt-image-1"], - supportedSizes: ["1024x1024", "1024x1536", "1536x1024"], - supportsImageEditing: false, + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + }, + edit: { + enabled: false, + maxInputImages: 0, + }, + geometry: { + sizes: ["1024x1024", "1024x1536", "1536x1024"], + }, + }, generateImage: vi.fn(async () => { throw new Error("not used"); }), @@ -138,6 +164,7 @@ describe("createImageGenerateTool", () => { const result = await tool.execute("call-1", { prompt: "A cat wearing sunglasses", model: "openai/gpt-image-1", + filename: "cats/output.png", count: 2, size: "1024x1024", }); @@ -167,7 +194,7 @@ describe("createImageGenerateTool", () => { "image/png", "tool-image-generation", undefined, - "cat-one.png", + "cats/output.png", ); expect(saveMediaBuffer).toHaveBeenNthCalledWith( 2, @@ -175,7 +202,7 @@ describe("createImageGenerateTool", () => { "image/png", "tool-image-generation", undefined, - "cat-two.png", + "cats/output.png", ); expect(result).toMatchObject({ content: [ @@ -189,6 +216,7 @@ describe("createImageGenerateTool", () => { model: "gpt-image-1", count: 2, paths: ["/tmp/generated-1.png", "/tmp/generated-2.png"], + filename: "cats/output.png", revisedPrompts: ["A more cinematic cat"], }, }); @@ -273,6 +301,7 @@ describe("createImageGenerateTool", () => { expect(generateImage).toHaveBeenCalledWith( expect.objectContaining({ + aspectRatio: undefined, resolution: "4K", inputImages: [ expect.objectContaining({ @@ -284,6 +313,91 @@ describe("createImageGenerateTool", () => { ); }); + it("forwards explicit aspect ratio and supports up to 5 reference images", async () => { + const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage").mockResolvedValue({ + provider: "google", + model: "gemini-3-pro-image-preview", + attempts: [], + images: [ + { + buffer: Buffer.from("png-out"), + mimeType: "image/png", + fileName: "edited.png", + }, + ], + }); + vi.spyOn(webMedia, "loadWebMedia").mockResolvedValue({ + kind: "image", + buffer: Buffer.from("input-image"), + contentType: "image/png", + }); + vi.spyOn(mediaStore, "saveMediaBuffer").mockResolvedValue({ + path: "/tmp/edited.png", + id: "edited.png", + size: 7, + contentType: "image/png", + }); + + const tool = createImageGenerateTool({ + config: { + agents: { + defaults: { + imageGenerationModel: { + primary: "google/gemini-3-pro-image-preview", + }, + }, + }, + }, + workspaceDir: process.cwd(), + }); + + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image_generate tool"); + } + + const images = Array.from({ length: 5 }, (_, index) => `./fixtures/ref-${index + 1}.png`); + await tool.execute("call-compose", { + prompt: "Combine these into one scene", + images, + aspectRatio: "16:9", + }); + + expect(generateImage).toHaveBeenCalledWith( + expect.objectContaining({ + aspectRatio: "16:9", + inputImages: expect.arrayContaining([ + expect.objectContaining({ buffer: Buffer.from("input-image"), mimeType: "image/png" }), + ]), + }), + ); + expect(generateImage.mock.calls[0]?.[0].inputImages).toHaveLength(5); + }); + + it("rejects unsupported aspect ratios", async () => { + const tool = createImageGenerateTool({ + config: { + agents: { + defaults: { + imageGenerationModel: { + primary: "google/gemini-3-pro-image-preview", + }, + }, + }, + }, + }); + + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image_generate tool"); + } + + await expect(tool.execute("call-bad-aspect", { prompt: "portrait", aspectRatio: "7:5" })) + .rejects.toThrow( + "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9", + ); + }); + it("lists registered provider and model options", async () => { stubImageGenerationProviders(); @@ -310,7 +424,8 @@ describe("createImageGenerateTool", () => { expect(text).toContain("google (default gemini-3.1-flash-image-preview)"); expect(text).toContain("gemini-3.1-flash-image-preview"); expect(text).toContain("gemini-3-pro-image-preview"); - expect(text).toContain("editing"); + expect(text).toContain("editing up to 5 refs"); + expect(text).toContain("aspect ratios 1:1, 16:9"); expect(result).toMatchObject({ details: { providers: expect.arrayContaining([ @@ -321,9 +436,139 @@ describe("createImageGenerateTool", () => { "gemini-3.1-flash-image-preview", "gemini-3-pro-image-preview", ]), + capabilities: expect.objectContaining({ + edit: expect.objectContaining({ + enabled: true, + maxInputImages: 5, + }), + }), }), ]), }, }); }); + + it("rejects provider-specific edit limits before runtime", async () => { + vi.spyOn(imageGenerationRuntime, "listRuntimeImageGenerationProviders").mockReturnValue([ + { + id: "fal", + defaultModel: "fal-ai/flux/dev", + models: ["fal-ai/flux/dev", "fal-ai/flux/dev/image-to-image"], + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + }, + edit: { + enabled: true, + maxInputImages: 1, + supportsSize: true, + supportsAspectRatio: false, + supportsResolution: true, + }, + }, + generateImage: vi.fn(async () => { + throw new Error("not used"); + }), + }, + ]); + const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage"); + vi.spyOn(webMedia, "loadWebMedia").mockResolvedValue({ + kind: "image", + buffer: Buffer.from("input-image"), + contentType: "image/png", + }); + + const tool = createImageGenerateTool({ + config: { + agents: { + defaults: { + imageGenerationModel: { + primary: "fal/fal-ai/flux/dev", + }, + }, + }, + }, + workspaceDir: process.cwd(), + }); + + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image_generate tool"); + } + + await expect( + tool.execute("call-fal-edit", { + prompt: "combine", + images: ["./fixtures/a.png", "./fixtures/b.png"], + }), + ).rejects.toThrow("fal edit supports at most 1 reference image"); + expect(generateImage).not.toHaveBeenCalled(); + }); + + it("rejects unsupported provider-specific edit aspect ratio overrides before runtime", async () => { + vi.spyOn(imageGenerationRuntime, "listRuntimeImageGenerationProviders").mockReturnValue([ + { + id: "fal", + defaultModel: "fal-ai/flux/dev", + models: ["fal-ai/flux/dev", "fal-ai/flux/dev/image-to-image"], + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + }, + edit: { + enabled: true, + maxInputImages: 1, + supportsSize: true, + supportsAspectRatio: false, + supportsResolution: true, + }, + geometry: { + aspectRatios: ["1:1", "16:9"], + }, + }, + generateImage: vi.fn(async () => { + throw new Error("not used"); + }), + }, + ]); + const generateImage = vi.spyOn(imageGenerationRuntime, "generateImage"); + vi.spyOn(webMedia, "loadWebMedia").mockResolvedValue({ + kind: "image", + buffer: Buffer.from("input-image"), + contentType: "image/png", + }); + + const tool = createImageGenerateTool({ + config: { + agents: { + defaults: { + imageGenerationModel: { + primary: "fal/fal-ai/flux/dev", + }, + }, + }, + }, + workspaceDir: process.cwd(), + }); + + expect(tool).not.toBeNull(); + if (!tool) { + throw new Error("expected image_generate tool"); + } + + await expect( + tool.execute("call-fal-aspect", { + prompt: "edit", + image: "./fixtures/a.png", + aspectRatio: "16:9", + }), + ).rejects.toThrow("fal edit does not support aspectRatio overrides"); + expect(generateImage).not.toHaveBeenCalled(); + }); }); diff --git a/src/agents/tools/image-generate-tool.ts b/src/agents/tools/image-generate-tool.ts index 057b9013100..3ae12fda187 100644 --- a/src/agents/tools/image-generate-tool.ts +++ b/src/agents/tools/image-generate-tool.ts @@ -6,6 +6,7 @@ import { listRuntimeImageGenerationProviders, } from "../../image-generation/runtime.js"; import type { + ImageGenerationProvider, ImageGenerationResolution, ImageGenerationSourceImage, } from "../../image-generation/types.js"; @@ -36,8 +37,20 @@ import { const DEFAULT_COUNT = 1; const MAX_COUNT = 4; -const MAX_INPUT_IMAGES = 4; +const MAX_INPUT_IMAGES = 5; const DEFAULT_RESOLUTION: ImageGenerationResolution = "1K"; +const SUPPORTED_ASPECT_RATIOS = new Set([ + "1:1", + "2:3", + "3:2", + "3:4", + "4:3", + "4:5", + "5:4", + "9:16", + "16:9", + "21:9", +]); const ImageGenerateToolSchema = Type.Object({ action: Type.Optional( @@ -60,12 +73,24 @@ const ImageGenerateToolSchema = Type.Object({ model: Type.Optional( Type.String({ description: "Optional provider/model override, e.g. openai/gpt-image-1." }), ), + filename: Type.Optional( + Type.String({ + description: + "Optional output filename hint. OpenClaw preserves the basename and saves under its managed media directory.", + }), + ), size: Type.Optional( Type.String({ description: "Optional size hint like 1024x1024, 1536x1024, 1024x1536, 1024x1792, or 1792x1024.", }), ), + aspectRatio: Type.Optional( + Type.String({ + description: + "Optional aspect ratio hint: 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9.", + }), + ), resolution: Type.Optional( Type.String({ description: @@ -162,6 +187,19 @@ function normalizeResolution(raw: string | undefined): ImageGenerationResolution throw new ToolInputError("resolution must be one of 1K, 2K, or 4K"); } +function normalizeAspectRatio(raw: string | undefined): string | undefined { + const normalized = raw?.trim(); + if (!normalized) { + return undefined; + } + if (SUPPORTED_ASPECT_RATIOS.has(normalized)) { + return normalized; + } + throw new ToolInputError( + "aspectRatio must be one of 1:1, 2:3, 3:2, 3:4, 4:3, 4:5, 5:4, 9:16, 16:9, or 21:9", + ); +} + function normalizeReferenceImages(args: Record): string[] { const imageCandidates: string[] = []; if (typeof args.image === "string") { @@ -192,6 +230,112 @@ function normalizeReferenceImages(args: Record): string[] { return normalized; } +function parseImageGenerationModelRef(raw: string | undefined): { provider: string; model: string } | null { + const trimmed = raw?.trim(); + if (!trimmed) { + return null; + } + const slashIndex = trimmed.indexOf("/"); + if (slashIndex <= 0 || slashIndex === trimmed.length - 1) { + return null; + } + return { + provider: trimmed.slice(0, slashIndex).trim(), + model: trimmed.slice(slashIndex + 1).trim(), + }; +} + +function resolveSelectedImageGenerationProvider(params: { + config?: OpenClawConfig; + imageGenerationModelConfig: ToolModelConfig; + modelOverride?: string; +}): ImageGenerationProvider | undefined { + const selectedRef = + parseImageGenerationModelRef(params.modelOverride) ?? + parseImageGenerationModelRef(params.imageGenerationModelConfig.primary); + if (!selectedRef) { + return undefined; + } + return listRuntimeImageGenerationProviders({ config: params.config }).find( + (provider) => + provider.id === selectedRef.provider || (provider.aliases ?? []).includes(selectedRef.provider), + ); +} + +function validateImageGenerationCapabilities(params: { + provider: ImageGenerationProvider | undefined; + count: number; + inputImageCount: number; + size?: string; + aspectRatio?: string; + resolution?: ImageGenerationResolution; +}) { + const provider = params.provider; + if (!provider) { + return; + } + const isEdit = params.inputImageCount > 0; + const modeCaps = isEdit ? provider.capabilities.edit : provider.capabilities.generate; + const geometry = provider.capabilities.geometry; + const maxCount = modeCaps.maxCount ?? MAX_COUNT; + if (params.count > maxCount) { + throw new ToolInputError( + `${provider.id} ${isEdit ? "edit" : "generate"} supports at most ${maxCount} output image${maxCount === 1 ? "" : "s"}.`, + ); + } + + if (isEdit) { + if (!provider.capabilities.edit.enabled) { + throw new ToolInputError(`${provider.id} does not support reference-image edits.`); + } + const maxInputImages = provider.capabilities.edit.maxInputImages ?? MAX_INPUT_IMAGES; + if (params.inputImageCount > maxInputImages) { + throw new ToolInputError( + `${provider.id} edit supports at most ${maxInputImages} reference image${maxInputImages === 1 ? "" : "s"}.`, + ); + } + } + + if (params.size) { + if (!modeCaps.supportsSize) { + throw new ToolInputError(`${provider.id} ${isEdit ? "edit" : "generate"} does not support size overrides.`); + } + if ((geometry?.sizes?.length ?? 0) > 0 && !geometry?.sizes?.includes(params.size)) { + throw new ToolInputError( + `${provider.id} ${isEdit ? "edit" : "generate"} size must be one of ${geometry?.sizes?.join(", ")}.`, + ); + } + } + + if (params.aspectRatio) { + if (!modeCaps.supportsAspectRatio) { + throw new ToolInputError(`${provider.id} ${isEdit ? "edit" : "generate"} does not support aspectRatio overrides.`); + } + if ( + (geometry?.aspectRatios?.length ?? 0) > 0 && + !geometry?.aspectRatios?.includes(params.aspectRatio) + ) { + throw new ToolInputError( + `${provider.id} ${isEdit ? "edit" : "generate"} aspectRatio must be one of ${geometry?.aspectRatios?.join(", ")}.`, + ); + } + } + + if (params.resolution) { + if (!modeCaps.supportsResolution) { + throw new ToolInputError(`${provider.id} ${isEdit ? "edit" : "generate"} does not support resolution overrides.`); + } + if ( + (geometry?.resolutions?.length ?? 0) > 0 && + !geometry?.resolutions?.includes(params.resolution) + ) { + throw new ToolInputError( + `${provider.id} ${isEdit ? "edit" : "generate"} resolution must be one of ${geometry?.resolutions?.join("/")}.`, + ); + } + } +} + type ImageGenerateSandboxConfig = { root: string; bridge: SandboxFsBridge; @@ -357,25 +501,25 @@ export function createImageGenerateTool(options?: { ...(provider.label ? { label: provider.label } : {}), ...(provider.defaultModel ? { defaultModel: provider.defaultModel } : {}), models: provider.models ?? (provider.defaultModel ? [provider.defaultModel] : []), - ...(provider.supportedSizes ? { supportedSizes: [...provider.supportedSizes] } : {}), - ...(provider.supportedResolutions - ? { supportedResolutions: [...provider.supportedResolutions] } - : {}), - ...(typeof provider.supportsImageEditing === "boolean" - ? { supportsImageEditing: provider.supportsImageEditing } - : {}), + capabilities: provider.capabilities, }), ); const lines = providers.flatMap((provider) => { const caps: string[] = []; - if (provider.supportsImageEditing) { - caps.push("editing"); + if (provider.capabilities.edit.enabled) { + const maxRefs = provider.capabilities.edit.maxInputImages; + caps.push( + `editing${typeof maxRefs === "number" ? ` up to ${maxRefs} ref${maxRefs === 1 ? "" : "s"}` : ""}`, + ); } - if ((provider.supportedResolutions?.length ?? 0) > 0) { - caps.push(`resolutions ${provider.supportedResolutions?.join("/")}`); + if ((provider.capabilities.geometry?.resolutions?.length ?? 0) > 0) { + caps.push(`resolutions ${provider.capabilities.geometry?.resolutions?.join("/")}`); } - if ((provider.supportedSizes?.length ?? 0) > 0) { - caps.push(`sizes ${provider.supportedSizes?.join(", ")}`); + if ((provider.capabilities.geometry?.sizes?.length ?? 0) > 0) { + caps.push(`sizes ${provider.capabilities.geometry?.sizes?.join(", ")}`); + } + if ((provider.capabilities.geometry?.aspectRatios?.length ?? 0) > 0) { + caps.push(`aspect ratios ${provider.capabilities.geometry?.aspectRatios?.join(", ")}`); } const modelLine = provider.models.length > 0 @@ -396,7 +540,9 @@ export function createImageGenerateTool(options?: { const prompt = readStringParam(params, "prompt", { required: true }); const imageInputs = normalizeReferenceImages(params); const model = readStringParam(params, "model"); + const filename = readStringParam(params, "filename"); const size = readStringParam(params, "size"); + const aspectRatio = normalizeAspectRatio(readStringParam(params, "aspectRatio")); const explicitResolution = normalizeResolution(readStringParam(params, "resolution")); const count = resolveRequestedCount(params); const loadedReferenceImages = await loadReferenceImages({ @@ -412,6 +558,19 @@ export function createImageGenerateTool(options?: { : inputImages.length > 0 ? await inferResolutionFromInputImages(inputImages) : undefined); + const selectedProvider = resolveSelectedImageGenerationProvider({ + config: effectiveCfg, + imageGenerationModelConfig, + modelOverride: model, + }); + validateImageGenerationCapabilities({ + provider: selectedProvider, + count, + inputImageCount: inputImages.length, + size, + aspectRatio, + resolution, + }); const result = await generateImage({ cfg: effectiveCfg, @@ -419,6 +578,7 @@ export function createImageGenerateTool(options?: { agentDir: options?.agentDir, modelOverride: model, size, + aspectRatio, resolution, count, inputImages, @@ -431,7 +591,7 @@ export function createImageGenerateTool(options?: { image.mimeType, "tool-image-generation", undefined, - image.fileName, + filename || image.fileName, ), ), ); @@ -468,6 +628,8 @@ export function createImageGenerateTool(options?: { : {}), ...(resolution ? { resolution } : {}), ...(size ? { size } : {}), + ...(aspectRatio ? { aspectRatio } : {}), + ...(filename ? { filename } : {}), attempts: result.attempts, metadata: result.metadata, ...(revisedPrompts.length > 0 ? { revisedPrompts } : {}), diff --git a/src/commands/doctor-legacy-config.migrations.test.ts b/src/commands/doctor-legacy-config.migrations.test.ts index e364d1b7168..738827c31c6 100644 --- a/src/commands/doctor-legacy-config.migrations.test.ts +++ b/src/commands/doctor-legacy-config.migrations.test.ts @@ -297,4 +297,99 @@ describe("normalizeCompatibilityConfigValues", () => { "Moved browser.ssrfPolicy.allowPrivateNetwork → browser.ssrfPolicy.dangerouslyAllowPrivateNetwork (true).", ); }); + + it("migrates nano-banana skill config to native image generation config", () => { + const res = normalizeCompatibilityConfigValues({ + skills: { + entries: { + "nano-banana-pro": { + enabled: true, + apiKey: { source: "env", provider: "default", id: "GEMINI_API_KEY" }, + }, + }, + }, + }); + + expect(res.config.agents?.defaults?.imageGenerationModel).toEqual({ + primary: "google/gemini-3-pro-image-preview", + }); + expect(res.config.models?.providers?.google?.apiKey).toEqual({ + source: "env", + provider: "default", + id: "GEMINI_API_KEY", + }); + expect(res.config.skills?.entries).toBeUndefined(); + expect(res.changes).toEqual([ + "Moved skills.entries.nano-banana-pro → agents.defaults.imageGenerationModel.primary (google/gemini-3-pro-image-preview).", + "Moved skills.entries.nano-banana-pro.apiKey → models.providers.google.apiKey.", + "Removed legacy skills.entries.nano-banana-pro.", + ]); + }); + + it("prefers legacy nano-banana env.GEMINI_API_KEY over skill apiKey during migration", () => { + const res = normalizeCompatibilityConfigValues({ + skills: { + entries: { + "nano-banana-pro": { + apiKey: "ignored-skill-api-key", + env: { + GEMINI_API_KEY: "env-gemini-key", + }, + }, + }, + }, + }); + + expect(res.config.models?.providers?.google?.apiKey).toBe("env-gemini-key"); + expect(res.changes).toContain( + "Moved skills.entries.nano-banana-pro.env.GEMINI_API_KEY → models.providers.google.apiKey.", + ); + }); + + it("preserves explicit native config while removing legacy nano-banana skill config", () => { + const res = normalizeCompatibilityConfigValues({ + agents: { + defaults: { + imageGenerationModel: { + primary: "fal/fal-ai/flux/dev", + }, + }, + }, + models: { + providers: { + google: { + apiKey: "existing-google-key", + }, + }, + }, + skills: { + entries: { + "nano-banana-pro": { + apiKey: "legacy-gemini-key", + }, + peekaboo: { enabled: true }, + }, + }, + }); + + expect(res.config.agents?.defaults?.imageGenerationModel).toEqual({ + primary: "fal/fal-ai/flux/dev", + }); + expect(res.config.models?.providers?.google?.apiKey).toBe("existing-google-key"); + expect(res.config.skills?.entries).toEqual({ + peekaboo: { enabled: true }, + }); + expect(res.changes).toEqual(["Removed legacy skills.entries.nano-banana-pro."]); + }); + + it("removes nano-banana from skills.allowBundled during migration", () => { + const res = normalizeCompatibilityConfigValues({ + skills: { + allowBundled: ["peekaboo", "nano-banana-pro"], + }, + }); + + expect(res.config.skills?.allowBundled).toEqual(["peekaboo"]); + expect(res.changes).toEqual(["Removed nano-banana-pro from skills.allowBundled."]); + }); }); diff --git a/src/commands/doctor-legacy-config.ts b/src/commands/doctor-legacy-config.ts index 2d6bfa83a11..8072b89854b 100644 --- a/src/commands/doctor-legacy-config.ts +++ b/src/commands/doctor-legacy-config.ts @@ -15,6 +15,8 @@ export function normalizeCompatibilityConfigValues(cfg: OpenClawConfig): { changes: string[]; } { const changes: string[] = []; + const NANO_BANANA_SKILL_KEY = "nano-banana-pro"; + const NANO_BANANA_MODEL = "google/gemini-3-pro-image-preview"; let next: OpenClawConfig = cfg; const isRecord = (value: unknown): value is Record => @@ -471,7 +473,121 @@ export function normalizeCompatibilityConfigValues(cfg: OpenClawConfig): { ); }; + const normalizeLegacyNanoBananaSkill = () => { + const rawSkills = next.skills; + if (!isRecord(rawSkills)) { + return; + } + + let skillsChanged = false; + let skills = structuredClone(rawSkills); + + if (Array.isArray(skills.allowBundled)) { + const allowBundled = skills.allowBundled.filter( + (value) => typeof value !== "string" || value.trim() !== NANO_BANANA_SKILL_KEY, + ); + if (allowBundled.length !== skills.allowBundled.length) { + if (allowBundled.length === 0) { + delete skills.allowBundled; + changes.push(`Removed skills.allowBundled entry for ${NANO_BANANA_SKILL_KEY}.`); + } else { + skills.allowBundled = allowBundled; + changes.push(`Removed ${NANO_BANANA_SKILL_KEY} from skills.allowBundled.`); + } + skillsChanged = true; + } + } + + const rawEntries = skills.entries; + if (!isRecord(rawEntries)) { + if (skillsChanged) { + next = { ...next, skills }; + } + return; + } + + const rawLegacyEntry = rawEntries[NANO_BANANA_SKILL_KEY]; + if (!isRecord(rawLegacyEntry)) { + if (skillsChanged) { + next = { ...next, skills }; + } + return; + } + + const existingImageGenerationModel = next.agents?.defaults?.imageGenerationModel; + if (existingImageGenerationModel === undefined) { + next = { + ...next, + agents: { + ...next.agents, + defaults: { + ...next.agents?.defaults, + imageGenerationModel: { + primary: NANO_BANANA_MODEL, + }, + }, + }, + }; + changes.push( + `Moved skills.entries.${NANO_BANANA_SKILL_KEY} → agents.defaults.imageGenerationModel.primary (${NANO_BANANA_MODEL}).`, + ); + } + + const legacyEnv = isRecord(rawLegacyEntry.env) ? rawLegacyEntry.env : undefined; + const legacyEnvApiKey = + typeof legacyEnv?.GEMINI_API_KEY === "string" ? legacyEnv.GEMINI_API_KEY.trim() : ""; + const legacyApiKey = + legacyEnvApiKey || + (typeof rawLegacyEntry.apiKey === "string" + ? rawLegacyEntry.apiKey.trim() + : rawLegacyEntry.apiKey && isRecord(rawLegacyEntry.apiKey) + ? structuredClone(rawLegacyEntry.apiKey) + : undefined); + + const rawModels = isRecord(next.models) ? structuredClone(next.models) : {}; + const rawProviders = isRecord(rawModels.providers) ? { ...rawModels.providers } : {}; + const rawGoogle = isRecord(rawProviders.google) ? { ...rawProviders.google } : {}; + const hasGoogleApiKey = rawGoogle.apiKey !== undefined; + if (!hasGoogleApiKey && legacyApiKey) { + rawGoogle.apiKey = legacyApiKey; + rawProviders.google = rawGoogle; + rawModels.providers = rawProviders; + next = { + ...next, + models: rawModels as OpenClawConfig["models"], + }; + changes.push( + `Moved skills.entries.${NANO_BANANA_SKILL_KEY}.${legacyEnvApiKey ? "env.GEMINI_API_KEY" : "apiKey"} → models.providers.google.apiKey.`, + ); + } + + const entries = { ...rawEntries }; + delete entries[NANO_BANANA_SKILL_KEY]; + if (Object.keys(entries).length === 0) { + delete skills.entries; + changes.push(`Removed legacy skills.entries.${NANO_BANANA_SKILL_KEY}.`); + } else { + skills.entries = entries; + changes.push(`Removed legacy skills.entries.${NANO_BANANA_SKILL_KEY}.`); + } + skillsChanged = true; + + if (Object.keys(skills).length === 0) { + const { skills: _ignored, ...rest } = next; + next = rest; + return; + } + + if (skillsChanged) { + next = { + ...next, + skills, + }; + } + }; + normalizeBrowserSsrFPolicyAlias(); + normalizeLegacyNanoBananaSkill(); const legacyAckReaction = cfg.messages?.ackReaction?.trim(); const hasWhatsAppConfig = cfg.channels?.whatsapp !== undefined; diff --git a/src/image-generation/providers/fal.test.ts b/src/image-generation/providers/fal.test.ts index c610c1b9c0c..ea583dbe431 100644 --- a/src/image-generation/providers/fal.test.ts +++ b/src/image-generation/providers/fal.test.ts @@ -127,6 +127,97 @@ describe("fal image-generation provider", () => { ); }); + it("maps aspect ratio for text generation without forcing a square default", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "fal-test-key", + source: "env", + mode: "api-key", + }); + const fetchMock = vi + .fn() + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ + images: [{ url: "https://v3.fal.media/files/example/wide.png" }], + }), + }) + .mockResolvedValueOnce({ + ok: true, + headers: new Headers({ "content-type": "image/png" }), + arrayBuffer: async () => Buffer.from("wide-data"), + }); + vi.stubGlobal("fetch", fetchMock); + + const provider = buildFalImageGenerationProvider(); + await provider.generateImage({ + provider: "fal", + model: "fal-ai/flux/dev", + prompt: "wide cinematic shot", + cfg: {}, + aspectRatio: "16:9", + }); + + expect(fetchMock).toHaveBeenNthCalledWith( + 1, + "https://fal.run/fal-ai/flux/dev", + expect.objectContaining({ + method: "POST", + body: JSON.stringify({ + prompt: "wide cinematic shot", + image_size: "landscape_16_9", + num_images: 1, + output_format: "png", + }), + }), + ); + }); + + it("combines resolution and aspect ratio for text generation", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "fal-test-key", + source: "env", + mode: "api-key", + }); + const fetchMock = vi + .fn() + .mockResolvedValueOnce({ + ok: true, + json: async () => ({ + images: [{ url: "https://v3.fal.media/files/example/portrait.png" }], + }), + }) + .mockResolvedValueOnce({ + ok: true, + headers: new Headers({ "content-type": "image/png" }), + arrayBuffer: async () => Buffer.from("portrait-data"), + }); + vi.stubGlobal("fetch", fetchMock); + + const provider = buildFalImageGenerationProvider(); + await provider.generateImage({ + provider: "fal", + model: "fal-ai/flux/dev", + prompt: "portrait poster", + cfg: {}, + resolution: "2K", + aspectRatio: "9:16", + }); + + expect(fetchMock).toHaveBeenNthCalledWith( + 1, + "https://fal.run/fal-ai/flux/dev", + expect.objectContaining({ + method: "POST", + body: JSON.stringify({ + prompt: "portrait poster", + image_size: { width: 1152, height: 2048 }, + num_images: 1, + output_format: "png", + }), + }), + ); + }); + it("rejects multi-image edit requests for now", async () => { vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ apiKey: "fal-test-key", @@ -148,4 +239,24 @@ describe("fal image-generation provider", () => { }), ).rejects.toThrow("at most one reference image"); }); + + it("rejects aspect ratio overrides for the current edit endpoint", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "fal-test-key", + source: "env", + mode: "api-key", + }); + + const provider = buildFalImageGenerationProvider(); + await expect( + provider.generateImage({ + provider: "fal", + model: "fal-ai/flux/dev", + prompt: "make it widescreen", + cfg: {}, + aspectRatio: "16:9", + inputImages: [{ buffer: Buffer.from("one"), mimeType: "image/png" }], + }), + ).rejects.toThrow("does not support aspectRatio overrides"); + }); }); diff --git a/src/image-generation/providers/fal.ts b/src/image-generation/providers/fal.ts index b9bd5517651..4059859e534 100644 --- a/src/image-generation/providers/fal.ts +++ b/src/image-generation/providers/fal.ts @@ -5,8 +5,15 @@ import type { GeneratedImageAsset } from "../types.js"; const DEFAULT_FAL_BASE_URL = "https://fal.run"; const DEFAULT_FAL_IMAGE_MODEL = "fal-ai/flux/dev"; const DEFAULT_FAL_EDIT_SUBPATH = "image-to-image"; -const DEFAULT_OUTPUT_SIZE = "square_hd"; const DEFAULT_OUTPUT_FORMAT = "png"; +const FAL_SUPPORTED_SIZES = [ + "1024x1024", + "1024x1536", + "1536x1024", + "1024x1792", + "1792x1024", +] as const; +const FAL_SUPPORTED_ASPECT_RATIOS = ["1:1", "4:3", "3:4", "16:9", "9:16"] as const; type FalGeneratedImage = { url?: string; @@ -57,23 +64,85 @@ function parseSize(raw: string | undefined): { width: number; height: number } | return { width, height }; } -function mapResolutionToSize(resolution: "1K" | "2K" | "4K" | undefined): FalImageSize | undefined { +function mapResolutionToEdge(resolution: "1K" | "2K" | "4K" | undefined): number | undefined { if (!resolution) { return undefined; } - const edge = resolution === "4K" ? 4096 : resolution === "2K" ? 2048 : 1024; - return { width: edge, height: edge }; + return resolution === "4K" ? 4096 : resolution === "2K" ? 2048 : 1024; +} + +function aspectRatioToEnum(aspectRatio: string | undefined): string | undefined { + const normalized = aspectRatio?.trim(); + if (!normalized) { + return undefined; + } + if (normalized === "1:1") { + return "square_hd"; + } + if (normalized === "4:3") { + return "landscape_4_3"; + } + if (normalized === "3:4") { + return "portrait_4_3"; + } + if (normalized === "16:9") { + return "landscape_16_9"; + } + if (normalized === "9:16") { + return "portrait_16_9"; + } + return undefined; +} + +function aspectRatioToDimensions(aspectRatio: string, edge: number): { width: number; height: number } { + const match = /^(\d+):(\d+)$/u.exec(aspectRatio.trim()); + if (!match) { + throw new Error(`Invalid fal aspect ratio: ${aspectRatio}`); + } + const widthRatio = Number.parseInt(match[1] ?? "", 10); + const heightRatio = Number.parseInt(match[2] ?? "", 10); + if (!Number.isFinite(widthRatio) || !Number.isFinite(heightRatio) || widthRatio <= 0 || heightRatio <= 0) { + throw new Error(`Invalid fal aspect ratio: ${aspectRatio}`); + } + if (widthRatio >= heightRatio) { + return { + width: edge, + height: Math.max(1, Math.round((edge * heightRatio) / widthRatio)), + }; + } + return { + width: Math.max(1, Math.round((edge * widthRatio) / heightRatio)), + height: edge, + }; } function resolveFalImageSize(params: { size?: string; resolution?: "1K" | "2K" | "4K"; -}): FalImageSize { + aspectRatio?: string; + hasInputImages: boolean; +}): FalImageSize | undefined { const parsed = parseSize(params.size); if (parsed) { return parsed; } - return mapResolutionToSize(params.resolution) ?? DEFAULT_OUTPUT_SIZE; + + const normalizedAspectRatio = params.aspectRatio?.trim(); + if (normalizedAspectRatio && params.hasInputImages) { + throw new Error("fal image edit endpoint does not support aspectRatio overrides"); + } + + const edge = mapResolutionToEdge(params.resolution); + if (normalizedAspectRatio && edge) { + return aspectRatioToDimensions(normalizedAspectRatio, edge); + } + if (edge) { + return { width: edge, height: edge }; + } + if (normalizedAspectRatio) { + return aspectRatioToEnum(normalizedAspectRatio) ?? aspectRatioToDimensions(normalizedAspectRatio, 1024); + } + return undefined; } function toDataUri(buffer: Buffer, mimeType: string): string { @@ -111,9 +180,27 @@ export function buildFalImageGenerationProvider(): ImageGenerationProviderPlugin label: "fal", defaultModel: DEFAULT_FAL_IMAGE_MODEL, models: [DEFAULT_FAL_IMAGE_MODEL, `${DEFAULT_FAL_IMAGE_MODEL}/${DEFAULT_FAL_EDIT_SUBPATH}`], - supportedSizes: ["1024x1024", "1024x1536", "1536x1024", "1024x1792", "1792x1024"], - supportedResolutions: ["1K", "2K", "4K"], - supportsImageEditing: true, + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + }, + edit: { + enabled: true, + maxCount: 4, + maxInputImages: 1, + supportsSize: true, + supportsAspectRatio: false, + supportsResolution: true, + }, + geometry: { + sizes: [...FAL_SUPPORTED_SIZES], + aspectRatios: [...FAL_SUPPORTED_ASPECT_RATIOS], + resolutions: ["1K", "2K", "4K"], + }, + }, async generateImage(req) { const auth = await resolveApiKeyForProvider({ provider: "fal", @@ -128,18 +215,22 @@ export function buildFalImageGenerationProvider(): ImageGenerationProviderPlugin throw new Error("fal image generation currently supports at most one reference image"); } + const hasInputImages = (req.inputImages?.length ?? 0) > 0; const imageSize = resolveFalImageSize({ size: req.size, resolution: req.resolution, + aspectRatio: req.aspectRatio, + hasInputImages, }); - const hasInputImages = (req.inputImages?.length ?? 0) > 0; const model = ensureFalModelPath(req.model, hasInputImages); const requestBody: Record = { prompt: req.prompt, - image_size: imageSize, num_images: req.count ?? 1, output_format: DEFAULT_OUTPUT_FORMAT, }; + if (imageSize !== undefined) { + requestBody.image_size = imageSize; + } if (hasInputImages) { const [input] = req.inputImages ?? []; diff --git a/src/image-generation/providers/google.test.ts b/src/image-generation/providers/google.test.ts index 224779f3429..5c64481edae 100644 --- a/src/image-generation/providers/google.test.ts +++ b/src/image-generation/providers/google.test.ts @@ -197,7 +197,6 @@ describe("Google image-generation provider", () => { generationConfig: { responseModalities: ["TEXT", "IMAGE"], imageConfig: { - aspectRatio: "1:1", imageSize: "4K", }, }, @@ -205,4 +204,62 @@ describe("Google image-generation provider", () => { }), ); }); + + it("forwards explicit aspect ratio without forcing a default when size is omitted", async () => { + vi.spyOn(modelAuth, "resolveApiKeyForProvider").mockResolvedValue({ + apiKey: "google-test-key", + source: "env", + mode: "api-key", + }); + const fetchMock = vi.fn().mockResolvedValue({ + ok: true, + json: async () => ({ + candidates: [ + { + content: { + parts: [ + { + inlineData: { + mimeType: "image/png", + data: Buffer.from("png-data").toString("base64"), + }, + }, + ], + }, + }, + ], + }), + }); + vi.stubGlobal("fetch", fetchMock); + + const provider = buildGoogleImageGenerationProvider(); + await provider.generateImage({ + provider: "google", + model: "gemini-3-pro-image-preview", + prompt: "portrait photo", + cfg: {}, + aspectRatio: "9:16", + }); + + expect(fetchMock).toHaveBeenCalledWith( + "https://generativelanguage.googleapis.com/v1beta/models/gemini-3-pro-image-preview:generateContent", + expect.objectContaining({ + method: "POST", + body: JSON.stringify({ + contents: [ + { + role: "user", + parts: [{ text: "portrait photo" }], + }, + ], + generationConfig: { + responseModalities: ["TEXT", "IMAGE"], + imageConfig: { + aspectRatio: "9:16", + }, + }, + }), + }), + ); + }); }); diff --git a/src/image-generation/providers/google.ts b/src/image-generation/providers/google.ts index f7469b147fa..24c725fa666 100644 --- a/src/image-generation/providers/google.ts +++ b/src/image-generation/providers/google.ts @@ -11,7 +11,25 @@ import type { ImageGenerationProviderPlugin } from "../../plugins/types.js"; const DEFAULT_GOOGLE_IMAGE_BASE_URL = "https://generativelanguage.googleapis.com/v1beta"; const DEFAULT_GOOGLE_IMAGE_MODEL = "gemini-3.1-flash-image-preview"; const DEFAULT_OUTPUT_MIME = "image/png"; -const DEFAULT_ASPECT_RATIO = "1:1"; +const GOOGLE_SUPPORTED_SIZES = [ + "1024x1024", + "1024x1536", + "1536x1024", + "1024x1792", + "1792x1024", +] as const; +const GOOGLE_SUPPORTED_ASPECT_RATIOS = [ + "1:1", + "2:3", + "3:2", + "3:4", + "4:3", + "4:5", + "5:4", + "9:16", + "16:9", + "21:9", +] as const; type GoogleInlineDataPart = { mimeType?: string; @@ -46,7 +64,7 @@ function mapSizeToImageConfig( ): { aspectRatio?: string; imageSize?: "2K" | "4K" } | undefined { const trimmed = size?.trim(); if (!trimmed) { - return { aspectRatio: DEFAULT_ASPECT_RATIO }; + return undefined; } const normalized = trimmed.toLowerCase(); @@ -81,8 +99,27 @@ export function buildGoogleImageGenerationProvider(): ImageGenerationProviderPlu label: "Google", defaultModel: DEFAULT_GOOGLE_IMAGE_MODEL, models: [DEFAULT_GOOGLE_IMAGE_MODEL, "gemini-3-pro-image-preview"], - supportedResolutions: ["1K", "2K", "4K"], - supportsImageEditing: true, + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + }, + edit: { + enabled: true, + maxCount: 4, + maxInputImages: 5, + supportsSize: true, + supportsAspectRatio: true, + supportsResolution: true, + }, + geometry: { + sizes: [...GOOGLE_SUPPORTED_SIZES], + aspectRatios: [...GOOGLE_SUPPORTED_ASPECT_RATIOS], + resolutions: ["1K", "2K", "4K"], + }, + }, async generateImage(req) { const auth = await resolveApiKeyForProvider({ provider: "google", @@ -111,6 +148,7 @@ export function buildGoogleImageGenerationProvider(): ImageGenerationProviderPlu })); const resolvedImageConfig = { ...imageConfig, + ...(req.aspectRatio?.trim() ? { aspectRatio: req.aspectRatio.trim() } : {}), ...(req.resolution ? { imageSize: req.resolution } : {}), }; diff --git a/src/image-generation/providers/openai.ts b/src/image-generation/providers/openai.ts index 1a0afe1f67d..7bce3854ab3 100644 --- a/src/image-generation/providers/openai.ts +++ b/src/image-generation/providers/openai.ts @@ -5,6 +5,7 @@ const DEFAULT_OPENAI_IMAGE_BASE_URL = "https://api.openai.com/v1"; const DEFAULT_OPENAI_IMAGE_MODEL = "gpt-image-1"; const DEFAULT_OUTPUT_MIME = "image/png"; const DEFAULT_SIZE = "1024x1024"; +const OPENAI_SUPPORTED_SIZES = ["1024x1024", "1024x1536", "1536x1024"] as const; type OpenAIImageApiResponse = { data?: Array<{ @@ -24,7 +25,25 @@ export function buildOpenAIImageGenerationProvider(): ImageGenerationProviderPlu label: "OpenAI", defaultModel: DEFAULT_OPENAI_IMAGE_MODEL, models: [DEFAULT_OPENAI_IMAGE_MODEL], - supportedSizes: ["1024x1024", "1024x1536", "1536x1024"], + capabilities: { + generate: { + maxCount: 4, + supportsSize: true, + supportsAspectRatio: false, + supportsResolution: false, + }, + edit: { + enabled: false, + maxCount: 0, + maxInputImages: 0, + supportsSize: false, + supportsAspectRatio: false, + supportsResolution: false, + }, + geometry: { + sizes: [...OPENAI_SUPPORTED_SIZES], + }, + }, async generateImage(req) { if ((req.inputImages?.length ?? 0) > 0) { throw new Error("OpenAI image generation provider does not support reference-image edits"); diff --git a/src/image-generation/runtime.test.ts b/src/image-generation/runtime.test.ts index b044c899c60..39dd03d0b9c 100644 --- a/src/image-generation/runtime.test.ts +++ b/src/image-generation/runtime.test.ts @@ -19,6 +19,10 @@ describe("image-generation runtime helpers", () => { source: "test", provider: { id: "image-plugin", + capabilities: { + generate: {}, + edit: { enabled: false }, + }, async generateImage(req) { seenAuthStore = req.authStore; return { @@ -76,7 +80,18 @@ describe("image-generation runtime helpers", () => { id: "image-plugin", defaultModel: "img-v1", models: ["img-v1", "img-v2"], - supportedResolutions: ["1K", "2K"], + capabilities: { + generate: { + supportsResolution: true, + }, + edit: { + enabled: true, + maxInputImages: 3, + }, + geometry: { + resolutions: ["1K", "2K"], + }, + }, generateImage: async () => ({ images: [{ buffer: Buffer.from("x"), mimeType: "image/png" }], }), @@ -89,7 +104,18 @@ describe("image-generation runtime helpers", () => { id: "image-plugin", defaultModel: "img-v1", models: ["img-v1", "img-v2"], - supportedResolutions: ["1K", "2K"], + capabilities: { + generate: { + supportsResolution: true, + }, + edit: { + enabled: true, + maxInputImages: 3, + }, + geometry: { + resolutions: ["1K", "2K"], + }, + }, }, ]); }); diff --git a/src/image-generation/runtime.ts b/src/image-generation/runtime.ts index f25048cd0b1..4416fba785c 100644 --- a/src/image-generation/runtime.ts +++ b/src/image-generation/runtime.ts @@ -25,6 +25,7 @@ export type GenerateImageParams = { modelOverride?: string; count?: number; size?: string; + aspectRatio?: string; resolution?: ImageGenerationResolution; inputImages?: ImageGenerationSourceImage[]; }; @@ -142,6 +143,7 @@ export async function generateImage( authStore: params.authStore, count: params.count, size: params.size, + aspectRatio: params.aspectRatio, resolution: params.resolution, inputImages: params.inputImages, }); diff --git a/src/image-generation/types.ts b/src/image-generation/types.ts index 7ea530ac2b9..123d5d98e6c 100644 --- a/src/image-generation/types.ts +++ b/src/image-generation/types.ts @@ -27,6 +27,7 @@ export type ImageGenerationRequest = { authStore?: AuthProfileStore; count?: number; size?: string; + aspectRatio?: string; resolution?: ImageGenerationResolution; inputImages?: ImageGenerationSourceImage[]; }; @@ -37,14 +38,36 @@ export type ImageGenerationResult = { metadata?: Record; }; +export type ImageGenerationModeCapabilities = { + maxCount?: number; + supportsSize?: boolean; + supportsAspectRatio?: boolean; + supportsResolution?: boolean; +}; + +export type ImageGenerationEditCapabilities = ImageGenerationModeCapabilities & { + enabled: boolean; + maxInputImages?: number; +}; + +export type ImageGenerationGeometryCapabilities = { + sizes?: string[]; + aspectRatios?: string[]; + resolutions?: ImageGenerationResolution[]; +}; + +export type ImageGenerationProviderCapabilities = { + generate: ImageGenerationModeCapabilities; + edit: ImageGenerationEditCapabilities; + geometry?: ImageGenerationGeometryCapabilities; +}; + export type ImageGenerationProvider = { id: string; aliases?: string[]; label?: string; defaultModel?: string; models?: string[]; - supportedSizes?: string[]; - supportedResolutions?: ImageGenerationResolution[]; - supportsImageEditing?: boolean; + capabilities: ImageGenerationProviderCapabilities; generateImage: (req: ImageGenerationRequest) => Promise; };