diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index 03cc011fc66..fad18e67faa 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -1,5 +1,10 @@ import { afterEach, beforeEach, describe, expect, it } from "vitest"; -import { validateProviderConfig, resolveVoiceCallConfig, type VoiceCallConfig } from "./config.js"; +import { + validateProviderConfig, + normalizeVoiceCallConfig, + resolveVoiceCallConfig, + type VoiceCallConfig, +} from "./config.js"; import { createVoiceCallBaseConfig } from "./test-fixtures.js"; function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): VoiceCallConfig { @@ -166,3 +171,22 @@ describe("validateProviderConfig", () => { }); }); }); + +describe("normalizeVoiceCallConfig", () => { + it("fills nested runtime defaults from a partial config boundary", () => { + const normalized = normalizeVoiceCallConfig({ + enabled: true, + provider: "mock", + streaming: { + enabled: true, + streamPath: "/custom-stream", + }, + }); + + expect(normalized.serve.path).toBe("/voice/webhook"); + expect(normalized.streaming.streamPath).toBe("/custom-stream"); + expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe"); + expect(normalized.tunnel.provider).toBe("none"); + expect(normalized.webhookSecurity.allowedHosts).toEqual([]); + }); +}); diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 75012723680..405eda179d9 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -350,17 +350,53 @@ export const VoiceCallConfigSchema = z .strict(); export type VoiceCallConfig = z.infer; +type DeepPartial = + T extends Array + ? DeepPartial[] + : T extends object + ? { [K in keyof T]?: DeepPartial } + : T; +export type VoiceCallConfigInput = DeepPartial; // ----------------------------------------------------------------------------- // Configuration Helpers // ----------------------------------------------------------------------------- +const DEFAULT_VOICE_CALL_CONFIG = VoiceCallConfigSchema.parse({}); + +function cloneDefaultVoiceCallConfig(): VoiceCallConfig { + return structuredClone(DEFAULT_VOICE_CALL_CONFIG); +} + +export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig { + const defaults = cloneDefaultVoiceCallConfig(); + return { + ...defaults, + ...config, + allowFrom: config.allowFrom ?? defaults.allowFrom, + outbound: { ...defaults.outbound, ...config.outbound }, + serve: { ...defaults.serve, ...config.serve }, + tailscale: { ...defaults.tailscale, ...config.tailscale }, + tunnel: { ...defaults.tunnel, ...config.tunnel }, + webhookSecurity: { + ...defaults.webhookSecurity, + ...config.webhookSecurity, + allowedHosts: config.webhookSecurity?.allowedHosts ?? defaults.webhookSecurity.allowedHosts, + trustedProxyIPs: + config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs, + }, + streaming: { ...defaults.streaming, ...config.streaming }, + stt: { ...defaults.stt, ...config.stt }, + tts: config.tts ?? defaults.tts, + }; +} + /** * Resolves the configuration by merging environment variables into missing fields. * Returns a new configuration object with environment variables applied. */ -export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig { - const resolved = JSON.parse(JSON.stringify(config)) as VoiceCallConfig; +export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig { + const resolved = normalizeVoiceCallConfig(config); // Telnyx if (resolved.provider === "telnyx") { @@ -405,7 +441,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig resolved.webhookSecurity.trustForwardingHeaders ?? false; resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? []; - return resolved; + return normalizeVoiceCallConfig(resolved); } /** diff --git a/extensions/voice-call/src/providers/tts-openai.test.ts b/extensions/voice-call/src/providers/tts-openai.test.ts index 9bc7a026a73..79d4644b59f 100644 --- a/extensions/voice-call/src/providers/tts-openai.test.ts +++ b/extensions/voice-call/src/providers/tts-openai.test.ts @@ -3,6 +3,8 @@ import type { OpenAITTSConfig } from "./tts-openai.js"; import { OpenAITTSProvider } from "./tts-openai.js"; type ProviderInternals = { + model: string; + voice: string; speed: number; }; @@ -27,4 +29,15 @@ describe("OpenAITTSProvider constructor defaults", () => { expect(provider.speed).toBe(1.0); }); + + it("treats blank model and voice overrides as unset", () => { + const provider = readProviderInternals({ + apiKey: "sk-test", // pragma: allowlist secret + model: " ", + voice: "", + }); + + expect(provider.model).toBe("gpt-4o-mini-tts"); + expect(provider.voice).toBe("coral"); + }); }); diff --git a/extensions/voice-call/src/providers/tts-openai.ts b/extensions/voice-call/src/providers/tts-openai.ts index a1ff70082cd..a27030b4578 100644 --- a/extensions/voice-call/src/providers/tts-openai.ts +++ b/extensions/voice-call/src/providers/tts-openai.ts @@ -66,6 +66,11 @@ export const OPENAI_TTS_VOICES = [ export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number]; +function trimToUndefined(value: string | undefined): string | undefined { + const trimmed = value?.trim(); + return trimmed ? trimmed : undefined; +} + /** * OpenAI TTS Provider for generating speech audio. */ @@ -77,13 +82,14 @@ export class OpenAITTSProvider { private instructions?: string; constructor(config: OpenAITTSConfig = {}) { - this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || ""; + this.apiKey = + trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? ""; // Default to gpt-4o-mini-tts for intelligent realtime applications - this.model = config.model || "gpt-4o-mini-tts"; + this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts"; // Default to coral - good balance of quality and natural tone - this.voice = (config.voice as OpenAITTSVoice) || "coral"; + this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral"; this.speed = config.speed ?? 1.0; - this.instructions = config.instructions; + this.instructions = trimToUndefined(config.instructions); if (!this.apiKey) { throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)"); @@ -105,7 +111,7 @@ export class OpenAITTSProvider { }; // Add instructions if using gpt-4o-mini-tts model - const effectiveInstructions = instructions || this.instructions; + const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions; if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) { body.instructions = effectiveInstructions; } diff --git a/extensions/voice-call/src/webhook.test.ts b/extensions/voice-call/src/webhook.test.ts index 6e3ecc6aafa..f5a827a3ef3 100644 --- a/extensions/voice-call/src/webhook.test.ts +++ b/extensions/voice-call/src/webhook.test.ts @@ -274,6 +274,32 @@ describe("VoiceCallWebhookServer replay handling", () => { }); }); +describe("VoiceCallWebhookServer response normalization", () => { + it("preserves explicit empty provider response bodies", async () => { + const responseProvider: VoiceCallProvider = { + ...provider, + parseWebhookEvent: () => ({ + events: [], + statusCode: 204, + providerResponseBody: "", + }), + }; + const { manager } = createManager([]); + const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } }); + const server = new VoiceCallWebhookServer(config, manager, responseProvider); + + try { + const baseUrl = await server.start(); + const response = await postWebhookForm(server, baseUrl, "CallSid=CA123&SpeechResult=hello"); + + expect(response.status).toBe(204); + expect(await response.text()).toBe(""); + } finally { + await server.stop(); + } + }); +}); + describe("VoiceCallWebhookServer start idempotency", () => { it("returns existing URL when start() is called twice without stop()", async () => { const { manager } = createManager([]); diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index cb0955b830b..1258229735e 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -5,7 +5,7 @@ import { readRequestBodyWithLimit, requestBodyErrorToText, } from "openclaw/plugin-sdk/voice-call"; -import type { VoiceCallConfig } from "./config.js"; +import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js"; import type { CoreConfig } from "./core-bridge.js"; import type { CallManager } from "./manager.js"; import type { MediaStreamConfig } from "./media-stream.js"; @@ -24,6 +24,26 @@ type WebhookResponsePayload = { headers?: Record; }; +function buildRequestUrl( + requestUrl: string | undefined, + requestHost: string | undefined, + fallbackHost = "localhost", +): URL { + return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`); +} + +function normalizeWebhookResponse(parsed: { + statusCode?: number; + providerResponseHeaders?: Record; + providerResponseBody?: string; +}): WebhookResponsePayload { + return { + statusCode: parsed.statusCode ?? 200, + headers: parsed.providerResponseHeaders, + body: parsed.providerResponseBody ?? "OK", + }; +} + /** * HTTP server for receiving voice call webhooks from providers. * Supports WebSocket upgrades for media streams when streaming is enabled. @@ -46,13 +66,13 @@ export class VoiceCallWebhookServer { provider: VoiceCallProvider, coreConfig?: CoreConfig, ) { - this.config = config; + this.config = normalizeVoiceCallConfig(config); this.manager = manager; this.provider = provider; this.coreConfig = coreConfig ?? null; // Initialize media stream handler if streaming is enabled - if (config.streaming?.enabled) { + if (this.config.streaming.enabled) { this.initializeMediaStreaming(); } } @@ -68,7 +88,8 @@ export class VoiceCallWebhookServer { * Initialize media streaming with OpenAI Realtime STT. */ private initializeMediaStreaming(): void { - const apiKey = this.config.streaming?.openaiApiKey || process.env.OPENAI_API_KEY; + const streaming = this.config.streaming; + const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY; if (!apiKey) { console.warn("[voice-call] Streaming enabled but no OpenAI API key found"); @@ -77,17 +98,17 @@ export class VoiceCallWebhookServer { const sttProvider = new OpenAIRealtimeSTTProvider({ apiKey, - model: this.config.streaming?.sttModel, - silenceDurationMs: this.config.streaming?.silenceDurationMs, - vadThreshold: this.config.streaming?.vadThreshold, + model: streaming.sttModel, + silenceDurationMs: streaming.silenceDurationMs, + vadThreshold: streaming.vadThreshold, }); const streamConfig: MediaStreamConfig = { sttProvider, - preStartTimeoutMs: this.config.streaming?.preStartTimeoutMs, - maxPendingConnections: this.config.streaming?.maxPendingConnections, - maxPendingConnectionsPerIp: this.config.streaming?.maxPendingConnectionsPerIp, - maxConnections: this.config.streaming?.maxConnections, + preStartTimeoutMs: streaming.preStartTimeoutMs, + maxPendingConnections: streaming.maxPendingConnections, + maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp, + maxConnections: streaming.maxConnections, shouldAcceptStream: ({ callId, token }) => { const call = this.manager.getCallByProviderCallId(callId); if (!call) { @@ -190,7 +211,7 @@ export class VoiceCallWebhookServer { */ async start(): Promise { const { port, bind, path: webhookPath } = this.config.serve; - const streamPath = this.config.streaming?.streamPath || "/voice/stream"; + const streamPath = this.config.streaming.streamPath; // Guard: if a server is already listening, return the existing URL. // This prevents EADDRINUSE when start() is called more than once on the @@ -280,8 +301,7 @@ export class VoiceCallWebhookServer { private getUpgradePathname(request: http.IncomingMessage): string | null { try { - const host = request.headers.host || "localhost"; - return new URL(request.url || "/", `http://${host}`).pathname; + return buildRequestUrl(request.url, request.headers.host).pathname; } catch { return null; } @@ -322,7 +342,7 @@ export class VoiceCallWebhookServer { req: http.IncomingMessage, webhookPath: string, ): Promise { - const url = new URL(req.url || "/", `http://${req.headers.host}`); + const url = buildRequestUrl(req.url, req.headers.host); if (url.pathname === "/voice/hold-music") { return { @@ -360,7 +380,7 @@ export class VoiceCallWebhookServer { const ctx: WebhookContext = { headers: req.headers as Record, rawBody: body, - url: `http://${req.headers.host}${req.url}`, + url: url.toString(), method: "POST", query: Object.fromEntries(url.searchParams), remoteAddress: req.socket.remoteAddress ?? undefined, @@ -386,11 +406,7 @@ export class VoiceCallWebhookServer { this.processParsedEvents(parsed.events); } - return { - statusCode: parsed.statusCode || 200, - headers: parsed.providerResponseHeaders, - body: parsed.providerResponseBody || "OK", - }; + return normalizeWebhookResponse(parsed); } private processParsedEvents(events: NormalizedEvent[]): void {