refactor: normalize voice-call runtime defaults

This commit is contained in:
Peter Steinberger 2026-03-08 02:49:40 +00:00
parent 5759b93dda
commit 3087893ef9
6 changed files with 151 additions and 30 deletions

View File

@ -1,5 +1,10 @@
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { validateProviderConfig, resolveVoiceCallConfig, type VoiceCallConfig } from "./config.js";
import {
validateProviderConfig,
normalizeVoiceCallConfig,
resolveVoiceCallConfig,
type VoiceCallConfig,
} from "./config.js";
import { createVoiceCallBaseConfig } from "./test-fixtures.js";
function createBaseConfig(provider: "telnyx" | "twilio" | "plivo" | "mock"): VoiceCallConfig {
@ -166,3 +171,22 @@ describe("validateProviderConfig", () => {
});
});
});
describe("normalizeVoiceCallConfig", () => {
it("fills nested runtime defaults from a partial config boundary", () => {
const normalized = normalizeVoiceCallConfig({
enabled: true,
provider: "mock",
streaming: {
enabled: true,
streamPath: "/custom-stream",
},
});
expect(normalized.serve.path).toBe("/voice/webhook");
expect(normalized.streaming.streamPath).toBe("/custom-stream");
expect(normalized.streaming.sttModel).toBe("gpt-4o-transcribe");
expect(normalized.tunnel.provider).toBe("none");
expect(normalized.webhookSecurity.allowedHosts).toEqual([]);
});
});

View File

@ -350,17 +350,53 @@ export const VoiceCallConfigSchema = z
.strict();
export type VoiceCallConfig = z.infer<typeof VoiceCallConfigSchema>;
type DeepPartial<T> =
T extends Array<infer U>
? DeepPartial<U>[]
: T extends object
? { [K in keyof T]?: DeepPartial<T[K]> }
: T;
export type VoiceCallConfigInput = DeepPartial<VoiceCallConfig>;
// -----------------------------------------------------------------------------
// Configuration Helpers
// -----------------------------------------------------------------------------
const DEFAULT_VOICE_CALL_CONFIG = VoiceCallConfigSchema.parse({});
function cloneDefaultVoiceCallConfig(): VoiceCallConfig {
return structuredClone(DEFAULT_VOICE_CALL_CONFIG);
}
export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
const defaults = cloneDefaultVoiceCallConfig();
return {
...defaults,
...config,
allowFrom: config.allowFrom ?? defaults.allowFrom,
outbound: { ...defaults.outbound, ...config.outbound },
serve: { ...defaults.serve, ...config.serve },
tailscale: { ...defaults.tailscale, ...config.tailscale },
tunnel: { ...defaults.tunnel, ...config.tunnel },
webhookSecurity: {
...defaults.webhookSecurity,
...config.webhookSecurity,
allowedHosts: config.webhookSecurity?.allowedHosts ?? defaults.webhookSecurity.allowedHosts,
trustedProxyIPs:
config.webhookSecurity?.trustedProxyIPs ?? defaults.webhookSecurity.trustedProxyIPs,
},
streaming: { ...defaults.streaming, ...config.streaming },
stt: { ...defaults.stt, ...config.stt },
tts: config.tts ?? defaults.tts,
};
}
/**
* Resolves the configuration by merging environment variables into missing fields.
* Returns a new configuration object with environment variables applied.
*/
export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig {
const resolved = JSON.parse(JSON.stringify(config)) as VoiceCallConfig;
export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallConfig {
const resolved = normalizeVoiceCallConfig(config);
// Telnyx
if (resolved.provider === "telnyx") {
@ -405,7 +441,7 @@ export function resolveVoiceCallConfig(config: VoiceCallConfig): VoiceCallConfig
resolved.webhookSecurity.trustForwardingHeaders ?? false;
resolved.webhookSecurity.trustedProxyIPs = resolved.webhookSecurity.trustedProxyIPs ?? [];
return resolved;
return normalizeVoiceCallConfig(resolved);
}
/**

View File

@ -3,6 +3,8 @@ import type { OpenAITTSConfig } from "./tts-openai.js";
import { OpenAITTSProvider } from "./tts-openai.js";
type ProviderInternals = {
model: string;
voice: string;
speed: number;
};
@ -27,4 +29,15 @@ describe("OpenAITTSProvider constructor defaults", () => {
expect(provider.speed).toBe(1.0);
});
it("treats blank model and voice overrides as unset", () => {
const provider = readProviderInternals({
apiKey: "sk-test", // pragma: allowlist secret
model: " ",
voice: "",
});
expect(provider.model).toBe("gpt-4o-mini-tts");
expect(provider.voice).toBe("coral");
});
});

View File

@ -66,6 +66,11 @@ export const OPENAI_TTS_VOICES = [
export type OpenAITTSVoice = (typeof OPENAI_TTS_VOICES)[number];
function trimToUndefined(value: string | undefined): string | undefined {
const trimmed = value?.trim();
return trimmed ? trimmed : undefined;
}
/**
* OpenAI TTS Provider for generating speech audio.
*/
@ -77,13 +82,14 @@ export class OpenAITTSProvider {
private instructions?: string;
constructor(config: OpenAITTSConfig = {}) {
this.apiKey = config.apiKey || process.env.OPENAI_API_KEY || "";
this.apiKey =
trimToUndefined(config.apiKey) ?? trimToUndefined(process.env.OPENAI_API_KEY) ?? "";
// Default to gpt-4o-mini-tts for intelligent realtime applications
this.model = config.model || "gpt-4o-mini-tts";
this.model = trimToUndefined(config.model) ?? "gpt-4o-mini-tts";
// Default to coral - good balance of quality and natural tone
this.voice = (config.voice as OpenAITTSVoice) || "coral";
this.voice = (trimToUndefined(config.voice) as OpenAITTSVoice | undefined) ?? "coral";
this.speed = config.speed ?? 1.0;
this.instructions = config.instructions;
this.instructions = trimToUndefined(config.instructions);
if (!this.apiKey) {
throw new Error("OpenAI API key required (set OPENAI_API_KEY or pass apiKey)");
@ -105,7 +111,7 @@ export class OpenAITTSProvider {
};
// Add instructions if using gpt-4o-mini-tts model
const effectiveInstructions = instructions || this.instructions;
const effectiveInstructions = trimToUndefined(instructions) ?? this.instructions;
if (effectiveInstructions && this.model.includes("gpt-4o-mini-tts")) {
body.instructions = effectiveInstructions;
}

View File

@ -274,6 +274,32 @@ describe("VoiceCallWebhookServer replay handling", () => {
});
});
describe("VoiceCallWebhookServer response normalization", () => {
it("preserves explicit empty provider response bodies", async () => {
const responseProvider: VoiceCallProvider = {
...provider,
parseWebhookEvent: () => ({
events: [],
statusCode: 204,
providerResponseBody: "",
}),
};
const { manager } = createManager([]);
const config = createConfig({ serve: { port: 0, bind: "127.0.0.1", path: "/voice/webhook" } });
const server = new VoiceCallWebhookServer(config, manager, responseProvider);
try {
const baseUrl = await server.start();
const response = await postWebhookForm(server, baseUrl, "CallSid=CA123&SpeechResult=hello");
expect(response.status).toBe(204);
expect(await response.text()).toBe("");
} finally {
await server.stop();
}
});
});
describe("VoiceCallWebhookServer start idempotency", () => {
it("returns existing URL when start() is called twice without stop()", async () => {
const { manager } = createManager([]);

View File

@ -5,7 +5,7 @@ import {
readRequestBodyWithLimit,
requestBodyErrorToText,
} from "openclaw/plugin-sdk/voice-call";
import type { VoiceCallConfig } from "./config.js";
import { normalizeVoiceCallConfig, type VoiceCallConfig } from "./config.js";
import type { CoreConfig } from "./core-bridge.js";
import type { CallManager } from "./manager.js";
import type { MediaStreamConfig } from "./media-stream.js";
@ -24,6 +24,26 @@ type WebhookResponsePayload = {
headers?: Record<string, string>;
};
function buildRequestUrl(
requestUrl: string | undefined,
requestHost: string | undefined,
fallbackHost = "localhost",
): URL {
return new URL(requestUrl ?? "/", `http://${requestHost ?? fallbackHost}`);
}
function normalizeWebhookResponse(parsed: {
statusCode?: number;
providerResponseHeaders?: Record<string, string>;
providerResponseBody?: string;
}): WebhookResponsePayload {
return {
statusCode: parsed.statusCode ?? 200,
headers: parsed.providerResponseHeaders,
body: parsed.providerResponseBody ?? "OK",
};
}
/**
* HTTP server for receiving voice call webhooks from providers.
* Supports WebSocket upgrades for media streams when streaming is enabled.
@ -46,13 +66,13 @@ export class VoiceCallWebhookServer {
provider: VoiceCallProvider,
coreConfig?: CoreConfig,
) {
this.config = config;
this.config = normalizeVoiceCallConfig(config);
this.manager = manager;
this.provider = provider;
this.coreConfig = coreConfig ?? null;
// Initialize media stream handler if streaming is enabled
if (config.streaming?.enabled) {
if (this.config.streaming.enabled) {
this.initializeMediaStreaming();
}
}
@ -68,7 +88,8 @@ export class VoiceCallWebhookServer {
* Initialize media streaming with OpenAI Realtime STT.
*/
private initializeMediaStreaming(): void {
const apiKey = this.config.streaming?.openaiApiKey || process.env.OPENAI_API_KEY;
const streaming = this.config.streaming;
const apiKey = streaming.openaiApiKey ?? process.env.OPENAI_API_KEY;
if (!apiKey) {
console.warn("[voice-call] Streaming enabled but no OpenAI API key found");
@ -77,17 +98,17 @@ export class VoiceCallWebhookServer {
const sttProvider = new OpenAIRealtimeSTTProvider({
apiKey,
model: this.config.streaming?.sttModel,
silenceDurationMs: this.config.streaming?.silenceDurationMs,
vadThreshold: this.config.streaming?.vadThreshold,
model: streaming.sttModel,
silenceDurationMs: streaming.silenceDurationMs,
vadThreshold: streaming.vadThreshold,
});
const streamConfig: MediaStreamConfig = {
sttProvider,
preStartTimeoutMs: this.config.streaming?.preStartTimeoutMs,
maxPendingConnections: this.config.streaming?.maxPendingConnections,
maxPendingConnectionsPerIp: this.config.streaming?.maxPendingConnectionsPerIp,
maxConnections: this.config.streaming?.maxConnections,
preStartTimeoutMs: streaming.preStartTimeoutMs,
maxPendingConnections: streaming.maxPendingConnections,
maxPendingConnectionsPerIp: streaming.maxPendingConnectionsPerIp,
maxConnections: streaming.maxConnections,
shouldAcceptStream: ({ callId, token }) => {
const call = this.manager.getCallByProviderCallId(callId);
if (!call) {
@ -190,7 +211,7 @@ export class VoiceCallWebhookServer {
*/
async start(): Promise<string> {
const { port, bind, path: webhookPath } = this.config.serve;
const streamPath = this.config.streaming?.streamPath || "/voice/stream";
const streamPath = this.config.streaming.streamPath;
// Guard: if a server is already listening, return the existing URL.
// This prevents EADDRINUSE when start() is called more than once on the
@ -280,8 +301,7 @@ export class VoiceCallWebhookServer {
private getUpgradePathname(request: http.IncomingMessage): string | null {
try {
const host = request.headers.host || "localhost";
return new URL(request.url || "/", `http://${host}`).pathname;
return buildRequestUrl(request.url, request.headers.host).pathname;
} catch {
return null;
}
@ -322,7 +342,7 @@ export class VoiceCallWebhookServer {
req: http.IncomingMessage,
webhookPath: string,
): Promise<WebhookResponsePayload> {
const url = new URL(req.url || "/", `http://${req.headers.host}`);
const url = buildRequestUrl(req.url, req.headers.host);
if (url.pathname === "/voice/hold-music") {
return {
@ -360,7 +380,7 @@ export class VoiceCallWebhookServer {
const ctx: WebhookContext = {
headers: req.headers as Record<string, string | string[] | undefined>,
rawBody: body,
url: `http://${req.headers.host}${req.url}`,
url: url.toString(),
method: "POST",
query: Object.fromEntries(url.searchParams),
remoteAddress: req.socket.remoteAddress ?? undefined,
@ -386,11 +406,7 @@ export class VoiceCallWebhookServer {
this.processParsedEvents(parsed.events);
}
return {
statusCode: parsed.statusCode || 200,
headers: parsed.providerResponseHeaders,
body: parsed.providerResponseBody || "OK",
};
return normalizeWebhookResponse(parsed);
}
private processParsedEvents(events: NormalizedEvent[]): void {