From 70bd1ffe927450af586044ffdbe3f96315ab2e61 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Thu, 12 Mar 2026 17:43:10 +0000 Subject: [PATCH] voice-call: fix tsgo type errors and formatting - Import WebSocketServer by named export (not Server alias) from ws - Add input_audio_transcription to RealtimeSessionUpdate interface - Fix z.record() call to pass explicit key schema (zod strict compat) - Cast DeepPartial tools to RealtimeToolConfig[] in normalizeVoiceCallConfig - Update consumeStreamToken test casts to reflect new nullable return type - Apply oxfmt formatting across voice-call extension files Co-Authored-By: Claude Sonnet 4.6 --- extensions/voice-call/README.md | 30 ++++----- extensions/voice-call/openclaw.plugin.json | 13 +++- extensions/voice-call/src/config.test.ts | 13 +++- extensions/voice-call/src/config.ts | 27 ++++++-- .../src/providers/openai-realtime-voice.ts | 8 +-- extensions/voice-call/src/runtime.ts | 2 +- extensions/voice-call/src/webhook.ts | 2 +- .../src/webhook/realtime-handler.test.ts | 62 +++++++++---------- .../src/webhook/realtime-handler.ts | 18 ++++-- 9 files changed, 106 insertions(+), 69 deletions(-) diff --git a/extensions/voice-call/README.md b/extensions/voice-call/README.md index 29a66d28540..6019d87661f 100644 --- a/extensions/voice-call/README.md +++ b/extensions/voice-call/README.md @@ -186,16 +186,16 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/ ```json5 { - inboundPolicy: "open", // required: realtime needs inbound calls enabled + inboundPolicy: "open", // required: realtime needs inbound calls enabled realtime: { enabled: true, - voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral, - // echo, marin, sage, shimmer, verse + voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral, + // echo, marin, sage, shimmer, verse instructions: "You are a helpful assistant.", - model: "gpt-4o-mini-realtime-preview", // optional, this is the default - temperature: 0.8, // 0–2, optional - vadThreshold: 0.5, // voice activity detection sensitivity, 0–1, optional + model: "gpt-4o-mini-realtime-preview", // optional, this is the default + temperature: 0.8, // 0–2, optional + vadThreshold: 0.5, // voice activity detection sensitivity, 0–1, optional silenceDurationMs: 500, // ms of silence before end-of-turn, optional }, } @@ -205,15 +205,15 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/ All `realtime.*` fields can be set via environment variables (config takes precedence): -| Env var | Config field | -|---|---| -| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` | -| `REALTIME_VOICE_MODEL` | `realtime.model` | -| `REALTIME_VOICE_VOICE` | `realtime.voice` | -| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` | -| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` | -| `VAD_THRESHOLD` | `realtime.vadThreshold` | -| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` | +| Env var | Config field | +| ----------------------------- | ---------------------------- | +| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` | +| `REALTIME_VOICE_MODEL` | `realtime.model` | +| `REALTIME_VOICE_VOICE` | `realtime.voice` | +| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` | +| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` | +| `VAD_THRESHOLD` | `realtime.vadThreshold` | +| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` | ### How it works diff --git a/extensions/voice-call/openclaw.plugin.json b/extensions/voice-call/openclaw.plugin.json index 83f2613530d..4a2a5f94433 100644 --- a/extensions/voice-call/openclaw.plugin.json +++ b/extensions/voice-call/openclaw.plugin.json @@ -427,7 +427,18 @@ }, "voice": { "type": "string", - "enum": ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"] + "enum": [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse" + ] }, "instructions": { "type": "string" diff --git a/extensions/voice-call/src/config.test.ts b/extensions/voice-call/src/config.test.ts index 899a12af5a7..ceef8d57d9a 100644 --- a/extensions/voice-call/src/config.test.ts +++ b/extensions/voice-call/src/config.test.ts @@ -226,7 +226,18 @@ describe("VoiceCallRealtimeConfigSchema", () => { }); it("accepts all valid Realtime API voice names", () => { - const voices = ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]; + const voices = [ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse", + ]; for (const voice of voices) { expect(() => VoiceCallRealtimeConfigSchema.parse({ voice })).not.toThrow(); } diff --git a/extensions/voice-call/src/config.ts b/extensions/voice-call/src/config.ts index 7d75c23ac29..2ace2283608 100644 --- a/extensions/voice-call/src/config.ts +++ b/extensions/voice-call/src/config.ts @@ -215,7 +215,7 @@ export const RealtimeToolSchema = z description: z.string(), parameters: z.object({ type: z.literal("object"), - properties: z.record(z.unknown()), + properties: z.record(z.string(), z.unknown()), required: z.array(z.string()).optional(), }), }) @@ -230,7 +230,18 @@ export const VoiceCallRealtimeConfigSchema = z model: z.string().optional(), /** Voice for AI speech output (env: REALTIME_VOICE_VOICE) */ voice: z - .enum(["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]) + .enum([ + "alloy", + "ash", + "ballad", + "cedar", + "coral", + "echo", + "marin", + "sage", + "shimmer", + "verse", + ]) .optional(), /** System instructions / persona (env: REALTIME_VOICE_INSTRUCTIONS) */ instructions: z.string().optional(), @@ -453,7 +464,10 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal realtime: { ...defaults.realtime, ...config.realtime, - tools: config.realtime?.tools ?? defaults.realtime.tools, + // Cast: DeepPartial makes tool fields appear optional in the input type, + // but Zod validates the full shape before it reaches here. + tools: + (config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools, }, stt: { ...defaults.stt, ...config.stt }, tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts), @@ -519,7 +533,8 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC resolved.realtime.model = resolved.realtime.model ?? process.env.REALTIME_VOICE_MODEL; resolved.realtime.voice = (resolved.realtime.voice ?? - (process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || undefined; + (process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || + undefined; resolved.realtime.instructions = resolved.realtime.instructions ?? process.env.REALTIME_VOICE_INSTRUCTIONS; if (resolved.realtime.temperature == null && process.env.REALTIME_VOICE_TEMPERATURE) { @@ -605,8 +620,8 @@ export function validateProviderConfig(config: VoiceCallConfig): { // "open" or "allowlist" are the correct choices when realtime.enabled = true. if (config.realtime?.enabled && config.inboundPolicy === "disabled") { errors.push( - "plugins.entries.voice-call.config.inboundPolicy must not be \"disabled\" when realtime.enabled is true " + - "(use \"open\" or \"allowlist\" — realtime calls are answered before policy can reject them)", + 'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true ' + + '(use "open" or "allowlist" — realtime calls are answered before policy can reject them)', ); } diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index 60ac63375c1..42f7b7949f1 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -450,9 +450,7 @@ export class OpenAIRealtimeVoiceBridge { private async attemptReconnect(): Promise { if (this.intentionallyClosed) return; - if ( - this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS - ) { + if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) { const err = new Error( `[RealtimeVoice] Max reconnect attempts (${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) exceeded`, ); @@ -464,8 +462,7 @@ export class OpenAIRealtimeVoiceBridge { this.reconnectAttempts++; const delay = - OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * - 2 ** (this.reconnectAttempts - 1); + OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1); console.log( `[RealtimeVoice] Reconnecting (${this.reconnectAttempts}/${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) in ${delay}ms...`, @@ -870,6 +867,7 @@ interface RealtimeSessionUpdate { create_response: boolean; }; temperature: number; + input_audio_transcription?: { model: string }; tools?: RealtimeTool[]; tool_choice?: string; }; diff --git a/extensions/voice-call/src/runtime.ts b/extensions/voice-call/src/runtime.ts index 7bad25ee984..7da90634774 100644 --- a/extensions/voice-call/src/runtime.ts +++ b/extensions/voice-call/src/runtime.ts @@ -11,8 +11,8 @@ import type { TelephonyTtsRuntime } from "./telephony-tts.js"; import { createTelephonyTtsProvider } from "./telephony-tts.js"; import { startTunnel, type TunnelResult } from "./tunnel.js"; import { VoiceCallWebhookServer } from "./webhook.js"; -import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; import { RealtimeCallHandler } from "./webhook/realtime-handler.js"; +import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js"; export type VoiceCallRuntime = { config: VoiceCallConfig; diff --git a/extensions/voice-call/src/webhook.ts b/extensions/voice-call/src/webhook.ts index fea71e83090..be3cce5f399 100644 --- a/extensions/voice-call/src/webhook.ts +++ b/extensions/voice-call/src/webhook.ts @@ -14,8 +14,8 @@ import type { VoiceCallProvider } from "./providers/base.js"; import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js"; import type { TwilioProvider } from "./providers/twilio.js"; import type { NormalizedEvent, WebhookContext } from "./types.js"; -import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; import type { RealtimeCallHandler } from "./webhook/realtime-handler.js"; +import { startStaleCallReaper } from "./webhook/stale-call-reaper.js"; const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024; diff --git a/extensions/voice-call/src/webhook/realtime-handler.test.ts b/extensions/voice-call/src/webhook/realtime-handler.test.ts index 755b21cc7cb..07107f08601 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.test.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.test.ts @@ -1,8 +1,8 @@ import http from "node:http"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; import type { CallManager } from "../manager.js"; -import type { CallRecord } from "../types.js"; import type { VoiceCallProvider } from "../providers/base.js"; +import type { CallRecord } from "../types.js"; import { RealtimeCallHandler } from "./realtime-handler.js"; /** Extract the stream token from a TwiML body string. */ @@ -144,10 +144,14 @@ describe("RealtimeCallHandler", () => { null, ); const issue = (handler as unknown as { issueStreamToken: () => string }).issueStreamToken; - const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; + const consume = ( + handler as unknown as { + consumeStreamToken: (t: string) => { from?: string; to?: string } | null; + } + ).consumeStreamToken; const token = issue.call(handler); - expect(consume.call(handler, token)).toBe(true); - expect(consume.call(handler, token)).toBe(false); + expect(consume.call(handler, token)).not.toBeNull(); + expect(consume.call(handler, token)).toBeNull(); }); it("rejects unknown tokens", () => { @@ -157,8 +161,12 @@ describe("RealtimeCallHandler", () => { makeProvider(), null, ); - const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken; - expect(consume.call(handler, "not-a-real-token")).toBe(false); + const consume = ( + handler as unknown as { + consumeStreamToken: (t: string) => { from?: string; to?: string } | null; + } + ).consumeStreamToken; + expect(consume.call(handler, "not-a-real-token")).toBeNull(); }); }); @@ -173,22 +181,18 @@ describe("RealtimeCallHandler", () => { }); const manager = makeManager(callRecord); - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); // Access private method via type assertion for unit testing - (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_test"); + ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_test"); // call.initiated + call.answered should both have been emitted expect(vi.mocked(manager.processEvent)).toHaveBeenCalledTimes(2); - const eventTypes = vi.mocked(manager.processEvent).mock.calls.map( - ([e]) => (e as { type: string }).type, - ); + const eventTypes = vi + .mocked(manager.processEvent) + .mock.calls.map(([e]) => (e as { type: string }).type); expect(eventTypes).toEqual(["call.initiated", "call.answered"]); // initialMessage must be cleared before call.answered fires @@ -199,15 +203,11 @@ describe("RealtimeCallHandler", () => { const callRecord = makeCallRecord({ callId: "manager-gen-id" }); const manager = makeManager(callRecord); - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); - const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_test"); + const result = ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_test"); expect(result).toBe("manager-gen-id"); }); @@ -218,15 +218,11 @@ describe("RealtimeCallHandler", () => { getCallByProviderCallId: vi.fn(() => undefined), } as unknown as CallManager; - const handler = new RealtimeCallHandler( - baseRealtimeConfig, - manager, - makeProvider(), - null, - ); + const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null); - const result = (handler as unknown as { registerCallInManager: (sid: string) => string }) - .registerCallInManager("CA_fallback"); + const result = ( + handler as unknown as { registerCallInManager: (sid: string) => string } + ).registerCallInManager("CA_fallback"); expect(result).toBe("CA_fallback"); }); diff --git a/extensions/voice-call/src/webhook/realtime-handler.ts b/extensions/voice-call/src/webhook/realtime-handler.ts index 304ee547384..56796828760 100644 --- a/extensions/voice-call/src/webhook/realtime-handler.ts +++ b/extensions/voice-call/src/webhook/realtime-handler.ts @@ -1,15 +1,15 @@ -import http from "node:http"; import { randomUUID } from "node:crypto"; +import http from "node:http"; import type { Duplex } from "node:stream"; -import { type WebSocket, Server as WebSocketServer } from "ws"; +import WebSocket, { WebSocketServer } from "ws"; import type { VoiceCallRealtimeConfig } from "../config.js"; import type { CoreConfig } from "../core-bridge.js"; import type { CallManager } from "../manager.js"; +import type { VoiceCallProvider } from "../providers/base.js"; import { OpenAIRealtimeVoiceBridge, type RealtimeTool, } from "../providers/openai-realtime-voice.js"; -import type { VoiceCallProvider } from "../providers/base.js"; import type { NormalizedEvent } from "../types.js"; import type { WebhookResponsePayload } from "../webhook.js"; @@ -124,7 +124,9 @@ export class RealtimeCallHandler { to: params?.get("To") ?? undefined, }); const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`; - console.log(`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`); + console.log( + `[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`, + ); const twiml = ` @@ -186,13 +188,17 @@ export class RealtimeCallHandler { ): OpenAIRealtimeVoiceBridge | null { const apiKey = this.openaiApiKey ?? process.env.OPENAI_API_KEY; if (!apiKey) { - console.error("[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)"); + console.error( + "[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)", + ); ws.close(1011, "No API key"); return null; } const callId = this.registerCallInManager(callSid, callerMeta); - console.log(`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`); + console.log( + `[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`, + ); // Declare as null first so closures can capture the reference before bridge is created. // By the time any callback fires, bridge will be fully assigned.