voice-call: fix tsgo type errors and formatting

- Import WebSocketServer by named export (not Server alias) from ws
- Add input_audio_transcription to RealtimeSessionUpdate interface
- Fix z.record() call to pass explicit key schema (zod strict compat)
- Cast DeepPartial tools to RealtimeToolConfig[] in normalizeVoiceCallConfig
- Update consumeStreamToken test casts to reflect new nullable return type
- Apply oxfmt formatting across voice-call extension files

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Forrest Blount 2026-03-12 17:43:10 +00:00
parent 1ea1695edc
commit 70bd1ffe92
9 changed files with 106 additions and 69 deletions

View File

@ -186,16 +186,16 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/
```json5
{
inboundPolicy: "open", // required: realtime needs inbound calls enabled
inboundPolicy: "open", // required: realtime needs inbound calls enabled
realtime: {
enabled: true,
voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral,
// echo, marin, sage, shimmer, verse
voice: "alloy", // Realtime API voices: alloy, ash, ballad, cedar, coral,
// echo, marin, sage, shimmer, verse
instructions: "You are a helpful assistant.",
model: "gpt-4o-mini-realtime-preview", // optional, this is the default
temperature: 0.8, // 02, optional
vadThreshold: 0.5, // voice activity detection sensitivity, 01, optional
model: "gpt-4o-mini-realtime-preview", // optional, this is the default
temperature: 0.8, // 02, optional
vadThreshold: 0.5, // voice activity detection sensitivity, 01, optional
silenceDurationMs: 500, // ms of silence before end-of-turn, optional
},
}
@ -205,15 +205,15 @@ Realtime mode routes inbound calls directly to the [OpenAI Realtime API](https:/
All `realtime.*` fields can be set via environment variables (config takes precedence):
| Env var | Config field |
|---|---|
| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` |
| `REALTIME_VOICE_MODEL` | `realtime.model` |
| `REALTIME_VOICE_VOICE` | `realtime.voice` |
| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` |
| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` |
| `VAD_THRESHOLD` | `realtime.vadThreshold` |
| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` |
| Env var | Config field |
| ----------------------------- | ---------------------------- |
| `REALTIME_VOICE_ENABLED=true` | `realtime.enabled` |
| `REALTIME_VOICE_MODEL` | `realtime.model` |
| `REALTIME_VOICE_VOICE` | `realtime.voice` |
| `REALTIME_VOICE_INSTRUCTIONS` | `realtime.instructions` |
| `REALTIME_VOICE_TEMPERATURE` | `realtime.temperature` |
| `VAD_THRESHOLD` | `realtime.vadThreshold` |
| `SILENCE_DURATION_MS` | `realtime.silenceDurationMs` |
### How it works

View File

@ -427,7 +427,18 @@
},
"voice": {
"type": "string",
"enum": ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"]
"enum": [
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"marin",
"sage",
"shimmer",
"verse"
]
},
"instructions": {
"type": "string"

View File

@ -226,7 +226,18 @@ describe("VoiceCallRealtimeConfigSchema", () => {
});
it("accepts all valid Realtime API voice names", () => {
const voices = ["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"];
const voices = [
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"marin",
"sage",
"shimmer",
"verse",
];
for (const voice of voices) {
expect(() => VoiceCallRealtimeConfigSchema.parse({ voice })).not.toThrow();
}

View File

@ -215,7 +215,7 @@ export const RealtimeToolSchema = z
description: z.string(),
parameters: z.object({
type: z.literal("object"),
properties: z.record(z.unknown()),
properties: z.record(z.string(), z.unknown()),
required: z.array(z.string()).optional(),
}),
})
@ -230,7 +230,18 @@ export const VoiceCallRealtimeConfigSchema = z
model: z.string().optional(),
/** Voice for AI speech output (env: REALTIME_VOICE_VOICE) */
voice: z
.enum(["alloy", "ash", "ballad", "cedar", "coral", "echo", "marin", "sage", "shimmer", "verse"])
.enum([
"alloy",
"ash",
"ballad",
"cedar",
"coral",
"echo",
"marin",
"sage",
"shimmer",
"verse",
])
.optional(),
/** System instructions / persona (env: REALTIME_VOICE_INSTRUCTIONS) */
instructions: z.string().optional(),
@ -453,7 +464,10 @@ export function normalizeVoiceCallConfig(config: VoiceCallConfigInput): VoiceCal
realtime: {
...defaults.realtime,
...config.realtime,
tools: config.realtime?.tools ?? defaults.realtime.tools,
// Cast: DeepPartial makes tool fields appear optional in the input type,
// but Zod validates the full shape before it reaches here.
tools:
(config.realtime?.tools as RealtimeToolConfig[] | undefined) ?? defaults.realtime.tools,
},
stt: { ...defaults.stt, ...config.stt },
tts: normalizeVoiceCallTtsConfig(defaults.tts, config.tts),
@ -519,7 +533,8 @@ export function resolveVoiceCallConfig(config: VoiceCallConfigInput): VoiceCallC
resolved.realtime.model = resolved.realtime.model ?? process.env.REALTIME_VOICE_MODEL;
resolved.realtime.voice =
(resolved.realtime.voice ??
(process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) || undefined;
(process.env.REALTIME_VOICE_VOICE as VoiceCallRealtimeConfig["voice"])) ||
undefined;
resolved.realtime.instructions =
resolved.realtime.instructions ?? process.env.REALTIME_VOICE_INSTRUCTIONS;
if (resolved.realtime.temperature == null && process.env.REALTIME_VOICE_TEMPERATURE) {
@ -605,8 +620,8 @@ export function validateProviderConfig(config: VoiceCallConfig): {
// "open" or "allowlist" are the correct choices when realtime.enabled = true.
if (config.realtime?.enabled && config.inboundPolicy === "disabled") {
errors.push(
"plugins.entries.voice-call.config.inboundPolicy must not be \"disabled\" when realtime.enabled is true " +
"(use \"open\" or \"allowlist\" — realtime calls are answered before policy can reject them)",
'plugins.entries.voice-call.config.inboundPolicy must not be "disabled" when realtime.enabled is true ' +
'(use "open" or "allowlist" — realtime calls are answered before policy can reject them)',
);
}

View File

@ -450,9 +450,7 @@ export class OpenAIRealtimeVoiceBridge {
private async attemptReconnect(): Promise<void> {
if (this.intentionallyClosed) return;
if (
this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS
) {
if (this.reconnectAttempts >= OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS) {
const err = new Error(
`[RealtimeVoice] Max reconnect attempts (${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) exceeded`,
);
@ -464,8 +462,7 @@ export class OpenAIRealtimeVoiceBridge {
this.reconnectAttempts++;
const delay =
OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS *
2 ** (this.reconnectAttempts - 1);
OpenAIRealtimeVoiceBridge.BASE_RECONNECT_DELAY_MS * 2 ** (this.reconnectAttempts - 1);
console.log(
`[RealtimeVoice] Reconnecting (${this.reconnectAttempts}/${OpenAIRealtimeVoiceBridge.MAX_RECONNECT_ATTEMPTS}) in ${delay}ms...`,
@ -870,6 +867,7 @@ interface RealtimeSessionUpdate {
create_response: boolean;
};
temperature: number;
input_audio_transcription?: { model: string };
tools?: RealtimeTool[];
tool_choice?: string;
};

View File

@ -11,8 +11,8 @@ import type { TelephonyTtsRuntime } from "./telephony-tts.js";
import { createTelephonyTtsProvider } from "./telephony-tts.js";
import { startTunnel, type TunnelResult } from "./tunnel.js";
import { VoiceCallWebhookServer } from "./webhook.js";
import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js";
import { RealtimeCallHandler } from "./webhook/realtime-handler.js";
import { cleanupTailscaleExposure, setupTailscaleExposure } from "./webhook/tailscale.js";
export type VoiceCallRuntime = {
config: VoiceCallConfig;

View File

@ -14,8 +14,8 @@ import type { VoiceCallProvider } from "./providers/base.js";
import { OpenAIRealtimeSTTProvider } from "./providers/stt-openai-realtime.js";
import type { TwilioProvider } from "./providers/twilio.js";
import type { NormalizedEvent, WebhookContext } from "./types.js";
import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
import type { RealtimeCallHandler } from "./webhook/realtime-handler.js";
import { startStaleCallReaper } from "./webhook/stale-call-reaper.js";
const MAX_WEBHOOK_BODY_BYTES = 1024 * 1024;

View File

@ -1,8 +1,8 @@
import http from "node:http";
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
import type { CallManager } from "../manager.js";
import type { CallRecord } from "../types.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { CallRecord } from "../types.js";
import { RealtimeCallHandler } from "./realtime-handler.js";
/** Extract the stream token from a TwiML body string. */
@ -144,10 +144,14 @@ describe("RealtimeCallHandler", () => {
null,
);
const issue = (handler as unknown as { issueStreamToken: () => string }).issueStreamToken;
const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken;
const consume = (
handler as unknown as {
consumeStreamToken: (t: string) => { from?: string; to?: string } | null;
}
).consumeStreamToken;
const token = issue.call(handler);
expect(consume.call(handler, token)).toBe(true);
expect(consume.call(handler, token)).toBe(false);
expect(consume.call(handler, token)).not.toBeNull();
expect(consume.call(handler, token)).toBeNull();
});
it("rejects unknown tokens", () => {
@ -157,8 +161,12 @@ describe("RealtimeCallHandler", () => {
makeProvider(),
null,
);
const consume = (handler as unknown as { consumeStreamToken: (t: string) => boolean }).consumeStreamToken;
expect(consume.call(handler, "not-a-real-token")).toBe(false);
const consume = (
handler as unknown as {
consumeStreamToken: (t: string) => { from?: string; to?: string } | null;
}
).consumeStreamToken;
expect(consume.call(handler, "not-a-real-token")).toBeNull();
});
});
@ -173,22 +181,18 @@ describe("RealtimeCallHandler", () => {
});
const manager = makeManager(callRecord);
const handler = new RealtimeCallHandler(
baseRealtimeConfig,
manager,
makeProvider(),
null,
);
const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null);
// Access private method via type assertion for unit testing
(handler as unknown as { registerCallInManager: (sid: string) => string })
.registerCallInManager("CA_test");
(
handler as unknown as { registerCallInManager: (sid: string) => string }
).registerCallInManager("CA_test");
// call.initiated + call.answered should both have been emitted
expect(vi.mocked(manager.processEvent)).toHaveBeenCalledTimes(2);
const eventTypes = vi.mocked(manager.processEvent).mock.calls.map(
([e]) => (e as { type: string }).type,
);
const eventTypes = vi
.mocked(manager.processEvent)
.mock.calls.map(([e]) => (e as { type: string }).type);
expect(eventTypes).toEqual(["call.initiated", "call.answered"]);
// initialMessage must be cleared before call.answered fires
@ -199,15 +203,11 @@ describe("RealtimeCallHandler", () => {
const callRecord = makeCallRecord({ callId: "manager-gen-id" });
const manager = makeManager(callRecord);
const handler = new RealtimeCallHandler(
baseRealtimeConfig,
manager,
makeProvider(),
null,
);
const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null);
const result = (handler as unknown as { registerCallInManager: (sid: string) => string })
.registerCallInManager("CA_test");
const result = (
handler as unknown as { registerCallInManager: (sid: string) => string }
).registerCallInManager("CA_test");
expect(result).toBe("manager-gen-id");
});
@ -218,15 +218,11 @@ describe("RealtimeCallHandler", () => {
getCallByProviderCallId: vi.fn(() => undefined),
} as unknown as CallManager;
const handler = new RealtimeCallHandler(
baseRealtimeConfig,
manager,
makeProvider(),
null,
);
const handler = new RealtimeCallHandler(baseRealtimeConfig, manager, makeProvider(), null);
const result = (handler as unknown as { registerCallInManager: (sid: string) => string })
.registerCallInManager("CA_fallback");
const result = (
handler as unknown as { registerCallInManager: (sid: string) => string }
).registerCallInManager("CA_fallback");
expect(result).toBe("CA_fallback");
});

View File

@ -1,15 +1,15 @@
import http from "node:http";
import { randomUUID } from "node:crypto";
import http from "node:http";
import type { Duplex } from "node:stream";
import { type WebSocket, Server as WebSocketServer } from "ws";
import WebSocket, { WebSocketServer } from "ws";
import type { VoiceCallRealtimeConfig } from "../config.js";
import type { CoreConfig } from "../core-bridge.js";
import type { CallManager } from "../manager.js";
import type { VoiceCallProvider } from "../providers/base.js";
import {
OpenAIRealtimeVoiceBridge,
type RealtimeTool,
} from "../providers/openai-realtime-voice.js";
import type { VoiceCallProvider } from "../providers/base.js";
import type { NormalizedEvent } from "../types.js";
import type { WebhookResponsePayload } from "../webhook.js";
@ -124,7 +124,9 @@ export class RealtimeCallHandler {
to: params?.get("To") ?? undefined,
});
const wsUrl = `wss://${host}/voice/stream/realtime?token=${token}`;
console.log(`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`);
console.log(
`[voice-call] Returning realtime TwiML with WebSocket: wss://${host}/voice/stream/realtime`,
);
const twiml = `<?xml version="1.0" encoding="UTF-8"?>
<Response>
<Connect>
@ -186,13 +188,17 @@ export class RealtimeCallHandler {
): OpenAIRealtimeVoiceBridge | null {
const apiKey = this.openaiApiKey ?? process.env.OPENAI_API_KEY;
if (!apiKey) {
console.error("[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)");
console.error(
"[voice-call] No OpenAI API key for realtime call (set streaming.openaiApiKey or OPENAI_API_KEY)",
);
ws.close(1011, "No API key");
return null;
}
const callId = this.registerCallInManager(callSid, callerMeta);
console.log(`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`);
console.log(
`[voice-call] Realtime call: streamSid=${streamSid}, callSid=${callSid}, callId=${callId}`,
);
// Declare as null first so closures can capture the reference before bridge is created.
// By the time any callback fires, bridge will be fully assigned.