From e554a0ea76ee50099ae26e8885b41ade3ff55e69 Mon Sep 17 00:00:00 2001 From: Forrest Blount Date: Wed, 11 Mar 2026 21:02:19 +0000 Subject: [PATCH] voice-call: bridge reliability and clarity improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fire drainPendingAudio + onReady in session.created handler instead of a fixed 100ms setTimeout after WS open; onReady is guarded by a sessionReadyFired flag so it fires exactly once (not on reconnects) - Add comment explaining why input_audio_transcription uses whisper-1 (Realtime API only; distinct from streaming.sttModel) - Mark OpenAIRealtimeVoiceProvider, createBridgeForStream, and MediaStreamHandlerLike as @internal — not used by the built-in handler but kept for external consumers Co-Authored-By: Claude Sonnet 4.6 --- .../src/providers/openai-realtime-voice.ts | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/extensions/voice-call/src/providers/openai-realtime-voice.ts b/extensions/voice-call/src/providers/openai-realtime-voice.ts index ee788904819..97118dc778a 100755 --- a/extensions/voice-call/src/providers/openai-realtime-voice.ts +++ b/extensions/voice-call/src/providers/openai-realtime-voice.ts @@ -199,6 +199,9 @@ export class OpenAIRealtimeVoiceBridge { /** Accumulate tool call arguments (streamed as deltas) */ private toolCallBuffers = new Map(); + /** Guards onReady/greeting so it fires only on the first session, not reconnects */ + private sessionReadyFired = false; + constructor(config: RealtimeVoiceConfig) { if (!config.apiKey) { throw new Error("[RealtimeVoice] OpenAI API key is required"); @@ -333,15 +336,10 @@ export class OpenAIRealtimeVoiceBridge { console.log("[RealtimeVoice] WebSocket connected"); this.connected = true; this.reconnectAttempts = 0; - - // Small delay to ensure the server is ready before sending session.update - // (mirrors the reference implementation's setTimeout(initializeSession, 100)) - setTimeout(() => { - this.sendSessionUpdate(); - this.drainPendingAudio(); - this.config.onReady?.(); - resolve(); - }, 100); + // Send session config immediately — no need to wait; the server + // confirms receipt via session.created which triggers drain + onReady. + this.sendSessionUpdate(); + resolve(); }); this.ws.on("message", (data: Buffer) => { @@ -409,6 +407,9 @@ export class OpenAIRealtimeVoiceBridge { voice: cfg.voice ?? "alloy", input_audio_format: "g711_ulaw", output_audio_format: "g711_ulaw", + // whisper-1 is the only model currently supported by the Realtime API for + // inline user-speech transcription. This is distinct from the streaming + // STT path (streaming.sttModel) which uses gpt-4o-transcribe. input_audio_transcription: { model: "whisper-1", }, @@ -491,6 +492,12 @@ export class OpenAIRealtimeVoiceBridge { // ---- Session lifecycle ---- case "session.created": console.log("[RealtimeVoice] Session created"); + this.drainPendingAudio(); + // Fire onReady exactly once — not on reconnects (greeting already played) + if (!this.sessionReadyFired) { + this.sessionReadyFired = true; + this.config.onReady?.(); + } break; case "session.updated": @@ -716,6 +723,7 @@ export class OpenAIRealtimeVoiceBridge { /** * Configuration for the provider factory. * Holds shared/default settings; per-call config is passed to createSession(). + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. */ export interface RealtimeVoiceProviderConfig { /** OpenAI API key */ @@ -784,6 +792,7 @@ export class OpenAIRealtimeVoiceProvider { /** * Minimal interface that the bridge integration needs from MediaStreamHandler. * This matches the actual MediaStreamHandler's method signatures. + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. */ export interface MediaStreamHandlerLike { sendAudio(streamSid: string, muLaw: Buffer): void; @@ -793,6 +802,7 @@ export interface MediaStreamHandlerLike { /** * Create a RealtimeVoiceBridge wired to an existing MediaStreamHandler session. + * @internal Not used by the plugin's built-in realtime handler; exposed for external consumers. * * Drop-in helper for use inside media-stream.ts handleStart(): *