diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt deleted file mode 100644 index ff13cf73911..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/ElevenLabsStreamingTts.kt +++ /dev/null @@ -1,338 +0,0 @@ -package ai.openclaw.app.voice - -import android.media.AudioAttributes -import android.media.AudioFormat -import android.media.AudioManager -import android.media.AudioTrack -import android.util.Base64 -import android.util.Log -import kotlinx.coroutines.* -import kotlinx.coroutines.flow.MutableStateFlow -import kotlinx.coroutines.flow.StateFlow -import okhttp3.* -import org.json.JSONObject -import kotlin.math.max - -/** - * Streams text chunks to ElevenLabs WebSocket API and plays audio in real-time. - * - * Usage: - * 1. Create instance with voice/API config - * 2. Call [start] to open WebSocket + AudioTrack - * 3. Call [sendText] with incremental text chunks as they arrive - * 4. Call [finish] when the full response is ready (sends EOS to ElevenLabs) - * 5. Call [stop] to cancel/cleanup at any time - * - * Audio playback begins as soon as the first audio chunk arrives from ElevenLabs, - * typically within ~100ms of the first text chunk for eleven_flash_v2_5. - * - * Note: eleven_v3 does NOT support WebSocket streaming. Use eleven_flash_v2_5 - * or eleven_flash_v2 for lowest latency. - */ -class ElevenLabsStreamingTts( - private val scope: CoroutineScope, - private val voiceId: String, - private val apiKey: String, - private val modelId: String = "eleven_flash_v2_5", - private val outputFormat: String = "pcm_24000", - private val sampleRate: Int = 24000, -) { - companion object { - private const val TAG = "ElevenLabsStreamTTS" - private const val BASE_URL = "wss://api.elevenlabs.io/v1/text-to-speech" - - /** Models that support WebSocket input streaming */ - val STREAMING_MODELS = setOf( - "eleven_flash_v2_5", - "eleven_flash_v2", - "eleven_multilingual_v2", - "eleven_turbo_v2_5", - "eleven_turbo_v2", - "eleven_monolingual_v1", - ) - - fun supportsStreaming(modelId: String): Boolean = modelId in STREAMING_MODELS - } - - private val _isPlaying = MutableStateFlow(false) - val isPlaying: StateFlow = _isPlaying - - private var webSocket: WebSocket? = null - private var audioTrack: AudioTrack? = null - private var trackStarted = false - private var client: OkHttpClient? = null - @Volatile private var stopped = false - @Volatile private var finished = false - @Volatile var hasReceivedAudio = false - private set - private var drainJob: Job? = null - - // Track text already sent so we only send incremental chunks - private var sentTextLength = 0 - @Volatile private var wsReady = false - private val pendingText = mutableListOf() - - /** - * Open the WebSocket connection and prepare AudioTrack. - * Must be called before [sendText]. - */ - fun start() { - stopped = false - finished = false - hasReceivedAudio = false - sentTextLength = 0 - trackStarted = false - wsReady = false - sentFullText = "" - synchronized(pendingText) { pendingText.clear() } - - // Prepare AudioTrack - val minBuffer = AudioTrack.getMinBufferSize( - sampleRate, - AudioFormat.CHANNEL_OUT_MONO, - AudioFormat.ENCODING_PCM_16BIT, - ) - val bufferSize = max(minBuffer * 2, 8 * 1024) - val track = AudioTrack( - AudioAttributes.Builder() - .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) - .setUsage(AudioAttributes.USAGE_MEDIA) - .build(), - AudioFormat.Builder() - .setSampleRate(sampleRate) - .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) - .setEncoding(AudioFormat.ENCODING_PCM_16BIT) - .build(), - bufferSize, - AudioTrack.MODE_STREAM, - AudioManager.AUDIO_SESSION_ID_GENERATE, - ) - if (track.state != AudioTrack.STATE_INITIALIZED) { - track.release() - Log.e(TAG, "AudioTrack init failed") - return - } - audioTrack = track - _isPlaying.value = true - - // Open WebSocket - val url = "$BASE_URL/$voiceId/stream-input?model_id=$modelId&output_format=$outputFormat" - val okClient = OkHttpClient.Builder() - .readTimeout(30, java.util.concurrent.TimeUnit.SECONDS) - .writeTimeout(10, java.util.concurrent.TimeUnit.SECONDS) - .build() - client = okClient - - val request = Request.Builder() - .url(url) - .header("xi-api-key", apiKey) - .build() - - webSocket = okClient.newWebSocket(request, object : WebSocketListener() { - override fun onOpen(webSocket: WebSocket, response: Response) { - Log.d(TAG, "WebSocket connected") - // Send initial config with voice settings - val config = JSONObject().apply { - put("text", " ") - put("voice_settings", JSONObject().apply { - put("stability", 0.5) - put("similarity_boost", 0.8) - put("use_speaker_boost", false) - }) - put("generation_config", JSONObject().apply { - put("chunk_length_schedule", org.json.JSONArray(listOf(120, 160, 250, 290))) - }) - } - webSocket.send(config.toString()) - wsReady = true - // Flush any text that was queued before WebSocket was ready - synchronized(pendingText) { - for (queued in pendingText) { - val msg = JSONObject().apply { put("text", queued) } - webSocket.send(msg.toString()) - Log.d(TAG, "flushed queued chunk: ${queued.length} chars") - } - pendingText.clear() - } - // Send deferred EOS if finish() was called before WebSocket was ready - if (finished) { - val eos = JSONObject().apply { put("text", "") } - webSocket.send(eos.toString()) - Log.d(TAG, "sent deferred EOS") - } - } - - override fun onMessage(webSocket: WebSocket, text: String) { - if (stopped) return - try { - val json = JSONObject(text) - val audio = json.optString("audio", "") - if (audio.isNotEmpty()) { - val pcmBytes = Base64.decode(audio, Base64.DEFAULT) - writeToTrack(pcmBytes) - } - } catch (e: Exception) { - Log.e(TAG, "Error parsing WebSocket message: ${e.message}") - } - } - - override fun onFailure(webSocket: WebSocket, t: Throwable, response: Response?) { - Log.e(TAG, "WebSocket error: ${t.message}") - stopped = true - cleanup() - } - - override fun onClosed(webSocket: WebSocket, code: Int, reason: String) { - Log.d(TAG, "WebSocket closed: $code $reason") - // Wait for AudioTrack to finish playing buffered audio, then cleanup - drainJob = scope.launch(Dispatchers.IO) { - drainAudioTrack() - cleanup() - } - } - }) - } - - /** - * Send incremental text. Call with the full accumulated text so far — - * only the new portion (since last send) will be transmitted. - */ - // Track the full text we've sent so we can detect replacement vs append - private var sentFullText = "" - - /** - // If we already sent a superset of this text, it's just a stale/out-of-order - // event from a different thread — not a real divergence. Ignore it. - if (sentFullText.startsWith(fullText)) return true - * Returns true if text was accepted, false if text diverged (caller should restart). - */ - @Synchronized - fun sendText(fullText: String): Boolean { - if (stopped) return false - if (finished) return true // Already finishing — not a diverge, don't restart - - // Detect text replacement: if the new text doesn't start with what we already sent, - // the stream has diverged (e.g., tool call interrupted and text was replaced). - if (sentFullText.isNotEmpty() && !fullText.startsWith(sentFullText)) { - // If we already sent a superset of this text, it's just a stale/out-of-order - // event from a different thread — not a real divergence. Ignore it. - if (sentFullText.startsWith(fullText)) return true - Log.d(TAG, "text diverged — sent='${sentFullText.take(60)}' new='${fullText.take(60)}'") - return false - } - - if (fullText.length > sentTextLength) { - val newText = fullText.substring(sentTextLength) - sentTextLength = fullText.length - sentFullText = fullText - - val ws = webSocket - if (ws != null && wsReady) { - val msg = JSONObject().apply { put("text", newText) } - ws.send(msg.toString()) - Log.d(TAG, "sent chunk: ${newText.length} chars") - } else { - // Queue if WebSocket not connected yet (ws null = still connecting, wsReady false = handshake pending) - synchronized(pendingText) { pendingText.add(newText) } - Log.d(TAG, "queued chunk: ${newText.length} chars (ws not ready)") - } - } - return true - } - - /** - * Signal that no more text is coming. Sends EOS to ElevenLabs. - * The WebSocket will close after generating remaining audio. - */ - @Synchronized - fun finish() { - if (stopped || finished) return - finished = true - val ws = webSocket - if (ws != null && wsReady) { - // Send empty text to signal end of stream - val eos = JSONObject().apply { put("text", "") } - ws.send(eos.toString()) - Log.d(TAG, "sent EOS") - } - // else: WebSocket not ready yet; onOpen will send EOS after flushing queued text - } - - /** - * Immediately stop playback and close everything. - */ - fun stop() { - stopped = true - finished = true - drainJob?.cancel() - drainJob = null - webSocket?.cancel() - webSocket = null - val track = audioTrack - audioTrack = null - if (track != null) { - try { - track.pause() - track.flush() - track.release() - } catch (_: Throwable) {} - } - _isPlaying.value = false - client?.dispatcher?.executorService?.shutdown() - client = null - } - - private fun writeToTrack(pcmBytes: ByteArray) { - val track = audioTrack ?: return - if (stopped) return - - // Start playback on first audio chunk — avoids underrun - if (!trackStarted) { - track.play() - trackStarted = true - hasReceivedAudio = true - Log.d(TAG, "AudioTrack started on first chunk") - } - - var offset = 0 - while (offset < pcmBytes.size && !stopped) { - val wrote = track.write(pcmBytes, offset, pcmBytes.size - offset) - if (wrote <= 0) { - if (stopped) return - Log.w(TAG, "AudioTrack write returned $wrote") - break - } - offset += wrote - } - } - - private fun drainAudioTrack() { - if (stopped) return - // Wait up to 10s for audio to finish playing - val deadline = System.currentTimeMillis() + 10_000 - while (!stopped && System.currentTimeMillis() < deadline) { - // Check if track is still playing - val track = audioTrack ?: return - if (track.playState != AudioTrack.PLAYSTATE_PLAYING) return - try { - Thread.sleep(100) - } catch (_: InterruptedException) { - return - } - } - } - - private fun cleanup() { - val track = audioTrack - audioTrack = null - if (track != null) { - try { - track.stop() - track.release() - } catch (_: Throwable) {} - } - _isPlaying.value = false - client?.dispatcher?.executorService?.shutdown() - client = null - } -} diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt deleted file mode 100644 index 90bbd81b8bd..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/StreamingMediaDataSource.kt +++ /dev/null @@ -1,98 +0,0 @@ -package ai.openclaw.app.voice - -import android.media.MediaDataSource -import kotlin.math.min - -internal class StreamingMediaDataSource : MediaDataSource() { - private data class Chunk(val start: Long, val data: ByteArray) - - private val lock = Object() - private val chunks = ArrayList() - private var totalSize: Long = 0 - private var closed = false - private var finished = false - private var lastReadIndex = 0 - - fun append(data: ByteArray) { - if (data.isEmpty()) return - synchronized(lock) { - if (closed || finished) return - val chunk = Chunk(totalSize, data) - chunks.add(chunk) - totalSize += data.size.toLong() - lock.notifyAll() - } - } - - fun finish() { - synchronized(lock) { - if (closed) return - finished = true - lock.notifyAll() - } - } - - fun fail() { - synchronized(lock) { - closed = true - lock.notifyAll() - } - } - - override fun readAt(position: Long, buffer: ByteArray, offset: Int, size: Int): Int { - if (position < 0) return -1 - synchronized(lock) { - while (!closed && !finished && position >= totalSize) { - lock.wait() - } - if (closed) return -1 - if (position >= totalSize && finished) return -1 - - val available = (totalSize - position).toInt() - val toRead = min(size, available) - var remaining = toRead - var destOffset = offset - var pos = position - - var index = findChunkIndex(pos) - while (remaining > 0 && index < chunks.size) { - val chunk = chunks[index] - val inChunkOffset = (pos - chunk.start).toInt() - if (inChunkOffset >= chunk.data.size) { - index++ - continue - } - val copyLen = min(remaining, chunk.data.size - inChunkOffset) - System.arraycopy(chunk.data, inChunkOffset, buffer, destOffset, copyLen) - remaining -= copyLen - destOffset += copyLen - pos += copyLen - if (inChunkOffset + copyLen >= chunk.data.size) { - index++ - } - } - - return toRead - remaining - } - } - - override fun getSize(): Long = -1 - - override fun close() { - synchronized(lock) { - closed = true - lock.notifyAll() - } - } - - private fun findChunkIndex(position: Long): Int { - var index = lastReadIndex - while (index < chunks.size) { - val chunk = chunks[index] - if (position < chunk.start + chunk.data.size) break - index++ - } - lastReadIndex = index - return index - } -} diff --git a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt b/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt deleted file mode 100644 index 7ada19e166b..00000000000 --- a/apps/android/app/src/main/java/ai/openclaw/app/voice/TalkModeVoiceResolver.kt +++ /dev/null @@ -1,122 +0,0 @@ -package ai.openclaw.app.voice - -import java.net.HttpURLConnection -import java.net.URL -import kotlinx.coroutines.Dispatchers -import kotlinx.coroutines.withContext -import kotlinx.serialization.json.Json -import kotlinx.serialization.json.JsonArray -import kotlinx.serialization.json.JsonElement -import kotlinx.serialization.json.JsonObject -import kotlinx.serialization.json.JsonPrimitive - -internal data class ElevenLabsVoice(val voiceId: String, val name: String?) - -internal data class TalkModeResolvedVoice( - val voiceId: String?, - val fallbackVoiceId: String?, - val defaultVoiceId: String?, - val currentVoiceId: String?, - val selectedVoiceName: String? = null, -) - -internal object TalkModeVoiceResolver { - fun resolveVoiceAlias(value: String?, voiceAliases: Map): String? { - val trimmed = value?.trim().orEmpty() - if (trimmed.isEmpty()) return null - val normalized = normalizeAliasKey(trimmed) - voiceAliases[normalized]?.let { return it } - if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed - return if (isLikelyVoiceId(trimmed)) trimmed else null - } - - suspend fun resolveVoiceId( - preferred: String?, - fallbackVoiceId: String?, - defaultVoiceId: String?, - currentVoiceId: String?, - voiceOverrideActive: Boolean, - listVoices: suspend () -> List, - ): TalkModeResolvedVoice { - val trimmed = preferred?.trim().orEmpty() - if (trimmed.isNotEmpty()) { - return TalkModeResolvedVoice( - voiceId = trimmed, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - if (!fallbackVoiceId.isNullOrBlank()) { - return TalkModeResolvedVoice( - voiceId = fallbackVoiceId, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - - val first = listVoices().firstOrNull() - if (first == null) { - return TalkModeResolvedVoice( - voiceId = null, - fallbackVoiceId = fallbackVoiceId, - defaultVoiceId = defaultVoiceId, - currentVoiceId = currentVoiceId, - ) - } - - return TalkModeResolvedVoice( - voiceId = first.voiceId, - fallbackVoiceId = first.voiceId, - defaultVoiceId = if (defaultVoiceId.isNullOrBlank()) first.voiceId else defaultVoiceId, - currentVoiceId = if (voiceOverrideActive) currentVoiceId else first.voiceId, - selectedVoiceName = first.name, - ) - } - - suspend fun listVoices(apiKey: String, json: Json): List { - return withContext(Dispatchers.IO) { - val url = URL("https://api.elevenlabs.io/v1/voices") - val conn = url.openConnection() as HttpURLConnection - try { - conn.requestMethod = "GET" - conn.connectTimeout = 15_000 - conn.readTimeout = 15_000 - conn.setRequestProperty("xi-api-key", apiKey) - - val code = conn.responseCode - val stream = if (code >= 400) conn.errorStream else conn.inputStream - val data = stream?.use { it.readBytes() } ?: byteArrayOf() - if (code >= 400) { - val message = data.toString(Charsets.UTF_8) - throw IllegalStateException("ElevenLabs voices failed: $code $message") - } - - val root = json.parseToJsonElement(data.toString(Charsets.UTF_8)).asObjectOrNull() - val voices = (root?.get("voices") as? JsonArray) ?: JsonArray(emptyList()) - voices.mapNotNull { entry -> - val obj = entry.asObjectOrNull() ?: return@mapNotNull null - val voiceId = obj["voice_id"].asStringOrNull() ?: return@mapNotNull null - val name = obj["name"].asStringOrNull() - ElevenLabsVoice(voiceId, name) - } - } finally { - conn.disconnect() - } - } - } - - private fun isLikelyVoiceId(value: String): Boolean { - if (value.length < 10) return false - return value.all { it.isLetterOrDigit() || it == '-' || it == '_' } - } - - private fun normalizeAliasKey(value: String): String = - value.trim().lowercase() -} - -private fun JsonElement?.asObjectOrNull(): JsonObject? = this as? JsonObject - -private fun JsonElement?.asStringOrNull(): String? = - (this as? JsonPrimitive)?.takeIf { it.isString }?.content diff --git a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt b/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt deleted file mode 100644 index 5cd46895d42..00000000000 --- a/apps/android/app/src/test/java/ai/openclaw/app/voice/TalkModeVoiceResolverTest.kt +++ /dev/null @@ -1,92 +0,0 @@ -package ai.openclaw.app.voice - -import kotlinx.coroutines.runBlocking -import org.junit.Assert.assertEquals -import org.junit.Assert.assertNull -import org.junit.Test - -class TalkModeVoiceResolverTest { - @Test - fun resolvesVoiceAliasCaseInsensitively() { - val resolved = - TalkModeVoiceResolver.resolveVoiceAlias( - " Clawd ", - mapOf("clawd" to "voice-123"), - ) - - assertEquals("voice-123", resolved) - } - - @Test - fun acceptsDirectVoiceIds() { - val resolved = TalkModeVoiceResolver.resolveVoiceAlias("21m00Tcm4TlvDq8ikWAM", emptyMap()) - - assertEquals("21m00Tcm4TlvDq8ikWAM", resolved) - } - - @Test - fun rejectsUnknownAliases() { - val resolved = TalkModeVoiceResolver.resolveVoiceAlias("nickname", emptyMap()) - - assertNull(resolved) - } - - @Test - fun reusesCachedFallbackVoiceBeforeFetchingCatalog() = - runBlocking { - var fetchCount = 0 - - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = "cached-voice", - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = false, - listVoices = { - fetchCount += 1 - emptyList() - }, - ) - - assertEquals("cached-voice", resolved.voiceId) - assertEquals(0, fetchCount) - } - - @Test - fun seedsDefaultVoiceFromCatalogWhenNeeded() = - runBlocking { - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = null, - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = false, - listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) }, - ) - - assertEquals("voice-1", resolved.voiceId) - assertEquals("voice-1", resolved.fallbackVoiceId) - assertEquals("voice-1", resolved.defaultVoiceId) - assertEquals("voice-1", resolved.currentVoiceId) - assertEquals("First", resolved.selectedVoiceName) - } - - @Test - fun preservesCurrentVoiceWhenOverrideIsActive() = - runBlocking { - val resolved = - TalkModeVoiceResolver.resolveVoiceId( - preferred = null, - fallbackVoiceId = null, - defaultVoiceId = null, - currentVoiceId = null, - voiceOverrideActive = true, - listVoices = { listOf(ElevenLabsVoice("voice-1", "First")) }, - ) - - assertEquals("voice-1", resolved.voiceId) - assertNull(resolved.currentVoiceId) - } -}