From f7f0b0c45b84520b3ffd5a586a5b176eaa3352c0 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Fri, 20 Feb 2026 08:05:39 +0100 Subject: [PATCH] Fix voice input: resampler stereo bug, remove invalid turn mode, cleanup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three bugs prevented voice input from working: 1. ResampleTo16000() treated NumFrames as total samples, dividing by channel count again — losing half the audio data with stereo input. The corrupted audio was unrecognizable to ElevenLabs VAD/STT. 2. Sent nonexistent "client_vad" turn mode in session init. The API has no turn.mode field; replaced with turn_timeout parameter. 3. Sent user_activity with every audio chunk, which resets the turn timeout timer and prevents the server from taking its turn. Also: send audio chunks as compact JSON, add message type debug logging, send conversation_initiation_client_data on connect. Co-Authored-By: Claude Opus 4.6 --- .../ElevenLabsMicrophoneCaptureComponent.cpp | 8 +- .../Private/ElevenLabsWebSocketProxy.cpp | 123 +++++++++++++++--- .../Public/ElevenLabsWebSocketProxy.h | 6 + 3 files changed, 116 insertions(+), 21 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp index ebce037..a0685cd 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp @@ -119,20 +119,22 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate( // Resampling // ───────────────────────────────────────────────────────────────────────────── TArray UElevenLabsMicrophoneCaptureComponent::ResampleTo16000( - const float* InAudio, int32 NumSamples, + const float* InAudio, int32 NumFrames, int32 InChannels, int32 InSampleRate) { const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000 // --- Step 1: Downmix to mono --- + // NOTE: NumFrames is the number of audio frames (not total samples). + // Each frame contains InChannels samples (e.g. 2 for stereo). + // The raw buffer has NumFrames * InChannels total float values. TArray Mono; if (InChannels == 1) { - Mono = TArray(InAudio, NumSamples); + Mono = TArray(InAudio, NumFrames); } else { - const int32 NumFrames = NumSamples / InChannels; Mono.Reserve(NumFrames); for (int32 i = 0; i < NumFrames; i++) { diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 6fa399c..1762e4b 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -72,7 +72,11 @@ void UElevenLabsWebSocketProxy::Connect(const FString& AgentIDOverride, const FS WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected); WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError); WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed); - WebSocket->OnMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsMessage); + // NOTE: We bind ONLY OnRawMessage (binary frames), NOT OnMessage (text frames). + // UE's WebSocket implementation fires BOTH callbacks for the same frame when using + // the libwebsockets backend — binding both causes every audio packet to be decoded + // and played twice. OnRawMessage handles all frame types: raw binary audio AND + // text-framed JSON (detected by peeking first byte for '{'). WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage); WebSocket->Connect(); @@ -94,36 +98,52 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray& PCMData) { if (!IsConnected()) { - UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected.")); + UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected (state=%d). Audio dropped."), + (int32)ConnectionState); return; } if (PCMData.Num() == 0) return; + UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num()); + // ElevenLabs expects: { "user_audio_chunk": "" } + // The server's VAD detects silence to determine end-of-turn. + // Do NOT send user_activity here — it resets the turn timeout timer + // and would prevent the server from taking the turn after the user stops speaking. const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num()); - TSharedPtr Msg = MakeShareable(new FJsonObject()); - Msg->SetStringField(ElevenLabsMessageType::AudioChunk, Base64Audio); - SendJsonMessage(Msg); + // Send as compact JSON (no pretty-printing) directly, bypassing SendJsonMessage + // to avoid the pretty-printed writer and to keep the payload minimal. + const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio); + + // Log first chunk fully for debugging + static int32 AudioChunksSent = 0; + AudioChunksSent++; + if (AudioChunksSent <= 2) + { + UE_LOG(LogElevenLabsWS, Log, TEXT(" Audio JSON (first 200 chars): %.200s"), *AudioJson); + } + + if (WebSocket.IsValid() && WebSocket->IsConnected()) + { + WebSocket->Send(AudioJson); + } } void UElevenLabsWebSocketProxy::SendUserTurnStart() { - // In client turn mode, signal that the user is active/speaking. - // API message: { "type": "user_activity" } - if (!IsConnected()) return; - TSharedPtr Msg = MakeShareable(new FJsonObject()); - Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity); - SendJsonMessage(Msg); + // No-op: the ElevenLabs API does not require a "start speaking" signal. + // The server's VAD detects speech from the audio chunks we send. + // user_activity is a keep-alive/timeout-reset message and should NOT be + // sent here — it would delay the agent's turn after the user stops. + UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow).")); } void UElevenLabsWebSocketProxy::SendUserTurnEnd() { - // In client turn mode, stopping user_activity signals end of user turn. - // The API uses user_activity for ongoing speech; simply stop sending it. - // No explicit end message is required — silence is detected server-side. - // We still log for debug visibility. - UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity.")); + // No explicit "end turn" message exists in the ElevenLabs API. + // The server detects end-of-speech via VAD when we stop sending audio chunks. + UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence.")); } void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text) @@ -155,8 +175,72 @@ void UElevenLabsWebSocketProxy::SendInterrupt() // ───────────────────────────────────────────────────────────────────────────── void UElevenLabsWebSocketProxy::OnWsConnected() { - UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Waiting for conversation_initiation_metadata...")); - // State stays Connecting until we receive the initiation metadata from the server. + UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Sending conversation_initiation_client_data...")); + // State stays Connecting until we receive conversation_initiation_metadata from the server. + + // ElevenLabs requires this message immediately after the WebSocket handshake to + // negotiate the session configuration. Without it, the server won't accept audio + // from the client (microphone stays silent from server perspective) and default + // settings are used (higher latency, no intermediate responses). + // + // Structure: + // { + // "type": "conversation_initiation_client_data", + // "conversation_config_override": { + // "agent": { + // "turn": { "turn_timeout": 3 } + // }, + // "tts": { + // "optimize_streaming_latency": 3 + // } + // }, + // "custom_llm_extra_body": { + // "enable_intermediate_response": true + // } + // } + + // Configure turn-taking behaviour. + // The ElevenLabs API does NOT have a turn.mode field. + // Turn-taking is controlled by the server's VAD and the turn_* parameters. + // In push-to-talk (Client mode) the user controls the mic; the server still + // uses its VAD to detect the end of speech from the audio chunks it receives. + TSharedPtr TurnObj = MakeShareable(new FJsonObject()); + // Lower turn_timeout so the agent responds faster after the user stops speaking. + // Default is 7s which feels very slow for push-to-talk. + if (TurnMode == EElevenLabsTurnMode::Client) + { + TurnObj->SetNumberField(TEXT("turn_timeout"), 3); + } + + TSharedPtr AgentObj = MakeShareable(new FJsonObject()); + AgentObj->SetObjectField(TEXT("turn"), TurnObj); + + TSharedPtr TtsObj = MakeShareable(new FJsonObject()); + TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3); + + TSharedPtr ConversationConfigOverride = MakeShareable(new FJsonObject()); + ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); + ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj); + + // enable_intermediate_response reduces time-to-first-audio by allowing the agent + // to start speaking before it has finished generating the full response. + TSharedPtr CustomLlmExtraBody = MakeShareable(new FJsonObject()); + CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true); + + TSharedPtr InitMsg = MakeShareable(new FJsonObject()); + InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData); + InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride); + InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody); + + // NOTE: We bypass SendJsonMessage() here intentionally. + // SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires + // during the handshake before IsConnected() returns true in some UE WS backends. + // We know the socket is open at this point — send directly. + FString InitJson; + TSharedRef> InitWriter = TJsonWriterFactory<>::Create(&InitJson); + FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter); + UE_LOG(LogElevenLabsWS, Log, TEXT("Sending initiation: %s"), *InitJson); + WebSocket->Send(InitJson); } void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error) @@ -200,6 +284,9 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) return; } + // Log every message type received from the server for debugging. + UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType); + if (MsgType == ElevenLabsMessageType::ConversationInitiation) { HandleConversationInitiation(Root); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index 50dec68..ce1f97f 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -183,4 +183,10 @@ private: // Accumulation buffer for multi-fragment binary WebSocket frames. // ElevenLabs sends JSON as binary frames; large messages arrive in fragments. TArray BinaryFrameBuffer; + +public: + // Set by UElevenLabsConversationalAgentComponent before calling Connect(). + // Controls the turn mode string sent in conversation_initiation_client_data + // AND whether user_activity is sent automatically with each audio chunk. + EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server; };