From f7f0b0c45b84520b3ffd5a586a5b176eaa3352c0 Mon Sep 17 00:00:00 2001
From: "j.foucher" <j.foucher@polymorph.fr>
Date: Fri, 20 Feb 2026 08:05:39 +0100
Subject: [PATCH] Fix voice input: resampler stereo bug, remove invalid turn
 mode, cleanup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three bugs prevented voice input from working:

1. ResampleTo16000() treated NumFrames as total samples, dividing by
   channel count again — losing half the audio data with stereo input.
   The corrupted audio was unrecognizable to ElevenLabs VAD/STT.

2. Sent nonexistent "client_vad" turn mode in session init. The API has
   no turn.mode field; replaced with turn_timeout parameter.

3. Sent user_activity with every audio chunk, which resets the turn
   timeout timer and prevents the server from taking its turn.

Also: send audio chunks as compact JSON, add message type debug logging,
send conversation_initiation_client_data on connect.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../ElevenLabsMicrophoneCaptureComponent.cpp  |   8 +-
 .../Private/ElevenLabsWebSocketProxy.cpp      | 123 +++++++++++++++---
 .../Public/ElevenLabsWebSocketProxy.h         |   6 +
 3 files changed, 116 insertions(+), 21 deletions(-)
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
index ebce037..a0685cd 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
@@ -119,20 +119,22 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
 // Resampling
 // ─────────────────────────────────────────────────────────────────────────────
 TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000(
-	const float* InAudio, int32 NumSamples,
+	const float* InAudio, int32 NumFrames,
 	int32 InChannels, int32 InSampleRate)
 {
 	const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000
 
 	// --- Step 1: Downmix to mono ---
+	// NOTE: NumFrames is the number of audio frames (not total samples).
+	// Each frame contains InChannels samples (e.g. 2 for stereo).
+	// The raw buffer has NumFrames * InChannels total float values.
 	TArray<float> Mono;
 	if (InChannels == 1)
 	{
-		Mono = TArray<float>(InAudio, NumSamples);
+		Mono = TArray<float>(InAudio, NumFrames);
 	}
 	else
 	{
-		const int32 NumFrames = NumSamples / InChannels;
 		Mono.Reserve(NumFrames);
 		for (int32 i = 0; i < NumFrames; i++)
 		{
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
index 6fa399c..1762e4b 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -72,7 +72,11 @@ void UElevenLabsWebSocketProxy::Connect(const FString& AgentIDOverride, const FS
 	WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected);
 	WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError);
 	WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed);
-	WebSocket->OnMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsMessage);
+	// NOTE: We bind ONLY OnRawMessage (binary frames), NOT OnMessage (text frames).
+	// UE's WebSocket implementation fires BOTH callbacks for the same frame when using
+	// the libwebsockets backend — binding both causes every audio packet to be decoded
+	// and played twice. OnRawMessage handles all frame types: raw binary audio AND
+	// text-framed JSON (detected by peeking first byte for '{').
 	WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage);
 
 	WebSocket->Connect();
@@ -94,36 +98,52 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 {
 	if (!IsConnected())
 	{
-		UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected."));
+		UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected (state=%d). Audio dropped."),
+			(int32)ConnectionState);
 		return;
 	}
 	if (PCMData.Num() == 0) return;
 
+	UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
+
 	// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
+	// The server's VAD detects silence to determine end-of-turn.
+	// Do NOT send user_activity here — it resets the turn timeout timer
+	// and would prevent the server from taking the turn after the user stops speaking.
 	const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num());
 
-	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
-	Msg->SetStringField(ElevenLabsMessageType::AudioChunk, Base64Audio);
-	SendJsonMessage(Msg);
+	// Send as compact JSON (no pretty-printing) directly, bypassing SendJsonMessage
+	// to avoid the pretty-printed writer and to keep the payload minimal.
+	const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);
+
+	// Log first chunk fully for debugging
+	static int32 AudioChunksSent = 0;
+	AudioChunksSent++;
+	if (AudioChunksSent <= 2)
+	{
+		UE_LOG(LogElevenLabsWS, Log, TEXT("  Audio JSON (first 200 chars): %.200s"), *AudioJson);
+	}
+
+	if (WebSocket.IsValid() && WebSocket->IsConnected())
+	{
+		WebSocket->Send(AudioJson);
+	}
 }
 
 void UElevenLabsWebSocketProxy::SendUserTurnStart()
 {
-	// In client turn mode, signal that the user is active/speaking.
-	// API message: { "type": "user_activity" }
-	if (!IsConnected()) return;
-	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
-	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
-	SendJsonMessage(Msg);
+	// No-op: the ElevenLabs API does not require a "start speaking" signal.
+	// The server's VAD detects speech from the audio chunks we send.
+	// user_activity is a keep-alive/timeout-reset message and should NOT be
+	// sent here — it would delay the agent's turn after the user stops.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
 }
 
 void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 {
-	// In client turn mode, stopping user_activity signals end of user turn.
-	// The API uses user_activity for ongoing speech; simply stop sending it.
-	// No explicit end message is required — silence is detected server-side.
-	// We still log for debug visibility.
-	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
+	// No explicit "end turn" message exists in the ElevenLabs API.
+	// The server detects end-of-speech via VAD when we stop sending audio chunks.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
 }
 
 void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
@@ -155,8 +175,72 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsWebSocketProxy::OnWsConnected()
 {
-	UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Waiting for conversation_initiation_metadata..."));
-	// State stays Connecting until we receive the initiation metadata from the server.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Sending conversation_initiation_client_data..."));
+	// State stays Connecting until we receive conversation_initiation_metadata from the server.
+
+	// ElevenLabs requires this message immediately after the WebSocket handshake to
+	// negotiate the session configuration. Without it, the server won't accept audio
+	// from the client (microphone stays silent from server perspective) and default
+	// settings are used (higher latency, no intermediate responses).
+	//
+	// Structure:
+	// {
+	//   "type": "conversation_initiation_client_data",
+	//   "conversation_config_override": {
+	//     "agent": {
+	//       "turn": { "turn_timeout": 3 }
+	//     },
+	//     "tts": {
+	//       "optimize_streaming_latency": 3
+	//     }
+	//   },
+	//   "custom_llm_extra_body": {
+	//     "enable_intermediate_response": true
+	//   }
+	// }
+
+	// Configure turn-taking behaviour.
+	// The ElevenLabs API does NOT have a turn.mode field.
+	// Turn-taking is controlled by the server's VAD and the turn_* parameters.
+	// In push-to-talk (Client mode) the user controls the mic; the server still
+	// uses its VAD to detect the end of speech from the audio chunks it receives.
+	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
+	// Lower turn_timeout so the agent responds faster after the user stops speaking.
+	// Default is 7s which feels very slow for push-to-talk.
+	if (TurnMode == EElevenLabsTurnMode::Client)
+	{
+		TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
+	}
+
+	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
+	AgentObj->SetObjectField(TEXT("turn"), TurnObj);
+
+	TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
+	TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
+
+	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
+	ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
+	ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
+
+	// enable_intermediate_response reduces time-to-first-audio by allowing the agent
+	// to start speaking before it has finished generating the full response.
+	TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
+	CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
+
+	TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
+	InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
+	InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
+	InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
+
+	// NOTE: We bypass SendJsonMessage() here intentionally.
+	// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
+	// during the handshake before IsConnected() returns true in some UE WS backends.
+	// We know the socket is open at this point — send directly.
+	FString InitJson;
+	TSharedRef<TJsonWriter<>> InitWriter = TJsonWriterFactory<>::Create(&InitJson);
+	FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
+	UE_LOG(LogElevenLabsWS, Log, TEXT("Sending initiation: %s"), *InitJson);
+	WebSocket->Send(InitJson);
 }
 
 void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error)
@@ -200,6 +284,9 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		return;
 	}
 
+	// Log every message type received from the server for debugging.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);
+
 	if (MsgType == ElevenLabsMessageType::ConversationInitiation)
 	{
 		HandleConversationInitiation(Root);
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
index 50dec68..ce1f97f 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -183,4 +183,10 @@ private:
 	// Accumulation buffer for multi-fragment binary WebSocket frames.
 	// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
 	TArray<uint8> BinaryFrameBuffer;
+
+public:
+	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
+	// Controls the turn mode string sent in conversation_initiation_client_data
+	// AND whether user_activity is sent automatically with each audio chunk.
+	EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
 };