Fix voice input: resampler stereo bug, remove invalid turn mode, cleanup

Three bugs prevented voice input from working: 1. ResampleTo16000() treated NumFrames as total samples, dividing by channel count again — losing half the audio data with stereo input. The corrupted audio was unrecognizable to ElevenLabs VAD/STT. 2. Sent nonexistent "client_vad" turn mode in session init. The API has no turn.mode field; replaced with turn_timeout parameter. 3. Sent user_activity with every audio chunk, which resets the turn timeout timer and prevents the server from taking its turn. Also: send audio chunks as compact JSON, add message type debug logging, send conversation_initiation_client_data on connect. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 08:05:39 +01:00
parent b888f7fcb6
commit f7f0b0c45b
3 changed files with 116 additions and 21 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
@@ -119,20 +119,22 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
 // Resampling
 // ─────────────────────────────────────────────────────────────────────────────
 TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000(
-	const float* InAudio, int32 NumSamples,
+	const float* InAudio, int32 NumFrames,
 	int32 InChannels, int32 InSampleRate)
 {
 	const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000
 	// --- Step 1: Downmix to mono ---
 	// NOTE: NumFrames is the number of audio frames (not total samples).
 	// Each frame contains InChannels samples (e.g. 2 for stereo).
 	// The raw buffer has NumFrames * InChannels total float values.
 	TArray<float> Mono;
 	if (InChannels == 1)
 	{
-		Mono = TArray<float>(InAudio, NumSamples);
+		Mono = TArray<float>(InAudio, NumFrames);
 	}
 	else
 	{
 		const int32 NumFrames = NumSamples / InChannels;
 		Mono.Reserve(NumFrames);
 		for (int32 i = 0; i < NumFrames; i++)
 		{
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -72,7 +72,11 @@ void UElevenLabsWebSocketProxy::Connect(const FString& AgentIDOverride, const FS
 	WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected);
 	WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError);
 	WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed);
-	WebSocket->OnMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsMessage);
+	// NOTE: We bind ONLY OnRawMessage (binary frames), NOT OnMessage (text frames).
 	// UE's WebSocket implementation fires BOTH callbacks for the same frame when using
 	// the libwebsockets backend — binding both causes every audio packet to be decoded
 	// and played twice. OnRawMessage handles all frame types: raw binary audio AND
 	// text-framed JSON (detected by peeking first byte for '{').
 	WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage);
 	WebSocket->Connect();
@@ -94,36 +98,52 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 {
 	if (!IsConnected())
 	{
-		UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected."));
+		UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected (state=%d). Audio dropped."),
 			(int32)ConnectionState);
 		return;
 	}
 	if (PCMData.Num() == 0) return;
 	UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
 	// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
 	// The server's VAD detects silence to determine end-of-turn.
 	// Do NOT send user_activity here — it resets the turn timeout timer
 	// and would prevent the server from taking the turn after the user stops speaking.
 	const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num());
-	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
+	// Send as compact JSON (no pretty-printing) directly, bypassing SendJsonMessage
-	Msg->SetStringField(ElevenLabsMessageType::AudioChunk, Base64Audio);
+	// to avoid the pretty-printed writer and to keep the payload minimal.
-	SendJsonMessage(Msg);
+	const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);
 	// Log first chunk fully for debugging
 	static int32 AudioChunksSent = 0;
 	AudioChunksSent++;
 	if (AudioChunksSent <= 2)
 	{
 		UE_LOG(LogElevenLabsWS, Log, TEXT("  Audio JSON (first 200 chars): %.200s"), *AudioJson);
 	}
 	if (WebSocket.IsValid() && WebSocket->IsConnected())
 	{
 		WebSocket->Send(AudioJson);
 	}
 }
 void UElevenLabsWebSocketProxy::SendUserTurnStart()
 {
-	// In client turn mode, signal that the user is active/speaking.
+	// No-op: the ElevenLabs API does not require a "start speaking" signal.
-	// API message: { "type": "user_activity" }
+	// The server's VAD detects speech from the audio chunks we send.
-	if (!IsConnected()) return;
+	// user_activity is a keep-alive/timeout-reset message and should NOT be
-	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
+	// sent here — it would delay the agent's turn after the user stops.
-	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
+	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
 	SendJsonMessage(Msg);
 }
 void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 {
-	// In client turn mode, stopping user_activity signals end of user turn.
+	// No explicit "end turn" message exists in the ElevenLabs API.
-	// The API uses user_activity for ongoing speech; simply stop sending it.
+	// The server detects end-of-speech via VAD when we stop sending audio chunks.
-	// No explicit end message is required — silence is detected server-side.
+	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
 	// We still log for debug visibility.
 	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
 }
 void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
@@ -155,8 +175,72 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsWebSocketProxy::OnWsConnected()
 {
-	UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Waiting for conversation_initiation_metadata..."));
+	UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Sending conversation_initiation_client_data..."));
-	// State stays Connecting until we receive the initiation metadata from the server.
+	// State stays Connecting until we receive conversation_initiation_metadata from the server.
 	// ElevenLabs requires this message immediately after the WebSocket handshake to
 	// negotiate the session configuration. Without it, the server won't accept audio
 	// from the client (microphone stays silent from server perspective) and default
 	// settings are used (higher latency, no intermediate responses).
 	//
 	// Structure:
 	// {
 	//   "type": "conversation_initiation_client_data",
 	//   "conversation_config_override": {
 	//     "agent": {
 	//       "turn": { "turn_timeout": 3 }
 	//     },
 	//     "tts": {
 	//       "optimize_streaming_latency": 3
 	//     }
 	//   },
 	//   "custom_llm_extra_body": {
 	//     "enable_intermediate_response": true
 	//   }
 	// }
 	// Configure turn-taking behaviour.
 	// The ElevenLabs API does NOT have a turn.mode field.
 	// Turn-taking is controlled by the server's VAD and the turn_* parameters.
 	// In push-to-talk (Client mode) the user controls the mic; the server still
 	// uses its VAD to detect the end of speech from the audio chunks it receives.
 	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 	// Lower turn_timeout so the agent responds faster after the user stops speaking.
 	// Default is 7s which feels very slow for push-to-talk.
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
 		TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
 	}
 	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
 	AgentObj->SetObjectField(TEXT("turn"), TurnObj);
 	TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
 	TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
 	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
 	ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
 	ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
 	// enable_intermediate_response reduces time-to-first-audio by allowing the agent
 	// to start speaking before it has finished generating the full response.
 	TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
 	CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
 	TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
 	InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
 	InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
 	InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
 	// NOTE: We bypass SendJsonMessage() here intentionally.
 	// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
 	// during the handshake before IsConnected() returns true in some UE WS backends.
 	// We know the socket is open at this point — send directly.
 	FString InitJson;
 	TSharedRef<TJsonWriter<>> InitWriter = TJsonWriterFactory<>::Create(&InitJson);
 	FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
 	UE_LOG(LogElevenLabsWS, Log, TEXT("Sending initiation: %s"), *InitJson);
 	WebSocket->Send(InitJson);
 }
 void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error)
@@ -200,6 +284,9 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		return;
 	}
 	// Log every message type received from the server for debugging.
 	UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);
 	if (MsgType == ElevenLabsMessageType::ConversationInitiation)
 	{
 		HandleConversationInitiation(Root);
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -183,4 +183,10 @@ private:
 	// Accumulation buffer for multi-fragment binary WebSocket frames.
 	// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
 	TArray<uint8> BinaryFrameBuffer;
 public:
 	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
 	// Controls the turn mode string sent in conversation_initiation_client_data
 	// AND whether user_activity is sent automatically with each audio chunk.
 	EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
 };