Fix voice input: resampler stereo bug, remove invalid turn mode, cleanup

Three bugs prevented voice input from working:

1. ResampleTo16000() treated NumFrames as total samples, dividing by
   channel count again — losing half the audio data with stereo input.
   The corrupted audio was unrecognizable to ElevenLabs VAD/STT.

2. Sent nonexistent "client_vad" turn mode in session init. The API has
   no turn.mode field; replaced with turn_timeout parameter.

3. Sent user_activity with every audio chunk, which resets the turn
   timeout timer and prevents the server from taking its turn.

Also: send audio chunks as compact JSON, add message type debug logging,
send conversation_initiation_client_data on connect.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-20 08:05:39 +01:00
parent b888f7fcb6
commit f7f0b0c45b
3 changed files with 116 additions and 21 deletions

View File

@ -119,20 +119,22 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
// Resampling // Resampling
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000( TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000(
const float* InAudio, int32 NumSamples, const float* InAudio, int32 NumFrames,
int32 InChannels, int32 InSampleRate) int32 InChannels, int32 InSampleRate)
{ {
const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000 const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000
// --- Step 1: Downmix to mono --- // --- Step 1: Downmix to mono ---
// NOTE: NumFrames is the number of audio frames (not total samples).
// Each frame contains InChannels samples (e.g. 2 for stereo).
// The raw buffer has NumFrames * InChannels total float values.
TArray<float> Mono; TArray<float> Mono;
if (InChannels == 1) if (InChannels == 1)
{ {
Mono = TArray<float>(InAudio, NumSamples); Mono = TArray<float>(InAudio, NumFrames);
} }
else else
{ {
const int32 NumFrames = NumSamples / InChannels;
Mono.Reserve(NumFrames); Mono.Reserve(NumFrames);
for (int32 i = 0; i < NumFrames; i++) for (int32 i = 0; i < NumFrames; i++)
{ {

View File

@ -72,7 +72,11 @@ void UElevenLabsWebSocketProxy::Connect(const FString& AgentIDOverride, const FS
WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected); WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected);
WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError); WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError);
WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed); WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed);
WebSocket->OnMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsMessage); // NOTE: We bind ONLY OnRawMessage (binary frames), NOT OnMessage (text frames).
// UE's WebSocket implementation fires BOTH callbacks for the same frame when using
// the libwebsockets backend — binding both causes every audio packet to be decoded
// and played twice. OnRawMessage handles all frame types: raw binary audio AND
// text-framed JSON (detected by peeking first byte for '{').
WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage); WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage);
WebSocket->Connect(); WebSocket->Connect();
@ -94,36 +98,52 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
{ {
if (!IsConnected()) if (!IsConnected())
{ {
UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected.")); UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected (state=%d). Audio dropped."),
(int32)ConnectionState);
return; return;
} }
if (PCMData.Num() == 0) return; if (PCMData.Num() == 0) return;
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" } // ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
// The server's VAD detects silence to determine end-of-turn.
// Do NOT send user_activity here — it resets the turn timeout timer
// and would prevent the server from taking the turn after the user stops speaking.
const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num()); const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num());
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject()); // Send as compact JSON (no pretty-printing) directly, bypassing SendJsonMessage
Msg->SetStringField(ElevenLabsMessageType::AudioChunk, Base64Audio); // to avoid the pretty-printed writer and to keep the payload minimal.
SendJsonMessage(Msg); const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);
// Log first chunk fully for debugging
static int32 AudioChunksSent = 0;
AudioChunksSent++;
if (AudioChunksSent <= 2)
{
UE_LOG(LogElevenLabsWS, Log, TEXT(" Audio JSON (first 200 chars): %.200s"), *AudioJson);
}
if (WebSocket.IsValid() && WebSocket->IsConnected())
{
WebSocket->Send(AudioJson);
}
} }
void UElevenLabsWebSocketProxy::SendUserTurnStart() void UElevenLabsWebSocketProxy::SendUserTurnStart()
{ {
// In client turn mode, signal that the user is active/speaking. // No-op: the ElevenLabs API does not require a "start speaking" signal.
// API message: { "type": "user_activity" } // The server's VAD detects speech from the audio chunks we send.
if (!IsConnected()) return; // user_activity is a keep-alive/timeout-reset message and should NOT be
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject()); // sent here — it would delay the agent's turn after the user stops.
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity); UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
SendJsonMessage(Msg);
} }
void UElevenLabsWebSocketProxy::SendUserTurnEnd() void UElevenLabsWebSocketProxy::SendUserTurnEnd()
{ {
// In client turn mode, stopping user_activity signals end of user turn. // No explicit "end turn" message exists in the ElevenLabs API.
// The API uses user_activity for ongoing speech; simply stop sending it. // The server detects end-of-speech via VAD when we stop sending audio chunks.
// No explicit end message is required — silence is detected server-side. UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
// We still log for debug visibility.
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
} }
void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text) void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
@ -155,8 +175,72 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsWebSocketProxy::OnWsConnected() void UElevenLabsWebSocketProxy::OnWsConnected()
{ {
UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Waiting for conversation_initiation_metadata...")); UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Sending conversation_initiation_client_data..."));
// State stays Connecting until we receive the initiation metadata from the server. // State stays Connecting until we receive conversation_initiation_metadata from the server.
// ElevenLabs requires this message immediately after the WebSocket handshake to
// negotiate the session configuration. Without it, the server won't accept audio
// from the client (microphone stays silent from server perspective) and default
// settings are used (higher latency, no intermediate responses).
//
// Structure:
// {
// "type": "conversation_initiation_client_data",
// "conversation_config_override": {
// "agent": {
// "turn": { "turn_timeout": 3 }
// },
// "tts": {
// "optimize_streaming_latency": 3
// }
// },
// "custom_llm_extra_body": {
// "enable_intermediate_response": true
// }
// }
// Configure turn-taking behaviour.
// The ElevenLabs API does NOT have a turn.mode field.
// Turn-taking is controlled by the server's VAD and the turn_* parameters.
// In push-to-talk (Client mode) the user controls the mic; the server still
// uses its VAD to detect the end of speech from the audio chunks it receives.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
// Lower turn_timeout so the agent responds faster after the user stops speaking.
// Default is 7s which feels very slow for push-to-talk.
if (TurnMode == EElevenLabsTurnMode::Client)
{
TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
}
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
// enable_intermediate_response reduces time-to-first-audio by allowing the agent
// to start speaking before it has finished generating the full response.
TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
// NOTE: We bypass SendJsonMessage() here intentionally.
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
// during the handshake before IsConnected() returns true in some UE WS backends.
// We know the socket is open at this point — send directly.
FString InitJson;
TSharedRef<TJsonWriter<>> InitWriter = TJsonWriterFactory<>::Create(&InitJson);
FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending initiation: %s"), *InitJson);
WebSocket->Send(InitJson);
} }
void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error) void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error)
@ -200,6 +284,9 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
return; return;
} }
// Log every message type received from the server for debugging.
UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);
if (MsgType == ElevenLabsMessageType::ConversationInitiation) if (MsgType == ElevenLabsMessageType::ConversationInitiation)
{ {
HandleConversationInitiation(Root); HandleConversationInitiation(Root);

View File

@ -183,4 +183,10 @@ private:
// Accumulation buffer for multi-fragment binary WebSocket frames. // Accumulation buffer for multi-fragment binary WebSocket frames.
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments. // ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
TArray<uint8> BinaryFrameBuffer; TArray<uint8> BinaryFrameBuffer;
public:
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
// Controls the turn mode string sent in conversation_initiation_client_data
// AND whether user_activity is sent automatically with each audio chunk.
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
}; };