Fix voice input: resampler stereo bug, remove invalid turn mode, cleanup
Three bugs prevented voice input from working: 1. ResampleTo16000() treated NumFrames as total samples, dividing by channel count again — losing half the audio data with stereo input. The corrupted audio was unrecognizable to ElevenLabs VAD/STT. 2. Sent nonexistent "client_vad" turn mode in session init. The API has no turn.mode field; replaced with turn_timeout parameter. 3. Sent user_activity with every audio chunk, which resets the turn timeout timer and prevents the server from taking its turn. Also: send audio chunks as compact JSON, add message type debug logging, send conversation_initiation_client_data on connect. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b888f7fcb6
commit
f7f0b0c45b
@ -119,20 +119,22 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
|
|||||||
// Resampling
|
// Resampling
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000(
|
TArray<float> UElevenLabsMicrophoneCaptureComponent::ResampleTo16000(
|
||||||
const float* InAudio, int32 NumSamples,
|
const float* InAudio, int32 NumFrames,
|
||||||
int32 InChannels, int32 InSampleRate)
|
int32 InChannels, int32 InSampleRate)
|
||||||
{
|
{
|
||||||
const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000
|
const int32 TargetRate = ElevenLabsAudio::SampleRate; // 16000
|
||||||
|
|
||||||
// --- Step 1: Downmix to mono ---
|
// --- Step 1: Downmix to mono ---
|
||||||
|
// NOTE: NumFrames is the number of audio frames (not total samples).
|
||||||
|
// Each frame contains InChannels samples (e.g. 2 for stereo).
|
||||||
|
// The raw buffer has NumFrames * InChannels total float values.
|
||||||
TArray<float> Mono;
|
TArray<float> Mono;
|
||||||
if (InChannels == 1)
|
if (InChannels == 1)
|
||||||
{
|
{
|
||||||
Mono = TArray<float>(InAudio, NumSamples);
|
Mono = TArray<float>(InAudio, NumFrames);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
const int32 NumFrames = NumSamples / InChannels;
|
|
||||||
Mono.Reserve(NumFrames);
|
Mono.Reserve(NumFrames);
|
||||||
for (int32 i = 0; i < NumFrames; i++)
|
for (int32 i = 0; i < NumFrames; i++)
|
||||||
{
|
{
|
||||||
|
|||||||
@ -72,7 +72,11 @@ void UElevenLabsWebSocketProxy::Connect(const FString& AgentIDOverride, const FS
|
|||||||
WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected);
|
WebSocket->OnConnected().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnected);
|
||||||
WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError);
|
WebSocket->OnConnectionError().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsConnectionError);
|
||||||
WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed);
|
WebSocket->OnClosed().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsClosed);
|
||||||
WebSocket->OnMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsMessage);
|
// NOTE: We bind ONLY OnRawMessage (binary frames), NOT OnMessage (text frames).
|
||||||
|
// UE's WebSocket implementation fires BOTH callbacks for the same frame when using
|
||||||
|
// the libwebsockets backend — binding both causes every audio packet to be decoded
|
||||||
|
// and played twice. OnRawMessage handles all frame types: raw binary audio AND
|
||||||
|
// text-framed JSON (detected by peeking first byte for '{').
|
||||||
WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage);
|
WebSocket->OnRawMessage().AddUObject(this, &UElevenLabsWebSocketProxy::OnWsBinaryMessage);
|
||||||
|
|
||||||
WebSocket->Connect();
|
WebSocket->Connect();
|
||||||
@ -94,36 +98,52 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
|||||||
{
|
{
|
||||||
if (!IsConnected())
|
if (!IsConnected())
|
||||||
{
|
{
|
||||||
UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected."));
|
UE_LOG(LogElevenLabsWS, Warning, TEXT("SendAudioChunk: not connected (state=%d). Audio dropped."),
|
||||||
|
(int32)ConnectionState);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (PCMData.Num() == 0) return;
|
if (PCMData.Num() == 0) return;
|
||||||
|
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
|
||||||
|
|
||||||
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
|
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
|
||||||
|
// The server's VAD detects silence to determine end-of-turn.
|
||||||
|
// Do NOT send user_activity here — it resets the turn timeout timer
|
||||||
|
// and would prevent the server from taking the turn after the user stops speaking.
|
||||||
const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num());
|
const FString Base64Audio = FBase64::Encode(PCMData.GetData(), PCMData.Num());
|
||||||
|
|
||||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
// Send as compact JSON (no pretty-printing) directly, bypassing SendJsonMessage
|
||||||
Msg->SetStringField(ElevenLabsMessageType::AudioChunk, Base64Audio);
|
// to avoid the pretty-printed writer and to keep the payload minimal.
|
||||||
SendJsonMessage(Msg);
|
const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);
|
||||||
|
|
||||||
|
// Log first chunk fully for debugging
|
||||||
|
static int32 AudioChunksSent = 0;
|
||||||
|
AudioChunksSent++;
|
||||||
|
if (AudioChunksSent <= 2)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT(" Audio JSON (first 200 chars): %.200s"), *AudioJson);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (WebSocket.IsValid() && WebSocket->IsConnected())
|
||||||
|
{
|
||||||
|
WebSocket->Send(AudioJson);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendUserTurnStart()
|
void UElevenLabsWebSocketProxy::SendUserTurnStart()
|
||||||
{
|
{
|
||||||
// In client turn mode, signal that the user is active/speaking.
|
// No-op: the ElevenLabs API does not require a "start speaking" signal.
|
||||||
// API message: { "type": "user_activity" }
|
// The server's VAD detects speech from the audio chunks we send.
|
||||||
if (!IsConnected()) return;
|
// user_activity is a keep-alive/timeout-reset message and should NOT be
|
||||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
// sent here — it would delay the agent's turn after the user stops.
|
||||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::UserActivity);
|
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
|
||||||
SendJsonMessage(Msg);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
||||||
{
|
{
|
||||||
// In client turn mode, stopping user_activity signals end of user turn.
|
// No explicit "end turn" message exists in the ElevenLabs API.
|
||||||
// The API uses user_activity for ongoing speech; simply stop sending it.
|
// The server detects end-of-speech via VAD when we stop sending audio chunks.
|
||||||
// No explicit end message is required — silence is detected server-side.
|
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
|
||||||
// We still log for debug visibility.
|
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended (client mode) — stopped sending user_activity."));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
|
void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
|
||||||
@ -155,8 +175,72 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
|
|||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
void UElevenLabsWebSocketProxy::OnWsConnected()
|
void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||||
{
|
{
|
||||||
UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Waiting for conversation_initiation_metadata..."));
|
UE_LOG(LogElevenLabsWS, Log, TEXT("WebSocket connected. Sending conversation_initiation_client_data..."));
|
||||||
// State stays Connecting until we receive the initiation metadata from the server.
|
// State stays Connecting until we receive conversation_initiation_metadata from the server.
|
||||||
|
|
||||||
|
// ElevenLabs requires this message immediately after the WebSocket handshake to
|
||||||
|
// negotiate the session configuration. Without it, the server won't accept audio
|
||||||
|
// from the client (microphone stays silent from server perspective) and default
|
||||||
|
// settings are used (higher latency, no intermediate responses).
|
||||||
|
//
|
||||||
|
// Structure:
|
||||||
|
// {
|
||||||
|
// "type": "conversation_initiation_client_data",
|
||||||
|
// "conversation_config_override": {
|
||||||
|
// "agent": {
|
||||||
|
// "turn": { "turn_timeout": 3 }
|
||||||
|
// },
|
||||||
|
// "tts": {
|
||||||
|
// "optimize_streaming_latency": 3
|
||||||
|
// }
|
||||||
|
// },
|
||||||
|
// "custom_llm_extra_body": {
|
||||||
|
// "enable_intermediate_response": true
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Configure turn-taking behaviour.
|
||||||
|
// The ElevenLabs API does NOT have a turn.mode field.
|
||||||
|
// Turn-taking is controlled by the server's VAD and the turn_* parameters.
|
||||||
|
// In push-to-talk (Client mode) the user controls the mic; the server still
|
||||||
|
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
||||||
|
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||||
|
// Lower turn_timeout so the agent responds faster after the user stops speaking.
|
||||||
|
// Default is 7s which feels very slow for push-to-talk.
|
||||||
|
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||||
|
{
|
||||||
|
TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
|
||||||
|
}
|
||||||
|
|
||||||
|
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||||
|
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||||
|
|
||||||
|
TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
|
||||||
|
TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
|
||||||
|
|
||||||
|
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
||||||
|
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
||||||
|
ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
|
||||||
|
|
||||||
|
// enable_intermediate_response reduces time-to-first-audio by allowing the agent
|
||||||
|
// to start speaking before it has finished generating the full response.
|
||||||
|
TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
|
||||||
|
CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
|
||||||
|
|
||||||
|
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
|
||||||
|
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
|
||||||
|
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
|
||||||
|
InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
|
||||||
|
|
||||||
|
// NOTE: We bypass SendJsonMessage() here intentionally.
|
||||||
|
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
|
||||||
|
// during the handshake before IsConnected() returns true in some UE WS backends.
|
||||||
|
// We know the socket is open at this point — send directly.
|
||||||
|
FString InitJson;
|
||||||
|
TSharedRef<TJsonWriter<>> InitWriter = TJsonWriterFactory<>::Create(&InitJson);
|
||||||
|
FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending initiation: %s"), *InitJson);
|
||||||
|
WebSocket->Send(InitJson);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error)
|
void UElevenLabsWebSocketProxy::OnWsConnectionError(const FString& Error)
|
||||||
@ -200,6 +284,9 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Log every message type received from the server for debugging.
|
||||||
|
UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);
|
||||||
|
|
||||||
if (MsgType == ElevenLabsMessageType::ConversationInitiation)
|
if (MsgType == ElevenLabsMessageType::ConversationInitiation)
|
||||||
{
|
{
|
||||||
HandleConversationInitiation(Root);
|
HandleConversationInitiation(Root);
|
||||||
|
|||||||
@ -183,4 +183,10 @@ private:
|
|||||||
// Accumulation buffer for multi-fragment binary WebSocket frames.
|
// Accumulation buffer for multi-fragment binary WebSocket frames.
|
||||||
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
|
||||||
TArray<uint8> BinaryFrameBuffer;
|
TArray<uint8> BinaryFrameBuffer;
|
||||||
|
|
||||||
|
public:
|
||||||
|
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||||
|
// Controls the turn mode string sent in conversation_initiation_client_data
|
||||||
|
// AND whether user_activity is sent automatically with each audio chunk.
|
||||||
|
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user