Fix audio chunk size: accumulate mic audio to 100ms before sending

WASAPI fires mic callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
ElevenLabs VAD/STT requires a minimum of ~100ms (3200 bytes) per chunk.
Tiny fragments arrived at the server but were never processed, so the
agent never transcribed or responded to user speech.

Fix: OnMicrophoneDataCaptured now appends to MicAccumulationBuffer and
only calls SendAudioChunk once >= 3200 bytes are accumulated. StopListening
flushes any remaining bytes before sending UserTurnEnd so the final words
of an utterance are never discarded. HandleDisconnected also clears the
buffer to prevent stale data on reconnect.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-19 18:41:58 +01:00
parent 993a827c7b
commit 91cf5b1bb4
2 changed files with 41 additions and 2 deletions

View File

@ -86,6 +86,10 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
}
// Pass our TurnMode to the proxy so it sends the correct mode in
// conversation_initiation_client_data and sends user_activity with each audio chunk.
WebSocketProxy->TurnMode = TurnMode;
WebSocketProxy->Connect(AgentID);
}
@ -128,6 +132,9 @@ void UElevenLabsConversationalAgentComponent::StartListening()
Mic->RegisterComponent();
}
// Always remove existing binding first to prevent duplicate delegates stacking
// up if StartListening is called more than once without a matching StopListening.
Mic->OnAudioCaptured.RemoveAll(this);
Mic->OnAudioCaptured.AddUObject(this,
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
Mic->StartCapture();
@ -147,6 +154,15 @@ void UElevenLabsConversationalAgentComponent::StopListening()
Mic->OnAudioCaptured.RemoveAll(this);
}
// Flush any partially-accumulated mic audio before signalling end-of-turn.
// This ensures the final words aren't discarded just because the last callback
// didn't push the buffer over the MicChunkMinBytes threshold.
if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
}
MicAccumulationBuffer.Reset();
if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
{
WebSocketProxy->SendUserTurnEnd();
@ -193,7 +209,12 @@ void UElevenLabsConversationalAgentComponent::HandleConnected(const FElevenLabsC
UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent connected. ConversationID=%s"), *Info.ConversationID);
OnAgentConnected.Broadcast(Info);
if (bAutoStartListening)
// In Client turn mode (push-to-talk), the user controls listening manually via
// StartListening()/StopListening(). Auto-starting would leave the mic open
// permanently and interfere with push-to-talk — the T-release StopListening()
// would close the mic that auto-start opened, leaving the user unable to speak.
// Only auto-start in Server VAD mode where the mic stays open the whole session.
if (bAutoStartListening && TurnMode == EElevenLabsTurnMode::Server)
{
StartListening();
}
@ -204,6 +225,7 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason);
bIsListening = false;
bAgentSpeaking = false;
MicAccumulationBuffer.Reset();
OnAgentDisconnected.Broadcast(StatusCode, Reason);
}
@ -321,8 +343,18 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
{
if (!IsConnected() || !bIsListening) return;
// Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
// until we have enough, then send the whole batch in one WebSocket frame.
TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
WebSocketProxy->SendAudioChunk(PCMBytes);
MicAccumulationBuffer.Append(PCMBytes);
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
MicAccumulationBuffer.Reset();
}
}
TArray<uint8> UElevenLabsConversationalAgentComponent::FloatPCMToInt16Bytes(const TArray<float>& FloatPCM)

View File

@ -230,4 +230,11 @@ private:
// consider the agent done speaking.
int32 SilentTickCount = 0;
static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps
// ── Microphone accumulation ───────────────────────────────────────────────
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
// We accumulate here and only call SendAudioChunk once enough bytes are ready.
TArray<uint8> MicAccumulationBuffer;
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono
};