From 91cf5b1bb4159d93a590a9027654b4d6e0ebaaf3 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 19 Feb 2026 18:41:58 +0100 Subject: [PATCH] Fix audio chunk size: accumulate mic audio to 100ms before sending WASAPI fires mic callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). ElevenLabs VAD/STT requires a minimum of ~100ms (3200 bytes) per chunk. Tiny fragments arrived at the server but were never processed, so the agent never transcribed or responded to user speech. Fix: OnMicrophoneDataCaptured now appends to MicAccumulationBuffer and only calls SendAudioChunk once >= 3200 bytes are accumulated. StopListening flushes any remaining bytes before sending UserTurnEnd so the final words of an utterance are never discarded. HandleDisconnected also clears the buffer to prevent stale data on reconnect. Co-Authored-By: Claude Opus 4.6 --- ...ElevenLabsConversationalAgentComponent.cpp | 36 +++++++++++++++++-- .../ElevenLabsConversationalAgentComponent.h | 7 ++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 5276d6a..7ca14c9 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -86,6 +86,10 @@ void UElevenLabsConversationalAgentComponent::StartConversation() &UElevenLabsConversationalAgentComponent::HandleInterrupted); } + // Pass our TurnMode to the proxy so it sends the correct mode in + // conversation_initiation_client_data and sends user_activity with each audio chunk. + WebSocketProxy->TurnMode = TurnMode; + WebSocketProxy->Connect(AgentID); } @@ -128,6 +132,9 @@ void UElevenLabsConversationalAgentComponent::StartListening() Mic->RegisterComponent(); } + // Always remove existing binding first to prevent duplicate delegates stacking + // up if StartListening is called more than once without a matching StopListening. + Mic->OnAudioCaptured.RemoveAll(this); Mic->OnAudioCaptured.AddUObject(this, &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); Mic->StartCapture(); @@ -147,6 +154,15 @@ void UElevenLabsConversationalAgentComponent::StopListening() Mic->OnAudioCaptured.RemoveAll(this); } + // Flush any partially-accumulated mic audio before signalling end-of-turn. + // This ensures the final words aren't discarded just because the last callback + // didn't push the buffer over the MicChunkMinBytes threshold. + if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) + { + WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); + } + MicAccumulationBuffer.Reset(); + if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client) { WebSocketProxy->SendUserTurnEnd(); @@ -193,7 +209,12 @@ void UElevenLabsConversationalAgentComponent::HandleConnected(const FElevenLabsC UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent connected. ConversationID=%s"), *Info.ConversationID); OnAgentConnected.Broadcast(Info); - if (bAutoStartListening) + // In Client turn mode (push-to-talk), the user controls listening manually via + // StartListening()/StopListening(). Auto-starting would leave the mic open + // permanently and interfere with push-to-talk — the T-release StopListening() + // would close the mic that auto-start opened, leaving the user unable to speak. + // Only auto-start in Server VAD mode where the mic stays open the whole session. + if (bAutoStartListening && TurnMode == EElevenLabsTurnMode::Server) { StartListening(); } @@ -204,6 +225,7 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason); bIsListening = false; bAgentSpeaking = false; + MicAccumulationBuffer.Reset(); OnAgentDisconnected.Broadcast(StatusCode, Reason); } @@ -321,8 +343,18 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr { if (!IsConnected() || !bIsListening) return; + // Convert this callback's samples to int16 bytes and accumulate. + // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms + // (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here + // until we have enough, then send the whole batch in one WebSocket frame. TArray PCMBytes = FloatPCMToInt16Bytes(FloatPCM); - WebSocketProxy->SendAudioChunk(PCMBytes); + MicAccumulationBuffer.Append(PCMBytes); + + if (MicAccumulationBuffer.Num() >= MicChunkMinBytes) + { + WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); + MicAccumulationBuffer.Reset(); + } } TArray UElevenLabsConversationalAgentComponent::FloatPCMToInt16Bytes(const TArray& FloatPCM) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 759e1d9..de581e5 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -230,4 +230,11 @@ private: // consider the agent done speaking. int32 SilentTickCount = 0; static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps + + // ── Microphone accumulation ─────────────────────────────────────────────── + // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). + // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. + // We accumulate here and only call SendAudioChunk once enough bytes are ready. + TArray MicAccumulationBuffer; + static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono };