diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 5276d6a..7ca14c9 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -86,6 +86,10 @@ void UElevenLabsConversationalAgentComponent::StartConversation() &UElevenLabsConversationalAgentComponent::HandleInterrupted); } + // Pass our TurnMode to the proxy so it sends the correct mode in + // conversation_initiation_client_data and sends user_activity with each audio chunk. + WebSocketProxy->TurnMode = TurnMode; + WebSocketProxy->Connect(AgentID); } @@ -128,6 +132,9 @@ void UElevenLabsConversationalAgentComponent::StartListening() Mic->RegisterComponent(); } + // Always remove existing binding first to prevent duplicate delegates stacking + // up if StartListening is called more than once without a matching StopListening. + Mic->OnAudioCaptured.RemoveAll(this); Mic->OnAudioCaptured.AddUObject(this, &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); Mic->StartCapture(); @@ -147,6 +154,15 @@ void UElevenLabsConversationalAgentComponent::StopListening() Mic->OnAudioCaptured.RemoveAll(this); } + // Flush any partially-accumulated mic audio before signalling end-of-turn. + // This ensures the final words aren't discarded just because the last callback + // didn't push the buffer over the MicChunkMinBytes threshold. + if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) + { + WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); + } + MicAccumulationBuffer.Reset(); + if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client) { WebSocketProxy->SendUserTurnEnd(); @@ -193,7 +209,12 @@ void UElevenLabsConversationalAgentComponent::HandleConnected(const FElevenLabsC UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent connected. ConversationID=%s"), *Info.ConversationID); OnAgentConnected.Broadcast(Info); - if (bAutoStartListening) + // In Client turn mode (push-to-talk), the user controls listening manually via + // StartListening()/StopListening(). Auto-starting would leave the mic open + // permanently and interfere with push-to-talk — the T-release StopListening() + // would close the mic that auto-start opened, leaving the user unable to speak. + // Only auto-start in Server VAD mode where the mic stays open the whole session. + if (bAutoStartListening && TurnMode == EElevenLabsTurnMode::Server) { StartListening(); } @@ -204,6 +225,7 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason); bIsListening = false; bAgentSpeaking = false; + MicAccumulationBuffer.Reset(); OnAgentDisconnected.Broadcast(StatusCode, Reason); } @@ -321,8 +343,18 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr { if (!IsConnected() || !bIsListening) return; + // Convert this callback's samples to int16 bytes and accumulate. + // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms + // (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here + // until we have enough, then send the whole batch in one WebSocket frame. TArray PCMBytes = FloatPCMToInt16Bytes(FloatPCM); - WebSocketProxy->SendAudioChunk(PCMBytes); + MicAccumulationBuffer.Append(PCMBytes); + + if (MicAccumulationBuffer.Num() >= MicChunkMinBytes) + { + WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); + MicAccumulationBuffer.Reset(); + } } TArray UElevenLabsConversationalAgentComponent::FloatPCMToInt16Bytes(const TArray& FloatPCM) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 759e1d9..de581e5 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -230,4 +230,11 @@ private: // consider the agent done speaking. int32 SilentTickCount = 0; static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps + + // ── Microphone accumulation ─────────────────────────────────────────────── + // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). + // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. + // We accumulate here and only call SendAudioChunk once enough bytes are ready. + TArray MicAccumulationBuffer; + static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono };