diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index a4affba..e58641e 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 4e4e0f4..d066e46 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -247,6 +247,9 @@ void UElevenLabsConversationalAgentComponent::StartListening() Mic->OnAudioCaptured.RemoveAll(this); Mic->OnAudioCaptured.AddUObject(this, &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); + // Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips + // capture entirely (before resampling) while the agent is speaking. + Mic->EchoSuppressFlag = &bAgentSpeaking; Mic->StartCapture(); const double T = TurnStartTime - SessionStartTime; @@ -275,20 +278,23 @@ void UElevenLabsConversationalAgentComponent::StopListening() // "user speaking" state and stall waiting for more audio that never arrives, // leaving both sides stuck — no audio for the collision response and no response // for subsequent turns. - if (bAgentGenerating) { - if (MicAccumulationBuffer.Num() > 0) + FScopeLock Lock(&MicSendLock); + if (bAgentGenerating) { - UE_LOG(LogElevenLabsAgent, Log, - TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."), - MicAccumulationBuffer.Num()); + if (MicAccumulationBuffer.Num() > 0) + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("StopListening: discarding %d bytes of accumulated mic audio (collision — server is mid-generation)."), + MicAccumulationBuffer.Num()); + } } + else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) + { + WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); + } + MicAccumulationBuffer.Reset(); } - else if (MicAccumulationBuffer.Num() > 0 && WebSocketProxy && IsConnected()) - { - WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); - } - MicAccumulationBuffer.Reset(); if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client) { @@ -394,7 +400,10 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod GeneratingTickCount = 0; TurnIndex = 0; LastClosedTurnIndex = 0; - MicAccumulationBuffer.Reset(); + { + FScopeLock Lock(&MicSendLock); + MicAccumulationBuffer.Reset(); + } OnAgentDisconnected.Broadcast(StatusCode, Reason); } @@ -611,10 +620,14 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr if (bAgentSpeaking) return; // Convert this callback's samples to int16 bytes and accumulate. - // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms - // (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here + // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms + // (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here // until we have enough, then send the whole batch in one WebSocket frame. TArray PCMBytes = FloatPCMToInt16Bytes(FloatPCM); + + // Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and + // game thread (StopListening flush). WebSocket send is also serialized. + FScopeLock Lock(&MicSendLock); MicAccumulationBuffer.Append(PCMBytes); if (MicAccumulationBuffer.Num() >= MicChunkMinBytes) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp index a0685cd..e9bea15 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp @@ -89,6 +89,12 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate( UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow.")); } + // Echo suppression: skip resampling + broadcasting entirely when agent is speaking. + if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed)) + { + return; + } + // Device sends float32 interleaved samples; cast from the void* API. const float* FloatAudio = static_cast(InAudio); @@ -104,15 +110,20 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate( } } - // Fire the delegate on the game thread so subscribers don't need to be - // thread-safe (WebSocket Send is not thread-safe in UE's implementation). - AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]() + // Dispatch to game thread for delegate broadcast. + // UE's FMulticastDelegate with AddUObject uses weak object pointer checks that + // are not thread-safe — broadcasting from the WASAPI thread causes the invocation + // to be silently skipped. The game thread dispatch adds ~8ms latency but is required. + if (bCapturing) { - if (bCapturing) + AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]() { - OnAudioCaptured.Broadcast(Data); - } - }); + if (bCapturing) + { + OnAudioCaptured.Broadcast(Captured); + } + }); + } } // ───────────────────────────────────────────────────────────────────────────── diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 83f273d..0dd8219 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -120,9 +120,12 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray& PCMData) // Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second). UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num()); - if (WebSocket.IsValid() && WebSocket->IsConnected()) { - WebSocket->Send(AudioJson); + FScopeLock Lock(&WebSocketSendLock); + if (WebSocket.IsValid() && WebSocket->IsConnected()) + { + WebSocket->Send(AudioJson); + } } } @@ -658,7 +661,10 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr& J UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out); } - WebSocket->Send(Out); + { + FScopeLock Lock(&WebSocketSendLock); + WebSocket->Send(Out); + } } FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 27f93b7..bd1334c 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -7,6 +7,7 @@ #include "ElevenLabsDefinitions.h" #include "ElevenLabsWebSocketProxy.h" #include "Sound/SoundWaveProcedural.h" +#include #include "ElevenLabsConversationalAgentComponent.generated.h" class UAudioComponent; @@ -337,8 +338,9 @@ private: USoundWaveProcedural* ProceduralSoundWave = nullptr; // ── State ───────────────────────────────────────────────────────────────── - bool bIsListening = false; - bool bAgentSpeaking = false; + // Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread. + std::atomic bIsListening{false}; + std::atomic bAgentSpeaking{false}; // True from the first agent_chat_response_part until the first audio chunk arrives. // Used to block StartListening() while the server is processing the previous turn. bool bAgentGenerating = false; @@ -399,6 +401,9 @@ private: // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). // ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT. // We accumulate here and only call SendAudioChunk once enough bytes are ready. + // MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread + // in OnMicrophoneDataCaptured and from game thread in StopListening flush). TArray MicAccumulationBuffer; - static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation) + FCriticalSection MicSendLock; + static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples) }; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h index 175c57d..de58348 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h @@ -5,6 +5,7 @@ #include "CoreMinimal.h" #include "Components/ActorComponent.h" #include "AudioCapture.h" +#include #include "ElevenLabsMicrophoneCaptureComponent.generated.h" // Delivers captured float PCM samples (16000 Hz mono, resampled from device rate). @@ -33,11 +34,17 @@ public: float VolumeMultiplier = 1.0f; /** - * Delegate fired on the game thread each time a new chunk of PCM audio - * is captured. Samples are float32, resampled to 16000 Hz mono. + * Delegate fired on the game thread each time a new chunk of PCM audio is + * captured. Samples are float32, resampled to 16000 Hz mono. + * Audio is captured on a WASAPI background thread, resampled there (with + * echo suppression), then dispatched to the game thread for this broadcast. */ FOnElevenLabsAudioCaptured OnAudioCaptured; + /** Optional pointer to an atomic bool that suppresses capture when true. + * Set by the agent component for echo suppression (skip mic while agent speaks). */ + std::atomic* EchoSuppressFlag = nullptr; + /** Open the default capture device and begin streaming audio. */ UFUNCTION(BlueprintCallable, Category = "ElevenLabs") void StartCapture(); @@ -65,7 +72,7 @@ private: Audio::FAudioCapture AudioCapture; Audio::FAudioCaptureDeviceParams DeviceParams; - bool bCapturing = false; + std::atomic bCapturing{false}; // Device sample rate discovered on StartCapture int32 DeviceSampleRate = 44100; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index fa1ee67..c1a3cb7 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -205,6 +205,10 @@ private: EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected; FElevenLabsConversationInfo ConversationInfo; + // Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be + // called from the WASAPI background thread while SendJsonMessage runs on game thread. + FCriticalSection WebSocketSendLock; + // Accumulation buffer for multi-fragment binary WebSocket frames. // ElevenLabs sends JSON as binary frames; large messages arrive in fragments. TArray BinaryFrameBuffer;