v1.7.1: Fix mic not heard + latency optimizations + thread safety

Fix regression from v1.7.0 where agent couldn't hear user speech: - Restore AsyncTask game-thread dispatch for delegate broadcast (AddUObject weak pointer checks are not thread-safe from WASAPI thread) - Keep early echo suppression in WASAPI callback (before resampling) - Keep MicChunkMinBytes at 3200 (100ms) for lower latency - Add thread safety: std::atomic<bool> for bIsListening/bAgentSpeaking/bCapturing, FCriticalSection for MicSendLock and WebSocketSendLock - Add EchoSuppressFlag pointer from agent to mic component Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 08:46:15 +01:00
parent f23acc8c1c
commit 152fc6196d
7 changed files with 75 additions and 29 deletions
--- a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
+++ b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -247,6 +247,9 @@ void UElevenLabsConversationalAgentComponent::StartListening()
 	Mic->OnAudioCaptured.RemoveAll(this);
 	Mic->OnAudioCaptured.AddUObject(this,
 		&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
+	// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
+	// capture entirely (before resampling) while the agent is speaking.
+	Mic->EchoSuppressFlag = &bAgentSpeaking;
 	Mic->StartCapture();

 	const double T = TurnStartTime - SessionStartTime;
@@ -275,6 +278,8 @@ void UElevenLabsConversationalAgentComponent::StopListening()
 	// "user speaking" state and stall waiting for more audio that never arrives,
 	// leaving both sides stuck — no audio for the collision response and no response
 	// for subsequent turns.
+	{
+		FScopeLock Lock(&MicSendLock);
 		if (bAgentGenerating)
 		{
 			if (MicAccumulationBuffer.Num() > 0)
@@ -289,6 +294,7 @@ void UElevenLabsConversationalAgentComponent::StopListening()
 			WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
 		}
 		MicAccumulationBuffer.Reset();
+	}

 	if (WebSocketProxy && TurnMode == EElevenLabsTurnMode::Client)
 	{
@@ -394,7 +400,10 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
 	GeneratingTickCount = 0;
 	TurnIndex = 0;
 	LastClosedTurnIndex = 0;
+	{
+		FScopeLock Lock(&MicSendLock);
 		MicAccumulationBuffer.Reset();
+	}
 	OnAgentDisconnected.Broadcast(StatusCode, Reason);
 }

@@ -611,10 +620,14 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
 	if (bAgentSpeaking) return;

 	// Convert this callback's samples to int16 bytes and accumulate.
-	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥250ms
-	// (8000 bytes) per chunk for reliable VAD and STT. We hold bytes here
+	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
+	// (3200 bytes) per chunk for reliable VAD and STT. We hold bytes here
 	// until we have enough, then send the whole batch in one WebSocket frame.
 	TArray<uint8> PCMBytes = FloatPCMToInt16Bytes(FloatPCM);
+
+	// Lock: MicAccumulationBuffer is accessed from WASAPI thread (here) and
+	// game thread (StopListening flush). WebSocket send is also serialized.
+	FScopeLock Lock(&MicSendLock);
 	MicAccumulationBuffer.Append(PCMBytes);

 	if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsMicrophoneCaptureComponent.cpp
@@ -89,6 +89,12 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
 		UE_LOG(LogElevenLabsMic, Verbose, TEXT("Audio capture buffer overflow."));
 	}

+	// Echo suppression: skip resampling + broadcasting entirely when agent is speaking.
+	if (EchoSuppressFlag && EchoSuppressFlag->load(std::memory_order_relaxed))
+	{
+		return;
+	}
+
 	// Device sends float32 interleaved samples; cast from the void* API.
 	const float* FloatAudio = static_cast<const float*>(InAudio);

@@ -104,15 +110,20 @@ void UElevenLabsMicrophoneCaptureComponent::OnAudioGenerate(
 		}
 	}

-	// Fire the delegate on the game thread so subscribers don't need to be
-	// thread-safe (WebSocket Send is not thread-safe in UE's implementation).
-	AsyncTask(ENamedThreads::GameThread, [this, Data = MoveTemp(Resampled)]()
+	// Dispatch to game thread for delegate broadcast.
+	// UE's FMulticastDelegate with AddUObject uses weak object pointer checks that
+	// are not thread-safe — broadcasting from the WASAPI thread causes the invocation
+	// to be silently skipped. The game thread dispatch adds ~8ms latency but is required.
+	if (bCapturing)
+	{
+		AsyncTask(ENamedThreads::GameThread, [this, Captured = MoveTemp(Resampled)]()
 		{
 			if (bCapturing)
 			{
-			OnAudioCaptured.Broadcast(Data);
+				OnAudioCaptured.Broadcast(Captured);
 			}
 		});
+	}
 }

 // ─────────────────────────────────────────────────────────────────────────────
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -120,10 +120,13 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 	// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
 	UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());

+	{
+		FScopeLock Lock(&WebSocketSendLock);
 		if (WebSocket.IsValid() && WebSocket->IsConnected())
 		{
 			WebSocket->Send(AudioJson);
 		}
+	}
 }

 void UElevenLabsWebSocketProxy::SendUserTurnStart()
@@ -658,7 +661,10 @@ void UElevenLabsWebSocketProxy::SendJsonMessage(const TSharedPtr<FJsonObject>& J
 		UE_LOG(LogElevenLabsWS, Verbose, TEXT("<< %s"), *Out);
 	}

+	{
+		FScopeLock Lock(&WebSocketSendLock);
 		WebSocket->Send(Out);
+	}
 }

 FString UElevenLabsWebSocketProxy::BuildWebSocketURL(const FString& AgentIDOverride, const FString& APIKeyOverride) const
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -7,6 +7,7 @@
 #include "ElevenLabsDefinitions.h"
 #include "ElevenLabsWebSocketProxy.h"
 #include "Sound/SoundWaveProcedural.h"
+#include <atomic>
 #include "ElevenLabsConversationalAgentComponent.generated.h"

 class UAudioComponent;
@@ -337,8 +338,9 @@ private:
 	USoundWaveProcedural* ProceduralSoundWave = nullptr;

 	// ── State ─────────────────────────────────────────────────────────────────
-	bool bIsListening = false;
-	bool bAgentSpeaking = false;
+	// Atomic: read from WASAPI background thread (OnMicrophoneDataCaptured), written from game thread.
+	std::atomic<bool> bIsListening{false};
+	std::atomic<bool> bAgentSpeaking{false};
 	// True from the first agent_chat_response_part until the first audio chunk arrives.
 	// Used to block StartListening() while the server is processing the previous turn.
 	bool bAgentGenerating = false;
@@ -399,6 +401,9 @@ private:
 	// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
 	// ElevenLabs needs at least ~100ms (3200 bytes) per chunk for reliable VAD/STT.
 	// We accumulate here and only call SendAudioChunk once enough bytes are ready.
+	// MicSendLock protects MicAccumulationBuffer + WebSocket send (accessed from WASAPI thread
+	// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
 	TArray<uint8> MicAccumulationBuffer;
-	static constexpr int32 MicChunkMinBytes = 8000; // 250ms @ 16kHz 16-bit mono (4000 samples, matches ElevenLabs SDK recommendation)
+	FCriticalSection MicSendLock;
+	static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
 };
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsMicrophoneCaptureComponent.h
@@ -5,6 +5,7 @@
 #include "CoreMinimal.h"
 #include "Components/ActorComponent.h"
 #include "AudioCapture.h"
+#include <atomic>
 #include "ElevenLabsMicrophoneCaptureComponent.generated.h"

 // Delivers captured float PCM samples (16000 Hz mono, resampled from device rate).
@@ -33,11 +34,17 @@ public:
 	float VolumeMultiplier = 1.0f;

 	/**
-	 * Delegate fired on the game thread each time a new chunk of PCM audio
-	 * is captured. Samples are float32, resampled to 16000 Hz mono.
+	 * Delegate fired on the game thread each time a new chunk of PCM audio is
+	 * captured. Samples are float32, resampled to 16000 Hz mono.
+	 * Audio is captured on a WASAPI background thread, resampled there (with
+	 * echo suppression), then dispatched to the game thread for this broadcast.
 	 */
 	FOnElevenLabsAudioCaptured OnAudioCaptured;

+	/** Optional pointer to an atomic bool that suppresses capture when true.
+	 *  Set by the agent component for echo suppression (skip mic while agent speaks). */
+	std::atomic<bool>* EchoSuppressFlag = nullptr;
+
 	/** Open the default capture device and begin streaming audio. */
 	UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
 	void StartCapture();
@@ -65,7 +72,7 @@ private:

 	Audio::FAudioCapture AudioCapture;
 	Audio::FAudioCaptureDeviceParams DeviceParams;
-	bool bCapturing = false;
+	std::atomic<bool> bCapturing{false};

 	// Device sample rate discovered on StartCapture
 	int32 DeviceSampleRate = 44100;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -205,6 +205,10 @@ private:
 	EElevenLabsConnectionState ConnectionState = EElevenLabsConnectionState::Disconnected;
 	FElevenLabsConversationInfo ConversationInfo;

+	// Serializes WebSocket->Send() calls — needed because SendAudioChunk can now be
+	// called from the WASAPI background thread while SendJsonMessage runs on game thread.
+	FCriticalSection WebSocketSendLock;
+
 	// Accumulation buffer for multi-fragment binary WebSocket frames.
 	// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
 	TArray<uint8> BinaryFrameBuffer;