diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index e58641e..0bd20df 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index d066e46..812bb1b 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening() &UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured); // Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips // capture entirely (before resampling) while the agent is speaking. - Mic->EchoSuppressFlag = &bAgentSpeaking; + // In Server VAD + interruption mode, disable echo suppression so the server + // receives the user's voice even during agent playback — the server's own VAD + // handles echo filtering and interruption detection. + if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption) + { + Mic->EchoSuppressFlag = nullptr; + } + else + { + Mic->EchoSuppressFlag = &bAgentSpeaking; + } Mic->StartCapture(); const double T = TurnStartTime - SessionStartTime; @@ -460,22 +470,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0; if (bIsListening) { - // Collision: server started generating Turn N's response while Turn M (M>N) mic was open. - // The server's VAD detected a pause in the user's speech and started generating - // prematurely — the user hasn't finished speaking yet. - // - // Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's - // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, - // causing it to re-enter "user speaking" state and stall — both sides stuck. - // - // Do NOT send an interrupt here — just let the server's response play out: - // - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally. - // - If audio never arrives → generating timeout (10s) clears bAgentGenerating. - // Either way the state machine recovers and Blueprint can reopen the mic. - UE_LOG(LogElevenLabsAgent, Log, - TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), - T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); - StopListening(); + // In Server VAD + interruption mode, keep the mic open so the server can + // detect if the user speaks over the agent and send an interruption event. + // The server handles echo filtering and VAD — we just keep streaming audio. + if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption) + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"), + T, LastClosedTurnIndex, LatencyFromTurnEnd); + } + else + { + // Collision: server started generating Turn N's response while Turn M (M>N) mic was open. + // Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's + // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, + // causing it to re-enter "user speaking" state and stall — both sides stuck. + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"), + T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd); + StopListening(); + } } UE_LOG(LogElevenLabsAgent, Log, @@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr // Echo suppression: skip sending mic audio while the agent is speaking. // This prevents the agent from hearing its own voice through the speakers, - // which would confuse the server's VAD and STT. Matches the approach used - // in the official ElevenLabs C++ SDK (outputPlaying_ flag). - if (bAgentSpeaking) return; + // which would confuse the server's VAD and STT. + // In Server VAD + interruption mode, keep sending audio so the server can + // detect the user speaking over the agent and trigger an interruption. + if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)) + { + return; + } // Convert this callback's samples to int16 bytes and accumulate. // WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms @@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr FScopeLock Lock(&MicSendLock); MicAccumulationBuffer.Append(PCMBytes); - if (MicAccumulationBuffer.Num() >= MicChunkMinBytes) + if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes()) { WebSocketProxy->SendAudioChunk(MicAccumulationBuffer); MicAccumulationBuffer.Reset(); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 0dd8219..65b86f9 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr* PartEvent = nullptr; - if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent) + FString PartText; + bool bFound = false; + + // Try current format: text_response_part.text + const TSharedPtr* TextPart = nullptr; + if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart) { - FString PartText; - if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty()) + (*TextPart)->TryGetStringField(TEXT("text"), PartText); + bFound = true; + } + + // Fallback: legacy format agent_chat_response_part_event.agent_response_part + if (!bFound) + { + const TSharedPtr* PartEvent = nullptr; + if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent) { - OnAgentResponsePart.Broadcast(PartText); + (*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText); + bFound = true; } } + + if (bFound && !PartText.IsEmpty()) + { + OnAgentResponsePart.Broadcast(PartText); + } } void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr& Root) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index bd1334c..ecd5923 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -118,6 +118,18 @@ public: UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency") bool bSpeculativeTurn = false; + /** + * Duration in milliseconds of each microphone audio chunk sent to ElevenLabs. + * WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT + * accuracy. We accumulate audio and send once this duration is reached. + * - Lower values (50-80ms): less latency, but VAD may be less reliable. + * - Higher values (150-250ms): more reliable VAD, but adds latency. + * Default: 100ms (3200 bytes at 16kHz 16-bit mono). + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", + meta = (ClampMin = "20", ClampMax = "500", Units = "ms")) + int32 MicChunkDurationMs = 100; + /** * Allow the user to interrupt the agent while it is playing audio (speaking). * When true, calling StartListening() while the agent is audibly speaking automatically @@ -405,5 +417,8 @@ private: // in OnMicrophoneDataCaptured and from game thread in StopListening flush). TArray MicAccumulationBuffer; FCriticalSection MicSendLock; - static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples) + + /** Compute the minimum bytes from the user-facing MicChunkDurationMs. + * Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */ + int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; } };