diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index 83bff19..748ebd5 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index a505247..c13d380 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -459,9 +459,7 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() // bAgentGenerating guard). Flushing would send audio to a server that is mid-generation, // causing it to re-enter "user speaking" state and stall — both sides stuck. // - // Do NOT send an interrupt here: the ElevenLabs server does not always send the - // interruption ack, which would leave bIgnoreIncomingContent=true and silently - // discard all subsequent content. Instead, let the server's response play out: + // Do NOT send an interrupt here — just let the server's response play out: // - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally. // - If audio never arrives → generating timeout (10s) clears bAgentGenerating. // Either way the state machine recovers and Blueprint can reopen the mic. diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index de6ffee..3f3215b 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -158,20 +158,6 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd() // in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop. // The flag is only reset in SendUserTurnStart() at the beginning of a new user turn. - // Clear the interrupt-ignore flag if it was never cleared by an "interruption" server ack. - // The ElevenLabs server does not always send the "interruption" acknowledgement reliably. - // By the time the user has spoken a full new turn (seconds of audio), any in-flight content - // from the previously interrupted generation has long since arrived — it is safe to resume - // normal content processing so the server's response to this new turn is not silently discarded. - if (bIgnoreIncomingContent) - { - bIgnoreIncomingContent = false; - const double T = UserTurnEndTime - SessionStartTime; - UE_LOG(LogElevenLabsWS, Log, - TEXT("[T+%.2fs] Cleared interrupt-ignore flag at turn end (server 'interruption' ack was not received — resuming content processing)."), - T); - } - const double T = UserTurnEndTime - SessionStartTime; UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T); } @@ -196,12 +182,7 @@ void UElevenLabsWebSocketProxy::SendInterrupt() { if (!IsConnected()) return; - // Immediately start discarding in-flight audio and chat response parts from - // the generation we are about to interrupt. The server may still send several - // frames before it processes our interrupt. We stop ignoring once the server - // sends its "interruption" acknowledgement (HandleInterruption). - bIgnoreIncomingContent = true; - UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks.")); + UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt.")); TSharedPtr Msg = MakeShareable(new FJsonObject()); Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt); @@ -467,17 +448,9 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size, } // Broadcast raw PCM bytes directly to the audio queue. - // Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse). TArray PCMData = MoveTemp(BinaryFrameBuffer); BinaryFrameBuffer.Reset(); - if (!bIgnoreIncomingContent) - { - OnAudioReceived.Broadcast(PCMData); - } - else - { - UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack).")); - } + OnAudioReceived.Broadcast(PCMData); } } @@ -507,15 +480,6 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr& Root) { - // Discard audio that belongs to an interrupted generation. - // The server may send several more audio frames after we sent "interrupt" — - // they must not restart the speaking state on the client side. - if (bIgnoreIncomingContent) - { - UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack).")); - return; - } - // Expected structure: // { "type": "audio", // "audio_event": { "audio_base_64": "", "event_id": 1 } @@ -569,16 +533,6 @@ void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr& void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr& Root) { - // ISSUE-19: discard agent_response that belongs to an interrupted generation. - // A stale agent_response from the cancelled turn would set bAgentResponseReceived=true - // on the component, allowing the silence-detection Tick to fire OnAgentStoppedSpeaking - // at the wrong time (no audio is currently playing for the new turn yet). - if (bIgnoreIncomingContent) - { - UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_response (interrupt pending server ack).")); - return; - } - // ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on // the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only // other place that resets this flag — so without this reset, OnAgentResponseStarted fires @@ -604,16 +558,6 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr& Root) { - // Ignore response parts that belong to a generation we have already interrupted. - // Without this guard, old parts arriving after SendInterrupt() would re-trigger - // OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart), - // causing the component to stop the newly-opened microphone — creating an infinite loop. - if (bIgnoreIncomingContent) - { - UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack).")); - return; - } - // agent_chat_response_part = the server is actively generating a response (LLM token stream). // Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone // if the Blueprint restarted listening before the server finished processing the previous turn. @@ -647,10 +591,7 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr& Root) { - // Server has acknowledged the interruption — the old generation is fully stopped. - // Resume accepting incoming audio and chat response parts (for the next turn). - bIgnoreIncomingContent = false; - UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing).")); + UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received).")); OnInterrupted.Broadcast(); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index b75d6e7..1cc0efc 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -226,12 +226,6 @@ private: // Used to compute [T+Xs] session-relative timestamps in all log messages. double SessionStartTime = 0.0; - // Set to true in SendInterrupt() so that in-flight audio frames and - // agent_chat_response_part messages from the interrupted generation are silently - // discarded instead of re-triggering the speaking/generating state. - // Cleared when the server sends its "interruption" acknowledgement. - bool bIgnoreIncomingContent = false; - public: // Set by UElevenLabsConversationalAgentComponent before calling Connect(). // Controls turn_timeout in conversation_initiation_client_data.