From e5a32f599702f9f565b352eb974f7dd4414690fc Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 12 Mar 2026 09:50:43 +0100 Subject: [PATCH] Fix audio cutoff and lip sync activation bugs during agent switching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix A→B→A audio cutoff: when switching back to a pending-leave agent, cancel the deferred leave instead of force-completing it (was calling StopAgentAudio on the agent we're returning to) - Fix deferred leave firing during TTS gaps: use IsAgentSpeakingOrPending() instead of IsAgentSpeaking() — checks bAgentGenerating and bAgentResponseReceived to avoid premature leave during inter-batch silence - Convert silence detection from tick-based to time-based: SilentTickCount → SilentTime (float seconds), GeneratingTickCount → GeneratingTime. Consistent behavior regardless of frame rate (was 5s@120fps vs 20s@30fps) - Fix lazy binding: add OnAgentConnected/OnAgentDisconnected in LipSync and FacialExpression TickComponent lazy-bind path (bActive stayed false forever in packaged builds when component init order differed) - Fix reconnection: reset bWaitingForAgentResponse and GeneratingTime before entering reconnect mode to avoid stale state on new session - Fix event_ID audio filtering: reset LastInterruptEventId in HandleAgentResponse and SendUserTurnStart so first audio chunks of a new turn are not silently discarded by stale interrupt filter - Preserve retained gaze when switching back to same agent (don't CleanupRetainedGaze if PrevRetained == NewAgent) Co-Authored-By: Claude Opus 4.6 --- .../PS_AI_ConvAgent_ElevenLabsComponent.cpp | 49 ++++++++++--------- ...AI_ConvAgent_FacialExpressionComponent.cpp | 9 ++++ .../PS_AI_ConvAgent_InteractionComponent.cpp | 44 ++++++++++++----- .../PS_AI_ConvAgent_LipSyncComponent.cpp | 10 ++++ ...AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp | 8 +++ .../PS_AI_ConvAgent_ElevenLabsComponent.h | 31 +++++++----- 6 files changed, 106 insertions(+), 45 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp index db8bfab..e5f3f52 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp @@ -177,27 +177,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel // Generating timeout (ISSUE-1): if the server sent agent_chat_response_part // (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false), - // force-clear bAgentGenerating after 10s so StartListening() is no longer blocked. - // Normal path: first audio chunk → EnqueueAgentAudio → bAgentGenerating=false. - // This fallback covers the rare case where TTS produces nothing (e.g. empty response). + // force-clear bAgentGenerating after GeneratingTimeoutSeconds so StartListening() + // is no longer blocked. Time-based to behave consistently across frame rates. if (bAgentGenerating && !bAgentSpeaking) { - if (++GeneratingTickCount >= HardSilenceTimeoutTicks) + GeneratingTime += DeltaTime; + if (GeneratingTime >= GeneratingTimeoutSeconds) { bAgentGenerating = false; - GeneratingTickCount = 0; + GeneratingTime = 0.0f; if (bDebug) { const double T = FPlatformTime::Seconds() - SessionStartTime; UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Warning, - TEXT("[T+%.2fs] [Turn %d] Generating timeout (10s) — no audio arrived. Clearing bAgentGenerating."), - T, LastClosedTurnIndex); + TEXT("[T+%.2fs] [Turn %d] Generating timeout (%.0fs) — no audio arrived. Clearing bAgentGenerating."), + T, LastClosedTurnIndex, GeneratingTimeoutSeconds); } } } else { - GeneratingTickCount = 0; + GeneratingTime = 0.0f; } // Pre-buffer timer: start playback after the pre-buffer period expires. @@ -321,19 +321,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel FScopeLock Lock(&AudioQueueLock); if (AudioQueue.Num() - AudioQueueReadOffset == 0) { - SilentTickCount++; + SilentTime += DeltaTime; // Wait for agent_response (confirms the full response is done) before // declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking // events when ElevenLabs TTS streams audio in multiple batches with gaps // (e.g. for long responses) — without this guard, the Blueprint's // OnAgentStoppedSpeaking handler reopens the mic mid-response. - const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks; + const bool bResponseConfirmed = bAgentResponseReceived && SilentTime >= SilenceThresholdSeconds; // Hard-timeout fallback: if agent_response never arrives (or is very late), // stop after 10s of silence to avoid leaving the state machine stuck. - // 10s was chosen to bridge observed inter-batch TTS gaps of up to ~5s. - const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks; + // Time-based to behave consistently regardless of frame rate. + const bool bHardTimeout = SilentTime >= HardSilenceTimeoutSeconds; if (bResponseConfirmed || bHardTimeout) { @@ -341,13 +341,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel bAgentSpeaking = false; bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted. bAgentResponseReceived = false; - SilentTickCount = 0; + SilentTime = 0.0f; bShouldBroadcastStopped = true; } } else { - SilentTickCount = 0; + SilentTime = 0.0f; } } // Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time. @@ -1096,7 +1096,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode, bIsListening = false; // bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio. bWaitingForAgentResponse = false; - GeneratingTickCount = 0; + GeneratingTime = 0.0f; TurnIndex = 0; LastClosedTurnIndex = 0; CurrentEmotion = EPS_AI_ConvAgent_Emotion::Neutral; @@ -1112,6 +1112,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode, if (!bIntentionalDisconnect && StatusCode != 1000 && MaxReconnectAttempts > 0 && GetOwnerRole() == ROLE_Authority) { + // Clean up stale turn state so the reconnected session starts fresh. + // StopAgentAudio() already ran above, but these fields are not reset by it: + bWaitingForAgentResponse = false; + GeneratingTime = 0.0f; + bWantsReconnect = true; ReconnectAttemptCount = 0; const double Delay = 1.0; // First attempt after 1 second. @@ -1612,7 +1617,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray bAgentGenerating = false; // Agent is now speaking — generation phase is over. bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking. bQueueWasDry = false; - SilentTickCount = 0; + SilentTime = 0.0f; // Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement. TurnFirstChunkTime = FPlatformTime::Seconds(); TurnFirstChunkBytes = PCMData.Num(); @@ -1717,7 +1722,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray } OnAudioPlaybackStarted.Broadcast(); } - SilentTickCount = 0; + SilentTime = 0.0f; } else { @@ -1745,7 +1750,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray } } // Reset silence counter — new audio arrived, we're not in a gap anymore - SilentTickCount = 0; + SilentTime = 0.0f; } } @@ -1784,7 +1789,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio() if (bAgentSpeaking) { bAgentSpeaking = false; - SilentTickCount = 0; + SilentTime = 0.0f; bWasSpeaking = true; Now = FPlatformTime::Seconds(); } @@ -2433,7 +2438,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::MulticastAgentStoppedSpeaking_Impleme { if (GetOwnerRole() == ROLE_Authority) return; bAgentSpeaking = false; - SilentTickCount = 0; + SilentTime = 0.0f; OnAgentStoppedSpeaking.Broadcast(); } @@ -2711,8 +2716,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const // Audio queue (read without lock for debug display — minor race is acceptable) const int32 QueueBytes = FMath::Max(0, AudioQueue.Num() - AudioQueueReadOffset); GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, MainColor, - FString::Printf(TEXT(" AudioQueue: %d bytes SilentTicks: %d"), - QueueBytes, SilentTickCount)); + FString::Printf(TEXT(" AudioQueue: %d bytes SilentTime: %.2fs"), + QueueBytes, SilentTime)); // Timing const double Now = FPlatformTime::Seconds(); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp index 46b2fc8..af0708e 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp @@ -286,6 +286,15 @@ void UPS_AI_ConvAgent_FacialExpressionComponent::TickComponent( AgentComponent = Agent; Agent->OnAgentEmotionChanged.AddDynamic( this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnEmotionChanged); + + // Bind conversation lifecycle — same as BeginPlay path. + // Without these, bActive stays false forever when lazy-bound. + Agent->OnAgentConnected.AddDynamic( + this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationConnected); + Agent->OnAgentDisconnected.AddDynamic( + this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationDisconnected); + bActive = false; + CurrentActiveAlpha = 0.0f; } } } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp index 235026f..f04b8c0 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp @@ -182,9 +182,9 @@ void UPS_AI_ConvAgent_InteractionComponent::TickComponent(float DeltaTime, ELeve CleanupRetainedGaze(Pending); PendingLeaveAgent.Reset(); } - else if (!Pending->IsAgentSpeaking()) + else if (!Pending->IsAgentSpeakingOrPending()) { - // Agent finished speaking — leave conversation, retain gaze. + // Agent truly finished speaking (not just a TTS inter-batch gap) — leave conversation, retain gaze. ExecuteLeave(Pending); GazeRetainedAgent = Pending; PendingLeaveAgent.Reset(); @@ -539,23 +539,45 @@ void UPS_AI_ConvAgent_InteractionComponent::SetSelectedAgent(UPS_AI_ConvAgent_El // player until they walk out of interaction range. if (bAutoStartConversation && (OldAgent->IsConnected() || OldAgent->bNetIsConversing)) { - // If a previous pending leave exists, force-complete it now. + // If a previous pending leave exists, handle it. if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevPending = PendingLeaveAgent.Get()) { - ExecuteLeave(PrevPending); - CleanupRetainedGaze(PrevPending); - PendingLeaveAgent.Reset(); + if (PrevPending == NewAgent) + { + // Player is switching back to the pending agent (A→B→A). + // Cancel the deferred leave — don't execute it, the player + // is coming back to this agent and the conversation is still alive. + PendingLeaveAgent.Reset(); + + if (bDebug) + { + UE_LOG(LogPS_AI_ConvAgent_Select, Log, + TEXT(" Cancelled pending leave (switching back to same agent): %s"), + PrevPending->GetOwner() ? *PrevPending->GetOwner()->GetName() : TEXT("(null)")); + } + } + else + { + // Different agent — force-complete the old pending leave. + ExecuteLeave(PrevPending); + CleanupRetainedGaze(PrevPending); + PendingLeaveAgent.Reset(); + } } - // Similarly, clean up any existing retained gaze. + // Similarly, clean up any existing retained gaze (unless it's the agent + // we're about to re-select — keep gaze alive during the transition). if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevRetained = GazeRetainedAgent.Get()) { - CleanupRetainedGaze(PrevRetained); - GazeRetainedAgent.Reset(); + if (PrevRetained != NewAgent) + { + CleanupRetainedGaze(PrevRetained); + GazeRetainedAgent.Reset(); + } } - if (OldAgent->IsAgentSpeaking()) + if (OldAgent->IsAgentSpeakingOrPending()) { - // Agent is still speaking — defer the Leave. + // Agent is still speaking (or generating, waiting for more audio) — defer the Leave. // Gaze and body tracking stay active so the agent keeps // looking at the player while finishing its sentence. PendingLeaveAgent = OldAgent; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp index aa0aa48..ddc754c 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp @@ -758,6 +758,16 @@ void UPS_AI_ConvAgent_LipSyncComponent::TickComponent(float DeltaTime, ELevelTic this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentInterrupted); Agent->OnAgentStoppedSpeaking.AddDynamic( this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentStopped); + + // Bind conversation lifecycle — same as BeginPlay path. + // Without these, bActive stays false forever when lazy-bound. + Agent->OnAgentConnected.AddDynamic( + this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationConnected); + Agent->OnAgentDisconnected.AddDynamic( + this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationDisconnected); + bActive = false; + CurrentActiveAlpha = 0.0f; + Agent->bEnableAgentPartialResponse = true; } } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp index 68077b0..7cc5615 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp @@ -127,6 +127,7 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::SendUserTurnStart() bWaitingForResponse = false; bFirstAudioResponseLogged = false; bAgentResponseStartedFired = false; + LastInterruptEventId = 0; // New user turn — stale interrupt filter no longer valid. // No log here — turn start is implicit from audio chunks following. } @@ -563,6 +564,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleAgentResponse(const TShar // subsequent agent_chat_response_part is guaranteed to belong to a new turn. bAgentResponseStartedFired = false; + // Also reset the interrupt audio filter here. agent_response is the last message + // of the current turn — any audio arriving after this belongs to a new generation + // and must not be filtered by a stale interrupt event_id from this turn. + // This covers the edge case where audio for the next turn arrives before + // agent_chat_response_part (which also resets the filter). + LastInterruptEventId = 0; + // { "type": "agent_response", // "agent_response_event": { "agent_response": "..." } // } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h index 850c940..6651248 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h @@ -601,6 +601,11 @@ public: UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") bool IsAgentSpeaking() const { return bAgentSpeaking; } + /** True when the agent is speaking OR the server hasn't confirmed the full response yet. + * Use this instead of IsAgentSpeaking() when you need to know if the agent MIGHT still + * produce more audio (e.g. during TTS inter-batch gaps). */ + bool IsAgentSpeakingOrPending() const { return bAgentSpeaking || (bAgentGenerating && !bAgentResponseReceived); } + UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const; @@ -795,25 +800,27 @@ private: // Debug: track when the AudioQueue runs dry during speech (one-shot log). bool bQueueWasDry = false; - // Silence detection: how many consecutive ticks with an empty audio queue. - int32 SilentTickCount = 0; + // Silence detection: accumulated seconds of empty audio queue. + // Time-based (not tick-based) to behave consistently across frame rates. + float SilentTime = 0.0f; - // Generating timeout: how many consecutive ticks bAgentGenerating has been true - // without any audio arriving. If this reaches HardSilenceTimeoutTicks, bAgentGenerating - // is force-cleared so StartListening() is no longer blocked. This covers the edge case - // where the server sends agent_chat_response_part but the TTS pipeline produces no audio. - int32 GeneratingTickCount = 0; + // Generating timeout: accumulated seconds with bAgentGenerating=true but no audio. + // If this exceeds GeneratingTimeoutSeconds, bAgentGenerating is force-cleared + // so StartListening() is no longer blocked. + float GeneratingTime = 0.0f; - // Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks + // Primary threshold: fire OnAgentStoppedSpeaking after this many seconds of silence // once the server has confirmed the full response (bAgentResponseReceived=true). - // 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream. - static constexpr int32 SilenceThresholdTicks = 30; + // 0.5s is enough to bridge brief inter-chunk gaps in the TTS stream. + static constexpr float SilenceThresholdSeconds = 0.5f; // Hard-timeout fallback: fire even without agent_response confirmation after 10s // of silence. This covers edge cases where agent_response is very late or missing, // while being long enough to bridge inter-batch TTS gaps (observed up to ~5s). - // Previously 2s — raised after logs showed premature firing during multi-batch responses. - static constexpr int32 HardSilenceTimeoutTicks = 600; // 10s at 60fps + static constexpr float HardSilenceTimeoutSeconds = 10.0f; + + // Generating timeout: same as hard silence timeout. + static constexpr float GeneratingTimeoutSeconds = 10.0f; // True once the server sends agent_response for the current turn. // The server sends the full text when generation is complete — this is the