Fix audio cutoff and lip sync activation bugs during agent switching

- Fix A→B→A audio cutoff: when switching back to a pending-leave agent, cancel the deferred leave instead of force-completing it (was calling StopAgentAudio on the agent we're returning to) - Fix deferred leave firing during TTS gaps: use IsAgentSpeakingOrPending() instead of IsAgentSpeaking() — checks bAgentGenerating and bAgentResponseReceived to avoid premature leave during inter-batch silence - Convert silence detection from tick-based to time-based: SilentTickCount → SilentTime (float seconds), GeneratingTickCount → GeneratingTime. Consistent behavior regardless of frame rate (was 5s@120fps vs 20s@30fps) - Fix lazy binding: add OnAgentConnected/OnAgentDisconnected in LipSync and FacialExpression TickComponent lazy-bind path (bActive stayed false forever in packaged builds when component init order differed) - Fix reconnection: reset bWaitingForAgentResponse and GeneratingTime before entering reconnect mode to avoid stale state on new session - Fix event_ID audio filtering: reset LastInterruptEventId in HandleAgentResponse and SendUserTurnStart so first audio chunks of a new turn are not silently discarded by stale interrupt filter - Preserve retained gaze when switching back to same agent (don't CleanupRetainedGaze if PrevRetained == NewAgent) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 09:50:43 +01:00
parent aea02abe89
commit e5a32f5997
6 changed files with 106 additions and 45 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp
@@ -177,27 +177,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
 	// Generating timeout (ISSUE-1): if the server sent agent_chat_response_part
 	// (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false),
-	// force-clear bAgentGenerating after 10s so StartListening() is no longer blocked.
+	// force-clear bAgentGenerating after GeneratingTimeoutSeconds so StartListening()
-	// Normal path: first audio chunk → EnqueueAgentAudio → bAgentGenerating=false.
+	// is no longer blocked. Time-based to behave consistently across frame rates.
 	// This fallback covers the rare case where TTS produces nothing (e.g. empty response).
 	if (bAgentGenerating && !bAgentSpeaking)
 	{
-		if (++GeneratingTickCount >= HardSilenceTimeoutTicks)
+		GeneratingTime += DeltaTime;
 		if (GeneratingTime >= GeneratingTimeoutSeconds)
 		{
 			bAgentGenerating = false;
-			GeneratingTickCount = 0;
+			GeneratingTime = 0.0f;
 			if (bDebug)
 			{
 				const double T = FPlatformTime::Seconds() - SessionStartTime;
 				UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Warning,
-					TEXT("[T+%.2fs] [Turn %d] Generating timeout (10s) — no audio arrived. Clearing bAgentGenerating."),
+					TEXT("[T+%.2fs] [Turn %d] Generating timeout (%.0fs) — no audio arrived. Clearing bAgentGenerating."),
-					T, LastClosedTurnIndex);
+					T, LastClosedTurnIndex, GeneratingTimeoutSeconds);
 			}
 		}
 	}
 	else
 	{
-		GeneratingTickCount = 0;
+		GeneratingTime = 0.0f;
 	}
 	// Pre-buffer timer: start playback after the pre-buffer period expires.
@@ -321,19 +321,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
 		FScopeLock Lock(&AudioQueueLock);
 		if (AudioQueue.Num() - AudioQueueReadOffset == 0)
 		{
-			SilentTickCount++;
+			SilentTime += DeltaTime;
 			// Wait for agent_response (confirms the full response is done) before
 			// declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking
 			// events when ElevenLabs TTS streams audio in multiple batches with gaps
 			// (e.g. for long responses) — without this guard, the Blueprint's
 			// OnAgentStoppedSpeaking handler reopens the mic mid-response.
-			const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks;
+			const bool bResponseConfirmed = bAgentResponseReceived && SilentTime >= SilenceThresholdSeconds;
 			// Hard-timeout fallback: if agent_response never arrives (or is very late),
 			// stop after 10s of silence to avoid leaving the state machine stuck.
-			// 10s was chosen to bridge observed inter-batch TTS gaps of up to ~5s.
+			// Time-based to behave consistently regardless of frame rate.
-			const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks;
+			const bool bHardTimeout = SilentTime >= HardSilenceTimeoutSeconds;
 			if (bResponseConfirmed || bHardTimeout)
 			{
@@ -341,13 +341,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
 				bAgentSpeaking = false;
 				bPreBuffering = false;  // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted.
 				bAgentResponseReceived = false;
-				SilentTickCount = 0;
+				SilentTime = 0.0f;
 				bShouldBroadcastStopped = true;
 			}
 		}
 		else
 		{
-			SilentTickCount = 0;
+			SilentTime = 0.0f;
 		}
 	}
 	// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
@@ -1096,7 +1096,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
 	bIsListening = false;
 	// bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio.
 	bWaitingForAgentResponse = false;
-	GeneratingTickCount = 0;
+	GeneratingTime = 0.0f;
 	TurnIndex = 0;
 	LastClosedTurnIndex = 0;
 	CurrentEmotion = EPS_AI_ConvAgent_Emotion::Neutral;
@@ -1112,6 +1112,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
 	if (!bIntentionalDisconnect && StatusCode != 1000
 		&& MaxReconnectAttempts > 0 && GetOwnerRole() == ROLE_Authority)
 	{
 		// Clean up stale turn state so the reconnected session starts fresh.
 		// StopAgentAudio() already ran above, but these fields are not reset by it:
 		bWaitingForAgentResponse = false;
 		GeneratingTime = 0.0f;
 		bWantsReconnect = true;
 		ReconnectAttemptCount = 0;
 		const double Delay = 1.0; // First attempt after 1 second.
@@ -1612,7 +1617,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 		bAgentGenerating = false;    // Agent is now speaking — generation phase is over.
 		bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
 		bQueueWasDry = false;
-		SilentTickCount = 0;
+		SilentTime = 0.0f;
 		// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
 		TurnFirstChunkTime = FPlatformTime::Seconds();
 		TurnFirstChunkBytes = PCMData.Num();
@@ -1717,7 +1722,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 			}
 			OnAudioPlaybackStarted.Broadcast();
 		}
-		SilentTickCount = 0;
+		SilentTime = 0.0f;
 	}
 	else
 	{
@@ -1745,7 +1750,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 			}
 		}
 		// Reset silence counter — new audio arrived, we're not in a gap anymore
-		SilentTickCount = 0;
+		SilentTime = 0.0f;
 	}
 }
@@ -1784,7 +1789,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
 		if (bAgentSpeaking)
 		{
 			bAgentSpeaking = false;
-			SilentTickCount = 0;
+			SilentTime = 0.0f;
 			bWasSpeaking = true;
 			Now = FPlatformTime::Seconds();
 		}
@@ -2433,7 +2438,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::MulticastAgentStoppedSpeaking_Impleme
 {
 	if (GetOwnerRole() == ROLE_Authority) return;
 	bAgentSpeaking = false;
-	SilentTickCount = 0;
+	SilentTime = 0.0f;
 	OnAgentStoppedSpeaking.Broadcast();
 }
@@ -2711,8 +2716,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
 	// Audio queue (read without lock for debug display — minor race is acceptable)
 	const int32 QueueBytes = FMath::Max(0, AudioQueue.Num() - AudioQueueReadOffset);
 	GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, MainColor,
-		FString::Printf(TEXT("  AudioQueue: %d bytes  SilentTicks: %d"),
+		FString::Printf(TEXT("  AudioQueue: %d bytes  SilentTime: %.2fs"),
-			QueueBytes, SilentTickCount));
+			QueueBytes, SilentTime));
 	// Timing
 	const double Now = FPlatformTime::Seconds();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_FacialExpressionComponent.cpp
@@ -286,6 +286,15 @@ void UPS_AI_ConvAgent_FacialExpressionComponent::TickComponent(
 				AgentComponent = Agent;
 				Agent->OnAgentEmotionChanged.AddDynamic(
 					this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnEmotionChanged);
 				// Bind conversation lifecycle — same as BeginPlay path.
 				// Without these, bActive stays false forever when lazy-bound.
 				Agent->OnAgentConnected.AddDynamic(
 					this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationConnected);
 				Agent->OnAgentDisconnected.AddDynamic(
 					this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationDisconnected);
 				bActive = false;
 				CurrentActiveAlpha = 0.0f;
 			}
 		}
 	}
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_InteractionComponent.cpp
@@ -182,9 +182,9 @@ void UPS_AI_ConvAgent_InteractionComponent::TickComponent(float DeltaTime, ELeve
 				CleanupRetainedGaze(Pending);
 				PendingLeaveAgent.Reset();
 			}
-			else if (!Pending->IsAgentSpeaking())
+			else if (!Pending->IsAgentSpeakingOrPending())
 			{
-				// Agent finished speaking — leave conversation, retain gaze.
+				// Agent truly finished speaking (not just a TTS inter-batch gap) — leave conversation, retain gaze.
 				ExecuteLeave(Pending);
 				GazeRetainedAgent = Pending;
 				PendingLeaveAgent.Reset();
@@ -539,23 +539,45 @@ void UPS_AI_ConvAgent_InteractionComponent::SetSelectedAgent(UPS_AI_ConvAgent_El
 		// player until they walk out of interaction range.
 		if (bAutoStartConversation && (OldAgent->IsConnected() || OldAgent->bNetIsConversing))
 		{
-			// If a previous pending leave exists, force-complete it now.
+			// If a previous pending leave exists, handle it.
 			if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevPending = PendingLeaveAgent.Get())
 			{
 				if (PrevPending == NewAgent)
 				{
 					// Player is switching back to the pending agent (A→B→A).
 					// Cancel the deferred leave — don't execute it, the player
 					// is coming back to this agent and the conversation is still alive.
 					PendingLeaveAgent.Reset();
 					if (bDebug)
 					{
 						UE_LOG(LogPS_AI_ConvAgent_Select, Log,
 							TEXT("  Cancelled pending leave (switching back to same agent): %s"),
 							PrevPending->GetOwner() ? *PrevPending->GetOwner()->GetName() : TEXT("(null)"));
 					}
 				}
 				else
 				{
 					// Different agent — force-complete the old pending leave.
 					ExecuteLeave(PrevPending);
 					CleanupRetainedGaze(PrevPending);
 					PendingLeaveAgent.Reset();
 				}
-			// Similarly, clean up any existing retained gaze.
+			}
 			// Similarly, clean up any existing retained gaze (unless it's the agent
 			// we're about to re-select — keep gaze alive during the transition).
 			if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevRetained = GazeRetainedAgent.Get())
 			{
 				if (PrevRetained != NewAgent)
 				{
 					CleanupRetainedGaze(PrevRetained);
 					GazeRetainedAgent.Reset();
 				}
 			}
-			if (OldAgent->IsAgentSpeaking())
+			if (OldAgent->IsAgentSpeakingOrPending())
 			{
-				// Agent is still speaking — defer the Leave.
+				// Agent is still speaking (or generating, waiting for more audio) — defer the Leave.
 				// Gaze and body tracking stay active so the agent keeps
 				// looking at the player while finishing its sentence.
 				PendingLeaveAgent = OldAgent;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_LipSyncComponent.cpp
@@ -758,6 +758,16 @@ void UPS_AI_ConvAgent_LipSyncComponent::TickComponent(float DeltaTime, ELevelTic
 					this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentInterrupted);
 				Agent->OnAgentStoppedSpeaking.AddDynamic(
 					this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentStopped);
 				// Bind conversation lifecycle — same as BeginPlay path.
 				// Without these, bActive stays false forever when lazy-bound.
 				Agent->OnAgentConnected.AddDynamic(
 					this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationConnected);
 				Agent->OnAgentDisconnected.AddDynamic(
 					this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationDisconnected);
 				bActive = false;
 				CurrentActiveAlpha = 0.0f;
 				Agent->bEnableAgentPartialResponse = true;
 			}
 		}
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp
@@ -127,6 +127,7 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::SendUserTurnStart()
 	bWaitingForResponse = false;
 	bFirstAudioResponseLogged = false;
 	bAgentResponseStartedFired = false;
 	LastInterruptEventId = 0;  // New user turn — stale interrupt filter no longer valid.
 	// No log here — turn start is implicit from audio chunks following.
 }
@@ -563,6 +564,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleAgentResponse(const TShar
 	// subsequent agent_chat_response_part is guaranteed to belong to a new turn.
 	bAgentResponseStartedFired = false;
 	// Also reset the interrupt audio filter here. agent_response is the last message
 	// of the current turn — any audio arriving after this belongs to a new generation
 	// and must not be filtered by a stale interrupt event_id from this turn.
 	// This covers the edge case where audio for the next turn arrives before
 	// agent_chat_response_part (which also resets the filter).
 	LastInterruptEventId = 0;
 	// { "type": "agent_response",
 	//   "agent_response_event": { "agent_response": "..." }
 	// }
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h
@@ -601,6 +601,11 @@ public:
 	UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
 	bool IsAgentSpeaking() const { return bAgentSpeaking; }
 	/** True when the agent is speaking OR the server hasn't confirmed the full response yet.
 	 *  Use this instead of IsAgentSpeaking() when you need to know if the agent MIGHT still
 	 *  produce more audio (e.g. during TTS inter-batch gaps). */
 	bool IsAgentSpeakingOrPending() const { return bAgentSpeaking || (bAgentGenerating && !bAgentResponseReceived); }
 	UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
 	const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const;
@@ -795,25 +800,27 @@ private:
 	// Debug: track when the AudioQueue runs dry during speech (one-shot log).
 	bool bQueueWasDry = false;
-	// Silence detection: how many consecutive ticks with an empty audio queue.
+	// Silence detection: accumulated seconds of empty audio queue.
-	int32 SilentTickCount = 0;
+	// Time-based (not tick-based) to behave consistently across frame rates.
 	float SilentTime = 0.0f;
-	// Generating timeout: how many consecutive ticks bAgentGenerating has been true
+	// Generating timeout: accumulated seconds with bAgentGenerating=true but no audio.
-	// without any audio arriving. If this reaches HardSilenceTimeoutTicks, bAgentGenerating
+	// If this exceeds GeneratingTimeoutSeconds, bAgentGenerating is force-cleared
-	// is force-cleared so StartListening() is no longer blocked. This covers the edge case
+	// so StartListening() is no longer blocked.
-	// where the server sends agent_chat_response_part but the TTS pipeline produces no audio.
+	float GeneratingTime = 0.0f;
 	int32 GeneratingTickCount = 0;
-	// Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks
+	// Primary threshold: fire OnAgentStoppedSpeaking after this many seconds of silence
 	// once the server has confirmed the full response (bAgentResponseReceived=true).
-	// 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream.
+	// 0.5s is enough to bridge brief inter-chunk gaps in the TTS stream.
-	static constexpr int32 SilenceThresholdTicks = 30;
+	static constexpr float SilenceThresholdSeconds = 0.5f;
 	// Hard-timeout fallback: fire even without agent_response confirmation after 10s
 	// of silence. This covers edge cases where agent_response is very late or missing,
 	// while being long enough to bridge inter-batch TTS gaps (observed up to ~5s).
-	// Previously 2s — raised after logs showed premature firing during multi-batch responses.
+	static constexpr float HardSilenceTimeoutSeconds = 10.0f;
-	static constexpr int32 HardSilenceTimeoutTicks = 600; // 10s at 60fps
+
 	// Generating timeout: same as hard silence timeout.
 	static constexpr float GeneratingTimeoutSeconds = 10.0f;
 	// True once the server sends agent_response for the current turn.
 	// The server sends the full text when generation is complete — this is the