Fix audio cutoff and lip sync activation bugs during agent switching

- Fix A→B→A audio cutoff: when switching back to a pending-leave agent,
  cancel the deferred leave instead of force-completing it (was calling
  StopAgentAudio on the agent we're returning to)
- Fix deferred leave firing during TTS gaps: use IsAgentSpeakingOrPending()
  instead of IsAgentSpeaking() — checks bAgentGenerating and
  bAgentResponseReceived to avoid premature leave during inter-batch silence
- Convert silence detection from tick-based to time-based: SilentTickCount
  → SilentTime (float seconds), GeneratingTickCount → GeneratingTime.
  Consistent behavior regardless of frame rate (was 5s@120fps vs 20s@30fps)
- Fix lazy binding: add OnAgentConnected/OnAgentDisconnected in LipSync
  and FacialExpression TickComponent lazy-bind path (bActive stayed false
  forever in packaged builds when component init order differed)
- Fix reconnection: reset bWaitingForAgentResponse and GeneratingTime
  before entering reconnect mode to avoid stale state on new session
- Fix event_ID audio filtering: reset LastInterruptEventId in
  HandleAgentResponse and SendUserTurnStart so first audio chunks of a
  new turn are not silently discarded by stale interrupt filter
- Preserve retained gaze when switching back to same agent (don't
  CleanupRetainedGaze if PrevRetained == NewAgent)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-12 09:50:43 +01:00
parent aea02abe89
commit e5a32f5997
6 changed files with 106 additions and 45 deletions

View File

@ -177,27 +177,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
// Generating timeout (ISSUE-1): if the server sent agent_chat_response_part // Generating timeout (ISSUE-1): if the server sent agent_chat_response_part
// (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false), // (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false),
// force-clear bAgentGenerating after 10s so StartListening() is no longer blocked. // force-clear bAgentGenerating after GeneratingTimeoutSeconds so StartListening()
// Normal path: first audio chunk → EnqueueAgentAudio → bAgentGenerating=false. // is no longer blocked. Time-based to behave consistently across frame rates.
// This fallback covers the rare case where TTS produces nothing (e.g. empty response).
if (bAgentGenerating && !bAgentSpeaking) if (bAgentGenerating && !bAgentSpeaking)
{ {
if (++GeneratingTickCount >= HardSilenceTimeoutTicks) GeneratingTime += DeltaTime;
if (GeneratingTime >= GeneratingTimeoutSeconds)
{ {
bAgentGenerating = false; bAgentGenerating = false;
GeneratingTickCount = 0; GeneratingTime = 0.0f;
if (bDebug) if (bDebug)
{ {
const double T = FPlatformTime::Seconds() - SessionStartTime; const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Warning, UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Warning,
TEXT("[T+%.2fs] [Turn %d] Generating timeout (10s) — no audio arrived. Clearing bAgentGenerating."), TEXT("[T+%.2fs] [Turn %d] Generating timeout (%.0fs) — no audio arrived. Clearing bAgentGenerating."),
T, LastClosedTurnIndex); T, LastClosedTurnIndex, GeneratingTimeoutSeconds);
} }
} }
} }
else else
{ {
GeneratingTickCount = 0; GeneratingTime = 0.0f;
} }
// Pre-buffer timer: start playback after the pre-buffer period expires. // Pre-buffer timer: start playback after the pre-buffer period expires.
@ -321,19 +321,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
FScopeLock Lock(&AudioQueueLock); FScopeLock Lock(&AudioQueueLock);
if (AudioQueue.Num() - AudioQueueReadOffset == 0) if (AudioQueue.Num() - AudioQueueReadOffset == 0)
{ {
SilentTickCount++; SilentTime += DeltaTime;
// Wait for agent_response (confirms the full response is done) before // Wait for agent_response (confirms the full response is done) before
// declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking // declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking
// events when ElevenLabs TTS streams audio in multiple batches with gaps // events when ElevenLabs TTS streams audio in multiple batches with gaps
// (e.g. for long responses) — without this guard, the Blueprint's // (e.g. for long responses) — without this guard, the Blueprint's
// OnAgentStoppedSpeaking handler reopens the mic mid-response. // OnAgentStoppedSpeaking handler reopens the mic mid-response.
const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks; const bool bResponseConfirmed = bAgentResponseReceived && SilentTime >= SilenceThresholdSeconds;
// Hard-timeout fallback: if agent_response never arrives (or is very late), // Hard-timeout fallback: if agent_response never arrives (or is very late),
// stop after 10s of silence to avoid leaving the state machine stuck. // stop after 10s of silence to avoid leaving the state machine stuck.
// 10s was chosen to bridge observed inter-batch TTS gaps of up to ~5s. // Time-based to behave consistently regardless of frame rate.
const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks; const bool bHardTimeout = SilentTime >= HardSilenceTimeoutSeconds;
if (bResponseConfirmed || bHardTimeout) if (bResponseConfirmed || bHardTimeout)
{ {
@ -341,13 +341,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
bAgentSpeaking = false; bAgentSpeaking = false;
bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted. bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted.
bAgentResponseReceived = false; bAgentResponseReceived = false;
SilentTickCount = 0; SilentTime = 0.0f;
bShouldBroadcastStopped = true; bShouldBroadcastStopped = true;
} }
} }
else else
{ {
SilentTickCount = 0; SilentTime = 0.0f;
} }
} }
// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time. // Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
@ -1096,7 +1096,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
bIsListening = false; bIsListening = false;
// bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio. // bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio.
bWaitingForAgentResponse = false; bWaitingForAgentResponse = false;
GeneratingTickCount = 0; GeneratingTime = 0.0f;
TurnIndex = 0; TurnIndex = 0;
LastClosedTurnIndex = 0; LastClosedTurnIndex = 0;
CurrentEmotion = EPS_AI_ConvAgent_Emotion::Neutral; CurrentEmotion = EPS_AI_ConvAgent_Emotion::Neutral;
@ -1112,6 +1112,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
if (!bIntentionalDisconnect && StatusCode != 1000 if (!bIntentionalDisconnect && StatusCode != 1000
&& MaxReconnectAttempts > 0 && GetOwnerRole() == ROLE_Authority) && MaxReconnectAttempts > 0 && GetOwnerRole() == ROLE_Authority)
{ {
// Clean up stale turn state so the reconnected session starts fresh.
// StopAgentAudio() already ran above, but these fields are not reset by it:
bWaitingForAgentResponse = false;
GeneratingTime = 0.0f;
bWantsReconnect = true; bWantsReconnect = true;
ReconnectAttemptCount = 0; ReconnectAttemptCount = 0;
const double Delay = 1.0; // First attempt after 1 second. const double Delay = 1.0; // First attempt after 1 second.
@ -1612,7 +1617,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
bAgentGenerating = false; // Agent is now speaking — generation phase is over. bAgentGenerating = false; // Agent is now speaking — generation phase is over.
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking. bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
bQueueWasDry = false; bQueueWasDry = false;
SilentTickCount = 0; SilentTime = 0.0f;
// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement. // Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
TurnFirstChunkTime = FPlatformTime::Seconds(); TurnFirstChunkTime = FPlatformTime::Seconds();
TurnFirstChunkBytes = PCMData.Num(); TurnFirstChunkBytes = PCMData.Num();
@ -1717,7 +1722,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
} }
OnAudioPlaybackStarted.Broadcast(); OnAudioPlaybackStarted.Broadcast();
} }
SilentTickCount = 0; SilentTime = 0.0f;
} }
else else
{ {
@ -1745,7 +1750,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
} }
} }
// Reset silence counter — new audio arrived, we're not in a gap anymore // Reset silence counter — new audio arrived, we're not in a gap anymore
SilentTickCount = 0; SilentTime = 0.0f;
} }
} }
@ -1784,7 +1789,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
if (bAgentSpeaking) if (bAgentSpeaking)
{ {
bAgentSpeaking = false; bAgentSpeaking = false;
SilentTickCount = 0; SilentTime = 0.0f;
bWasSpeaking = true; bWasSpeaking = true;
Now = FPlatformTime::Seconds(); Now = FPlatformTime::Seconds();
} }
@ -2433,7 +2438,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::MulticastAgentStoppedSpeaking_Impleme
{ {
if (GetOwnerRole() == ROLE_Authority) return; if (GetOwnerRole() == ROLE_Authority) return;
bAgentSpeaking = false; bAgentSpeaking = false;
SilentTickCount = 0; SilentTime = 0.0f;
OnAgentStoppedSpeaking.Broadcast(); OnAgentStoppedSpeaking.Broadcast();
} }
@ -2711,8 +2716,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
// Audio queue (read without lock for debug display — minor race is acceptable) // Audio queue (read without lock for debug display — minor race is acceptable)
const int32 QueueBytes = FMath::Max(0, AudioQueue.Num() - AudioQueueReadOffset); const int32 QueueBytes = FMath::Max(0, AudioQueue.Num() - AudioQueueReadOffset);
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, MainColor, GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, MainColor,
FString::Printf(TEXT(" AudioQueue: %d bytes SilentTicks: %d"), FString::Printf(TEXT(" AudioQueue: %d bytes SilentTime: %.2fs"),
QueueBytes, SilentTickCount)); QueueBytes, SilentTime));
// Timing // Timing
const double Now = FPlatformTime::Seconds(); const double Now = FPlatformTime::Seconds();

View File

@ -286,6 +286,15 @@ void UPS_AI_ConvAgent_FacialExpressionComponent::TickComponent(
AgentComponent = Agent; AgentComponent = Agent;
Agent->OnAgentEmotionChanged.AddDynamic( Agent->OnAgentEmotionChanged.AddDynamic(
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnEmotionChanged); this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnEmotionChanged);
// Bind conversation lifecycle — same as BeginPlay path.
// Without these, bActive stays false forever when lazy-bound.
Agent->OnAgentConnected.AddDynamic(
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationConnected);
Agent->OnAgentDisconnected.AddDynamic(
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationDisconnected);
bActive = false;
CurrentActiveAlpha = 0.0f;
} }
} }
} }

View File

@ -182,9 +182,9 @@ void UPS_AI_ConvAgent_InteractionComponent::TickComponent(float DeltaTime, ELeve
CleanupRetainedGaze(Pending); CleanupRetainedGaze(Pending);
PendingLeaveAgent.Reset(); PendingLeaveAgent.Reset();
} }
else if (!Pending->IsAgentSpeaking()) else if (!Pending->IsAgentSpeakingOrPending())
{ {
// Agent finished speaking — leave conversation, retain gaze. // Agent truly finished speaking (not just a TTS inter-batch gap) — leave conversation, retain gaze.
ExecuteLeave(Pending); ExecuteLeave(Pending);
GazeRetainedAgent = Pending; GazeRetainedAgent = Pending;
PendingLeaveAgent.Reset(); PendingLeaveAgent.Reset();
@ -539,23 +539,45 @@ void UPS_AI_ConvAgent_InteractionComponent::SetSelectedAgent(UPS_AI_ConvAgent_El
// player until they walk out of interaction range. // player until they walk out of interaction range.
if (bAutoStartConversation && (OldAgent->IsConnected() || OldAgent->bNetIsConversing)) if (bAutoStartConversation && (OldAgent->IsConnected() || OldAgent->bNetIsConversing))
{ {
// If a previous pending leave exists, force-complete it now. // If a previous pending leave exists, handle it.
if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevPending = PendingLeaveAgent.Get()) if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevPending = PendingLeaveAgent.Get())
{ {
if (PrevPending == NewAgent)
{
// Player is switching back to the pending agent (A→B→A).
// Cancel the deferred leave — don't execute it, the player
// is coming back to this agent and the conversation is still alive.
PendingLeaveAgent.Reset();
if (bDebug)
{
UE_LOG(LogPS_AI_ConvAgent_Select, Log,
TEXT(" Cancelled pending leave (switching back to same agent): %s"),
PrevPending->GetOwner() ? *PrevPending->GetOwner()->GetName() : TEXT("(null)"));
}
}
else
{
// Different agent — force-complete the old pending leave.
ExecuteLeave(PrevPending); ExecuteLeave(PrevPending);
CleanupRetainedGaze(PrevPending); CleanupRetainedGaze(PrevPending);
PendingLeaveAgent.Reset(); PendingLeaveAgent.Reset();
} }
// Similarly, clean up any existing retained gaze. }
// Similarly, clean up any existing retained gaze (unless it's the agent
// we're about to re-select — keep gaze alive during the transition).
if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevRetained = GazeRetainedAgent.Get()) if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevRetained = GazeRetainedAgent.Get())
{
if (PrevRetained != NewAgent)
{ {
CleanupRetainedGaze(PrevRetained); CleanupRetainedGaze(PrevRetained);
GazeRetainedAgent.Reset(); GazeRetainedAgent.Reset();
} }
}
if (OldAgent->IsAgentSpeaking()) if (OldAgent->IsAgentSpeakingOrPending())
{ {
// Agent is still speaking — defer the Leave. // Agent is still speaking (or generating, waiting for more audio) — defer the Leave.
// Gaze and body tracking stay active so the agent keeps // Gaze and body tracking stay active so the agent keeps
// looking at the player while finishing its sentence. // looking at the player while finishing its sentence.
PendingLeaveAgent = OldAgent; PendingLeaveAgent = OldAgent;

View File

@ -758,6 +758,16 @@ void UPS_AI_ConvAgent_LipSyncComponent::TickComponent(float DeltaTime, ELevelTic
this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentInterrupted); this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentInterrupted);
Agent->OnAgentStoppedSpeaking.AddDynamic( Agent->OnAgentStoppedSpeaking.AddDynamic(
this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentStopped); this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentStopped);
// Bind conversation lifecycle — same as BeginPlay path.
// Without these, bActive stays false forever when lazy-bound.
Agent->OnAgentConnected.AddDynamic(
this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationConnected);
Agent->OnAgentDisconnected.AddDynamic(
this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationDisconnected);
bActive = false;
CurrentActiveAlpha = 0.0f;
Agent->bEnableAgentPartialResponse = true; Agent->bEnableAgentPartialResponse = true;
} }
} }

View File

@ -127,6 +127,7 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::SendUserTurnStart()
bWaitingForResponse = false; bWaitingForResponse = false;
bFirstAudioResponseLogged = false; bFirstAudioResponseLogged = false;
bAgentResponseStartedFired = false; bAgentResponseStartedFired = false;
LastInterruptEventId = 0; // New user turn — stale interrupt filter no longer valid.
// No log here — turn start is implicit from audio chunks following. // No log here — turn start is implicit from audio chunks following.
} }
@ -563,6 +564,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleAgentResponse(const TShar
// subsequent agent_chat_response_part is guaranteed to belong to a new turn. // subsequent agent_chat_response_part is guaranteed to belong to a new turn.
bAgentResponseStartedFired = false; bAgentResponseStartedFired = false;
// Also reset the interrupt audio filter here. agent_response is the last message
// of the current turn — any audio arriving after this belongs to a new generation
// and must not be filtered by a stale interrupt event_id from this turn.
// This covers the edge case where audio for the next turn arrives before
// agent_chat_response_part (which also resets the filter).
LastInterruptEventId = 0;
// { "type": "agent_response", // { "type": "agent_response",
// "agent_response_event": { "agent_response": "..." } // "agent_response_event": { "agent_response": "..." }
// } // }

View File

@ -601,6 +601,11 @@ public:
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
bool IsAgentSpeaking() const { return bAgentSpeaking; } bool IsAgentSpeaking() const { return bAgentSpeaking; }
/** True when the agent is speaking OR the server hasn't confirmed the full response yet.
* Use this instead of IsAgentSpeaking() when you need to know if the agent MIGHT still
* produce more audio (e.g. during TTS inter-batch gaps). */
bool IsAgentSpeakingOrPending() const { return bAgentSpeaking || (bAgentGenerating && !bAgentResponseReceived); }
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const; const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const;
@ -795,25 +800,27 @@ private:
// Debug: track when the AudioQueue runs dry during speech (one-shot log). // Debug: track when the AudioQueue runs dry during speech (one-shot log).
bool bQueueWasDry = false; bool bQueueWasDry = false;
// Silence detection: how many consecutive ticks with an empty audio queue. // Silence detection: accumulated seconds of empty audio queue.
int32 SilentTickCount = 0; // Time-based (not tick-based) to behave consistently across frame rates.
float SilentTime = 0.0f;
// Generating timeout: how many consecutive ticks bAgentGenerating has been true // Generating timeout: accumulated seconds with bAgentGenerating=true but no audio.
// without any audio arriving. If this reaches HardSilenceTimeoutTicks, bAgentGenerating // If this exceeds GeneratingTimeoutSeconds, bAgentGenerating is force-cleared
// is force-cleared so StartListening() is no longer blocked. This covers the edge case // so StartListening() is no longer blocked.
// where the server sends agent_chat_response_part but the TTS pipeline produces no audio. float GeneratingTime = 0.0f;
int32 GeneratingTickCount = 0;
// Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks // Primary threshold: fire OnAgentStoppedSpeaking after this many seconds of silence
// once the server has confirmed the full response (bAgentResponseReceived=true). // once the server has confirmed the full response (bAgentResponseReceived=true).
// 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream. // 0.5s is enough to bridge brief inter-chunk gaps in the TTS stream.
static constexpr int32 SilenceThresholdTicks = 30; static constexpr float SilenceThresholdSeconds = 0.5f;
// Hard-timeout fallback: fire even without agent_response confirmation after 10s // Hard-timeout fallback: fire even without agent_response confirmation after 10s
// of silence. This covers edge cases where agent_response is very late or missing, // of silence. This covers edge cases where agent_response is very late or missing,
// while being long enough to bridge inter-batch TTS gaps (observed up to ~5s). // while being long enough to bridge inter-batch TTS gaps (observed up to ~5s).
// Previously 2s — raised after logs showed premature firing during multi-batch responses. static constexpr float HardSilenceTimeoutSeconds = 10.0f;
static constexpr int32 HardSilenceTimeoutTicks = 600; // 10s at 60fps
// Generating timeout: same as hard silence timeout.
static constexpr float GeneratingTimeoutSeconds = 10.0f;
// True once the server sends agent_response for the current turn. // True once the server sends agent_response for the current turn.
// The server sends the full text when generation is complete — this is the // The server sends the full text when generation is complete — this is the