Fix audio cutoff and lip sync activation bugs during agent switching
- Fix A→B→A audio cutoff: when switching back to a pending-leave agent, cancel the deferred leave instead of force-completing it (was calling StopAgentAudio on the agent we're returning to) - Fix deferred leave firing during TTS gaps: use IsAgentSpeakingOrPending() instead of IsAgentSpeaking() — checks bAgentGenerating and bAgentResponseReceived to avoid premature leave during inter-batch silence - Convert silence detection from tick-based to time-based: SilentTickCount → SilentTime (float seconds), GeneratingTickCount → GeneratingTime. Consistent behavior regardless of frame rate (was 5s@120fps vs 20s@30fps) - Fix lazy binding: add OnAgentConnected/OnAgentDisconnected in LipSync and FacialExpression TickComponent lazy-bind path (bActive stayed false forever in packaged builds when component init order differed) - Fix reconnection: reset bWaitingForAgentResponse and GeneratingTime before entering reconnect mode to avoid stale state on new session - Fix event_ID audio filtering: reset LastInterruptEventId in HandleAgentResponse and SendUserTurnStart so first audio chunks of a new turn are not silently discarded by stale interrupt filter - Preserve retained gaze when switching back to same agent (don't CleanupRetainedGaze if PrevRetained == NewAgent) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
aea02abe89
commit
e5a32f5997
@ -177,27 +177,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
|
||||
// Generating timeout (ISSUE-1): if the server sent agent_chat_response_part
|
||||
// (bAgentGenerating=true) but no audio ever arrived (bAgentSpeaking=false),
|
||||
// force-clear bAgentGenerating after 10s so StartListening() is no longer blocked.
|
||||
// Normal path: first audio chunk → EnqueueAgentAudio → bAgentGenerating=false.
|
||||
// This fallback covers the rare case where TTS produces nothing (e.g. empty response).
|
||||
// force-clear bAgentGenerating after GeneratingTimeoutSeconds so StartListening()
|
||||
// is no longer blocked. Time-based to behave consistently across frame rates.
|
||||
if (bAgentGenerating && !bAgentSpeaking)
|
||||
{
|
||||
if (++GeneratingTickCount >= HardSilenceTimeoutTicks)
|
||||
GeneratingTime += DeltaTime;
|
||||
if (GeneratingTime >= GeneratingTimeoutSeconds)
|
||||
{
|
||||
bAgentGenerating = false;
|
||||
GeneratingTickCount = 0;
|
||||
GeneratingTime = 0.0f;
|
||||
if (bDebug)
|
||||
{
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Warning,
|
||||
TEXT("[T+%.2fs] [Turn %d] Generating timeout (10s) — no audio arrived. Clearing bAgentGenerating."),
|
||||
T, LastClosedTurnIndex);
|
||||
TEXT("[T+%.2fs] [Turn %d] Generating timeout (%.0fs) — no audio arrived. Clearing bAgentGenerating."),
|
||||
T, LastClosedTurnIndex, GeneratingTimeoutSeconds);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
GeneratingTickCount = 0;
|
||||
GeneratingTime = 0.0f;
|
||||
}
|
||||
|
||||
// Pre-buffer timer: start playback after the pre-buffer period expires.
|
||||
@ -321,19 +321,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
if (AudioQueue.Num() - AudioQueueReadOffset == 0)
|
||||
{
|
||||
SilentTickCount++;
|
||||
SilentTime += DeltaTime;
|
||||
|
||||
// Wait for agent_response (confirms the full response is done) before
|
||||
// declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking
|
||||
// events when ElevenLabs TTS streams audio in multiple batches with gaps
|
||||
// (e.g. for long responses) — without this guard, the Blueprint's
|
||||
// OnAgentStoppedSpeaking handler reopens the mic mid-response.
|
||||
const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks;
|
||||
const bool bResponseConfirmed = bAgentResponseReceived && SilentTime >= SilenceThresholdSeconds;
|
||||
|
||||
// Hard-timeout fallback: if agent_response never arrives (or is very late),
|
||||
// stop after 10s of silence to avoid leaving the state machine stuck.
|
||||
// 10s was chosen to bridge observed inter-batch TTS gaps of up to ~5s.
|
||||
const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks;
|
||||
// Time-based to behave consistently regardless of frame rate.
|
||||
const bool bHardTimeout = SilentTime >= HardSilenceTimeoutSeconds;
|
||||
|
||||
if (bResponseConfirmed || bHardTimeout)
|
||||
{
|
||||
@ -341,13 +341,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
bAgentSpeaking = false;
|
||||
bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted.
|
||||
bAgentResponseReceived = false;
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
bShouldBroadcastStopped = true;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
}
|
||||
}
|
||||
// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
|
||||
@ -1096,7 +1096,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
|
||||
bIsListening = false;
|
||||
// bAgentSpeaking / bAgentGenerating / bAgentResponseReceived already cleared by StopAgentAudio.
|
||||
bWaitingForAgentResponse = false;
|
||||
GeneratingTickCount = 0;
|
||||
GeneratingTime = 0.0f;
|
||||
TurnIndex = 0;
|
||||
LastClosedTurnIndex = 0;
|
||||
CurrentEmotion = EPS_AI_ConvAgent_Emotion::Neutral;
|
||||
@ -1112,6 +1112,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleDisconnected(int32 StatusCode,
|
||||
if (!bIntentionalDisconnect && StatusCode != 1000
|
||||
&& MaxReconnectAttempts > 0 && GetOwnerRole() == ROLE_Authority)
|
||||
{
|
||||
// Clean up stale turn state so the reconnected session starts fresh.
|
||||
// StopAgentAudio() already ran above, but these fields are not reset by it:
|
||||
bWaitingForAgentResponse = false;
|
||||
GeneratingTime = 0.0f;
|
||||
|
||||
bWantsReconnect = true;
|
||||
ReconnectAttemptCount = 0;
|
||||
const double Delay = 1.0; // First attempt after 1 second.
|
||||
@ -1612,7 +1617,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
|
||||
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
||||
bQueueWasDry = false;
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
|
||||
TurnFirstChunkTime = FPlatformTime::Seconds();
|
||||
TurnFirstChunkBytes = PCMData.Num();
|
||||
@ -1717,7 +1722,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
}
|
||||
OnAudioPlaybackStarted.Broadcast();
|
||||
}
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -1745,7 +1750,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
}
|
||||
}
|
||||
// Reset silence counter — new audio arrived, we're not in a gap anymore
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
}
|
||||
}
|
||||
|
||||
@ -1784,7 +1789,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
|
||||
if (bAgentSpeaking)
|
||||
{
|
||||
bAgentSpeaking = false;
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
bWasSpeaking = true;
|
||||
Now = FPlatformTime::Seconds();
|
||||
}
|
||||
@ -2433,7 +2438,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::MulticastAgentStoppedSpeaking_Impleme
|
||||
{
|
||||
if (GetOwnerRole() == ROLE_Authority) return;
|
||||
bAgentSpeaking = false;
|
||||
SilentTickCount = 0;
|
||||
SilentTime = 0.0f;
|
||||
OnAgentStoppedSpeaking.Broadcast();
|
||||
}
|
||||
|
||||
@ -2711,8 +2716,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
|
||||
// Audio queue (read without lock for debug display — minor race is acceptable)
|
||||
const int32 QueueBytes = FMath::Max(0, AudioQueue.Num() - AudioQueueReadOffset);
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, MainColor,
|
||||
FString::Printf(TEXT(" AudioQueue: %d bytes SilentTicks: %d"),
|
||||
QueueBytes, SilentTickCount));
|
||||
FString::Printf(TEXT(" AudioQueue: %d bytes SilentTime: %.2fs"),
|
||||
QueueBytes, SilentTime));
|
||||
|
||||
// Timing
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
|
||||
@ -286,6 +286,15 @@ void UPS_AI_ConvAgent_FacialExpressionComponent::TickComponent(
|
||||
AgentComponent = Agent;
|
||||
Agent->OnAgentEmotionChanged.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnEmotionChanged);
|
||||
|
||||
// Bind conversation lifecycle — same as BeginPlay path.
|
||||
// Without these, bActive stays false forever when lazy-bound.
|
||||
Agent->OnAgentConnected.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationConnected);
|
||||
Agent->OnAgentDisconnected.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_FacialExpressionComponent::OnConversationDisconnected);
|
||||
bActive = false;
|
||||
CurrentActiveAlpha = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -182,9 +182,9 @@ void UPS_AI_ConvAgent_InteractionComponent::TickComponent(float DeltaTime, ELeve
|
||||
CleanupRetainedGaze(Pending);
|
||||
PendingLeaveAgent.Reset();
|
||||
}
|
||||
else if (!Pending->IsAgentSpeaking())
|
||||
else if (!Pending->IsAgentSpeakingOrPending())
|
||||
{
|
||||
// Agent finished speaking — leave conversation, retain gaze.
|
||||
// Agent truly finished speaking (not just a TTS inter-batch gap) — leave conversation, retain gaze.
|
||||
ExecuteLeave(Pending);
|
||||
GazeRetainedAgent = Pending;
|
||||
PendingLeaveAgent.Reset();
|
||||
@ -539,23 +539,45 @@ void UPS_AI_ConvAgent_InteractionComponent::SetSelectedAgent(UPS_AI_ConvAgent_El
|
||||
// player until they walk out of interaction range.
|
||||
if (bAutoStartConversation && (OldAgent->IsConnected() || OldAgent->bNetIsConversing))
|
||||
{
|
||||
// If a previous pending leave exists, force-complete it now.
|
||||
// If a previous pending leave exists, handle it.
|
||||
if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevPending = PendingLeaveAgent.Get())
|
||||
{
|
||||
if (PrevPending == NewAgent)
|
||||
{
|
||||
// Player is switching back to the pending agent (A→B→A).
|
||||
// Cancel the deferred leave — don't execute it, the player
|
||||
// is coming back to this agent and the conversation is still alive.
|
||||
PendingLeaveAgent.Reset();
|
||||
|
||||
if (bDebug)
|
||||
{
|
||||
UE_LOG(LogPS_AI_ConvAgent_Select, Log,
|
||||
TEXT(" Cancelled pending leave (switching back to same agent): %s"),
|
||||
PrevPending->GetOwner() ? *PrevPending->GetOwner()->GetName() : TEXT("(null)"));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Different agent — force-complete the old pending leave.
|
||||
ExecuteLeave(PrevPending);
|
||||
CleanupRetainedGaze(PrevPending);
|
||||
PendingLeaveAgent.Reset();
|
||||
}
|
||||
// Similarly, clean up any existing retained gaze.
|
||||
}
|
||||
// Similarly, clean up any existing retained gaze (unless it's the agent
|
||||
// we're about to re-select — keep gaze alive during the transition).
|
||||
if (UPS_AI_ConvAgent_ElevenLabsComponent* PrevRetained = GazeRetainedAgent.Get())
|
||||
{
|
||||
if (PrevRetained != NewAgent)
|
||||
{
|
||||
CleanupRetainedGaze(PrevRetained);
|
||||
GazeRetainedAgent.Reset();
|
||||
}
|
||||
}
|
||||
|
||||
if (OldAgent->IsAgentSpeaking())
|
||||
if (OldAgent->IsAgentSpeakingOrPending())
|
||||
{
|
||||
// Agent is still speaking — defer the Leave.
|
||||
// Agent is still speaking (or generating, waiting for more audio) — defer the Leave.
|
||||
// Gaze and body tracking stay active so the agent keeps
|
||||
// looking at the player while finishing its sentence.
|
||||
PendingLeaveAgent = OldAgent;
|
||||
|
||||
@ -758,6 +758,16 @@ void UPS_AI_ConvAgent_LipSyncComponent::TickComponent(float DeltaTime, ELevelTic
|
||||
this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentInterrupted);
|
||||
Agent->OnAgentStoppedSpeaking.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_LipSyncComponent::OnAgentStopped);
|
||||
|
||||
// Bind conversation lifecycle — same as BeginPlay path.
|
||||
// Without these, bActive stays false forever when lazy-bound.
|
||||
Agent->OnAgentConnected.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationConnected);
|
||||
Agent->OnAgentDisconnected.AddDynamic(
|
||||
this, &UPS_AI_ConvAgent_LipSyncComponent::OnConversationDisconnected);
|
||||
bActive = false;
|
||||
CurrentActiveAlpha = 0.0f;
|
||||
|
||||
Agent->bEnableAgentPartialResponse = true;
|
||||
}
|
||||
}
|
||||
|
||||
@ -127,6 +127,7 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::SendUserTurnStart()
|
||||
bWaitingForResponse = false;
|
||||
bFirstAudioResponseLogged = false;
|
||||
bAgentResponseStartedFired = false;
|
||||
LastInterruptEventId = 0; // New user turn — stale interrupt filter no longer valid.
|
||||
|
||||
// No log here — turn start is implicit from audio chunks following.
|
||||
}
|
||||
@ -563,6 +564,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleAgentResponse(const TShar
|
||||
// subsequent agent_chat_response_part is guaranteed to belong to a new turn.
|
||||
bAgentResponseStartedFired = false;
|
||||
|
||||
// Also reset the interrupt audio filter here. agent_response is the last message
|
||||
// of the current turn — any audio arriving after this belongs to a new generation
|
||||
// and must not be filtered by a stale interrupt event_id from this turn.
|
||||
// This covers the edge case where audio for the next turn arrives before
|
||||
// agent_chat_response_part (which also resets the filter).
|
||||
LastInterruptEventId = 0;
|
||||
|
||||
// { "type": "agent_response",
|
||||
// "agent_response_event": { "agent_response": "..." }
|
||||
// }
|
||||
|
||||
@ -601,6 +601,11 @@ public:
|
||||
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
|
||||
bool IsAgentSpeaking() const { return bAgentSpeaking; }
|
||||
|
||||
/** True when the agent is speaking OR the server hasn't confirmed the full response yet.
|
||||
* Use this instead of IsAgentSpeaking() when you need to know if the agent MIGHT still
|
||||
* produce more audio (e.g. during TTS inter-batch gaps). */
|
||||
bool IsAgentSpeakingOrPending() const { return bAgentSpeaking || (bAgentGenerating && !bAgentResponseReceived); }
|
||||
|
||||
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
|
||||
const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const;
|
||||
|
||||
@ -795,25 +800,27 @@ private:
|
||||
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
|
||||
bool bQueueWasDry = false;
|
||||
|
||||
// Silence detection: how many consecutive ticks with an empty audio queue.
|
||||
int32 SilentTickCount = 0;
|
||||
// Silence detection: accumulated seconds of empty audio queue.
|
||||
// Time-based (not tick-based) to behave consistently across frame rates.
|
||||
float SilentTime = 0.0f;
|
||||
|
||||
// Generating timeout: how many consecutive ticks bAgentGenerating has been true
|
||||
// without any audio arriving. If this reaches HardSilenceTimeoutTicks, bAgentGenerating
|
||||
// is force-cleared so StartListening() is no longer blocked. This covers the edge case
|
||||
// where the server sends agent_chat_response_part but the TTS pipeline produces no audio.
|
||||
int32 GeneratingTickCount = 0;
|
||||
// Generating timeout: accumulated seconds with bAgentGenerating=true but no audio.
|
||||
// If this exceeds GeneratingTimeoutSeconds, bAgentGenerating is force-cleared
|
||||
// so StartListening() is no longer blocked.
|
||||
float GeneratingTime = 0.0f;
|
||||
|
||||
// Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks
|
||||
// Primary threshold: fire OnAgentStoppedSpeaking after this many seconds of silence
|
||||
// once the server has confirmed the full response (bAgentResponseReceived=true).
|
||||
// 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream.
|
||||
static constexpr int32 SilenceThresholdTicks = 30;
|
||||
// 0.5s is enough to bridge brief inter-chunk gaps in the TTS stream.
|
||||
static constexpr float SilenceThresholdSeconds = 0.5f;
|
||||
|
||||
// Hard-timeout fallback: fire even without agent_response confirmation after 10s
|
||||
// of silence. This covers edge cases where agent_response is very late or missing,
|
||||
// while being long enough to bridge inter-batch TTS gaps (observed up to ~5s).
|
||||
// Previously 2s — raised after logs showed premature firing during multi-batch responses.
|
||||
static constexpr int32 HardSilenceTimeoutTicks = 600; // 10s at 60fps
|
||||
static constexpr float HardSilenceTimeoutSeconds = 10.0f;
|
||||
|
||||
// Generating timeout: same as hard silence timeout.
|
||||
static constexpr float GeneratingTimeoutSeconds = 10.0f;
|
||||
|
||||
// True once the server sends agent_response for the current turn.
|
||||
// The server sends the full text when generation is complete — this is the
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user