v1.9.0: Fix audio gaps, pre-buffer, and lip sync neutral pose

- Remove silence padding accumulation bug: QueueAudio'd silence was accumulating in USoundWaveProcedural's internal buffer during TTS gaps, delaying real audio by ~800ms. USoundWaveProcedural with INDEFINITELY_LOOPING_DURATION generates silence internally instead. - Fix pre-buffer bypass: guard OnProceduralUnderflow with bPreBuffering check — the audio component never stops (INDEFINITELY_LOOPING_DURATION) so it was draining AudioQueue during pre-buffering, defeating it entirely. - Audio pre-buffer default 2000ms (max 4000ms) to absorb ElevenLabs server-side TTS inter-chunk gaps (~2s between chunks confirmed). - Add diagnostic timestamps [T+Xs] in HandleAudioReceived and AudioQueue DRY/recovered logs for debugging audio pipeline timing. - Fix lip sync not returning to neutral: add snap-to-zero (< 0.01) in blendshape smoothing pass and clean up PreviousBlendshapes to prevent asymptotic Lerp residuals keeping mouth slightly open. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 20:37:23 +01:00
parent 7dfffdbad8
commit c2142f3e6b
4 changed files with 85 additions and 17 deletions
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe

 void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
 {
+	const double T = FPlatformTime::Seconds() - SessionStartTime;
+	const int32 NumSamples = PCMData.Num() / sizeof(int16);
+	const float DurationMs = (static_cast<float>(NumSamples) / 16000.0f) * 1000.0f;
+	int32 QueueBefore;
+	{
+		FScopeLock Lock(&AudioQueueLock);
+		QueueBefore = AudioQueue.Num() / sizeof(int16);
+	}
+	UE_LOG(LogElevenLabsAgent, Log,
+		TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"),
+		T, LastClosedTurnIndex, NumSamples, DurationMs,
+		QueueBefore, (static_cast<float>(QueueBefore) / 16000.0f) * 1000.0f);
+
 	EnqueueAgentAudio(PCMData);
 	// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
 	OnAgentAudioData.Broadcast(PCMData);
@@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 {
 	FScopeLock Lock(&AudioQueueLock);

+	// During pre-buffering, do NOT consume data from AudioQueue.
+	// The AudioPlaybackComponent is still "playing" from the previous turn
+	// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
+	// keeps firing. Without this guard, the underflow callback would drain
+	// the AudioQueue immediately, defeating the pre-buffer entirely.
+	// The ProceduralSoundWave generates silence internally when we return
+	// nothing — this silence does NOT accumulate, so once bPreBuffering
+	// clears, the buffered data plays immediately.
+	if (bPreBuffering)
+	{
+		return;
+	}
+
 	if (AudioQueue.Num() > 0)
 	{
 		const int32 BytesRequired = SamplesRequired * sizeof(int16);
@@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(

 		InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
 		AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
+
+		// Log when queue recovers (new data arrived after being dry)
+		if (bQueueWasDry)
+		{
+			bQueueWasDry = false;
+			const double T = FPlatformTime::Seconds() - SessionStartTime;
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
+				T, LastClosedTurnIndex, AudioQueue.Num());
+		}
 	}
 	else if (bAgentSpeaking)
 	{
-		// Queue is empty but agent is still speaking (TTS inter-batch gap).
-		// Feed a SMALL amount of silence to keep the audio component alive.
-		// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
-		// SamplesRequired to avoid queuing large blocks of silence in the
-		// audio component's internal buffer. Without this cap, multiple
-		// underflow calls during a TTS gap accumulate hundreds of ms of silence
-		// that must be played through BEFORE real audio data — causing the
-		// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
-		// one small silence block sits ahead of new audio when it arrives.
-		constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
-		const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
-		const int32 SilenceBytes = SilenceSamples * sizeof(int16);
-		SilenceBuffer.SetNumZeroed(SilenceBytes);
-		InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
+		// Log once when queue first runs dry
+		if (!bQueueWasDry)
+		{
+			bQueueWasDry = true;
+			const double T = FPlatformTime::Seconds() - SessionStartTime;
+			UE_LOG(LogElevenLabsAgent, Warning,
+				TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."),
+				T, LastClosedTurnIndex, SamplesRequired);
+		}
+
+		// Do NOT feed silence via QueueAudio! USoundWaveProcedural with
+		// INDEFINITELY_LOOPING_DURATION generates silence internally when
+		// its buffer is empty — this internal silence does NOT accumulate
+		// in the queue, so new audio data plays immediately when it arrives.
+		//
+		// Previously we QueueAudio'd 32ms silence blocks here, but they
+		// accumulated in the procedural wave's internal buffer during TTS
+		// gaps (1-2s between chunks). When the next chunk arrived, its data
+		// was queued AFTER hundreds of ms of accumulated silence, causing
+		// an audible pause before the real audio played.
 	}
 }

@@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
 		bAgentSpeaking = true;
 		bAgentGenerating = false;    // Agent is now speaking — generation phase is over.
 		bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
+		bQueueWasDry = false;
 		SilentTickCount = 0;

 		const double T = AgentSpeakStart - SessionStartTime;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -554,6 +554,28 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 			{
 				Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
 			}
+			// Snap to zero to prevent the mouth from staying slightly open
+			// after speech ends. Without this, the asymptotic Lerp decay
+			// leaves tiny residual values (e.g. jawOpen=0.005) that keep
+			// the mouth visibly ajar on MetaHuman faces.
+			if (Pair.Value < 0.01f)
+			{
+				Pair.Value = 0.0f;
+			}
+		}
+		// Clean up PreviousBlendshapes: remove entries that have fully decayed
+		// to zero so they don't feed residual values back into the next frame.
+		TArray<FName> KeysToRemove;
+		for (const auto& Pair : CurrentBlendshapes)
+		{
+			if (Pair.Value == 0.0f)
+			{
+				KeysToRemove.Add(Pair.Key);
+			}
+		}
+		for (const FName& Key : KeysToRemove)
+		{
+			CurrentBlendshapes.Remove(Key);
 		}
 		PreviousBlendshapes = CurrentBlendshapes;
 	}
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -143,9 +143,9 @@ public:
 	 *  Higher values = fewer gaps but more latency on the first word.
 	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
-		meta = (ClampMin = "0", ClampMax = "500",
-		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
-	int32 AudioPreBufferMs = 250;
+		meta = (ClampMin = "0", ClampMax = "4000",
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
+	int32 AudioPreBufferMs = 2000;

 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
@@ -377,6 +377,9 @@ private:
 	bool bPreBuffering = false;
 	double PreBufferStartTime = 0.0;

+	// Debug: track when the AudioQueue runs dry during speech (one-shot log).
+	bool bQueueWasDry = false;
+
 	// Silence detection: how many consecutive ticks with an empty audio queue.
 	int32 SilentTickCount = 0;