v1.9.0: Fix audio gaps, pre-buffer, and lip sync neutral pose

- Remove silence padding accumulation bug: QueueAudio'd silence was accumulating in USoundWaveProcedural's internal buffer during TTS gaps, delaying real audio by ~800ms. USoundWaveProcedural with INDEFINITELY_LOOPING_DURATION generates silence internally instead. - Fix pre-buffer bypass: guard OnProceduralUnderflow with bPreBuffering check — the audio component never stops (INDEFINITELY_LOOPING_DURATION) so it was draining AudioQueue during pre-buffering, defeating it entirely. - Audio pre-buffer default 2000ms (max 4000ms) to absorb ElevenLabs server-side TTS inter-chunk gaps (~2s between chunks confirmed). - Add diagnostic timestamps [T+Xs] in HandleAudioReceived and AudioQueue DRY/recovered logs for debugging audio pipeline timing. - Fix lip sync not returning to neutral: add snap-to-zero (< 0.01) in blendshape smoothing pass and clean up PreviousBlendshapes to prevent asymptotic Lerp residuals keeping mouth slightly open. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 20:37:23 +01:00
parent 7dfffdbad8
commit c2142f3e6b
4 changed files with 85 additions and 17 deletions
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
 void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
 {
 	const double T = FPlatformTime::Seconds() - SessionStartTime;
 	const int32 NumSamples = PCMData.Num() / sizeof(int16);
 	const float DurationMs = (static_cast<float>(NumSamples) / 16000.0f) * 1000.0f;
 	int32 QueueBefore;
 	{
 		FScopeLock Lock(&AudioQueueLock);
 		QueueBefore = AudioQueue.Num() / sizeof(int16);
 	}
 	UE_LOG(LogElevenLabsAgent, Log,
 		TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"),
 		T, LastClosedTurnIndex, NumSamples, DurationMs,
 		QueueBefore, (static_cast<float>(QueueBefore) / 16000.0f) * 1000.0f);
 	EnqueueAgentAudio(PCMData);
 	// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
 	OnAgentAudioData.Broadcast(PCMData);
@@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 {
 	FScopeLock Lock(&AudioQueueLock);
 	// During pre-buffering, do NOT consume data from AudioQueue.
 	// The AudioPlaybackComponent is still "playing" from the previous turn
 	// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
 	// keeps firing. Without this guard, the underflow callback would drain
 	// the AudioQueue immediately, defeating the pre-buffer entirely.
 	// The ProceduralSoundWave generates silence internally when we return
 	// nothing — this silence does NOT accumulate, so once bPreBuffering
 	// clears, the buffered data plays immediately.
 	if (bPreBuffering)
 	{
 		return;
 	}
 	if (AudioQueue.Num() > 0)
 	{
 		const int32 BytesRequired = SamplesRequired * sizeof(int16);
@@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 		InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
 		AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
 		// Log when queue recovers (new data arrived after being dry)
 		if (bQueueWasDry)
 		{
 			bQueueWasDry = false;
 			const double T = FPlatformTime::Seconds() - SessionStartTime;
 			UE_LOG(LogElevenLabsAgent, Log,
 				TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
 				T, LastClosedTurnIndex, AudioQueue.Num());
 		}
 	}
 	else if (bAgentSpeaking)
 	{
-		// Queue is empty but agent is still speaking (TTS inter-batch gap).
+		// Log once when queue first runs dry
-		// Feed a SMALL amount of silence to keep the audio component alive.
+		if (!bQueueWasDry)
-		// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
+		{
-		// SamplesRequired to avoid queuing large blocks of silence in the
+			bQueueWasDry = true;
-		// audio component's internal buffer. Without this cap, multiple
+			const double T = FPlatformTime::Seconds() - SessionStartTime;
-		// underflow calls during a TTS gap accumulate hundreds of ms of silence
+			UE_LOG(LogElevenLabsAgent, Warning,
-		// that must be played through BEFORE real audio data — causing the
+				TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."),
-		// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
+				T, LastClosedTurnIndex, SamplesRequired);
-		// one small silence block sits ahead of new audio when it arrives.
+		}
-		constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
+
-		const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
+		// Do NOT feed silence via QueueAudio! USoundWaveProcedural with
-		const int32 SilenceBytes = SilenceSamples * sizeof(int16);
+		// INDEFINITELY_LOOPING_DURATION generates silence internally when
-		SilenceBuffer.SetNumZeroed(SilenceBytes);
+		// its buffer is empty — this internal silence does NOT accumulate
-		InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
+		// in the queue, so new audio data plays immediately when it arrives.
 		//
 		// Previously we QueueAudio'd 32ms silence blocks here, but they
 		// accumulated in the procedural wave's internal buffer during TTS
 		// gaps (1-2s between chunks). When the next chunk arrived, its data
 		// was queued AFTER hundreds of ms of accumulated silence, causing
 		// an audible pause before the real audio played.
 	}
 }
@@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
 		bAgentSpeaking = true;
 		bAgentGenerating = false;    // Agent is now speaking — generation phase is over.
 		bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
 		bQueueWasDry = false;
 		SilentTickCount = 0;
 		const double T = AgentSpeakStart - SessionStartTime;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -554,6 +554,28 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 			{
 				Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
 			}
 			// Snap to zero to prevent the mouth from staying slightly open
 			// after speech ends. Without this, the asymptotic Lerp decay
 			// leaves tiny residual values (e.g. jawOpen=0.005) that keep
 			// the mouth visibly ajar on MetaHuman faces.
 			if (Pair.Value < 0.01f)
 			{
 				Pair.Value = 0.0f;
 			}
 		}
 		// Clean up PreviousBlendshapes: remove entries that have fully decayed
 		// to zero so they don't feed residual values back into the next frame.
 		TArray<FName> KeysToRemove;
 		for (const auto& Pair : CurrentBlendshapes)
 		{
 			if (Pair.Value == 0.0f)
 			{
 				KeysToRemove.Add(Pair.Key);
 			}
 		}
 		for (const FName& Key : KeysToRemove)
 		{
 			CurrentBlendshapes.Remove(Key);
 		}
 		PreviousBlendshapes = CurrentBlendshapes;
 	}
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -143,9 +143,9 @@ public:
 	 *  Higher values = fewer gaps but more latency on the first word.
 	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
-		meta = (ClampMin = "0", ClampMax = "500",
+		meta = (ClampMin = "0", ClampMax = "4000",
-		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
-	int32 AudioPreBufferMs = 250;
+	int32 AudioPreBufferMs = 2000;
 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
@@ -377,6 +377,9 @@ private:
 	bool bPreBuffering = false;
 	double PreBufferStartTime = 0.0;
 	// Debug: track when the AudioQueue runs dry during speech (one-shot log).
 	bool bQueueWasDry = false;
 	// Silence detection: how many consecutive ticks with an empty audio queue.
 	int32 SilentTickCount = 0;