Lip sync v2: text persistence across TTS chunks, audio pre-buffering, smoothing fixes

- Fix text erasure between TTS audio chunks (bFullTextReceived guard): partial text now persists across all chunks of the same utterance instead of being erased after chunk 1's queue empties - Add audio pre-buffering (AudioPreBufferMs, default 250ms) to absorb TTS inter-chunk gaps and eliminate mid-sentence audio pauses - Lip sync pauses viseme queue consumption during pre-buffer to stay in sync - Inter-frame interpolation (lerp between consumed and next queued frame) for smoother mouth transitions instead of 32ms step-wise jumps - Reduce double-smoothing (blendshape smooth 0.8→0.4, release 0.5→0.65) - Adjust duration weights (vowels 2.0/1.7, plosives 0.8, silence 1.0) - UI range refinement (AmplitudeScale 0.5-1.0, SmoothingSpeed 35-65) - Silence padding capped at 512 samples (32ms) to prevent buffer accumulation - Audio playback restart on buffer underrun during speech - Optimized log levels (most debug→Verbose, kept key diagnostics at Log) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 19:34:36 +01:00
parent ce7a146ce9
commit 7dfffdbad8
6 changed files with 1270 additions and 184 deletions
--- a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap
+++ b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -80,6 +80,25 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
 		GeneratingTickCount = 0;
 	}

+	// Pre-buffer timer: start playback after the pre-buffer period expires.
+	// If the second TTS chunk didn't arrive in time, start playing with
+	// whatever we have. The silence padding will bridge any remaining gap.
+	if (bPreBuffering)
+	{
+		const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
+		if (Elapsed >= static_cast<double>(AudioPreBufferMs))
+		{
+			bPreBuffering = false;
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."),
+				LastClosedTurnIndex, AudioPreBufferMs);
+			if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+			{
+				AudioPlaybackComponent->Play();
+			}
+		}
+	}
+
 	// Silence detection.
 	// ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock.
 	// OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast
@@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 	USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired)
 {
 	FScopeLock Lock(&AudioQueueLock);
-	if (AudioQueue.Num() == 0) return;

-	const int32 BytesRequired = SamplesRequired * sizeof(int16);
-	const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
+	if (AudioQueue.Num() > 0)
+	{
+		const int32 BytesRequired = SamplesRequired * sizeof(int16);
+		const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);

-	InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
-	AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
+		InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
+		AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
+	}
+	else if (bAgentSpeaking)
+	{
+		// Queue is empty but agent is still speaking (TTS inter-batch gap).
+		// Feed a SMALL amount of silence to keep the audio component alive.
+		// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
+		// SamplesRequired to avoid queuing large blocks of silence in the
+		// audio component's internal buffer. Without this cap, multiple
+		// underflow calls during a TTS gap accumulate hundreds of ms of silence
+		// that must be played through BEFORE real audio data — causing the
+		// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
+		// one small silence block sits ahead of new audio when it arrives.
+		constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
+		const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
+		const int32 SilenceBytes = SilenceSamples * sizeof(int16);
+		SilenceBuffer.SetNumZeroed(SilenceBytes);
+		InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
+	}
 }

 void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uint8>& PCMData)
@@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin

 		OnAgentStartedSpeaking.Broadcast();

+		if (AudioPreBufferMs > 0)
+		{
+			// Pre-buffer: accumulate audio before starting playback.
+			// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
+			// chunk 1 finishes playing, eliminating mid-sentence pauses.
+			bPreBuffering = true;
+			PreBufferStartTime = FPlatformTime::Seconds();
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[Turn %d] Pre-buffering %dms before starting playback."),
+				LastClosedTurnIndex, AudioPreBufferMs);
+		}
+		else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+		{
+			AudioPlaybackComponent->Play();
+		}
+	}
+	else if (bPreBuffering)
+	{
+		// Second (or later) audio chunk arrived during pre-buffer period.
+		// We now have both chunks buffered — start playback immediately.
+		bPreBuffering = false;
+		const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
+		UE_LOG(LogElevenLabsAgent, Log,
+			TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
+			LastClosedTurnIndex, BufferedMs);
 		if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
 		{
 			AudioPlaybackComponent->Play();
 		}
+		SilentTickCount = 0;
+	}
+	else
+	{
+		// Already speaking — but the audio component may have stopped due to
+		// buffer underrun (TTS inter-batch gap). Restart it if needed.
+		if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+		{
+			UE_LOG(LogElevenLabsAgent, Warning,
+				TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."),
+				LastClosedTurnIndex);
+			AudioPlaybackComponent->Play();
+		}
+		// Reset silence counter — new audio arrived, we're not in a gap anymore
+		SilentTickCount = 0;
 	}
 }

@@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 	// while holding it would block the audio thread for the full Blueprint handler duration.
 	bool bWasSpeaking = false;
 	double Now = 0.0;
+	bPreBuffering = false; // Clear pre-buffer state on stop.
 	{
 		FScopeLock Lock(&AudioQueueLock);
 		AudioQueue.Empty();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -136,6 +136,17 @@ public:
 		meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
 	bool bEnableAgentPartialResponse = false;

+	/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
+	 *  ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
+	 *  Pre-buffering delays playback start so the second chunk arrives before the
+	 *  first finishes playing, eliminating the audible gap mid-sentence.
+	 *  Higher values = fewer gaps but more latency on the first word.
+	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
+		meta = (ClampMin = "0", ClampMax = "500",
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
+	int32 AudioPreBufferMs = 250;
+
 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
 		meta = (ClampMin = "0.0",
@@ -257,6 +268,11 @@ public:
 	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
 	const FElevenLabsConversationInfo& GetConversationInfo() const;

+	/** True while audio is being pre-buffered (playback hasn't started yet).
+	 *  Used by the LipSync component to pause viseme queue consumption. */
+	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
+	bool IsPreBuffering() const { return bPreBuffering; }
+
 	/** Access the underlying WebSocket proxy (advanced use). */
 	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
 	UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; }
@@ -353,6 +369,14 @@ private:
 	TArray<uint8> AudioQueue;
 	FCriticalSection AudioQueueLock;

+	// Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps
+	// to keep the audio component alive (prevents stop on buffer underrun).
+	TArray<uint8> SilenceBuffer;
+
+	// Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps.
+	bool bPreBuffering = false;
+	double PreBufferStartTime = 0.0;
+
 	// Silence detection: how many consecutive ticks with an empty audio queue.
 	int32 SilentTickCount = 0;

--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -51,11 +51,19 @@ public:
 		ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
 	float LipSyncStrength = 1.0f;

+	/** Scales the audio amplitude driving mouth movement.
+	 *  Lower values produce subtler animation, higher values are more pronounced.
+	 *  Use this to tone down overly strong lip movement without changing the shape. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "0.5", ClampMax = "1.0",
+		ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape."))
+	float AmplitudeScale = 0.75f;
+
 	/** How quickly viseme weights interpolate towards new values each frame. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
-		meta = (ClampMin = "1.0", ClampMax = "100.0",
-		ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
-	float SmoothingSpeed = 20.0f;
+		meta = (ClampMin = "35.0", ClampMax = "65.0",
+		ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
+	float SmoothingSpeed = 50.0f;

 	// ── Events ────────────────────────────────────────────────────────────────

@@ -87,6 +95,20 @@ private:
 	/** Receives raw PCM from the agent component. */
 	void OnAudioChunkReceived(const TArray<uint8>& PCMData);

+	/** Receives full text response from the agent component. */
+	UFUNCTION()
+	void OnTextResponseReceived(const FString& ResponseText);
+
+	/** Receives partial text streaming from the agent component. */
+	UFUNCTION()
+	void OnPartialTextReceived(const FString& PartialText);
+
+	/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
+	void ConvertTextToVisemes(const FString& Text);
+
+	/** Apply text-derived viseme shapes to the remaining queued frames. */
+	void ApplyTextVisemesToQueue();
+
 	/** Extract frequency band energies from the spectrum analyzer. */
 	void AnalyzeSpectrum();

@@ -122,6 +144,13 @@ private:
 	// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
 	TMap<FName, float> CurrentBlendshapes;

+	// Previous frame's blendshape values for additional output smoothing
+	TMap<FName, float> PreviousBlendshapes;
+
+	// Last consumed queue frame — used for inter-frame interpolation
+	// to create continuous motion instead of 32ms step-wise jumps
+	TMap<FName, float> LastConsumedVisemes;
+
 	// MetaHuman mode: Face mesh has no morph targets, use animation curves instead.
 	// Set automatically in BeginPlay when TargetMesh has 0 morph targets.
 	bool bUseCurveMode = false;
@@ -129,9 +158,48 @@ private:
 	// Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops.
 	TMap<FName, FName> CurveNameCache;

+	// RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening)
+	float CurrentAmplitude = 0.0f;
+
+	// ── Viseme queue ──────────────────────────────────────────────────────────
+
+	// Queue of per-window viseme analysis results.
+	// OnAudioChunkReceived builds one frame per 512-sample window (~32ms).
+	// TickComponent consumes them at the correct playback rate.
+	TArray<TMap<FName, float>> VisemeQueue;
+
+	// Parallel queue of per-window amplitude values (for text-driven shape replacement)
+	TArray<float> AmplitudeQueue;
+
+	// Timer for consuming queued viseme frames at the FFT window rate
+	float PlaybackTimer = 0.0f;
+
 	// Whether we have pending analysis results to process
 	bool bHasPendingAnalysis = false;

+	// ── Text-driven lip sync ──────────────────────────────────────────────────
+
+	// Accumulated partial text from streaming (agent_chat_response_part events).
+	// Built up token-by-token before the audio arrives.
+	FString AccumulatedText;
+
+	// Ordered sequence of OVR viseme names derived from text.
+	// E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR]
+	TArray<FName> TextVisemeSequence;
+
+	// Whether text-based visemes have been applied to the current queue
+	bool bTextVisemesApplied = false;
+
+	// Set when agent_response arrives (full text for this utterance).
+	// Prevents resetting AccumulatedText between audio chunks of the
+	// SAME utterance — only reset once the full response is confirmed.
+	bool bFullTextReceived = false;
+
+	// Wait-for-text mechanism: when audio arrives without text, hold playback
+	// until text arrives (partial or full) so all frames get proper text visemes.
+	bool bWaitingForText = false;
+	double WaitingForTextStartTime = 0.0;
+
 	// Cached reference to the agent component on the same Actor
 	TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
 	FDelegateHandle AudioDataHandle;