Lip sync v2: text persistence across TTS chunks, audio pre-buffering, smoothing fixes

- Fix text erasure between TTS audio chunks (bFullTextReceived guard):
  partial text now persists across all chunks of the same utterance instead
  of being erased after chunk 1's queue empties
- Add audio pre-buffering (AudioPreBufferMs, default 250ms) to absorb TTS
  inter-chunk gaps and eliminate mid-sentence audio pauses
- Lip sync pauses viseme queue consumption during pre-buffer to stay in sync
- Inter-frame interpolation (lerp between consumed and next queued frame)
  for smoother mouth transitions instead of 32ms step-wise jumps
- Reduce double-smoothing (blendshape smooth 0.8→0.4, release 0.5→0.65)
- Adjust duration weights (vowels 2.0/1.7, plosives 0.8, silence 1.0)
- UI range refinement (AmplitudeScale 0.5-1.0, SmoothingSpeed 35-65)
- Silence padding capped at 512 samples (32ms) to prevent buffer accumulation
- Audio playback restart on buffer underrun during speech
- Optimized log levels (most debug→Verbose, kept key diagnostics at Log)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-22 19:34:36 +01:00
parent ce7a146ce9
commit 7dfffdbad8
6 changed files with 1270 additions and 184 deletions

View File

@ -80,6 +80,25 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
GeneratingTickCount = 0;
}
// Pre-buffer timer: start playback after the pre-buffer period expires.
// If the second TTS chunk didn't arrive in time, start playing with
// whatever we have. The silence padding will bridge any remaining gap.
if (bPreBuffering)
{
const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
if (Elapsed >= static_cast<double>(AudioPreBufferMs))
{
bPreBuffering = false;
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."),
LastClosedTurnIndex, AudioPreBufferMs);
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{
AudioPlaybackComponent->Play();
}
}
}
// Silence detection.
// ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock.
// OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast
@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired)
{
FScopeLock Lock(&AudioQueueLock);
if (AudioQueue.Num() == 0) return;
const int32 BytesRequired = SamplesRequired * sizeof(int16);
const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
if (AudioQueue.Num() > 0)
{
const int32 BytesRequired = SamplesRequired * sizeof(int16);
const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
}
else if (bAgentSpeaking)
{
// Queue is empty but agent is still speaking (TTS inter-batch gap).
// Feed a SMALL amount of silence to keep the audio component alive.
// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
// SamplesRequired to avoid queuing large blocks of silence in the
// audio component's internal buffer. Without this cap, multiple
// underflow calls during a TTS gap accumulate hundreds of ms of silence
// that must be played through BEFORE real audio data — causing the
// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
// one small silence block sits ahead of new audio when it arrives.
constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
const int32 SilenceBytes = SilenceSamples * sizeof(int16);
SilenceBuffer.SetNumZeroed(SilenceBytes);
InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
}
}
void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uint8>& PCMData)
@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
OnAgentStartedSpeaking.Broadcast();
if (AudioPreBufferMs > 0)
{
// Pre-buffer: accumulate audio before starting playback.
// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
// chunk 1 finishes playing, eliminating mid-sentence pauses.
bPreBuffering = true;
PreBufferStartTime = FPlatformTime::Seconds();
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[Turn %d] Pre-buffering %dms before starting playback."),
LastClosedTurnIndex, AudioPreBufferMs);
}
else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{
AudioPlaybackComponent->Play();
}
}
else if (bPreBuffering)
{
// Second (or later) audio chunk arrived during pre-buffer period.
// We now have both chunks buffered — start playback immediately.
bPreBuffering = false;
const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
LastClosedTurnIndex, BufferedMs);
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{
AudioPlaybackComponent->Play();
}
SilentTickCount = 0;
}
else
{
// Already speaking — but the audio component may have stopped due to
// buffer underrun (TTS inter-batch gap). Restart it if needed.
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{
UE_LOG(LogElevenLabsAgent, Warning,
TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."),
LastClosedTurnIndex);
AudioPlaybackComponent->Play();
}
// Reset silence counter — new audio arrived, we're not in a gap anymore
SilentTickCount = 0;
}
}
@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
// while holding it would block the audio thread for the full Blueprint handler duration.
bool bWasSpeaking = false;
double Now = 0.0;
bPreBuffering = false; // Clear pre-buffer state on stop.
{
FScopeLock Lock(&AudioQueueLock);
AudioQueue.Empty();

View File

@ -136,6 +136,17 @@ public:
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
bool bEnableAgentPartialResponse = false;
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
* ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
* Pre-buffering delays playback start so the second chunk arrives before the
* first finishes playing, eliminating the audible gap mid-sentence.
* Higher values = fewer gaps but more latency on the first word.
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ClampMin = "0", ClampMax = "500",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
int32 AudioPreBufferMs = 250;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
meta = (ClampMin = "0.0",
@ -257,6 +268,11 @@ public:
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
const FElevenLabsConversationInfo& GetConversationInfo() const;
/** True while audio is being pre-buffered (playback hasn't started yet).
* Used by the LipSync component to pause viseme queue consumption. */
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
bool IsPreBuffering() const { return bPreBuffering; }
/** Access the underlying WebSocket proxy (advanced use). */
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; }
@ -353,6 +369,14 @@ private:
TArray<uint8> AudioQueue;
FCriticalSection AudioQueueLock;
// Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps
// to keep the audio component alive (prevents stop on buffer underrun).
TArray<uint8> SilenceBuffer;
// Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps.
bool bPreBuffering = false;
double PreBufferStartTime = 0.0;
// Silence detection: how many consecutive ticks with an empty audio queue.
int32 SilentTickCount = 0;

View File

@ -51,11 +51,19 @@ public:
ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
float LipSyncStrength = 1.0f;
/** Scales the audio amplitude driving mouth movement.
* Lower values produce subtler animation, higher values are more pronounced.
* Use this to tone down overly strong lip movement without changing the shape. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "0.5", ClampMax = "1.0",
ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape."))
float AmplitudeScale = 0.75f;
/** How quickly viseme weights interpolate towards new values each frame. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "1.0", ClampMax = "100.0",
ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
float SmoothingSpeed = 20.0f;
meta = (ClampMin = "35.0", ClampMax = "65.0",
ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
float SmoothingSpeed = 50.0f;
// ── Events ────────────────────────────────────────────────────────────────
@ -87,6 +95,20 @@ private:
/** Receives raw PCM from the agent component. */
void OnAudioChunkReceived(const TArray<uint8>& PCMData);
/** Receives full text response from the agent component. */
UFUNCTION()
void OnTextResponseReceived(const FString& ResponseText);
/** Receives partial text streaming from the agent component. */
UFUNCTION()
void OnPartialTextReceived(const FString& PartialText);
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
void ConvertTextToVisemes(const FString& Text);
/** Apply text-derived viseme shapes to the remaining queued frames. */
void ApplyTextVisemesToQueue();
/** Extract frequency band energies from the spectrum analyzer. */
void AnalyzeSpectrum();
@ -122,6 +144,13 @@ private:
// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
TMap<FName, float> CurrentBlendshapes;
// Previous frame's blendshape values for additional output smoothing
TMap<FName, float> PreviousBlendshapes;
// Last consumed queue frame — used for inter-frame interpolation
// to create continuous motion instead of 32ms step-wise jumps
TMap<FName, float> LastConsumedVisemes;
// MetaHuman mode: Face mesh has no morph targets, use animation curves instead.
// Set automatically in BeginPlay when TargetMesh has 0 morph targets.
bool bUseCurveMode = false;
@ -129,9 +158,48 @@ private:
// Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops.
TMap<FName, FName> CurveNameCache;
// RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening)
float CurrentAmplitude = 0.0f;
// ── Viseme queue ──────────────────────────────────────────────────────────
// Queue of per-window viseme analysis results.
// OnAudioChunkReceived builds one frame per 512-sample window (~32ms).
// TickComponent consumes them at the correct playback rate.
TArray<TMap<FName, float>> VisemeQueue;
// Parallel queue of per-window amplitude values (for text-driven shape replacement)
TArray<float> AmplitudeQueue;
// Timer for consuming queued viseme frames at the FFT window rate
float PlaybackTimer = 0.0f;
// Whether we have pending analysis results to process
bool bHasPendingAnalysis = false;
// ── Text-driven lip sync ──────────────────────────────────────────────────
// Accumulated partial text from streaming (agent_chat_response_part events).
// Built up token-by-token before the audio arrives.
FString AccumulatedText;
// Ordered sequence of OVR viseme names derived from text.
// E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR]
TArray<FName> TextVisemeSequence;
// Whether text-based visemes have been applied to the current queue
bool bTextVisemesApplied = false;
// Set when agent_response arrives (full text for this utterance).
// Prevents resetting AccumulatedText between audio chunks of the
// SAME utterance — only reset once the full response is confirmed.
bool bFullTextReceived = false;
// Wait-for-text mechanism: when audio arrives without text, hold playback
// until text arrives (partial or full) so all frames get proper text visemes.
bool bWaitingForText = false;
double WaitingForTextStartTime = 0.0;
// Cached reference to the agent component on the same Actor
TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
FDelegateHandle AudioDataHandle;