Lip sync v2: text persistence across TTS chunks, audio pre-buffering, smoothing fixes
- Fix text erasure between TTS audio chunks (bFullTextReceived guard): partial text now persists across all chunks of the same utterance instead of being erased after chunk 1's queue empties - Add audio pre-buffering (AudioPreBufferMs, default 250ms) to absorb TTS inter-chunk gaps and eliminate mid-sentence audio pauses - Lip sync pauses viseme queue consumption during pre-buffer to stay in sync - Inter-frame interpolation (lerp between consumed and next queued frame) for smoother mouth transitions instead of 32ms step-wise jumps - Reduce double-smoothing (blendshape smooth 0.8→0.4, release 0.5→0.65) - Adjust duration weights (vowels 2.0/1.7, plosives 0.8, silence 1.0) - UI range refinement (AmplitudeScale 0.5-1.0, SmoothingSpeed 35-65) - Silence padding capped at 512 samples (32ms) to prevent buffer accumulation - Audio playback restart on buffer underrun during speech - Optimized log levels (most debug→Verbose, kept key diagnostics at Log) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
ce7a146ce9
commit
7dfffdbad8
Binary file not shown.
Binary file not shown.
@ -80,6 +80,25 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
|
||||
GeneratingTickCount = 0;
|
||||
}
|
||||
|
||||
// Pre-buffer timer: start playback after the pre-buffer period expires.
|
||||
// If the second TTS chunk didn't arrive in time, start playing with
|
||||
// whatever we have. The silence padding will bridge any remaining gap.
|
||||
if (bPreBuffering)
|
||||
{
|
||||
const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
|
||||
if (Elapsed >= static_cast<double>(AudioPreBufferMs))
|
||||
{
|
||||
bPreBuffering = false;
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."),
|
||||
LastClosedTurnIndex, AudioPreBufferMs);
|
||||
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
AudioPlaybackComponent->Play();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Silence detection.
|
||||
// ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock.
|
||||
// OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast
|
||||
@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
||||
USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired)
|
||||
{
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
if (AudioQueue.Num() == 0) return;
|
||||
|
||||
const int32 BytesRequired = SamplesRequired * sizeof(int16);
|
||||
const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
|
||||
if (AudioQueue.Num() > 0)
|
||||
{
|
||||
const int32 BytesRequired = SamplesRequired * sizeof(int16);
|
||||
const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
|
||||
|
||||
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
|
||||
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
|
||||
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
|
||||
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
|
||||
}
|
||||
else if (bAgentSpeaking)
|
||||
{
|
||||
// Queue is empty but agent is still speaking (TTS inter-batch gap).
|
||||
// Feed a SMALL amount of silence to keep the audio component alive.
|
||||
// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
|
||||
// SamplesRequired to avoid queuing large blocks of silence in the
|
||||
// audio component's internal buffer. Without this cap, multiple
|
||||
// underflow calls during a TTS gap accumulate hundreds of ms of silence
|
||||
// that must be played through BEFORE real audio data — causing the
|
||||
// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
|
||||
// one small silence block sits ahead of new audio when it arrives.
|
||||
constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
|
||||
const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
|
||||
const int32 SilenceBytes = SilenceSamples * sizeof(int16);
|
||||
SilenceBuffer.SetNumZeroed(SilenceBytes);
|
||||
InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uint8>& PCMData)
|
||||
@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
||||
|
||||
OnAgentStartedSpeaking.Broadcast();
|
||||
|
||||
if (AudioPreBufferMs > 0)
|
||||
{
|
||||
// Pre-buffer: accumulate audio before starting playback.
|
||||
// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
|
||||
// chunk 1 finishes playing, eliminating mid-sentence pauses.
|
||||
bPreBuffering = true;
|
||||
PreBufferStartTime = FPlatformTime::Seconds();
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[Turn %d] Pre-buffering %dms before starting playback."),
|
||||
LastClosedTurnIndex, AudioPreBufferMs);
|
||||
}
|
||||
else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
AudioPlaybackComponent->Play();
|
||||
}
|
||||
}
|
||||
else if (bPreBuffering)
|
||||
{
|
||||
// Second (or later) audio chunk arrived during pre-buffer period.
|
||||
// We now have both chunks buffered — start playback immediately.
|
||||
bPreBuffering = false;
|
||||
const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
|
||||
LastClosedTurnIndex, BufferedMs);
|
||||
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
AudioPlaybackComponent->Play();
|
||||
}
|
||||
SilentTickCount = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Already speaking — but the audio component may have stopped due to
|
||||
// buffer underrun (TTS inter-batch gap). Restart it if needed.
|
||||
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Warning,
|
||||
TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."),
|
||||
LastClosedTurnIndex);
|
||||
AudioPlaybackComponent->Play();
|
||||
}
|
||||
// Reset silence counter — new audio arrived, we're not in a gap anymore
|
||||
SilentTickCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
||||
// while holding it would block the audio thread for the full Blueprint handler duration.
|
||||
bool bWasSpeaking = false;
|
||||
double Now = 0.0;
|
||||
bPreBuffering = false; // Clear pre-buffer state on stop.
|
||||
{
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
AudioQueue.Empty();
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -136,6 +136,17 @@ public:
|
||||
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
|
||||
bool bEnableAgentPartialResponse = false;
|
||||
|
||||
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
|
||||
* ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
|
||||
* Pre-buffering delays playback start so the second chunk arrives before the
|
||||
* first finishes playing, eliminating the audible gap mid-sentence.
|
||||
* Higher values = fewer gaps but more latency on the first word.
|
||||
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ClampMin = "0", ClampMax = "500",
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
|
||||
int32 AudioPreBufferMs = 250;
|
||||
|
||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
meta = (ClampMin = "0.0",
|
||||
@ -257,6 +268,11 @@ public:
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
|
||||
const FElevenLabsConversationInfo& GetConversationInfo() const;
|
||||
|
||||
/** True while audio is being pre-buffered (playback hasn't started yet).
|
||||
* Used by the LipSync component to pause viseme queue consumption. */
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
|
||||
bool IsPreBuffering() const { return bPreBuffering; }
|
||||
|
||||
/** Access the underlying WebSocket proxy (advanced use). */
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
|
||||
UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; }
|
||||
@ -353,6 +369,14 @@ private:
|
||||
TArray<uint8> AudioQueue;
|
||||
FCriticalSection AudioQueueLock;
|
||||
|
||||
// Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps
|
||||
// to keep the audio component alive (prevents stop on buffer underrun).
|
||||
TArray<uint8> SilenceBuffer;
|
||||
|
||||
// Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps.
|
||||
bool bPreBuffering = false;
|
||||
double PreBufferStartTime = 0.0;
|
||||
|
||||
// Silence detection: how many consecutive ticks with an empty audio queue.
|
||||
int32 SilentTickCount = 0;
|
||||
|
||||
|
||||
@ -51,11 +51,19 @@ public:
|
||||
ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
|
||||
float LipSyncStrength = 1.0f;
|
||||
|
||||
/** Scales the audio amplitude driving mouth movement.
|
||||
* Lower values produce subtler animation, higher values are more pronounced.
|
||||
* Use this to tone down overly strong lip movement without changing the shape. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
|
||||
meta = (ClampMin = "0.5", ClampMax = "1.0",
|
||||
ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape."))
|
||||
float AmplitudeScale = 0.75f;
|
||||
|
||||
/** How quickly viseme weights interpolate towards new values each frame. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
|
||||
meta = (ClampMin = "1.0", ClampMax = "100.0",
|
||||
ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
|
||||
float SmoothingSpeed = 20.0f;
|
||||
meta = (ClampMin = "35.0", ClampMax = "65.0",
|
||||
ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
|
||||
float SmoothingSpeed = 50.0f;
|
||||
|
||||
// ── Events ────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -87,6 +95,20 @@ private:
|
||||
/** Receives raw PCM from the agent component. */
|
||||
void OnAudioChunkReceived(const TArray<uint8>& PCMData);
|
||||
|
||||
/** Receives full text response from the agent component. */
|
||||
UFUNCTION()
|
||||
void OnTextResponseReceived(const FString& ResponseText);
|
||||
|
||||
/** Receives partial text streaming from the agent component. */
|
||||
UFUNCTION()
|
||||
void OnPartialTextReceived(const FString& PartialText);
|
||||
|
||||
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
|
||||
void ConvertTextToVisemes(const FString& Text);
|
||||
|
||||
/** Apply text-derived viseme shapes to the remaining queued frames. */
|
||||
void ApplyTextVisemesToQueue();
|
||||
|
||||
/** Extract frequency band energies from the spectrum analyzer. */
|
||||
void AnalyzeSpectrum();
|
||||
|
||||
@ -122,6 +144,13 @@ private:
|
||||
// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
|
||||
TMap<FName, float> CurrentBlendshapes;
|
||||
|
||||
// Previous frame's blendshape values for additional output smoothing
|
||||
TMap<FName, float> PreviousBlendshapes;
|
||||
|
||||
// Last consumed queue frame — used for inter-frame interpolation
|
||||
// to create continuous motion instead of 32ms step-wise jumps
|
||||
TMap<FName, float> LastConsumedVisemes;
|
||||
|
||||
// MetaHuman mode: Face mesh has no morph targets, use animation curves instead.
|
||||
// Set automatically in BeginPlay when TargetMesh has 0 morph targets.
|
||||
bool bUseCurveMode = false;
|
||||
@ -129,9 +158,48 @@ private:
|
||||
// Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops.
|
||||
TMap<FName, FName> CurveNameCache;
|
||||
|
||||
// RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening)
|
||||
float CurrentAmplitude = 0.0f;
|
||||
|
||||
// ── Viseme queue ──────────────────────────────────────────────────────────
|
||||
|
||||
// Queue of per-window viseme analysis results.
|
||||
// OnAudioChunkReceived builds one frame per 512-sample window (~32ms).
|
||||
// TickComponent consumes them at the correct playback rate.
|
||||
TArray<TMap<FName, float>> VisemeQueue;
|
||||
|
||||
// Parallel queue of per-window amplitude values (for text-driven shape replacement)
|
||||
TArray<float> AmplitudeQueue;
|
||||
|
||||
// Timer for consuming queued viseme frames at the FFT window rate
|
||||
float PlaybackTimer = 0.0f;
|
||||
|
||||
// Whether we have pending analysis results to process
|
||||
bool bHasPendingAnalysis = false;
|
||||
|
||||
// ── Text-driven lip sync ──────────────────────────────────────────────────
|
||||
|
||||
// Accumulated partial text from streaming (agent_chat_response_part events).
|
||||
// Built up token-by-token before the audio arrives.
|
||||
FString AccumulatedText;
|
||||
|
||||
// Ordered sequence of OVR viseme names derived from text.
|
||||
// E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR]
|
||||
TArray<FName> TextVisemeSequence;
|
||||
|
||||
// Whether text-based visemes have been applied to the current queue
|
||||
bool bTextVisemesApplied = false;
|
||||
|
||||
// Set when agent_response arrives (full text for this utterance).
|
||||
// Prevents resetting AccumulatedText between audio chunks of the
|
||||
// SAME utterance — only reset once the full response is confirmed.
|
||||
bool bFullTextReceived = false;
|
||||
|
||||
// Wait-for-text mechanism: when audio arrives without text, hold playback
|
||||
// until text arrives (partial or full) so all frames get proper text visemes.
|
||||
bool bWaitingForText = false;
|
||||
double WaitingForTextStartTime = 0.0;
|
||||
|
||||
// Cached reference to the agent component on the same Actor
|
||||
TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
|
||||
FDelegateHandle AudioDataHandle;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user