v1.9.0: Fix audio gaps, pre-buffer, and lip sync neutral pose

- Remove silence padding accumulation bug: QueueAudio'd silence was
  accumulating in USoundWaveProcedural's internal buffer during TTS gaps,
  delaying real audio by ~800ms. USoundWaveProcedural with
  INDEFINITELY_LOOPING_DURATION generates silence internally instead.
- Fix pre-buffer bypass: guard OnProceduralUnderflow with bPreBuffering
  check — the audio component never stops (INDEFINITELY_LOOPING_DURATION)
  so it was draining AudioQueue during pre-buffering, defeating it entirely.
- Audio pre-buffer default 2000ms (max 4000ms) to absorb ElevenLabs
  server-side TTS inter-chunk gaps (~2s between chunks confirmed).
- Add diagnostic timestamps [T+Xs] in HandleAudioReceived and
  AudioQueue DRY/recovered logs for debugging audio pipeline timing.
- Fix lip sync not returning to neutral: add snap-to-zero (< 0.01)
  in blendshape smoothing pass and clean up PreviousBlendshapes to
  prevent asymptotic Lerp residuals keeping mouth slightly open.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-22 20:37:23 +01:00
parent 7dfffdbad8
commit c2142f3e6b
4 changed files with 85 additions and 17 deletions

View File

@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
{
const double T = FPlatformTime::Seconds() - SessionStartTime;
const int32 NumSamples = PCMData.Num() / sizeof(int16);
const float DurationMs = (static_cast<float>(NumSamples) / 16000.0f) * 1000.0f;
int32 QueueBefore;
{
FScopeLock Lock(&AudioQueueLock);
QueueBefore = AudioQueue.Num() / sizeof(int16);
}
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"),
T, LastClosedTurnIndex, NumSamples, DurationMs,
QueueBefore, (static_cast<float>(QueueBefore) / 16000.0f) * 1000.0f);
EnqueueAgentAudio(PCMData);
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
OnAgentAudioData.Broadcast(PCMData);
@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
{
FScopeLock Lock(&AudioQueueLock);
// During pre-buffering, do NOT consume data from AudioQueue.
// The AudioPlaybackComponent is still "playing" from the previous turn
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
// keeps firing. Without this guard, the underflow callback would drain
// the AudioQueue immediately, defeating the pre-buffer entirely.
// The ProceduralSoundWave generates silence internally when we return
// nothing — this silence does NOT accumulate, so once bPreBuffering
// clears, the buffered data plays immediately.
if (bPreBuffering)
{
return;
}
if (AudioQueue.Num() > 0)
{
const int32 BytesRequired = SamplesRequired * sizeof(int16);
@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
// Log when queue recovers (new data arrived after being dry)
if (bQueueWasDry)
{
bQueueWasDry = false;
const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
T, LastClosedTurnIndex, AudioQueue.Num());
}
}
else if (bAgentSpeaking)
{
// Queue is empty but agent is still speaking (TTS inter-batch gap).
// Feed a SMALL amount of silence to keep the audio component alive.
// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
// SamplesRequired to avoid queuing large blocks of silence in the
// audio component's internal buffer. Without this cap, multiple
// underflow calls during a TTS gap accumulate hundreds of ms of silence
// that must be played through BEFORE real audio data — causing the
// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
// one small silence block sits ahead of new audio when it arrives.
constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
const int32 SilenceBytes = SilenceSamples * sizeof(int16);
SilenceBuffer.SetNumZeroed(SilenceBytes);
InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
// Log once when queue first runs dry
if (!bQueueWasDry)
{
bQueueWasDry = true;
const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogElevenLabsAgent, Warning,
TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."),
T, LastClosedTurnIndex, SamplesRequired);
}
// Do NOT feed silence via QueueAudio! USoundWaveProcedural with
// INDEFINITELY_LOOPING_DURATION generates silence internally when
// its buffer is empty — this internal silence does NOT accumulate
// in the queue, so new audio data plays immediately when it arrives.
//
// Previously we QueueAudio'd 32ms silence blocks here, but they
// accumulated in the procedural wave's internal buffer during TTS
// gaps (1-2s between chunks). When the next chunk arrived, its data
// was queued AFTER hundreds of ms of accumulated silence, causing
// an audible pause before the real audio played.
}
}
@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
bAgentSpeaking = true;
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
bQueueWasDry = false;
SilentTickCount = 0;
const double T = AgentSpeakStart - SessionStartTime;

View File

@ -554,6 +554,28 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
{
Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
}
// Snap to zero to prevent the mouth from staying slightly open
// after speech ends. Without this, the asymptotic Lerp decay
// leaves tiny residual values (e.g. jawOpen=0.005) that keep
// the mouth visibly ajar on MetaHuman faces.
if (Pair.Value < 0.01f)
{
Pair.Value = 0.0f;
}
}
// Clean up PreviousBlendshapes: remove entries that have fully decayed
// to zero so they don't feed residual values back into the next frame.
TArray<FName> KeysToRemove;
for (const auto& Pair : CurrentBlendshapes)
{
if (Pair.Value == 0.0f)
{
KeysToRemove.Add(Pair.Key);
}
}
for (const FName& Key : KeysToRemove)
{
CurrentBlendshapes.Remove(Key);
}
PreviousBlendshapes = CurrentBlendshapes;
}

View File

@ -143,9 +143,9 @@ public:
* Higher values = fewer gaps but more latency on the first word.
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ClampMin = "0", ClampMax = "500",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
int32 AudioPreBufferMs = 250;
meta = (ClampMin = "0", ClampMax = "4000",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
int32 AudioPreBufferMs = 2000;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
@ -377,6 +377,9 @@ private:
bool bPreBuffering = false;
double PreBufferStartTime = 0.0;
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
bool bQueueWasDry = false;
// Silence detection: how many consecutive ticks with an empty audio queue.
int32 SilentTickCount = 0;