v1.9.0: Fix audio gaps, pre-buffer, and lip sync neutral pose
- Remove silence padding accumulation bug: QueueAudio'd silence was accumulating in USoundWaveProcedural's internal buffer during TTS gaps, delaying real audio by ~800ms. USoundWaveProcedural with INDEFINITELY_LOOPING_DURATION generates silence internally instead. - Fix pre-buffer bypass: guard OnProceduralUnderflow with bPreBuffering check — the audio component never stops (INDEFINITELY_LOOPING_DURATION) so it was draining AudioQueue during pre-buffering, defeating it entirely. - Audio pre-buffer default 2000ms (max 4000ms) to absorb ElevenLabs server-side TTS inter-chunk gaps (~2s between chunks confirmed). - Add diagnostic timestamps [T+Xs] in HandleAudioReceived and AudioQueue DRY/recovered logs for debugging audio pipeline timing. - Fix lip sync not returning to neutral: add snap-to-zero (< 0.01) in blendshape smoothing pass and clean up PreviousBlendshapes to prevent asymptotic Lerp residuals keeping mouth slightly open. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
7dfffdbad8
commit
c2142f3e6b
Binary file not shown.
@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
|
||||
{
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
const int32 NumSamples = PCMData.Num() / sizeof(int16);
|
||||
const float DurationMs = (static_cast<float>(NumSamples) / 16000.0f) * 1000.0f;
|
||||
int32 QueueBefore;
|
||||
{
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
QueueBefore = AudioQueue.Num() / sizeof(int16);
|
||||
}
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"),
|
||||
T, LastClosedTurnIndex, NumSamples, DurationMs,
|
||||
QueueBefore, (static_cast<float>(QueueBefore) / 16000.0f) * 1000.0f);
|
||||
|
||||
EnqueueAgentAudio(PCMData);
|
||||
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
|
||||
OnAgentAudioData.Broadcast(PCMData);
|
||||
@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
||||
{
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
|
||||
// During pre-buffering, do NOT consume data from AudioQueue.
|
||||
// The AudioPlaybackComponent is still "playing" from the previous turn
|
||||
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
|
||||
// keeps firing. Without this guard, the underflow callback would drain
|
||||
// the AudioQueue immediately, defeating the pre-buffer entirely.
|
||||
// The ProceduralSoundWave generates silence internally when we return
|
||||
// nothing — this silence does NOT accumulate, so once bPreBuffering
|
||||
// clears, the buffered data plays immediately.
|
||||
if (bPreBuffering)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (AudioQueue.Num() > 0)
|
||||
{
|
||||
const int32 BytesRequired = SamplesRequired * sizeof(int16);
|
||||
@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
||||
|
||||
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
|
||||
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
|
||||
|
||||
// Log when queue recovers (new data arrived after being dry)
|
||||
if (bQueueWasDry)
|
||||
{
|
||||
bQueueWasDry = false;
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
|
||||
T, LastClosedTurnIndex, AudioQueue.Num());
|
||||
}
|
||||
}
|
||||
else if (bAgentSpeaking)
|
||||
{
|
||||
// Queue is empty but agent is still speaking (TTS inter-batch gap).
|
||||
// Feed a SMALL amount of silence to keep the audio component alive.
|
||||
// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
|
||||
// SamplesRequired to avoid queuing large blocks of silence in the
|
||||
// audio component's internal buffer. Without this cap, multiple
|
||||
// underflow calls during a TTS gap accumulate hundreds of ms of silence
|
||||
// that must be played through BEFORE real audio data — causing the
|
||||
// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
|
||||
// one small silence block sits ahead of new audio when it arrives.
|
||||
constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
|
||||
const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
|
||||
const int32 SilenceBytes = SilenceSamples * sizeof(int16);
|
||||
SilenceBuffer.SetNumZeroed(SilenceBytes);
|
||||
InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
|
||||
// Log once when queue first runs dry
|
||||
if (!bQueueWasDry)
|
||||
{
|
||||
bQueueWasDry = true;
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsAgent, Warning,
|
||||
TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."),
|
||||
T, LastClosedTurnIndex, SamplesRequired);
|
||||
}
|
||||
|
||||
// Do NOT feed silence via QueueAudio! USoundWaveProcedural with
|
||||
// INDEFINITELY_LOOPING_DURATION generates silence internally when
|
||||
// its buffer is empty — this internal silence does NOT accumulate
|
||||
// in the queue, so new audio data plays immediately when it arrives.
|
||||
//
|
||||
// Previously we QueueAudio'd 32ms silence blocks here, but they
|
||||
// accumulated in the procedural wave's internal buffer during TTS
|
||||
// gaps (1-2s between chunks). When the next chunk arrived, its data
|
||||
// was queued AFTER hundreds of ms of accumulated silence, causing
|
||||
// an audible pause before the real audio played.
|
||||
}
|
||||
}
|
||||
|
||||
@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
||||
bAgentSpeaking = true;
|
||||
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
|
||||
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
||||
bQueueWasDry = false;
|
||||
SilentTickCount = 0;
|
||||
|
||||
const double T = AgentSpeakStart - SessionStartTime;
|
||||
|
||||
@ -554,6 +554,28 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
||||
{
|
||||
Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
|
||||
}
|
||||
// Snap to zero to prevent the mouth from staying slightly open
|
||||
// after speech ends. Without this, the asymptotic Lerp decay
|
||||
// leaves tiny residual values (e.g. jawOpen=0.005) that keep
|
||||
// the mouth visibly ajar on MetaHuman faces.
|
||||
if (Pair.Value < 0.01f)
|
||||
{
|
||||
Pair.Value = 0.0f;
|
||||
}
|
||||
}
|
||||
// Clean up PreviousBlendshapes: remove entries that have fully decayed
|
||||
// to zero so they don't feed residual values back into the next frame.
|
||||
TArray<FName> KeysToRemove;
|
||||
for (const auto& Pair : CurrentBlendshapes)
|
||||
{
|
||||
if (Pair.Value == 0.0f)
|
||||
{
|
||||
KeysToRemove.Add(Pair.Key);
|
||||
}
|
||||
}
|
||||
for (const FName& Key : KeysToRemove)
|
||||
{
|
||||
CurrentBlendshapes.Remove(Key);
|
||||
}
|
||||
PreviousBlendshapes = CurrentBlendshapes;
|
||||
}
|
||||
|
||||
@ -143,9 +143,9 @@ public:
|
||||
* Higher values = fewer gaps but more latency on the first word.
|
||||
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ClampMin = "0", ClampMax = "500",
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
|
||||
int32 AudioPreBufferMs = 250;
|
||||
meta = (ClampMin = "0", ClampMax = "4000",
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
|
||||
int32 AudioPreBufferMs = 2000;
|
||||
|
||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||
@ -377,6 +377,9 @@ private:
|
||||
bool bPreBuffering = false;
|
||||
double PreBufferStartTime = 0.0;
|
||||
|
||||
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
|
||||
bool bQueueWasDry = false;
|
||||
|
||||
// Silence detection: how many consecutive ticks with an empty audio queue.
|
||||
int32 SilentTickCount = 0;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user