v1.9.0: Fix audio gaps, pre-buffer, and lip sync neutral pose
- Remove silence padding accumulation bug: QueueAudio'd silence was accumulating in USoundWaveProcedural's internal buffer during TTS gaps, delaying real audio by ~800ms. USoundWaveProcedural with INDEFINITELY_LOOPING_DURATION generates silence internally instead. - Fix pre-buffer bypass: guard OnProceduralUnderflow with bPreBuffering check — the audio component never stops (INDEFINITELY_LOOPING_DURATION) so it was draining AudioQueue during pre-buffering, defeating it entirely. - Audio pre-buffer default 2000ms (max 4000ms) to absorb ElevenLabs server-side TTS inter-chunk gaps (~2s between chunks confirmed). - Add diagnostic timestamps [T+Xs] in HandleAudioReceived and AudioQueue DRY/recovered logs for debugging audio pipeline timing. - Fix lip sync not returning to neutral: add snap-to-zero (< 0.01) in blendshape smoothing pass and clean up PreviousBlendshapes to prevent asymptotic Lerp residuals keeping mouth slightly open. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
7dfffdbad8
commit
c2142f3e6b
Binary file not shown.
@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
|
|||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
|
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
|
||||||
{
|
{
|
||||||
|
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||||
|
const int32 NumSamples = PCMData.Num() / sizeof(int16);
|
||||||
|
const float DurationMs = (static_cast<float>(NumSamples) / 16000.0f) * 1000.0f;
|
||||||
|
int32 QueueBefore;
|
||||||
|
{
|
||||||
|
FScopeLock Lock(&AudioQueueLock);
|
||||||
|
QueueBefore = AudioQueue.Num() / sizeof(int16);
|
||||||
|
}
|
||||||
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
|
TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"),
|
||||||
|
T, LastClosedTurnIndex, NumSamples, DurationMs,
|
||||||
|
QueueBefore, (static_cast<float>(QueueBefore) / 16000.0f) * 1000.0f);
|
||||||
|
|
||||||
EnqueueAgentAudio(PCMData);
|
EnqueueAgentAudio(PCMData);
|
||||||
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
|
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
|
||||||
OnAgentAudioData.Broadcast(PCMData);
|
OnAgentAudioData.Broadcast(PCMData);
|
||||||
@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
|||||||
{
|
{
|
||||||
FScopeLock Lock(&AudioQueueLock);
|
FScopeLock Lock(&AudioQueueLock);
|
||||||
|
|
||||||
|
// During pre-buffering, do NOT consume data from AudioQueue.
|
||||||
|
// The AudioPlaybackComponent is still "playing" from the previous turn
|
||||||
|
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
|
||||||
|
// keeps firing. Without this guard, the underflow callback would drain
|
||||||
|
// the AudioQueue immediately, defeating the pre-buffer entirely.
|
||||||
|
// The ProceduralSoundWave generates silence internally when we return
|
||||||
|
// nothing — this silence does NOT accumulate, so once bPreBuffering
|
||||||
|
// clears, the buffered data plays immediately.
|
||||||
|
if (bPreBuffering)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (AudioQueue.Num() > 0)
|
if (AudioQueue.Num() > 0)
|
||||||
{
|
{
|
||||||
const int32 BytesRequired = SamplesRequired * sizeof(int16);
|
const int32 BytesRequired = SamplesRequired * sizeof(int16);
|
||||||
@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
|||||||
|
|
||||||
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
|
InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
|
||||||
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
|
AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
|
||||||
|
|
||||||
|
// Log when queue recovers (new data arrived after being dry)
|
||||||
|
if (bQueueWasDry)
|
||||||
|
{
|
||||||
|
bQueueWasDry = false;
|
||||||
|
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||||
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
|
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
|
||||||
|
T, LastClosedTurnIndex, AudioQueue.Num());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (bAgentSpeaking)
|
else if (bAgentSpeaking)
|
||||||
{
|
{
|
||||||
// Queue is empty but agent is still speaking (TTS inter-batch gap).
|
// Log once when queue first runs dry
|
||||||
// Feed a SMALL amount of silence to keep the audio component alive.
|
if (!bQueueWasDry)
|
||||||
// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
|
{
|
||||||
// SamplesRequired to avoid queuing large blocks of silence in the
|
bQueueWasDry = true;
|
||||||
// audio component's internal buffer. Without this cap, multiple
|
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||||
// underflow calls during a TTS gap accumulate hundreds of ms of silence
|
UE_LOG(LogElevenLabsAgent, Warning,
|
||||||
// that must be played through BEFORE real audio data — causing the
|
TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."),
|
||||||
// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
|
T, LastClosedTurnIndex, SamplesRequired);
|
||||||
// one small silence block sits ahead of new audio when it arrives.
|
}
|
||||||
constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
|
|
||||||
const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
|
// Do NOT feed silence via QueueAudio! USoundWaveProcedural with
|
||||||
const int32 SilenceBytes = SilenceSamples * sizeof(int16);
|
// INDEFINITELY_LOOPING_DURATION generates silence internally when
|
||||||
SilenceBuffer.SetNumZeroed(SilenceBytes);
|
// its buffer is empty — this internal silence does NOT accumulate
|
||||||
InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
|
// in the queue, so new audio data plays immediately when it arrives.
|
||||||
|
//
|
||||||
|
// Previously we QueueAudio'd 32ms silence blocks here, but they
|
||||||
|
// accumulated in the procedural wave's internal buffer during TTS
|
||||||
|
// gaps (1-2s between chunks). When the next chunk arrived, its data
|
||||||
|
// was queued AFTER hundreds of ms of accumulated silence, causing
|
||||||
|
// an audible pause before the real audio played.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
|||||||
bAgentSpeaking = true;
|
bAgentSpeaking = true;
|
||||||
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
|
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
|
||||||
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
||||||
|
bQueueWasDry = false;
|
||||||
SilentTickCount = 0;
|
SilentTickCount = 0;
|
||||||
|
|
||||||
const double T = AgentSpeakStart - SessionStartTime;
|
const double T = AgentSpeakStart - SessionStartTime;
|
||||||
|
|||||||
@ -554,6 +554,28 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
|||||||
{
|
{
|
||||||
Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
|
Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
|
||||||
}
|
}
|
||||||
|
// Snap to zero to prevent the mouth from staying slightly open
|
||||||
|
// after speech ends. Without this, the asymptotic Lerp decay
|
||||||
|
// leaves tiny residual values (e.g. jawOpen=0.005) that keep
|
||||||
|
// the mouth visibly ajar on MetaHuman faces.
|
||||||
|
if (Pair.Value < 0.01f)
|
||||||
|
{
|
||||||
|
Pair.Value = 0.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Clean up PreviousBlendshapes: remove entries that have fully decayed
|
||||||
|
// to zero so they don't feed residual values back into the next frame.
|
||||||
|
TArray<FName> KeysToRemove;
|
||||||
|
for (const auto& Pair : CurrentBlendshapes)
|
||||||
|
{
|
||||||
|
if (Pair.Value == 0.0f)
|
||||||
|
{
|
||||||
|
KeysToRemove.Add(Pair.Key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (const FName& Key : KeysToRemove)
|
||||||
|
{
|
||||||
|
CurrentBlendshapes.Remove(Key);
|
||||||
}
|
}
|
||||||
PreviousBlendshapes = CurrentBlendshapes;
|
PreviousBlendshapes = CurrentBlendshapes;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -143,9 +143,9 @@ public:
|
|||||||
* Higher values = fewer gaps but more latency on the first word.
|
* Higher values = fewer gaps but more latency on the first word.
|
||||||
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||||
meta = (ClampMin = "0", ClampMax = "500",
|
meta = (ClampMin = "0", ClampMax = "4000",
|
||||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
|
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
|
||||||
int32 AudioPreBufferMs = 250;
|
int32 AudioPreBufferMs = 2000;
|
||||||
|
|
||||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
|
||||||
@ -377,6 +377,9 @@ private:
|
|||||||
bool bPreBuffering = false;
|
bool bPreBuffering = false;
|
||||||
double PreBufferStartTime = 0.0;
|
double PreBufferStartTime = 0.0;
|
||||||
|
|
||||||
|
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
|
||||||
|
bool bQueueWasDry = false;
|
||||||
|
|
||||||
// Silence detection: how many consecutive ticks with an empty audio queue.
|
// Silence detection: how many consecutive ticks with an empty audio queue.
|
||||||
int32 SilentTickCount = 0;
|
int32 SilentTickCount = 0;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user