diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset index 432938f..ac0738b 100644 Binary files a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset and b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 4b25444..d62721f 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -444,6 +444,19 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray& PCMData) { + const double T = FPlatformTime::Seconds() - SessionStartTime; + const int32 NumSamples = PCMData.Num() / sizeof(int16); + const float DurationMs = (static_cast(NumSamples) / 16000.0f) * 1000.0f; + int32 QueueBefore; + { + FScopeLock Lock(&AudioQueueLock); + QueueBefore = AudioQueue.Num() / sizeof(int16); + } + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] Audio chunk received: %d samples (%.0fms) | AudioQueue before: %d samples (%.0fms)"), + T, LastClosedTurnIndex, NumSamples, DurationMs, + QueueBefore, (static_cast(QueueBefore) / 16000.0f) * 1000.0f); + EnqueueAgentAudio(PCMData); // Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis). OnAgentAudioData.Broadcast(PCMData); @@ -560,6 +573,19 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow( { FScopeLock Lock(&AudioQueueLock); + // During pre-buffering, do NOT consume data from AudioQueue. + // The AudioPlaybackComponent is still "playing" from the previous turn + // (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow + // keeps firing. Without this guard, the underflow callback would drain + // the AudioQueue immediately, defeating the pre-buffer entirely. + // The ProceduralSoundWave generates silence internally when we return + // nothing — this silence does NOT accumulate, so once bPreBuffering + // clears, the buffered data plays immediately. + if (bPreBuffering) + { + return; + } + if (AudioQueue.Num() > 0) { const int32 BytesRequired = SamplesRequired * sizeof(int16); @@ -567,23 +593,39 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow( InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush); AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No); + + // Log when queue recovers (new data arrived after being dry) + if (bQueueWasDry) + { + bQueueWasDry = false; + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."), + T, LastClosedTurnIndex, AudioQueue.Num()); + } } else if (bAgentSpeaking) { - // Queue is empty but agent is still speaking (TTS inter-batch gap). - // Feed a SMALL amount of silence to keep the audio component alive. - // IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of - // SamplesRequired to avoid queuing large blocks of silence in the - // audio component's internal buffer. Without this cap, multiple - // underflow calls during a TTS gap accumulate hundreds of ms of silence - // that must be played through BEFORE real audio data — causing the - // audible 1s+ pause between TTS chunks. With 32ms chunks, at most - // one small silence block sits ahead of new audio when it arrives. - constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz - const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples); - const int32 SilenceBytes = SilenceSamples * sizeof(int16); - SilenceBuffer.SetNumZeroed(SilenceBytes); - InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes); + // Log once when queue first runs dry + if (!bQueueWasDry) + { + bQueueWasDry = true; + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[T+%.2fs] [Turn %d] AudioQueue DRY — waiting for next TTS chunk (requested %d samples)."), + T, LastClosedTurnIndex, SamplesRequired); + } + + // Do NOT feed silence via QueueAudio! USoundWaveProcedural with + // INDEFINITELY_LOOPING_DURATION generates silence internally when + // its buffer is empty — this internal silence does NOT accumulate + // in the queue, so new audio data plays immediately when it arrives. + // + // Previously we QueueAudio'd 32ms silence blocks here, but they + // accumulated in the procedural wave's internal buffer during TTS + // gaps (1-2s between chunks). When the next chunk arrived, its data + // was queued AFTER hundreds of ms of accumulated silence, causing + // an audible pause before the real audio played. } } @@ -601,6 +643,7 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray KeysToRemove; + for (const auto& Pair : CurrentBlendshapes) + { + if (Pair.Value == 0.0f) + { + KeysToRemove.Add(Pair.Key); + } + } + for (const FName& Key : KeysToRemove) + { + CurrentBlendshapes.Remove(Key); } PreviousBlendshapes = CurrentBlendshapes; } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index e3c1ced..1125f94 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -143,9 +143,9 @@ public: * Higher values = fewer gaps but more latency on the first word. * Set to 0 for immediate playback (may cause mid-sentence pauses). */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", - meta = (ClampMin = "0", ClampMax = "500", - ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness.")) - int32 AudioPreBufferMs = 250; + meta = (ClampMin = "0", ClampMax = "4000", + ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps.")) + int32 AudioPreBufferMs = 2000; /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", @@ -377,6 +377,9 @@ private: bool bPreBuffering = false; double PreBufferStartTime = 0.0; + // Debug: track when the AudioQueue runs dry during speech (one-shot log). + bool bQueueWasDry = false; + // Silence detection: how many consecutive ticks with an empty audio queue. int32 SilentTickCount = 0;