diff --git a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap index 1fe726f..b00a002 100644 Binary files a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap and b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap differ diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset index 27f231c..432938f 100644 Binary files a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset and b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index c29d21f..4b25444 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -80,6 +80,25 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe GeneratingTickCount = 0; } + // Pre-buffer timer: start playback after the pre-buffer period expires. + // If the second TTS chunk didn't arrive in time, start playing with + // whatever we have. The silence padding will bridge any remaining gap. + if (bPreBuffering) + { + const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0; + if (Elapsed >= static_cast(AudioPreBufferMs)) + { + bPreBuffering = false; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."), + LastClosedTurnIndex, AudioPreBufferMs); + if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + AudioPlaybackComponent->Play(); + } + } + } + // Silence detection. // ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock. // OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast @@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow( USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired) { FScopeLock Lock(&AudioQueueLock); - if (AudioQueue.Num() == 0) return; - const int32 BytesRequired = SamplesRequired * sizeof(int16); - const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired); + if (AudioQueue.Num() > 0) + { + const int32 BytesRequired = SamplesRequired * sizeof(int16); + const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired); - InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush); - AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No); + InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush); + AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No); + } + else if (bAgentSpeaking) + { + // Queue is empty but agent is still speaking (TTS inter-batch gap). + // Feed a SMALL amount of silence to keep the audio component alive. + // IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of + // SamplesRequired to avoid queuing large blocks of silence in the + // audio component's internal buffer. Without this cap, multiple + // underflow calls during a TTS gap accumulate hundreds of ms of silence + // that must be played through BEFORE real audio data — causing the + // audible 1s+ pause between TTS chunks. With 32ms chunks, at most + // one small silence block sits ahead of new audio when it arrives. + constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz + const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples); + const int32 SilenceBytes = SilenceSamples * sizeof(int16); + SilenceBuffer.SetNumZeroed(SilenceBytes); + InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes); + } } void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray& PCMData) @@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray 0) + { + // Pre-buffer: accumulate audio before starting playback. + // This absorbs TTS inter-chunk gaps so chunk 2 arrives before + // chunk 1 finishes playing, eliminating mid-sentence pauses. + bPreBuffering = true; + PreBufferStartTime = FPlatformTime::Seconds(); + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffering %dms before starting playback."), + LastClosedTurnIndex, AudioPreBufferMs); + } + else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + AudioPlaybackComponent->Play(); + } + } + else if (bPreBuffering) + { + // Second (or later) audio chunk arrived during pre-buffer period. + // We now have both chunks buffered — start playback immediately. + bPreBuffering = false; + const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0; + UE_LOG(LogElevenLabsAgent, Log, + TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."), + LastClosedTurnIndex, BufferedMs); if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) { AudioPlaybackComponent->Play(); } + SilentTickCount = 0; + } + else + { + // Already speaking — but the audio component may have stopped due to + // buffer underrun (TTS inter-batch gap). Restart it if needed. + if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) + { + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."), + LastClosedTurnIndex); + AudioPlaybackComponent->Play(); + } + // Reset silence counter — new audio arrived, we're not in a gap anymore + SilentTickCount = 0; } } @@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio() // while holding it would block the audio thread for the full Blueprint handler duration. bool bWasSpeaking = false; double Now = 0.0; + bPreBuffering = false; // Clear pre-buffer state on stop. { FScopeLock Lock(&AudioQueueLock); AudioQueue.Empty(); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp index bcb4adf..3a0e0b1 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp @@ -33,136 +33,136 @@ TMap> UElevenLabsLipSyncComponent::CreateVisemeToBlend // PP — bilabial (P, B, M): lips pressed together { TMap BS; - BS.Add(FName("mouthClose"), 0.7f); - BS.Add(FName("mouthPressLeft"), 0.3f); - BS.Add(FName("mouthPressRight"), 0.3f); + BS.Add(FName("mouthClose"), 0.9f); + BS.Add(FName("mouthPressLeft"), 0.5f); + BS.Add(FName("mouthPressRight"), 0.5f); Map.Add(FName("PP"), BS); } // FF — labiodental (F, V): lower lip tucked under upper teeth { TMap BS; - BS.Add(FName("mouthShrugLower"), 0.5f); - BS.Add(FName("mouthUpperUpLeft"), 0.3f); - BS.Add(FName("mouthUpperUpRight"), 0.3f); - BS.Add(FName("jawOpen"), 0.1f); + BS.Add(FName("mouthShrugLower"), 0.7f); + BS.Add(FName("mouthUpperUpLeft"), 0.4f); + BS.Add(FName("mouthUpperUpRight"), 0.4f); + BS.Add(FName("jawOpen"), 0.15f); Map.Add(FName("FF"), BS); } // TH — dental (TH): tongue between teeth { TMap BS; - BS.Add(FName("tongueOut"), 0.4f); - BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("tongueOut"), 0.5f); + BS.Add(FName("jawOpen"), 0.2f); Map.Add(FName("TH"), BS); } // DD — alveolar (D, T, N): tongue on alveolar ridge { TMap BS; - BS.Add(FName("jawOpen"), 0.25f); - BS.Add(FName("mouthClose"), 0.2f); - BS.Add(FName("mouthLowerDownLeft"), 0.15f); - BS.Add(FName("mouthLowerDownRight"), 0.15f); + BS.Add(FName("jawOpen"), 0.35f); + BS.Add(FName("mouthClose"), 0.3f); + BS.Add(FName("mouthLowerDownLeft"), 0.25f); + BS.Add(FName("mouthLowerDownRight"), 0.25f); Map.Add(FName("DD"), BS); } // kk — velar (K, G): back of tongue raised { TMap BS; - BS.Add(FName("jawOpen"), 0.25f); - BS.Add(FName("mouthStretchLeft"), 0.15f); - BS.Add(FName("mouthStretchRight"), 0.15f); + BS.Add(FName("jawOpen"), 0.35f); + BS.Add(FName("mouthStretchLeft"), 0.25f); + BS.Add(FName("mouthStretchRight"), 0.25f); Map.Add(FName("kk"), BS); } // CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge { TMap BS; - BS.Add(FName("mouthFunnel"), 0.45f); - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthPucker"), 0.15f); + BS.Add(FName("mouthFunnel"), 0.65f); + BS.Add(FName("jawOpen"), 0.3f); + BS.Add(FName("mouthPucker"), 0.3f); Map.Add(FName("CH"), BS); } // SS — alveolar fricative (S, Z): air through narrow channel { TMap BS; - BS.Add(FName("mouthStretchLeft"), 0.4f); - BS.Add(FName("mouthStretchRight"), 0.4f); - BS.Add(FName("jawOpen"), 0.1f); - BS.Add(FName("mouthSmileLeft"), 0.15f); - BS.Add(FName("mouthSmileRight"), 0.15f); + BS.Add(FName("mouthStretchLeft"), 0.6f); + BS.Add(FName("mouthStretchRight"), 0.6f); + BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("mouthSmileLeft"), 0.3f); + BS.Add(FName("mouthSmileRight"), 0.3f); Map.Add(FName("SS"), BS); } // nn — nasal (N, M, NG): soft palate lowered { TMap BS; - BS.Add(FName("jawOpen"), 0.15f); - BS.Add(FName("mouthClose"), 0.2f); - BS.Add(FName("mouthPressLeft"), 0.1f); - BS.Add(FName("mouthPressRight"), 0.1f); + BS.Add(FName("jawOpen"), 0.2f); + BS.Add(FName("mouthClose"), 0.35f); + BS.Add(FName("mouthPressLeft"), 0.2f); + BS.Add(FName("mouthPressRight"), 0.2f); Map.Add(FName("nn"), BS); } // RR — retroflex/rhotic (R, L): tongue curled or lateral { TMap BS; - BS.Add(FName("mouthFunnel"), 0.3f); - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthRollLower"), 0.15f); + BS.Add(FName("mouthFunnel"), 0.5f); + BS.Add(FName("jawOpen"), 0.3f); + BS.Add(FName("mouthRollLower"), 0.3f); Map.Add(FName("RR"), BS); } // aa — open vowel (A as in "father"): wide open jaw { TMap BS; - BS.Add(FName("jawOpen"), 0.7f); - BS.Add(FName("mouthLowerDownLeft"), 0.4f); - BS.Add(FName("mouthLowerDownRight"), 0.4f); - BS.Add(FName("mouthShrugUpper"), 0.1f); + BS.Add(FName("jawOpen"), 0.85f); + BS.Add(FName("mouthLowerDownLeft"), 0.5f); + BS.Add(FName("mouthLowerDownRight"), 0.5f); + BS.Add(FName("mouthShrugUpper"), 0.15f); Map.Add(FName("aa"), BS); } // E — mid front vowel (E as in "bed"): mid-open, spread lips { TMap BS; - BS.Add(FName("jawOpen"), 0.4f); - BS.Add(FName("mouthSmileLeft"), 0.3f); - BS.Add(FName("mouthSmileRight"), 0.3f); - BS.Add(FName("mouthLowerDownLeft"), 0.2f); - BS.Add(FName("mouthLowerDownRight"), 0.2f); + BS.Add(FName("jawOpen"), 0.5f); + BS.Add(FName("mouthSmileLeft"), 0.5f); + BS.Add(FName("mouthSmileRight"), 0.5f); + BS.Add(FName("mouthLowerDownLeft"), 0.3f); + BS.Add(FName("mouthLowerDownRight"), 0.3f); Map.Add(FName("E"), BS); } // ih — close front vowel (I as in "sit"): narrow opening, spread lips { TMap BS; - BS.Add(FName("jawOpen"), 0.2f); - BS.Add(FName("mouthSmileLeft"), 0.25f); - BS.Add(FName("mouthSmileRight"), 0.25f); - BS.Add(FName("mouthStretchLeft"), 0.1f); - BS.Add(FName("mouthStretchRight"), 0.1f); + BS.Add(FName("jawOpen"), 0.25f); + BS.Add(FName("mouthSmileLeft"), 0.45f); + BS.Add(FName("mouthSmileRight"), 0.45f); + BS.Add(FName("mouthStretchLeft"), 0.2f); + BS.Add(FName("mouthStretchRight"), 0.2f); Map.Add(FName("ih"), BS); } // oh — mid back vowel (O as in "go"): rounded lips, open jaw { TMap BS; - BS.Add(FName("jawOpen"), 0.5f); - BS.Add(FName("mouthFunnel"), 0.5f); - BS.Add(FName("mouthLowerDownLeft"), 0.2f); - BS.Add(FName("mouthLowerDownRight"), 0.2f); + BS.Add(FName("jawOpen"), 0.6f); + BS.Add(FName("mouthFunnel"), 0.7f); + BS.Add(FName("mouthLowerDownLeft"), 0.3f); + BS.Add(FName("mouthLowerDownRight"), 0.3f); Map.Add(FName("oh"), BS); } // ou — close back vowel (OO as in "boot"): tightly rounded lips { TMap BS; - BS.Add(FName("mouthPucker"), 0.6f); - BS.Add(FName("mouthFunnel"), 0.4f); - BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("mouthPucker"), 0.8f); + BS.Add(FName("mouthFunnel"), 0.6f); + BS.Add(FName("jawOpen"), 0.2f); Map.Add(FName("ou"), BS); } @@ -220,7 +220,20 @@ void UElevenLabsLipSyncComponent::BeginPlay() AgentComponent = Agent; AudioDataHandle = Agent->OnAgentAudioData.AddUObject( this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived); - UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName()); + + // Bind to text response delegates for text-driven lip sync. + // Partial text (streaming) provides text BEFORE audio arrives. + // Full text provides the complete sentence (arrives just after audio). + Agent->OnAgentPartialResponse.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnPartialTextReceived); + Agent->OnAgentTextResponse.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); + + // Enable partial response streaming if not already enabled + Agent->bEnableAgentPartialResponse = true; + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName()); } else { @@ -368,10 +381,17 @@ void UElevenLabsLipSyncComponent::BeginPlay() void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason) { // Unbind from agent component - if (AgentComponent.IsValid() && AudioDataHandle.IsValid()) + if (AgentComponent.IsValid()) { - AgentComponent->OnAgentAudioData.Remove(AudioDataHandle); - AudioDataHandle.Reset(); + if (AudioDataHandle.IsValid()) + { + AgentComponent->OnAgentAudioData.Remove(AudioDataHandle); + AudioDataHandle.Reset(); + } + AgentComponent->OnAgentPartialResponse.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnPartialTextReceived); + AgentComponent->OnAgentTextResponse.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); } AgentComponent.Reset(); SpectrumAnalyzer.Reset(); @@ -388,31 +408,156 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick { Super::TickComponent(DeltaTime, TickType, ThisTickFunction); - // Smooth viseme weights towards targets using exponential interpolation - const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f); + // ── Consume queued viseme analysis frames at the FFT window rate ───────── + // Each 512-sample FFT window at 16kHz = 32ms of audio. + // We consume one queued frame every 32ms to match the original audio timing. + constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s + + // Pre-buffer sync: don't consume viseme queue while the agent component is + // pre-buffering audio. This keeps lip sync in sync with audio playback. + // Without this, the lip sync would start 250ms ahead of the audio. + if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering()) + { + return; + } + + // Wait-for-text: hold playback until text arrives so all frames get proper + // text-driven visemes. Timeout after 500ms and start with spectral shapes. + if (bWaitingForText) + { + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + if (WaitElapsed >= 0.5) + { + // Timeout — start playback with spectral shapes as fallback + bWaitingForText = false; + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Warning, + TEXT("Text wait timeout (%.0fms). Starting lip sync with spectral shapes (Queue=%d)."), + WaitElapsed * 1000.0, VisemeQueue.Num()); + } + else + { + // Still waiting — keep timer frozen, skip consumption + PlaybackTimer = 0.0f; + } + } + + PlaybackTimer += DeltaTime; + + while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0) + { + LastConsumedVisemes = VisemeQueue[0]; + TargetVisemes = VisemeQueue[0]; + VisemeQueue.RemoveAt(0); + if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0); + PlaybackTimer -= WindowDuration; + } + + // ── Inter-frame interpolation ───────────────────────────────────────── + // Instead of holding the same TargetVisemes for 32ms then jumping to the + // next frame, blend smoothly between the last consumed frame and the next + // queued frame. This prevents the "frantic" look from step-wise changes + // and creates continuous, natural-looking mouth motion. + if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0) + { + const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f); + for (const FName& Name : VisemeNames) + { + const float From = LastConsumedVisemes.FindRef(Name); + const float To = VisemeQueue[0].FindRef(Name); + TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T); + } + } + + // If queue runs dry, decay towards silence and reset text state + if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f) + { + for (const FName& Name : VisemeNames) + { + TargetVisemes.FindOrAdd(Name) = 0.0f; + } + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + PlaybackTimer = 0.0f; + + // Reset text state — but ONLY after the full response (agent_response) + // has arrived AND text was applied. This prevents destroying text between + // audio chunks of the SAME utterance: partial text arrives once, but + // ElevenLabs splits the audio into 2-3 chunks with gaps. Without + // bFullTextReceived, the text is erased after chunk 1's queue empties, + // leaving chunks 2-3 without text visemes (spectral fallback only). + if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } + } + + // ── Asymmetric smoothing ───────────────────────────────────────────────── + // At SmoothingSpeed=15: AttackSpeed=15 → alpha=0.24/frame, ~4 frames to 70%. + // ReleaseSpeed=7.5 → alpha=0.12/frame, ~9 frames to 70%. Mouth opens quickly, + // closes more gradually for natural-looking speech. + const float AttackSpeed = SmoothingSpeed * 1.0f; + const float ReleaseSpeed = SmoothingSpeed * 0.65f; bool bAnyNonZero = false; for (const FName& Name : VisemeNames) { float& Current = SmoothedVisemes.FindOrAdd(Name); - const float Target = TargetVisemes.FindOrAdd(Name); + const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; - Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha); + const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed; + const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f); + + Current = FMath::Lerp(Current, Target, Alpha); // Snap to zero to avoid infinite tiny values if (Current < 0.001f) Current = 0.0f; if (Current > 0.001f) bAnyNonZero = true; } - // "sil" uses LipSyncStrength=1 always — it's the rest pose - SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp( - SmoothedVisemes.FindOrAdd(FName("sil")), - TargetVisemes.FindOrAdd(FName("sil")), - Alpha); + // Periodic viseme activity log (Verbose — enable with log verbosity for debugging) + static int32 TickLogCount = 0; + if (++TickLogCount % 30 == 1) + { + FName DominantViseme = FName("sil"); + float DominantWeight = 0.0f; + for (const FName& Name : VisemeNames) + { + const float W = SmoothedVisemes.FindOrAdd(Name); + if (W > DominantWeight) + { + DominantWeight = W; + DominantViseme = Name; + } + } + + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"), + VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight); + } // Convert visemes to ARKit blendshapes MapVisemesToBlendshapes(); + // ── Additional blendshape-level smoothing ───────────────────────────── + // A second smoothing pass on the final ARKit blendshape values removes + // residual jitter from the OVR→ARKit mapping step. This is lighter than + // the viseme-level smoothing and provides a natural "soft" look. + { + const float BSmoothAlpha = FMath::Clamp(DeltaTime * SmoothingSpeed * 0.4f, 0.0f, 1.0f); + for (auto& Pair : CurrentBlendshapes) + { + const float* Prev = PreviousBlendshapes.Find(Pair.Key); + if (Prev) + { + Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha); + } + } + PreviousBlendshapes = CurrentBlendshapes; + } + // Auto-apply morph targets if a target mesh is set if (TargetMesh) { @@ -438,11 +583,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD const int16* Samples = reinterpret_cast(PCMData.GetData()); const int32 NumSamples = PCMData.Num() / sizeof(int16); - // DEBUG: log first audio chunk received static bool bFirstChunkLogged = false; if (!bFirstChunkLogged) { - UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples); + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples); bFirstChunkLogged = true; } @@ -452,14 +596,819 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD FloatBuffer.Add(static_cast(Samples[i]) / 32768.0f); } - // Feed to rolling FFT analyzer + // ── STEP 1: ONE spectral analysis for the whole chunk (SHAPE) ───────── + // The FSpectrumAnalyzer's ring buffer returns nearly identical results for + // sequential 512-sample pushes. So we analyze the chunk as a whole to + // determine which mouth shape (viseme blend) to use. SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples); + SpectrumAnalyzer->PerformAnalysisIfPossible(true); + AnalyzeSpectrum(); // Sets TargetVisemes with shape-only weights (~1.0) - // Try to perform analysis (returns true when enough data for one FFT window) - if (SpectrumAnalyzer->PerformAnalysisIfPossible(true)) + // Save the spectral shape for this chunk + TMap ChunkShape = TargetVisemes; + + // ── Late start fix: when queue was empty, delay playback to wait for text ── + // Partial text usually arrives 50-100ms before audio, but sometimes audio + // comes first. A small delay gives text time to arrive and be applied to + // the first frames, preventing mute mouth at utterance start. + const bool bQueueWasEmpty = (VisemeQueue.Num() == 0); + + // ── STEP 2: Per-window amplitude + ZCR (DYNAMICS + VARIATION) ───────── + // For each 512-sample window (~32ms), compute: + // - RMS amplitude: captures syllable rhythm (natural opening/closing) + // - Zero-crossing rate: detects sibilants/fricatives within the chunk + // The shape (which visemes) stays constant per chunk, but the amplitude + // (how much) varies per window, creating realistic speech dynamics. + constexpr int32 WindowSize = 512; + int32 WindowsQueued = 0; + float MinAmp = 1.0f, MaxAmp = 0.0f; // For debug logging + + for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize) { - AnalyzeSpectrum(); + // RMS amplitude for this window + float SumSquares = 0.0f; + int32 ZeroCrossings = 0; + for (int32 i = 0; i < WindowSize; ++i) + { + const float S = FloatBuffer[Offset + i]; + SumSquares += S * S; + if (i > 0 && ((S >= 0.0f) != (FloatBuffer[Offset + i - 1] >= 0.0f))) + ZeroCrossings++; + } + + const float WindowRMS = FMath::Sqrt(SumSquares / static_cast(WindowSize)); + const float ZCR = static_cast(ZeroCrossings) / static_cast(WindowSize - 1); + + // Normalize amplitude: typical speech RMS at 16-bit is 0.02-0.15. + // Scale up and apply power curve for dynamic range compression. + // pow(0.4) compresses more than sqrt (0.5): quiet parts become more + // visible while loud parts are slightly reduced. This ensures the + // first part of a TTS response (often quieter) has adequate lip movement. + float Amplitude = FMath::Clamp(WindowRMS * 10.0f, 0.0f, 1.5f); + Amplitude = FMath::Clamp(FMath::Pow(Amplitude, 0.4f), 0.0f, 1.0f); + + // Apply user-configurable amplitude attenuation (AmplitudeScale 0-1). + // This reduces overall mouth movement intensity without changing the + // viseme shape, giving control over "how much" the mouth opens. + Amplitude *= AmplitudeScale; + + MinAmp = FMath::Min(MinAmp, Amplitude); + MaxAmp = FMath::Max(MaxAmp, Amplitude); + + // Build this window's viseme frame + TMap WindowVisemes; + + if (Amplitude < 0.08f) + { + // Silence — mouth closed (between syllables / pauses) + for (const FName& Name : VisemeNames) + WindowVisemes.Add(Name, 0.0f); + WindowVisemes.FindOrAdd(FName("sil")) = 1.0f; + } + else + { + // Active speech — determine shape and scale by amplitude + + // High ZCR (>0.15) suggests fricative/sibilant energy. + // This provides within-chunk shape variation: voiced segments + // use the spectral shape, fricative segments override to SS/FF. + if (ZCR > 0.15f) + { + for (const FName& Name : VisemeNames) + WindowVisemes.Add(Name, 0.0f); + + float SibStrength = FMath::Clamp((ZCR - 0.15f) * 5.0f, 0.0f, 1.0f); + WindowVisemes.FindOrAdd(FName("SS")) = SibStrength * Amplitude; + WindowVisemes.FindOrAdd(FName("FF")) = (1.0f - SibStrength) * Amplitude * 0.5f; + WindowVisemes.FindOrAdd(FName("ih")) = (1.0f - SibStrength) * Amplitude * 0.3f; + + // Blend in the chunk shape at reduced weight for non-sibilant visemes + for (const FName& Name : VisemeNames) + { + if (Name != FName("SS") && Name != FName("FF") && Name != FName("ih") && Name != FName("sil")) + { + WindowVisemes.FindOrAdd(Name) += ChunkShape.FindRef(Name) * Amplitude * (1.0f - SibStrength) * 0.4f; + } + } + } + else + { + // Voiced segment — use chunk spectral shape scaled by amplitude. + // This creates the primary speech animation: syllable rhythm + // from amplitude, mouth shape from spectral analysis. + for (const FName& Name : VisemeNames) + { + if (Name == FName("sil")) + { + WindowVisemes.Add(Name, 0.0f); + } + else + { + WindowVisemes.Add(Name, ChunkShape.FindRef(Name) * Amplitude); + } + } + } + } + + VisemeQueue.Add(WindowVisemes); + AmplitudeQueue.Add(Amplitude); + WindowsQueued++; } + + // ── Pseudo-speech fallback (no text available) ────────────────────── + // When text visemes are not available (server doesn't send partial text, + // or text arrives much later than audio), create natural-looking mouth + // movement by cycling through vowel/consonant shapes at speech rate. + // This is MUCH better than the single spectral shape (one shape for the + // entire chunk). If text arrives later, ApplyTextVisemesToQueue() will + // overwrite these frames with proper text-driven visemes. + if (TextVisemeSequence.Num() == 0 && WindowsQueued > 0) + { + // Vowel/consonant alternation at ~5 syllables/second. + // Each "syllable" = 3 frames vowel + 2 frames consonant = 5 frames × 32ms = 160ms. + static const FName VowelShapes[] = { FName("aa"), FName("oh"), FName("E"), FName("ih"), FName("ou") }; + static const FName ConsonantShapes[] = { FName("nn"), FName("PP"), FName("DD"), FName("kk"), FName("RR") }; + constexpr int32 NumShapes = 5; + constexpr int32 VowelFrames = 3; // ~96ms open + constexpr int32 ConsonantFrames = 2; // ~64ms transition + constexpr int32 SyllableFrames = VowelFrames + ConsonantFrames; // ~160ms + + int32 StartIdx = VisemeQueue.Num() - WindowsQueued; + int32 ActiveCount = 0; + int32 PseudoCount = 0; + + for (int32 Idx = StartIdx; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + const float Amp = AmplitudeQueue[Idx]; + if (Amp < 0.08f) continue; // Keep silent frames as-is + + const int32 SyllableIdx = ActiveCount / SyllableFrames; + const int32 FrameInSyllable = ActiveCount % SyllableFrames; + const int32 ShapeIdx = SyllableIdx % NumShapes; + + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + Frame.FindOrAdd(Name) = 0.0f; + + if (FrameInSyllable < VowelFrames) + { + // Vowel phase — mouth open + const FName Vowel = VowelShapes[ShapeIdx]; + Frame.FindOrAdd(Vowel) = Amp; + + // Anticipatory blend in last vowel frame towards consonant + if (FrameInSyllable == VowelFrames - 1) + { + const FName Consonant = ConsonantShapes[ShapeIdx]; + Frame.FindOrAdd(Vowel) = Amp * 0.7f; + Frame.FindOrAdd(Consonant) = Amp * 0.3f; + } + } + else + { + // Consonant/transition phase — mouth partially closed + const FName Consonant = ConsonantShapes[ShapeIdx]; + Frame.FindOrAdd(Consonant) = Amp * 0.7f; + + // Anticipatory blend towards next vowel in last consonant frame + if (FrameInSyllable == SyllableFrames - 1) + { + const int32 NextShapeIdx = (SyllableIdx + 1) % NumShapes; + const FName NextVowel = VowelShapes[NextShapeIdx]; + Frame.FindOrAdd(NextVowel) = Amp * 0.3f; + } + } + + ActiveCount++; + PseudoCount++; + } + + if (PseudoCount > 0) + { + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Pseudo-speech: %d active frames (%d syllables)"), + PseudoCount, (PseudoCount + SyllableFrames - 1) / SyllableFrames); + } + } + + // ── Late start fix + wait-for-text ─────────────────────────────────── + // When a new utterance begins (queue was empty): + // 1) Override leading silent frames (TTS fade-in) with minimum amplitude + // 2) If text hasn't arrived yet, hold playback until it does (max 500ms) + // This ensures ALL frames get text-driven visemes from the start. + if (bQueueWasEmpty && WindowsQueued > 0) + { + // Override leading silent frames with minimum amplitude + constexpr float MinStartAmplitude = 0.15f; + int32 FixedCount = 0; + for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + if (AmplitudeQueue[Idx] >= 0.08f) + break; // Stop at first naturally active frame + + AmplitudeQueue[Idx] = MinStartAmplitude; + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + { + Frame.FindOrAdd(Name) = (Name == FName("sil")) + ? 0.0f + : ChunkShape.FindRef(Name) * MinStartAmplitude; + } + FixedCount++; + } + + if (FixedCount > 0) + { + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Late start fix: overrode %d leading silent frames with min amplitude %.2f"), + FixedCount, MinStartAmplitude); + } + + // If text is already available (from partial responses arriving before audio), + // apply it immediately and start playback. + // Otherwise, hold playback until text arrives (wait-for-text mechanism). + if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3) + { + // Text already available — apply and start playback immediately + ApplyTextVisemesToQueue(); + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Text already available (%d visemes). Starting lip sync immediately."), + TextVisemeSequence.Num()); + } + else + { + // No text yet — hold playback until text arrives or timeout + bWaitingForText = true; + WaitingForTextStartTime = FPlatformTime::Seconds(); + PlaybackTimer = 0.0f; + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Waiting for text before starting lip sync (%d frames queued)."), + WindowsQueued); + } + } + else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0) + { + // Not a new utterance but text is available — apply to new frames + ApplyTextVisemesToQueue(); + } + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Audio chunk: %d samples → %d windows | Amp=[%.2f-%.2f] | Queue=%d (%.1fs) | TextVisemes=%d"), + NumSamples, WindowsQueued, + MinAmp, MaxAmp, VisemeQueue.Num(), + VisemeQueue.Num() * (512.0f / 16000.0f), TextVisemeSequence.Num()); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Text-driven lip sync +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialText) +{ + // If the previous utterance's full text was already received, + // this partial text belongs to a NEW utterance — start fresh. + if (bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } + + // Accumulate streaming text fragments (arrive BEFORE audio) + AccumulatedText += PartialText; + + // Convert accumulated text to viseme sequence progressively + ConvertTextToVisemes(AccumulatedText); + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Partial text: \"%s\" → %d visemes (accumulated: \"%s\")"), + *PartialText, TextVisemeSequence.Num(), *AccumulatedText); + + // If we were waiting for text to arrive before starting playback, + // apply text visemes to queued frames and start consuming. + if (bWaitingForText && TextVisemeSequence.Num() >= 3) + { + if (VisemeQueue.Num() > 0) + { + ApplyTextVisemesToQueue(); + } + bWaitingForText = false; + PlaybackTimer = 0.0f; // Start consuming now + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Text arrived after %.0fms wait. Starting lip sync playback."), + WaitElapsed * 1000.0); + } +} + +void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& ResponseText) +{ + // Full text arrived — use it as the definitive source + bFullTextReceived = true; + AccumulatedText = ResponseText; + ConvertTextToVisemes(ResponseText); + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num()); + + // Apply to any remaining queued frames + if (VisemeQueue.Num() > 0) + { + ApplyTextVisemesToQueue(); + } + + // If we were waiting for text to arrive before starting playback, start now + if (bWaitingForText) + { + bWaitingForText = false; + PlaybackTimer = 0.0f; + const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime; + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Full text arrived after %.0fms wait. Starting lip sync playback."), + WaitElapsed * 1000.0); + } + + // Log the viseme sequence for debugging + { + FString VisSeq; + int32 Count = 0; + for (const FName& V : TextVisemeSequence) + { + if (Count > 0) VisSeq += TEXT(" "); + VisSeq += V.ToString(); + if (++Count >= 30) { VisSeq += TEXT(" ..."); break; } + } + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("Viseme sequence: [%s]"), *VisSeq); + } + + // NOTE: Do NOT reset bTextVisemesApplied here. It's reset in TickComponent + // when the queue empties AFTER the text has been consumed. Resetting it here + // would prevent TickComponent from cleaning up AccumulatedText, which then + // persists and corrupts the next utterance's partial text accumulation. +} + +void UElevenLabsLipSyncComponent::ConvertTextToVisemes(const FString& Text) +{ + TextVisemeSequence.Reset(); + + // Lowercase for matching + FString Lower = Text.ToLower(); + + // Process character by character, checking multi-char graphemes first. + // This supports French and English phoneme-to-viseme mapping. + int32 i = 0; + while (i < Lower.Len()) + { + TCHAR C = Lower[i]; + TCHAR C1 = (i + 1 < Lower.Len()) ? Lower[i + 1] : 0; + TCHAR C2 = (i + 2 < Lower.Len()) ? Lower[i + 2] : 0; + + // ── 3-char graphemes ────────────────────────────────────────────── + if (C == 'e' && C1 == 'a' && C2 == 'u') + { + // eau → /o/ (oh) + TextVisemeSequence.Add(FName("oh")); + i += 3; continue; + } + if (C == 'a' && C1 == 'i' && C2 == 'n') + { + // ain → /ɛ̃/ (E nasal) + TextVisemeSequence.Add(FName("E")); + i += 3; continue; + } + if (C == 'e' && C1 == 'i' && C2 == 'n') + { + // ein → /ɛ̃/ (E nasal) + TextVisemeSequence.Add(FName("E")); + i += 3; continue; + } + if (C == 'o' && C1 == 'e' && C2 == 'u') + { + // oeu → /ø/ (oh-like) + TextVisemeSequence.Add(FName("oh")); + i += 3; continue; + } + + // ── 2-char graphemes ────────────────────────────────────────────── + if (C == 'o' && C1 == 'u') + { + // ou → /u/ (ou) + TextVisemeSequence.Add(FName("ou")); + i += 2; continue; + } + if (C == 'o' && C1 == 'i') + { + // oi → /wa/ (ou + aa) + TextVisemeSequence.Add(FName("ou")); + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'o' && C1 == 'n') + { + // on → /ɔ̃/ (oh nasal) + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'o' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2))) + { + // om at end → /ɔ̃/ (oh nasal) + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'a' && (C1 == 'n' || C1 == 'm')) + { + // an, am → /ɑ̃/ (aa nasal) + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'e' && C1 == 'n') + { + // en → /ɑ̃/ (aa nasal, French) + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'e' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2))) + { + // em at end → /ɑ̃/ + TextVisemeSequence.Add(FName("aa")); + i += 2; continue; + } + if (C == 'i' && (C1 == 'n' || C1 == 'm')) + { + // in, im → /ɛ̃/ + TextVisemeSequence.Add(FName("ih")); + i += 2; continue; + } + if (C == 'u' && C1 == 'n') + { + // un → /œ̃/ + TextVisemeSequence.Add(FName("ih")); + i += 2; continue; + } + if (C == 'a' && C1 == 'u') + { + // au → /o/ + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'a' && C1 == 'i') + { + // ai → /ɛ/ + TextVisemeSequence.Add(FName("E")); + i += 2; continue; + } + if (C == 'e' && C1 == 'i') + { + // ei → /ɛ/ + TextVisemeSequence.Add(FName("E")); + i += 2; continue; + } + if (C == 'e' && C1 == 'u') + { + // eu → /ø/ + TextVisemeSequence.Add(FName("oh")); + i += 2; continue; + } + if (C == 'c' && C1 == 'h') + { + // ch → /ʃ/ + TextVisemeSequence.Add(FName("CH")); + i += 2; continue; + } + if (C == 's' && C1 == 'h') + { + // sh → /ʃ/ + TextVisemeSequence.Add(FName("CH")); + i += 2; continue; + } + if (C == 'g' && C1 == 'n') + { + // gn → /ɲ/ + TextVisemeSequence.Add(FName("nn")); + i += 2; continue; + } + if (C == 'p' && C1 == 'h') + { + // ph → /f/ + TextVisemeSequence.Add(FName("FF")); + i += 2; continue; + } + if (C == 't' && C1 == 'h') + { + // th → /θ/ + TextVisemeSequence.Add(FName("TH")); + i += 2; continue; + } + if (C == 'q' && C1 == 'u') + { + // qu → /k/ + TextVisemeSequence.Add(FName("kk")); + i += 2; continue; + } + if (C == 'l' && C1 == 'l') + { + // ll → /l/ (single) + TextVisemeSequence.Add(FName("RR")); + i += 2; continue; + } + if (C == 's' && C1 == 's') + { + // ss → /s/ + TextVisemeSequence.Add(FName("SS")); + i += 2; continue; + } + if (C == 'm' && C1 == 'm') + { + // mm → /m/ + TextVisemeSequence.Add(FName("PP")); + i += 2; continue; + } + if (C == 'n' && C1 == 'n') + { + // nn → /n/ + TextVisemeSequence.Add(FName("nn")); + i += 2; continue; + } + if (C == 't' && C1 == 't') + { + // tt → /t/ + TextVisemeSequence.Add(FName("DD")); + i += 2; continue; + } + if (C == 'c' && (C1 == 'e' || C1 == 'i' || C1 == 'y')) + { + // ce, ci, cy → /s/ + TextVisemeSequence.Add(FName("SS")); + i += 1; continue; // Only consume the 'c', let the vowel be processed next + } + if (C == 'g' && (C1 == 'e' || C1 == 'i' || C1 == 'y')) + { + // ge, gi, gy → /ʒ/ + TextVisemeSequence.Add(FName("CH")); + i += 1; continue; + } + + // ── French silent letters at end of word ────────────────────────── + // In French, final s, t, d, x, z are typically silent. + // Examples: "vous" → /vu/, "comment" → /kɔmɑ̃/, "allez" → /ale/ + { + bool bIsWordFinal = (i + 1 >= Lower.Len()) || !FChar::IsAlpha(Lower[i + 1]); + + // Silent final consonants + if (bIsWordFinal && (C == 's' || C == 't' || C == 'd' || C == 'x' || C == 'z')) + { + i++; continue; + } + + // e muet (silent 'e') at end of word — not é, è, ê + // Plain 'e' at end of a word is usually silent in French. + // Accented variants (é=0xE9, è=0xE8, ê=0xEA) are always pronounced. + if (C == 'e' && bIsWordFinal && i > 0 && FChar::IsAlpha(Lower[i - 1])) + { + i++; continue; + } + } + + // ── Single characters ───────────────────────────────────────────── + switch (C) + { + // Vowels + case 'a': case TCHAR(0xE0): case TCHAR(0xE2): // a, à, â + TextVisemeSequence.Add(FName("aa")); break; + case 'e': case TCHAR(0xE9): case TCHAR(0xE8): case TCHAR(0xEA): // e, é, è, ê + TextVisemeSequence.Add(FName("E")); break; + case 'i': case TCHAR(0xEE): case TCHAR(0xEF): // i, î, ï + TextVisemeSequence.Add(FName("ih")); break; + case 'o': case TCHAR(0xF4): // o, ô + TextVisemeSequence.Add(FName("oh")); break; + case 'u': case TCHAR(0xFB): case TCHAR(0xFC): // u, û, ü + TextVisemeSequence.Add(FName("ou")); break; + case 'y': + TextVisemeSequence.Add(FName("ih")); break; + + // Consonants + case 'b': + TextVisemeSequence.Add(FName("PP")); break; + case 'c': case 'k': case 'q': + TextVisemeSequence.Add(FName("kk")); break; + case 'd': + TextVisemeSequence.Add(FName("DD")); break; + case 'f': + TextVisemeSequence.Add(FName("FF")); break; + case 'g': + TextVisemeSequence.Add(FName("kk")); break; + case 'h': + // Silent in French, aspirated in English — skip + break; + case 'j': + TextVisemeSequence.Add(FName("CH")); break; + case 'l': + TextVisemeSequence.Add(FName("RR")); break; + case 'm': + TextVisemeSequence.Add(FName("PP")); break; + case 'n': + TextVisemeSequence.Add(FName("nn")); break; + case 'p': + TextVisemeSequence.Add(FName("PP")); break; + case 'r': + TextVisemeSequence.Add(FName("RR")); break; + case 's': + TextVisemeSequence.Add(FName("SS")); break; + case 't': + TextVisemeSequence.Add(FName("DD")); break; + case 'v': + TextVisemeSequence.Add(FName("FF")); break; + case 'w': + TextVisemeSequence.Add(FName("ou")); break; + case 'x': + TextVisemeSequence.Add(FName("kk")); + TextVisemeSequence.Add(FName("SS")); break; + case 'z': + TextVisemeSequence.Add(FName("SS")); break; + + // Space / punctuation → silence + case ' ': case ',': case '.': case '!': case '?': case ';': case ':': + case '-': case '\n': case '\r': + TextVisemeSequence.Add(FName("sil")); break; + + default: + // Unknown character — skip + break; + } + + i++; + } + + // ── Post-processing: merge consecutive silence entries ──────────────── + // "Bonjour, " generates two sil (comma + space). Collapse to one. + { + TArray Merged; + Merged.Reserve(TextVisemeSequence.Num()); + for (const FName& V : TextVisemeSequence) + { + if (V == FName("sil") && Merged.Num() > 0 && Merged.Last() == FName("sil")) + continue; // Skip duplicate sil + Merged.Add(V); + } + // Also strip leading/trailing sil + while (Merged.Num() > 0 && Merged[0] == FName("sil")) + Merged.RemoveAt(0); + while (Merged.Num() > 0 && Merged.Last() == FName("sil")) + Merged.RemoveAt(Merged.Num() - 1); + + TextVisemeSequence = MoveTemp(Merged); + } +} + +// Duration weights for viseme types. +// Vowels naturally last longer than consonants in speech. +// These weights control how many audio frames each viseme occupies. +static float GetVisemeDurationWeight(const FName& Viseme) +{ + // Vowels — sustained, mouth held open: ~100-150ms + if (Viseme == FName("aa") || Viseme == FName("oh") || Viseme == FName("E")) + return 2.0f; + if (Viseme == FName("ih") || Viseme == FName("ou")) + return 1.7f; + + // Liquids / nasals — semi-sustained: ~60-100ms + if (Viseme == FName("RR") || Viseme == FName("nn")) + return 1.5f; + + // Fricatives — moderate duration: ~60-80ms + if (Viseme == FName("SS") || Viseme == FName("FF") || Viseme == FName("CH") || Viseme == FName("TH")) + return 1.2f; + + // Plosives — short closure: ~50-70ms (not too short to avoid frantic look) + if (Viseme == FName("PP") || Viseme == FName("DD") || Viseme == FName("kk")) + return 0.8f; + + // Silence — brief pause between words (keep short to avoid frozen look) + if (Viseme == FName("sil")) + return 1.0f; + + return 1.0f; +} + +void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue() +{ + if (TextVisemeSequence.Num() == 0 || VisemeQueue.Num() == 0) return; + + // Count non-silent frames (amplitude > threshold) in the queue + int32 ActiveFrames = 0; + for (int32 Idx = 0; Idx < AmplitudeQueue.Num(); ++Idx) + { + if (AmplitudeQueue[Idx] >= 0.08f) + ActiveFrames++; + } + + if (ActiveFrames == 0) return; + + // ── Duration-weighted distribution ──────────────────────────────────── + // Vowels get more frames than consonants, creating natural timing where + // the mouth lingers on open vowels and quickly transitions through plosives. + + // Compute total weighted duration of the viseme sequence + float TotalWeight = 0.0f; + for (const FName& V : TextVisemeSequence) + { + TotalWeight += GetVisemeDurationWeight(V); + } + + // Build a cumulative weight array for mapping frame index → viseme index. + // CumulativeWeight[i] = sum of weights from viseme 0..i-1 + TArray CumulativeWeight; + CumulativeWeight.SetNum(TextVisemeSequence.Num() + 1); + CumulativeWeight[0] = 0.0f; + for (int32 V = 0; V < TextVisemeSequence.Num(); ++V) + { + CumulativeWeight[V + 1] = CumulativeWeight[V] + GetVisemeDurationWeight(TextVisemeSequence[V]); + } + + // For each active audio frame, find which viseme it maps to based on + // its proportional position in the weighted timeline. + int32 ActiveIdx = 0; + for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx) + { + const float Amp = AmplitudeQueue[Idx]; + + if (Amp < 0.08f) + { + // Silent frame — keep as silence + continue; + } + + // Where are we in the weighted timeline? (0..TotalWeight) + const float TimelinePos = (static_cast(ActiveIdx) / static_cast(ActiveFrames)) * TotalWeight; + + // Find which viseme this position falls into (binary-style search) + int32 VisemeIdx = 0; + for (int32 V = 0; V < TextVisemeSequence.Num(); ++V) + { + if (TimelinePos >= CumulativeWeight[V] && TimelinePos < CumulativeWeight[V + 1]) + { + VisemeIdx = V; + break; + } + VisemeIdx = V; // Fallback to last + } + + const FName TextViseme = TextVisemeSequence[VisemeIdx]; + + // Blend progress within current viseme (0..1) + const float VisemeStart = CumulativeWeight[VisemeIdx]; + const float VisemeDuration = CumulativeWeight[VisemeIdx + 1] - VisemeStart; + const float LocalProgress = (VisemeDuration > 0.01f) + ? FMath::Clamp((TimelinePos - VisemeStart) / VisemeDuration, 0.0f, 1.0f) + : 0.0f; + + // Next viseme for blending during the last 30% of each viseme + const int32 NextIdx = FMath::Min(VisemeIdx + 1, TextVisemeSequence.Num() - 1); + const FName NextViseme = TextVisemeSequence[NextIdx]; + + // Rebuild this frame: text-derived shape × stored amplitude + TMap& Frame = VisemeQueue[Idx]; + for (const FName& Name : VisemeNames) + { + Frame.FindOrAdd(Name) = 0.0f; + } + + if (TextViseme == FName("sil")) + { + // Text-driven silence — mouth closes + Frame.FindOrAdd(FName("sil")) = 1.0f; + } + else + { + // Anticipatory blending: in the last 30% of each viseme, + // gradually blend towards the next viseme shape. + const float BlendZone = 0.3f; + float BlendToNext = 0.0f; + if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme && NextViseme != FName("sil")) + { + BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone; + } + + // Primary viseme shape × amplitude + Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f); + + // Blend towards next viseme + if (BlendToNext > 0.0f && NextViseme != FName("sil")) + { + Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f; + } + } + + ActiveIdx++; + } + + bTextVisemesApplied = true; + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Applied %d text visemes to %d active frames (of %d total)"), + TextVisemeSequence.Num(), ActiveFrames, VisemeQueue.Num()); } void UElevenLabsLipSyncComponent::AnalyzeSpectrum() @@ -478,14 +1427,9 @@ void UElevenLabsLipSyncComponent::AnalyzeSpectrum() const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy; - // DEBUG: log energy levels periodically - static int32 AnalysisCount = 0; - if (++AnalysisCount % 50 == 1) // Log every ~50 analyses - { - UE_LOG(LogElevenLabsLipSync, Log, - TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"), - TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); - } + UE_LOG(LogElevenLabsLipSync, Verbose, + TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"), + TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); } @@ -519,131 +1463,103 @@ void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy, TargetVisemes.FindOrAdd(Name) = 0.0f; } - // Silence threshold — below this, mouth is closed - constexpr float SilenceThreshold = 0.002f; - - if (TotalEnergy < SilenceThreshold) + // Below noise floor → silence shape + if (TotalEnergy < 0.01f) { TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; return; } - // Normalize band energies relative to total + // ── Spectral ratios determine mouth SHAPE (not intensity) ──────────── + // These weights are ~1.0 (full strength). Per-window amplitude in + // OnAudioChunkReceived scales them to create speech dynamics. + // This function produces a "shape template" for the entire audio chunk. const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f); const float NormF1 = F1Energy * InvTotal; const float NormF2 = F2Energy * InvTotal; const float NormF3 = F3Energy * InvTotal; const float NormSibilant = SibilantEnergy * InvTotal; - // Energy-based intensity (how "loud" the speech is — drives overall jaw opening) - // Scale to a usable 0-1 range. The constant is empirically tuned. - const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f); + // Brightness: ratio of high-freq to low-freq energy. + // Low brightness = rounded lips (oh, ou). High = spread lips (E, ih, SS). + const float Brightness = FMath::Clamp( + (NormF2 + NormF3 + NormSibilant * 2.0f) / FMath::Max(NormF1 + 0.01f, 0.01f), + 0.0f, 4.0f) / 4.0f; - // ── Classification based on spectral shape ─────────────────────────────── - // The approach: compute "votes" for each viseme category based on where - // the spectral energy is concentrated. Multiple visemes can be active - // simultaneously (blended). + // ── Primary vowel/consonant shape (mutually exclusive) ─────────────── - // Fricatives / sibilants: high-frequency energy dominates - if (NormSibilant > 0.25f) + if (NormSibilant > 0.2f) { - const float FricativeWeight = NormSibilant * Intensity; - // Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq) + // Sibilant chunk — per-window ZCR will refine this further + float SibWeight = FMath::Clamp(NormSibilant * 2.0f, 0.0f, 1.0f); if (NormF3 > NormF2) { - TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight; + TargetVisemes.FindOrAdd(FName("SS")) = SibWeight; } else { - TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f; - TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f; + TargetVisemes.FindOrAdd(FName("CH")) = SibWeight * 0.7f; + TargetVisemes.FindOrAdd(FName("SS")) = SibWeight * 0.3f; } - // F/V component - TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f; + TargetVisemes.FindOrAdd(FName("FF")) = SibWeight * 0.3f; } - - // Voiced speech: most energy in voice + F1 + F2 - if (NormSibilant < 0.5f) + else if (Brightness > 0.55f) { - const float VoicedWeight = (1.0f - NormSibilant) * Intensity; - - // Open vowels: strong F1 = wide jaw opening - if (NormF1 > 0.3f) + // Bright (front vowel): E or ih — spread lips + if (NormF1 > 0.2f) { - if (NormF2 > 0.35f) - { - // High F2 + high F1 → front open vowel (A as in "cat") - TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1; - } - else - { - // Low F2 + high F1 → back open vowel (O as in "go") - TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f; - TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f; - } + TargetVisemes.FindOrAdd(FName("E")) = 1.0f; + TargetVisemes.FindOrAdd(FName("aa")) = 0.3f; } - - // Mid vowels: moderate F1 - if (NormF1 > 0.15f && NormF1 <= 0.3f) + else { - if (NormF2 > 0.4f) - { - // High F2 → front mid vowel (E as in "bed") - TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f; - } - else - { - // Low F2 → rounded mid vowel - TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f; - } - } - - // Close vowels: weak F1 - if (NormF1 <= 0.15f && NormF2 > 0.0f) - { - if (NormF2 > 0.4f) - { - // High F2 → front close vowel (I as in "see") - TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f; - } - else - { - // Low F2 → back close vowel (OO as in "boot") - TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f; - } - } - - // Nasals / liquids: prominent F3 with low sibilant - if (NormF3 > 0.2f && NormSibilant < 0.15f) - { - if (NormF1 < 0.2f) - { - TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f; - } - else - { - TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f; - } - } - - // Plosive detection: very low F1 with moderate energy = lips/tongue closed - if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f) - { - TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f; - TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f; + TargetVisemes.FindOrAdd(FName("ih")) = 0.8f; } } - - // TH detection: moderate sibilant + moderate F3 (dental fricative) - if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f) + else if (Brightness < 0.3f) { - TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f; + // Dark (back vowel): oh or ou — rounded lips + if (NormF1 > 0.2f) + { + TargetVisemes.FindOrAdd(FName("oh")) = 1.0f; + } + else + { + TargetVisemes.FindOrAdd(FName("ou")) = 0.8f; + } + } + else + { + // Neutral / open vowel: aa — wide open jaw + TargetVisemes.FindOrAdd(FName("aa")) = 1.0f; } - // Ensure at least some silence weight when energy is very low - if (Intensity < 0.1f) + // ── Secondary consonant contributions (additive) ───────────────────── + + // Nasals (N, M, NG): prominent F3, low sibilant + if (NormF3 > 0.25f && NormSibilant < 0.15f) { - TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f; + TargetVisemes.FindOrAdd(FName("nn")) = 0.5f; + TargetVisemes.FindOrAdd(FName("RR")) = 0.2f; + } + + // Plosive hint (P, B): very low F1 + if (NormF1 < 0.08f && NormSibilant < 0.2f) + { + TargetVisemes.FindOrAdd(FName("PP")) = 0.5f; + TargetVisemes.FindOrAdd(FName("DD")) = 0.3f; + } + + // Labiodental (F, V): moderate sibilant + lip involvement + if (NormSibilant > 0.12f && NormSibilant < 0.3f && NormF1 < 0.15f) + { + TargetVisemes.FindOrAdd(FName("FF")) = 0.6f; + } + + // Dental (TH): moderate sibilant + moderate F3 + if (NormSibilant > 0.12f && NormSibilant < 0.35f && NormF3 > 0.15f) + { + TargetVisemes.FindOrAdd(FName("TH")) = 0.4f; } } @@ -686,9 +1602,8 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets() { if (!TargetMesh) return; - // DEBUG: log blendshape values periodically static int32 ApplyCount = 0; - if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps + if (++ApplyCount % 120 == 1) { FString DebugStr; for (const auto& Pair : CurrentBlendshapes) @@ -708,7 +1623,7 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets() } if (DebugStr.Len() > 0) { - UE_LOG(LogElevenLabsLipSync, Log, TEXT("%s: %s"), + UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("%s: %s"), bUseCurveMode ? TEXT("Curves") : TEXT("Blendshapes"), *DebugStr); } } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index e5a5fe2..e3c1ced 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -136,6 +136,17 @@ public: meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text.")) bool bEnableAgentPartialResponse = false; + /** Pre-buffer delay (ms) before starting audio playback on the first chunk. + * ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them. + * Pre-buffering delays playback start so the second chunk arrives before the + * first finishes playing, eliminating the audible gap mid-sentence. + * Higher values = fewer gaps but more latency on the first word. + * Set to 0 for immediate playback (may cause mid-sentence pauses). */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", + meta = (ClampMin = "0", ClampMax = "500", + ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness.")) + int32 AudioPreBufferMs = 250; + /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs", meta = (ClampMin = "0.0", @@ -257,6 +268,11 @@ public: UFUNCTION(BlueprintPure, Category = "ElevenLabs") const FElevenLabsConversationInfo& GetConversationInfo() const; + /** True while audio is being pre-buffered (playback hasn't started yet). + * Used by the LipSync component to pause viseme queue consumption. */ + UFUNCTION(BlueprintPure, Category = "ElevenLabs") + bool IsPreBuffering() const { return bPreBuffering; } + /** Access the underlying WebSocket proxy (advanced use). */ UFUNCTION(BlueprintPure, Category = "ElevenLabs") UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; } @@ -353,6 +369,14 @@ private: TArray AudioQueue; FCriticalSection AudioQueueLock; + // Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps + // to keep the audio component alive (prevents stop on buffer underrun). + TArray SilenceBuffer; + + // Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps. + bool bPreBuffering = false; + double PreBufferStartTime = 0.0; + // Silence detection: how many consecutive ticks with an empty audio queue. int32 SilentTickCount = 0; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h index 9a98e1d..abaf230 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h @@ -51,11 +51,19 @@ public: ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler.")) float LipSyncStrength = 1.0f; + /** Scales the audio amplitude driving mouth movement. + * Lower values produce subtler animation, higher values are more pronounced. + * Use this to tone down overly strong lip movement without changing the shape. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "0.5", ClampMax = "1.0", + ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape.")) + float AmplitudeScale = 0.75f; + /** How quickly viseme weights interpolate towards new values each frame. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", - meta = (ClampMin = "1.0", ClampMax = "100.0", - ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good.")) - float SmoothingSpeed = 20.0f; + meta = (ClampMin = "35.0", ClampMax = "65.0", + ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive.")) + float SmoothingSpeed = 50.0f; // ── Events ──────────────────────────────────────────────────────────────── @@ -87,6 +95,20 @@ private: /** Receives raw PCM from the agent component. */ void OnAudioChunkReceived(const TArray& PCMData); + /** Receives full text response from the agent component. */ + UFUNCTION() + void OnTextResponseReceived(const FString& ResponseText); + + /** Receives partial text streaming from the agent component. */ + UFUNCTION() + void OnPartialTextReceived(const FString& PartialText); + + /** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */ + void ConvertTextToVisemes(const FString& Text); + + /** Apply text-derived viseme shapes to the remaining queued frames. */ + void ApplyTextVisemesToQueue(); + /** Extract frequency band energies from the spectrum analyzer. */ void AnalyzeSpectrum(); @@ -122,6 +144,13 @@ private: // ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes) TMap CurrentBlendshapes; + // Previous frame's blendshape values for additional output smoothing + TMap PreviousBlendshapes; + + // Last consumed queue frame — used for inter-frame interpolation + // to create continuous motion instead of 32ms step-wise jumps + TMap LastConsumedVisemes; + // MetaHuman mode: Face mesh has no morph targets, use animation curves instead. // Set automatically in BeginPlay when TargetMesh has 0 morph targets. bool bUseCurveMode = false; @@ -129,9 +158,48 @@ private: // Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops. TMap CurveNameCache; + // RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening) + float CurrentAmplitude = 0.0f; + + // ── Viseme queue ────────────────────────────────────────────────────────── + + // Queue of per-window viseme analysis results. + // OnAudioChunkReceived builds one frame per 512-sample window (~32ms). + // TickComponent consumes them at the correct playback rate. + TArray> VisemeQueue; + + // Parallel queue of per-window amplitude values (for text-driven shape replacement) + TArray AmplitudeQueue; + + // Timer for consuming queued viseme frames at the FFT window rate + float PlaybackTimer = 0.0f; + // Whether we have pending analysis results to process bool bHasPendingAnalysis = false; + // ── Text-driven lip sync ────────────────────────────────────────────────── + + // Accumulated partial text from streaming (agent_chat_response_part events). + // Built up token-by-token before the audio arrives. + FString AccumulatedText; + + // Ordered sequence of OVR viseme names derived from text. + // E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR] + TArray TextVisemeSequence; + + // Whether text-based visemes have been applied to the current queue + bool bTextVisemesApplied = false; + + // Set when agent_response arrives (full text for this utterance). + // Prevents resetting AccumulatedText between audio chunks of the + // SAME utterance — only reset once the full response is confirmed. + bool bFullTextReceived = false; + + // Wait-for-text mechanism: when audio arrives without text, hold playback + // until text arrives (partial or full) so all frames get proper text visemes. + bool bWaitingForText = false; + double WaitingForTextStartTime = 0.0; + // Cached reference to the agent component on the same Actor TWeakObjectPtr AgentComponent; FDelegateHandle AudioDataHandle;