diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset index 5707f93..f17f98a 100644 Binary files a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset and b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp index 3a5f073..a50f33d 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp @@ -612,104 +612,233 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick PlaybackTimer += DeltaTime; + bool bConsumedFrame = false; + float LastConsumedAmp = 0.0f; + while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0) { LastConsumedVisemes = VisemeQueue[0]; TargetVisemes = VisemeQueue[0]; + if (AmplitudeQueue.Num() > 0) + { + LastConsumedAmp = AmplitudeQueue[0]; + AmplitudeQueue.RemoveAt(0); + } VisemeQueue.RemoveAt(0); - if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0); PlaybackTimer -= WindowDuration; + bConsumedFrame = true; } - // ── Inter-frame interpolation ───────────────────────────────────────── - // Instead of holding the same TargetVisemes for 32ms then jumping to the - // next frame, blend smoothly between the last consumed frame and the next - // queued frame. This prevents the "frantic" look from step-wise changes - // and creates continuous, natural-looking mouth motion. - if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0) + // ── Playback-time envelope update ───────────────────────────────────── + // The AudioEnvelopeValue was originally only set in OnAudioChunkReceived + // (at chunk ARRIVAL time). In timeline mode, we need it to track the + // amplitude of audio being PLAYED (consumed frames), not received. + // Also decay towards 0 when no audio is being consumed (Q=0), + // so the timeline pauses and the mouth closes properly. { - const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f); - for (const FName& Name : VisemeNames) + const float EnvAttackCoeff = 1.0f - FMath::Exp(-WindowDuration + / FMath::Max(0.001f, EnvelopeAttackMs * 0.001f)); + const float EnvReleaseCoeff = 1.0f - FMath::Exp(-WindowDuration + / FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f)); + + if (bConsumedFrame) { - const float From = LastConsumedVisemes.FindRef(Name); - const float To = VisemeQueue[0].FindRef(Name); - TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T); + // Update envelope from consumed frame amplitude (playback-synced) + const float Coeff = (LastConsumedAmp > AudioEnvelopeValue) + ? EnvAttackCoeff : EnvReleaseCoeff; + AudioEnvelopeValue += (LastConsumedAmp - AudioEnvelopeValue) * Coeff; + } + else if (VisemeQueue.Num() == 0) + { + // No audio being played — fast decay to close mouth promptly. + // Uses 40ms time constant (faster than EnvelopeReleaseMs=100ms) + // so the mouth closes in ~120ms instead of ~600ms. + const float FastDecayCoeff = 1.0f - FMath::Exp(-DeltaTime / 0.040f); + AudioEnvelopeValue *= (1.0f - FastDecayCoeff); + if (AudioEnvelopeValue < 0.001f) AudioEnvelopeValue = 0.0f; } } - // If queue runs dry, decay towards silence and reset text state - if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f) - { - for (const FName& Name : VisemeNames) - { - TargetVisemes.FindOrAdd(Name) = 0.0f; - } - TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; - PlaybackTimer = 0.0f; - - // Reset text state — but ONLY after the full response (agent_response) - // has arrived AND text was applied. This prevents destroying text between - // audio chunks of the SAME utterance: partial text arrives once, but - // ElevenLabs splits the audio into 2-3 chunks with gaps. Without - // bFullTextReceived, the text is erased after chunk 1's queue empties, - // leaving chunks 2-3 without text visemes (spectral fallback only). - if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived) - { - AccumulatedText.Reset(); - TextVisemeSequence.Reset(); - bTextVisemesApplied = false; - bFullTextReceived = false; - } - } - - // ── Asymmetric smoothing ───────────────────────────────────────────────── - // At SmoothingSpeed=50: AttackSpeed=50 → alpha=0.83/frame, ~1-2 frames to target. - // ReleaseSpeed=32.5 → alpha=0.54/frame, ~3 frames to 70%. Mouth opens quickly, - // closes more gradually for natural-looking speech. - // - // In pose mode, use gentler smoothing: the spectral analysis oscillates - // between frames and with 50+ simultaneous curves per viseme, rapid - // tracking amplifies noise into visible vibration. Slower smoothing - // absorbs the oscillation while keeping movements deliberate and clean. - const bool bPoseSmoothing = (PoseExtractedCurveMap.Num() > 0); - const float AttackSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.7f : 1.0f); - const float ReleaseSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.45f : 0.65f); + const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0); bool bAnyNonZero = false; - for (const FName& Name : VisemeNames) + if (bPoseMode && bVisemeTimelineActive && VisemeTimeline.Num() > 0) { - float& Current = SmoothedVisemes.FindOrAdd(Name); - const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; + // ── POSE MODE: Decoupled viseme timeline ───────────────────────── + // Visemes are evaluated from an independent timeline at render + // framerate, completely decoupled from 32ms audio chunk windows. + // Audio provides only the amplitude envelope for modulation. - const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed; - const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f); + // Pause timeline during silence gaps between TTS chunks. + // This keeps the timeline in sync when ElevenLabs sends audio + // in 2-3 chunks with ~2s gaps between them. + const bool bShouldPause = (VisemeQueue.Num() == 0 && AudioEnvelopeValue < 0.05f); + if (!bShouldPause) + VisemeTimelineCursor += DeltaTime; - Current = FMath::Lerp(Current, Target, Alpha); - - // Snap to zero to avoid infinite tiny values - if (Current < 0.001f) Current = 0.0f; - if (Current > 0.001f) bAnyNonZero = true; - } - - // Periodic viseme activity log (Verbose — enable with log verbosity for debugging) - static int32 TickLogCount = 0; - if (++TickLogCount % 30 == 1) - { - FName DominantViseme = FName("sil"); - float DominantWeight = 0.0f; - for (const FName& Name : VisemeNames) + // Timeline end handling + const float TimelineEnd = VisemeTimeline.Last().StartSec + VisemeTimeline.Last().DurationSec; + if (VisemeTimelineCursor >= TimelineEnd) { - const float W = SmoothedVisemes.FindOrAdd(Name); - if (W > DominantWeight) + // Clamp at end — envelope will close the mouth naturally + VisemeTimelineCursor = TimelineEnd - 0.001f; + + // If queue is also dry, deactivate and reset text state + if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f) { - DominantWeight = W; - DominantViseme = Name; + bVisemeTimelineActive = false; + + if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } } } - UE_LOG(LogElevenLabsLipSync, Verbose, - TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"), - VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight); + // Dead zone: quadratic suppression of low envelope values + // to eliminate mouth trembling before silence transitions. + float EffectiveEnv = AudioEnvelopeValue; + const float DeadZone = 0.15f; + if (EffectiveEnv < DeadZone) + { + const float DzT = EffectiveEnv / DeadZone; + EffectiveEnv = DzT * DzT * DeadZone; + } + + // Find current viseme entry in timeline + int32 CurrentIdx = VisemeTimeline.Num() - 1; + for (int32 i = 0; i < VisemeTimeline.Num(); ++i) + { + if (VisemeTimelineCursor < VisemeTimeline[i].StartSec + VisemeTimeline[i].DurationSec) + { + CurrentIdx = i; + break; + } + } + + const FVisemeTimelineEntry& Entry = VisemeTimeline[CurrentIdx]; + const float LocalProgress = FMath::Clamp( + (VisemeTimelineCursor - Entry.StartSec) / FMath::Max(0.001f, Entry.DurationSec), + 0.0f, 1.0f); + + const int32 NextIdx = FMath::Min(CurrentIdx + 1, VisemeTimeline.Num() - 1); + const FName& NextViseme = VisemeTimeline[NextIdx].Viseme; + + // Full-duration crossfade with quintic smootherstep (C2 continuous). + // NO hold phase — the mouth is always in motion, transitioning from + // one viseme to the next over the entire duration. This eliminates + // the "static hold then snap" feel of partial crossfades. + // + // Quintic smootherstep: T³(6T²-15T+10) — smoother than smoothstep, + // zero 1st AND 2nd derivative at endpoints = no visible acceleration + // discontinuity at viseme boundaries. + float BlendToNext = 0.0f; + if (NextViseme != Entry.Viseme) + { + const float T = LocalProgress; // 0..1 over full duration + BlendToNext = T * T * T * (T * (T * 6.0f - 15.0f) + 10.0f); + } + + // Set TargetVisemes from timeline × amplitude envelope + for (const FName& Name : VisemeNames) + TargetVisemes.FindOrAdd(Name) = 0.0f; + + const float Amp = FMath::Max(EffectiveEnv, 0.0f); + TargetVisemes.FindOrAdd(Entry.Viseme) = Amp * (1.0f - BlendToNext); + if (BlendToNext > 0.0f) + TargetVisemes.FindOrAdd(NextViseme) = Amp * BlendToNext; + + // Moderate Lerp smoothing — smooths transitions across 50+ pose curves. + // Attack=0.55: reaches target in ~3 frames (99ms at 30fps) + // Release=0.40: fades in ~5 frames (165ms at 30fps) + // Slower than before (was 0.90/0.70) to eliminate saccades while the + // smoothstep crossfade handles the primary transition shape. + const float PoseAttackAlpha = 0.55f; + const float PoseReleaseAlpha = 0.40f; + + for (const FName& Name : VisemeNames) + { + float& Current = SmoothedVisemes.FindOrAdd(Name); + const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; + const float Alpha = (Target > Current) ? PoseAttackAlpha : PoseReleaseAlpha; + Current = FMath::Lerp(Current, Target, Alpha); + if (Current < 0.005f) Current = 0.0f; + if (Current > 0.001f) bAnyNonZero = true; + } + } + else + { + // ── NON-POSE MODE (or pose mode without timeline) ──────────────── + // Queue-based system: inter-frame interpolation + asymmetric smoothing. + + // Inter-frame interpolation: blend between last consumed and next queued frame + if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0) + { + const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f); + for (const FName& Name : VisemeNames) + { + const float From = LastConsumedVisemes.FindRef(Name); + const float To = VisemeQueue[0].FindRef(Name); + TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T); + } + } + + // Queue-dry: decay to silence and reset text state + if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f) + { + for (const FName& Name : VisemeNames) + TargetVisemes.FindOrAdd(Name) = 0.0f; + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + PlaybackTimer = 0.0f; + + if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived) + { + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + } + } + + // Asymmetric smoothing (fast attack, slow release) + const float AttackSpeed = SmoothingSpeed; + const float ReleaseSpeed = SmoothingSpeed * 0.65f; + + for (const FName& Name : VisemeNames) + { + float& Current = SmoothedVisemes.FindOrAdd(Name); + const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; + const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed; + const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f); + Current = FMath::Lerp(Current, Target, Alpha); + if (Current < 0.001f) Current = 0.0f; + if (Current > 0.001f) bAnyNonZero = true; + } + } + + // Real-time viseme debug log — every 3 ticks (~100ms at 30fps). + // Shows all active smoothed visemes + envelope to diagnose trembling. + static int32 TickLogCount = 0; + if (++TickLogCount % 3 == 0 && bAnyNonZero) + { + FString ActiveVisemes; + for (const FName& Name : VisemeNames) + { + const float W = SmoothedVisemes.FindOrAdd(Name); + if (W > 0.01f) + { + if (ActiveVisemes.Len() > 0) ActiveVisemes += TEXT(" "); + ActiveVisemes += FString::Printf(TEXT("%s=%.3f"), *Name.ToString(), W); + } + } + if (ActiveVisemes.IsEmpty()) ActiveVisemes = TEXT("(none)"); + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("VISEME Q=%d Env=%.3f TL=%.0fms | %s"), + VisemeQueue.Num(), AudioEnvelopeValue, VisemeTimelineCursor * 1000.0f, *ActiveVisemes); } // Convert visemes to ARKit blendshapes @@ -794,7 +923,14 @@ void UElevenLabsLipSyncComponent::OnAgentStopped() VisemeQueue.Reset(); AmplitudeQueue.Reset(); PlaybackTimer = 0.0f; + AudioEnvelopeValue = 0.0f; bWaitingForText = false; + + // Deactivate viseme timeline (will be rebuilt on next utterance) + bVisemeTimelineActive = false; + VisemeTimeline.Reset(); + VisemeTimelineCursor = 0.0f; + TotalActiveFramesSeen = 0; } void UElevenLabsLipSyncComponent::ResetToNeutral() @@ -803,6 +939,7 @@ void UElevenLabsLipSyncComponent::ResetToNeutral() VisemeQueue.Reset(); AmplitudeQueue.Reset(); PlaybackTimer = 0.0f; + AudioEnvelopeValue = 0.0f; bWaitingForText = false; // Reset text-driven lip sync state for the interrupted utterance @@ -811,6 +948,12 @@ void UElevenLabsLipSyncComponent::ResetToNeutral() bTextVisemesApplied = false; bFullTextReceived = false; + // Reset decoupled viseme timeline + bVisemeTimelineActive = false; + VisemeTimeline.Reset(); + VisemeTimelineCursor = 0.0f; + TotalActiveFramesSeen = 0; + // Snap all visemes to silence immediately (no smoothing delay) for (const FName& Name : VisemeNames) { @@ -883,29 +1026,27 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD if (bPoseMode) { - // ── Pose mode: hybrid amplitude ────────────────────────────────── - // Two amplitude levels serve different purposes: + // ── Pose mode: envelope-modulated amplitude ────────────────────── + // An envelope follower applied to per-window RMS creates a smooth + // amplitude curve that tracks speech dynamics: // - // 1. CHUNK-LEVEL RMS → shape intensity (smooth, no per-window jitter). - // All active frames use this amplitude so 50+ pose curves don't - // vibrate from 32ms-level amplitude oscillation. + // - Fast ATTACK: mouth opens quickly on speech onset / louder syllables + // - Slow RELEASE: mouth closes gradually between syllables / pauses + // - No per-window jitter: the envelope smooths 32ms-level oscillation + // that would vibrate 50+ simultaneous pose curves // - // 2. PER-WINDOW RMS → silence detection only (binary: speech or pause). - // Detects intra-chunk silence windows that chunk-level averaging - // would miss. This makes audio pauses (commas, breathing) visible - // as the mouth properly closes during gaps. + // Raw per-window RMS is still used for silence detection (binary gate). + // The envelope value drives the shape intensity (how open the mouth is). // - float ChunkSumSq = 0.0f; - for (int32 i = 0; i < NumSamples; ++i) - ChunkSumSq += FloatBuffer[i] * FloatBuffer[i]; - const float ChunkRMS = FMath::Sqrt(ChunkSumSq / FMath::Max(1.0f, static_cast(NumSamples))); - float ChunkAmplitude = FMath::Clamp(ChunkRMS * 10.0f, 0.0f, 1.5f); - ChunkAmplitude = FMath::Clamp(FMath::Pow(ChunkAmplitude, 0.4f), 0.0f, 1.0f); - ChunkAmplitude *= AmplitudeScale; + const float WindowDurationSec = static_cast(WindowSize) / 16000.0f; // ~32ms + const float AttackCoeff = 1.0f - FMath::Exp(-WindowDurationSec + / FMath::Max(0.001f, EnvelopeAttackMs * 0.001f)); + const float ReleaseCoeff = 1.0f - FMath::Exp(-WindowDurationSec + / FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f)); for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize) { - // Per-window amplitude for silence detection + // Per-window RMS amplitude float WindowSumSq = 0.0f; const int32 WindowEnd = FMath::Min(Offset + WindowSize, NumSamples); const int32 WindowLen = WindowEnd - Offset; @@ -916,7 +1057,24 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD WindowAmp = FMath::Clamp(FMath::Pow(WindowAmp, 0.4f), 0.0f, 1.0f); WindowAmp *= AmplitudeScale; - const bool bSilentWindow = (WindowAmp < 0.08f); + // Envelope follower: fast attack, slow release + const float Coeff = (WindowAmp > AudioEnvelopeValue) ? AttackCoeff : ReleaseCoeff; + AudioEnvelopeValue += (WindowAmp - AudioEnvelopeValue) * Coeff; + + // Dead zone: quadratic suppression of low amplitudes. + // Low envelope values (0.08-0.15) produce tiny mouth movements + // that look like trembling before silence. The quadratic curve + // pushes these below the silence gate, creating a + // clean cut into silence instead of gradual fade-to-tremble. + const float DeadZone = 0.15f; + float EffectiveAmp = AudioEnvelopeValue; + if (EffectiveAmp < DeadZone) + { + const float T = EffectiveAmp / DeadZone; // 0..1 + EffectiveAmp = T * T * DeadZone; // Quadratic: low → ~0 + } + + const bool bSilentWindow = (EffectiveAmp < 0.08f); TMap Frame; for (const FName& Name : VisemeNames) @@ -925,17 +1083,18 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD if (bSilentWindow) { Frame.FindOrAdd(FName("sil")) = 1.0f; - AmplitudeQueue.Add(0.0f); // Marked silent for ApplyTextVisemesToQueue + AmplitudeQueue.Add(0.0f); } else { - Frame.FindOrAdd(FName("aa")) = ChunkAmplitude; // Smooth chunk intensity - AmplitudeQueue.Add(ChunkAmplitude); + Frame.FindOrAdd(FName("aa")) = EffectiveAmp; + AmplitudeQueue.Add(EffectiveAmp); + TotalActiveFramesSeen++; } VisemeQueue.Add(Frame); - MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : ChunkAmplitude); - MaxAmp = FMath::Max(MaxAmp, ChunkAmplitude); + MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : EffectiveAmp); + MaxAmp = FMath::Max(MaxAmp, EffectiveAmp); WindowsQueued++; } } @@ -1156,7 +1315,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3) { // Text already available — apply and start playback immediately - ApplyTextVisemesToQueue(); + if (bPoseMode) + BuildVisemeTimeline(); + else + ApplyTextVisemesToQueue(); PlaybackTimer = 0.0f; UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("Text already available (%d visemes). Starting lip sync immediately."), @@ -1176,7 +1338,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMD else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0) { // Not a new utterance but text is available — apply to new frames - ApplyTextVisemesToQueue(); + if (bPoseMode) + BuildVisemeTimeline(); + else + ApplyTextVisemesToQueue(); } UE_LOG(LogElevenLabsLipSync, Log, @@ -1216,9 +1381,13 @@ void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialTe // apply text visemes to queued frames and start consuming. if (bWaitingForText && TextVisemeSequence.Num() >= 3) { + const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0); if (VisemeQueue.Num() > 0) { - ApplyTextVisemesToQueue(); + if (bPoseMode) + BuildVisemeTimeline(); + else + ApplyTextVisemesToQueue(); } bWaitingForText = false; PlaybackTimer = 0.0f; // Start consuming now @@ -1239,10 +1408,13 @@ void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& Response UE_LOG(LogElevenLabsLipSync, Log, TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num()); - // Apply to any remaining queued frames - if (VisemeQueue.Num() > 0) + // Apply to any remaining queued frames (or extend timeline in pose mode) { - ApplyTextVisemesToQueue(); + const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0); + if (bPoseMode) + BuildVisemeTimeline(); + else if (VisemeQueue.Num() > 0) + ApplyTextVisemesToQueue(); } // If we were waiting for text to arrive before starting playback, start now @@ -1775,22 +1947,25 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue() Frame.FindOrAdd(Name) = 0.0f; } - // Anticipatory blending: in the last 30% of each viseme, - // gradually blend towards the next viseme shape. - const float BlendZone = 0.3f; + // Full crossfade with smoothstep in the last 40% of each viseme. + // The previous 30%/50%-max blend caused a discontinuity at viseme + // boundaries (50/50 → 0/100 jump), visible as trembling on 50+ curves. + // Now: full 0→100% crossfade with smoothstep (ease-in-out) curve + // eliminates any discontinuity at the boundary. + const float BlendZone = 0.4f; float BlendToNext = 0.0f; if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme) { - BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone; + const float T = (LocalProgress - (1.0f - BlendZone)) / BlendZone; // 0..1 + BlendToNext = T * T * (3.0f - 2.0f * T); // Smoothstep: ease-in-out } - // Primary viseme shape × amplitude - Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f); + // Crossfade: current viseme fades out, next fades in + Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext); - // Blend towards next viseme if (BlendToNext > 0.0f) { - Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f; + Frame.FindOrAdd(NextViseme) += Amp * BlendToNext; } ActiveIdx++; @@ -1806,6 +1981,90 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue() FinalRatio, FinalRatio * 32.0f); } +// ───────────────────────────────────────────────────────────────────────────── +// Decoupled viseme timeline (pose mode) +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::BuildVisemeTimeline() +{ + if (TextVisemeSequence.Num() == 0) return; + + // Use TOTAL active frames seen across all chunks (not just remaining queue). + // Frames already consumed by TickComponent are counted too, so the timeline + // is properly scaled to the full audio duration. + constexpr float WindowDurationSec = 512.0f / 16000.0f; // ~32ms + if (TotalActiveFramesSeen == 0) return; + + const float AudioDurationSec = TotalActiveFramesSeen * WindowDurationSec; + + // ── Subsample viseme sequence to natural speech rate ────────────────── + // Real speech has ~4-5 distinct mouth shapes per second (one per syllable + // nucleus). The text-to-viseme pipeline can produce 15-25 visemes for a + // short phrase, which at 1.2s audio = ~60ms each = saccades. + // Cap at ~10 visemes/sec (100ms minimum) — allows more phoneme detail + // while staying within natural French syllable rate (~8-10 shapes/sec). + constexpr float MinVisemeDurationSec = 0.100f; + const int32 MaxVisemes = FMath::Max(2, FMath::CeilToInt(AudioDurationSec / MinVisemeDurationSec)); + + TArray FinalSequence; + if (TextVisemeSequence.Num() > MaxVisemes) + { + // Subsample: take evenly-spaced visemes from the full sequence + for (int32 i = 0; i < MaxVisemes; ++i) + { + const int32 Idx = (i * (TextVisemeSequence.Num() - 1)) / FMath::Max(1, MaxVisemes - 1); + const FName& V = TextVisemeSequence[Idx]; + // Skip consecutive duplicates + if (FinalSequence.Num() == 0 || FinalSequence.Last() != V) + FinalSequence.Add(V); + } + } + else + { + FinalSequence = TextVisemeSequence; + } + + if (FinalSequence.Num() == 0) return; + + // Compute natural durations from phoneme weights + float NaturalTotalSec = 0.0f; + for (const FName& V : FinalSequence) + { + NaturalTotalSec += GetVisemeDurationWeight(V) * 0.120f; + } + + // Scale factor: match actual audio duration + const float Scale = (NaturalTotalSec > 0.01f) ? AudioDurationSec / NaturalTotalSec : 1.0f; + + // If timeline is already playing, preserve absolute cursor position. + const float SavedCursor = bVisemeTimelineActive ? VisemeTimelineCursor : 0.0f; + + // Build timeline entries with scaled durations + VisemeTimeline.Reset(); + float CursorSec = 0.0f; + for (const FName& V : FinalSequence) + { + FVisemeTimelineEntry Entry; + Entry.Viseme = V; + Entry.StartSec = CursorSec; + Entry.DurationSec = GetVisemeDurationWeight(V) * 0.120f * Scale; + VisemeTimeline.Add(Entry); + CursorSec += Entry.DurationSec; + } + + // Restore cursor: keep absolute position, clamped to new timeline + VisemeTimelineCursor = FMath::Min(SavedCursor, FMath::Max(0.0f, CursorSec - 0.001f)); + + bVisemeTimelineActive = true; + bTextVisemesApplied = true; + + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Built viseme timeline: %d entries (from %d, max %d), audio=%.0fms, scale=%.2f → %.0fms/viseme avg"), + FinalSequence.Num(), TextVisemeSequence.Num(), MaxVisemes, + AudioDurationSec * 1000.0f, Scale, + (FinalSequence.Num() > 0) ? (CursorSec * 1000.0f / FinalSequence.Num()) : 0.0f); +} + void UElevenLabsLipSyncComponent::AnalyzeSpectrum() { if (!SpectrumAnalyzer) return; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h index 8d987ac..a324697 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h @@ -11,6 +11,15 @@ class UElevenLabsConversationalAgentComponent; class UElevenLabsLipSyncPoseMap; class USkeletalMeshComponent; +/** A single entry in the decoupled viseme timeline. + * Built from text phoneme analysis, played back independently of audio chunks. */ +struct FVisemeTimelineEntry +{ + FName Viseme; + float StartSec; // absolute start time from utterance start + float DurationSec; // how long this viseme is held +}; + // Fired every tick when viseme/blendshape data has been updated. DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady); @@ -66,6 +75,24 @@ public: ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive.")) float SmoothingSpeed = 50.0f; + // ── Audio Envelope ────────────────────────────────────────────────────── + + /** Envelope attack time in milliseconds. + * Controls how fast the mouth opens when speech starts or gets louder. + * Lower = snappier onset, higher = gentler opening. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "5.0", ClampMax = "100.0", + ToolTip = "Envelope attack (ms).\n10 = snappy, 15 = balanced, 30 = gentle.\nHow fast the mouth opens on speech onset.")) + float EnvelopeAttackMs = 15.0f; + + /** Envelope release time in milliseconds. + * Controls how slowly the mouth closes when speech gets quieter. + * Higher = smoother, more natural decay. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "20.0", ClampMax = "500.0", + ToolTip = "Envelope release (ms).\n60 = responsive, 100 = balanced, 200 = smooth/cinematic.\nHow slowly the mouth closes between syllables.")) + float EnvelopeReleaseMs = 100.0f; + // ── Phoneme Pose Map ───────────────────────────────────────────────────── /** Optional pose map asset mapping OVR visemes to phoneme AnimSequences. @@ -130,9 +157,14 @@ private: /** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */ void ConvertTextToVisemes(const FString& Text); - /** Apply text-derived viseme shapes to the remaining queued frames. */ + /** Apply text-derived viseme shapes to the remaining queued frames (non-pose mode). */ void ApplyTextVisemesToQueue(); + /** Build a decoupled viseme timeline from text (pose mode). + * Visemes get natural phoneme durations, evaluated continuously in Tick + * instead of being quantized to 32ms audio chunks. */ + void BuildVisemeTimeline(); + /** Extract frequency band energies from the spectrum analyzer. */ void AnalyzeSpectrum(); @@ -198,6 +230,10 @@ private: // Timer for consuming queued viseme frames at the FFT window rate float PlaybackTimer = 0.0f; + // Envelope follower state: smoothed amplitude that tracks speech dynamics + // with fast attack (mouth opens quickly) and slow release (closes gradually). + float AudioEnvelopeValue = 0.0f; + // Whether we have pending analysis results to process bool bHasPendingAnalysis = false; @@ -214,6 +250,16 @@ private: // Whether text-based visemes have been applied to the current queue bool bTextVisemesApplied = false; + // ── Decoupled viseme timeline (pose mode) ──────────────────────────────── + // In pose mode, text visemes are played from an independent timeline + // evaluated each tick at render framerate, instead of being quantized + // to 32ms audio chunk windows. Audio provides only the amplitude envelope. + + TArray VisemeTimeline; + float VisemeTimelineCursor = 0.0f; // current playback position (seconds) + bool bVisemeTimelineActive = false; // true when timeline is playing + int32 TotalActiveFramesSeen = 0; // cumulative non-silent frames across all chunks + // Set when agent_response arrives (full text for this utterance). // Prevents resetting AccumulatedText between audio chunks of the // SAME utterance — only reset once the full response is confirmed.