v2.1.0: Decoupled viseme timeline with quintic crossfade and tuned dead zone

Decouple viseme timing from 32ms audio chunks by introducing an independent
FVisemeTimelineEntry timeline evaluated at render framerate. Playback-time
envelope tracking from consumed queue frames replaces arrival-time-only
updates, with fast 40ms decay when queue is empty.

- Viseme subsampling caps at ~10/sec (100ms min) to prevent saccades
- Full-duration quintic smootherstep crossfade (C2 continuous, no hold phase)
- Dead zone lowered to 0.15 for cleaner silence transitions
- TotalActiveFramesSeen cumulative counter for accurate timeline scaling
- Absolute cursor preservation on timeline rebuild
- Moderate Lerp smoothing (attack 0.55, release 0.40)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-24 16:46:36 +01:00
parent aa3010bae8
commit e57be0a1d9
3 changed files with 418 additions and 113 deletions

View File

@ -612,104 +612,233 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
PlaybackTimer += DeltaTime; PlaybackTimer += DeltaTime;
bool bConsumedFrame = false;
float LastConsumedAmp = 0.0f;
while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0) while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0)
{ {
LastConsumedVisemes = VisemeQueue[0]; LastConsumedVisemes = VisemeQueue[0];
TargetVisemes = VisemeQueue[0]; TargetVisemes = VisemeQueue[0];
if (AmplitudeQueue.Num() > 0)
{
LastConsumedAmp = AmplitudeQueue[0];
AmplitudeQueue.RemoveAt(0);
}
VisemeQueue.RemoveAt(0); VisemeQueue.RemoveAt(0);
if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0);
PlaybackTimer -= WindowDuration; PlaybackTimer -= WindowDuration;
bConsumedFrame = true;
} }
// ── Inter-frame interpolation ───────────────────────────────────────── // ── Playback-time envelope update ─────────────────────────────────────
// Instead of holding the same TargetVisemes for 32ms then jumping to the // The AudioEnvelopeValue was originally only set in OnAudioChunkReceived
// next frame, blend smoothly between the last consumed frame and the next // (at chunk ARRIVAL time). In timeline mode, we need it to track the
// queued frame. This prevents the "frantic" look from step-wise changes // amplitude of audio being PLAYED (consumed frames), not received.
// and creates continuous, natural-looking mouth motion. // Also decay towards 0 when no audio is being consumed (Q=0),
if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0) // so the timeline pauses and the mouth closes properly.
{ {
const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f); const float EnvAttackCoeff = 1.0f - FMath::Exp(-WindowDuration
for (const FName& Name : VisemeNames) / FMath::Max(0.001f, EnvelopeAttackMs * 0.001f));
const float EnvReleaseCoeff = 1.0f - FMath::Exp(-WindowDuration
/ FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f));
if (bConsumedFrame)
{ {
const float From = LastConsumedVisemes.FindRef(Name); // Update envelope from consumed frame amplitude (playback-synced)
const float To = VisemeQueue[0].FindRef(Name); const float Coeff = (LastConsumedAmp > AudioEnvelopeValue)
TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T); ? EnvAttackCoeff : EnvReleaseCoeff;
AudioEnvelopeValue += (LastConsumedAmp - AudioEnvelopeValue) * Coeff;
}
else if (VisemeQueue.Num() == 0)
{
// No audio being played — fast decay to close mouth promptly.
// Uses 40ms time constant (faster than EnvelopeReleaseMs=100ms)
// so the mouth closes in ~120ms instead of ~600ms.
const float FastDecayCoeff = 1.0f - FMath::Exp(-DeltaTime / 0.040f);
AudioEnvelopeValue *= (1.0f - FastDecayCoeff);
if (AudioEnvelopeValue < 0.001f) AudioEnvelopeValue = 0.0f;
} }
} }
// If queue runs dry, decay towards silence and reset text state const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
{
for (const FName& Name : VisemeNames)
{
TargetVisemes.FindOrAdd(Name) = 0.0f;
}
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
PlaybackTimer = 0.0f;
// Reset text state — but ONLY after the full response (agent_response)
// has arrived AND text was applied. This prevents destroying text between
// audio chunks of the SAME utterance: partial text arrives once, but
// ElevenLabs splits the audio into 2-3 chunks with gaps. Without
// bFullTextReceived, the text is erased after chunk 1's queue empties,
// leaving chunks 2-3 without text visemes (spectral fallback only).
if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
{
AccumulatedText.Reset();
TextVisemeSequence.Reset();
bTextVisemesApplied = false;
bFullTextReceived = false;
}
}
// ── Asymmetric smoothing ─────────────────────────────────────────────────
// At SmoothingSpeed=50: AttackSpeed=50 → alpha=0.83/frame, ~1-2 frames to target.
// ReleaseSpeed=32.5 → alpha=0.54/frame, ~3 frames to 70%. Mouth opens quickly,
// closes more gradually for natural-looking speech.
//
// In pose mode, use gentler smoothing: the spectral analysis oscillates
// between frames and with 50+ simultaneous curves per viseme, rapid
// tracking amplifies noise into visible vibration. Slower smoothing
// absorbs the oscillation while keeping movements deliberate and clean.
const bool bPoseSmoothing = (PoseExtractedCurveMap.Num() > 0);
const float AttackSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.7f : 1.0f);
const float ReleaseSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.45f : 0.65f);
bool bAnyNonZero = false; bool bAnyNonZero = false;
for (const FName& Name : VisemeNames) if (bPoseMode && bVisemeTimelineActive && VisemeTimeline.Num() > 0)
{ {
float& Current = SmoothedVisemes.FindOrAdd(Name); // ── POSE MODE: Decoupled viseme timeline ─────────────────────────
const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength; // Visemes are evaluated from an independent timeline at render
// framerate, completely decoupled from 32ms audio chunk windows.
// Audio provides only the amplitude envelope for modulation.
const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed; // Pause timeline during silence gaps between TTS chunks.
const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f); // This keeps the timeline in sync when ElevenLabs sends audio
// in 2-3 chunks with ~2s gaps between them.
const bool bShouldPause = (VisemeQueue.Num() == 0 && AudioEnvelopeValue < 0.05f);
if (!bShouldPause)
VisemeTimelineCursor += DeltaTime;
Current = FMath::Lerp(Current, Target, Alpha); // Timeline end handling
const float TimelineEnd = VisemeTimeline.Last().StartSec + VisemeTimeline.Last().DurationSec;
// Snap to zero to avoid infinite tiny values if (VisemeTimelineCursor >= TimelineEnd)
if (Current < 0.001f) Current = 0.0f;
if (Current > 0.001f) bAnyNonZero = true;
}
// Periodic viseme activity log (Verbose — enable with log verbosity for debugging)
static int32 TickLogCount = 0;
if (++TickLogCount % 30 == 1)
{
FName DominantViseme = FName("sil");
float DominantWeight = 0.0f;
for (const FName& Name : VisemeNames)
{ {
const float W = SmoothedVisemes.FindOrAdd(Name); // Clamp at end — envelope will close the mouth naturally
if (W > DominantWeight) VisemeTimelineCursor = TimelineEnd - 0.001f;
// If queue is also dry, deactivate and reset text state
if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
{ {
DominantWeight = W; bVisemeTimelineActive = false;
DominantViseme = Name;
if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
{
AccumulatedText.Reset();
TextVisemeSequence.Reset();
bTextVisemesApplied = false;
bFullTextReceived = false;
}
} }
} }
UE_LOG(LogElevenLabsLipSync, Verbose, // Dead zone: quadratic suppression of low envelope values
TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"), // to eliminate mouth trembling before silence transitions.
VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight); float EffectiveEnv = AudioEnvelopeValue;
const float DeadZone = 0.15f;
if (EffectiveEnv < DeadZone)
{
const float DzT = EffectiveEnv / DeadZone;
EffectiveEnv = DzT * DzT * DeadZone;
}
// Find current viseme entry in timeline
int32 CurrentIdx = VisemeTimeline.Num() - 1;
for (int32 i = 0; i < VisemeTimeline.Num(); ++i)
{
if (VisemeTimelineCursor < VisemeTimeline[i].StartSec + VisemeTimeline[i].DurationSec)
{
CurrentIdx = i;
break;
}
}
const FVisemeTimelineEntry& Entry = VisemeTimeline[CurrentIdx];
const float LocalProgress = FMath::Clamp(
(VisemeTimelineCursor - Entry.StartSec) / FMath::Max(0.001f, Entry.DurationSec),
0.0f, 1.0f);
const int32 NextIdx = FMath::Min(CurrentIdx + 1, VisemeTimeline.Num() - 1);
const FName& NextViseme = VisemeTimeline[NextIdx].Viseme;
// Full-duration crossfade with quintic smootherstep (C2 continuous).
// NO hold phase — the mouth is always in motion, transitioning from
// one viseme to the next over the entire duration. This eliminates
// the "static hold then snap" feel of partial crossfades.
//
// Quintic smootherstep: T³(6T²-15T+10) — smoother than smoothstep,
// zero 1st AND 2nd derivative at endpoints = no visible acceleration
// discontinuity at viseme boundaries.
float BlendToNext = 0.0f;
if (NextViseme != Entry.Viseme)
{
const float T = LocalProgress; // 0..1 over full duration
BlendToNext = T * T * T * (T * (T * 6.0f - 15.0f) + 10.0f);
}
// Set TargetVisemes from timeline × amplitude envelope
for (const FName& Name : VisemeNames)
TargetVisemes.FindOrAdd(Name) = 0.0f;
const float Amp = FMath::Max(EffectiveEnv, 0.0f);
TargetVisemes.FindOrAdd(Entry.Viseme) = Amp * (1.0f - BlendToNext);
if (BlendToNext > 0.0f)
TargetVisemes.FindOrAdd(NextViseme) = Amp * BlendToNext;
// Moderate Lerp smoothing — smooths transitions across 50+ pose curves.
// Attack=0.55: reaches target in ~3 frames (99ms at 30fps)
// Release=0.40: fades in ~5 frames (165ms at 30fps)
// Slower than before (was 0.90/0.70) to eliminate saccades while the
// smoothstep crossfade handles the primary transition shape.
const float PoseAttackAlpha = 0.55f;
const float PoseReleaseAlpha = 0.40f;
for (const FName& Name : VisemeNames)
{
float& Current = SmoothedVisemes.FindOrAdd(Name);
const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
const float Alpha = (Target > Current) ? PoseAttackAlpha : PoseReleaseAlpha;
Current = FMath::Lerp(Current, Target, Alpha);
if (Current < 0.005f) Current = 0.0f;
if (Current > 0.001f) bAnyNonZero = true;
}
}
else
{
// ── NON-POSE MODE (or pose mode without timeline) ────────────────
// Queue-based system: inter-frame interpolation + asymmetric smoothing.
// Inter-frame interpolation: blend between last consumed and next queued frame
if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0)
{
const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f);
for (const FName& Name : VisemeNames)
{
const float From = LastConsumedVisemes.FindRef(Name);
const float To = VisemeQueue[0].FindRef(Name);
TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T);
}
}
// Queue-dry: decay to silence and reset text state
if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
{
for (const FName& Name : VisemeNames)
TargetVisemes.FindOrAdd(Name) = 0.0f;
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
PlaybackTimer = 0.0f;
if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
{
AccumulatedText.Reset();
TextVisemeSequence.Reset();
bTextVisemesApplied = false;
bFullTextReceived = false;
}
}
// Asymmetric smoothing (fast attack, slow release)
const float AttackSpeed = SmoothingSpeed;
const float ReleaseSpeed = SmoothingSpeed * 0.65f;
for (const FName& Name : VisemeNames)
{
float& Current = SmoothedVisemes.FindOrAdd(Name);
const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed;
const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f);
Current = FMath::Lerp(Current, Target, Alpha);
if (Current < 0.001f) Current = 0.0f;
if (Current > 0.001f) bAnyNonZero = true;
}
}
// Real-time viseme debug log — every 3 ticks (~100ms at 30fps).
// Shows all active smoothed visemes + envelope to diagnose trembling.
static int32 TickLogCount = 0;
if (++TickLogCount % 3 == 0 && bAnyNonZero)
{
FString ActiveVisemes;
for (const FName& Name : VisemeNames)
{
const float W = SmoothedVisemes.FindOrAdd(Name);
if (W > 0.01f)
{
if (ActiveVisemes.Len() > 0) ActiveVisemes += TEXT(" ");
ActiveVisemes += FString::Printf(TEXT("%s=%.3f"), *Name.ToString(), W);
}
}
if (ActiveVisemes.IsEmpty()) ActiveVisemes = TEXT("(none)");
UE_LOG(LogElevenLabsLipSync, Log,
TEXT("VISEME Q=%d Env=%.3f TL=%.0fms | %s"),
VisemeQueue.Num(), AudioEnvelopeValue, VisemeTimelineCursor * 1000.0f, *ActiveVisemes);
} }
// Convert visemes to ARKit blendshapes // Convert visemes to ARKit blendshapes
@ -794,7 +923,14 @@ void UElevenLabsLipSyncComponent::OnAgentStopped()
VisemeQueue.Reset(); VisemeQueue.Reset();
AmplitudeQueue.Reset(); AmplitudeQueue.Reset();
PlaybackTimer = 0.0f; PlaybackTimer = 0.0f;
AudioEnvelopeValue = 0.0f;
bWaitingForText = false; bWaitingForText = false;
// Deactivate viseme timeline (will be rebuilt on next utterance)
bVisemeTimelineActive = false;
VisemeTimeline.Reset();
VisemeTimelineCursor = 0.0f;
TotalActiveFramesSeen = 0;
} }
void UElevenLabsLipSyncComponent::ResetToNeutral() void UElevenLabsLipSyncComponent::ResetToNeutral()
@ -803,6 +939,7 @@ void UElevenLabsLipSyncComponent::ResetToNeutral()
VisemeQueue.Reset(); VisemeQueue.Reset();
AmplitudeQueue.Reset(); AmplitudeQueue.Reset();
PlaybackTimer = 0.0f; PlaybackTimer = 0.0f;
AudioEnvelopeValue = 0.0f;
bWaitingForText = false; bWaitingForText = false;
// Reset text-driven lip sync state for the interrupted utterance // Reset text-driven lip sync state for the interrupted utterance
@ -811,6 +948,12 @@ void UElevenLabsLipSyncComponent::ResetToNeutral()
bTextVisemesApplied = false; bTextVisemesApplied = false;
bFullTextReceived = false; bFullTextReceived = false;
// Reset decoupled viseme timeline
bVisemeTimelineActive = false;
VisemeTimeline.Reset();
VisemeTimelineCursor = 0.0f;
TotalActiveFramesSeen = 0;
// Snap all visemes to silence immediately (no smoothing delay) // Snap all visemes to silence immediately (no smoothing delay)
for (const FName& Name : VisemeNames) for (const FName& Name : VisemeNames)
{ {
@ -883,29 +1026,27 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
if (bPoseMode) if (bPoseMode)
{ {
// ── Pose mode: hybrid amplitude ────────────────────────────────── // ── Pose mode: envelope-modulated amplitude ──────────────────────
// Two amplitude levels serve different purposes: // An envelope follower applied to per-window RMS creates a smooth
// amplitude curve that tracks speech dynamics:
// //
// 1. CHUNK-LEVEL RMS → shape intensity (smooth, no per-window jitter). // - Fast ATTACK: mouth opens quickly on speech onset / louder syllables
// All active frames use this amplitude so 50+ pose curves don't // - Slow RELEASE: mouth closes gradually between syllables / pauses
// vibrate from 32ms-level amplitude oscillation. // - No per-window jitter: the envelope smooths 32ms-level oscillation
// that would vibrate 50+ simultaneous pose curves
// //
// 2. PER-WINDOW RMS → silence detection only (binary: speech or pause). // Raw per-window RMS is still used for silence detection (binary gate).
// Detects intra-chunk silence windows that chunk-level averaging // The envelope value drives the shape intensity (how open the mouth is).
// would miss. This makes audio pauses (commas, breathing) visible
// as the mouth properly closes during gaps.
// //
float ChunkSumSq = 0.0f; const float WindowDurationSec = static_cast<float>(WindowSize) / 16000.0f; // ~32ms
for (int32 i = 0; i < NumSamples; ++i) const float AttackCoeff = 1.0f - FMath::Exp(-WindowDurationSec
ChunkSumSq += FloatBuffer[i] * FloatBuffer[i]; / FMath::Max(0.001f, EnvelopeAttackMs * 0.001f));
const float ChunkRMS = FMath::Sqrt(ChunkSumSq / FMath::Max(1.0f, static_cast<float>(NumSamples))); const float ReleaseCoeff = 1.0f - FMath::Exp(-WindowDurationSec
float ChunkAmplitude = FMath::Clamp(ChunkRMS * 10.0f, 0.0f, 1.5f); / FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f));
ChunkAmplitude = FMath::Clamp(FMath::Pow(ChunkAmplitude, 0.4f), 0.0f, 1.0f);
ChunkAmplitude *= AmplitudeScale;
for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize) for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize)
{ {
// Per-window amplitude for silence detection // Per-window RMS amplitude
float WindowSumSq = 0.0f; float WindowSumSq = 0.0f;
const int32 WindowEnd = FMath::Min(Offset + WindowSize, NumSamples); const int32 WindowEnd = FMath::Min(Offset + WindowSize, NumSamples);
const int32 WindowLen = WindowEnd - Offset; const int32 WindowLen = WindowEnd - Offset;
@ -916,7 +1057,24 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
WindowAmp = FMath::Clamp(FMath::Pow(WindowAmp, 0.4f), 0.0f, 1.0f); WindowAmp = FMath::Clamp(FMath::Pow(WindowAmp, 0.4f), 0.0f, 1.0f);
WindowAmp *= AmplitudeScale; WindowAmp *= AmplitudeScale;
const bool bSilentWindow = (WindowAmp < 0.08f); // Envelope follower: fast attack, slow release
const float Coeff = (WindowAmp > AudioEnvelopeValue) ? AttackCoeff : ReleaseCoeff;
AudioEnvelopeValue += (WindowAmp - AudioEnvelopeValue) * Coeff;
// Dead zone: quadratic suppression of low amplitudes.
// Low envelope values (0.08-0.15) produce tiny mouth movements
// that look like trembling before silence. The quadratic curve
// pushes these below the silence gate, creating a
// clean cut into silence instead of gradual fade-to-tremble.
const float DeadZone = 0.15f;
float EffectiveAmp = AudioEnvelopeValue;
if (EffectiveAmp < DeadZone)
{
const float T = EffectiveAmp / DeadZone; // 0..1
EffectiveAmp = T * T * DeadZone; // Quadratic: low → ~0
}
const bool bSilentWindow = (EffectiveAmp < 0.08f);
TMap<FName, float> Frame; TMap<FName, float> Frame;
for (const FName& Name : VisemeNames) for (const FName& Name : VisemeNames)
@ -925,17 +1083,18 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
if (bSilentWindow) if (bSilentWindow)
{ {
Frame.FindOrAdd(FName("sil")) = 1.0f; Frame.FindOrAdd(FName("sil")) = 1.0f;
AmplitudeQueue.Add(0.0f); // Marked silent for ApplyTextVisemesToQueue AmplitudeQueue.Add(0.0f);
} }
else else
{ {
Frame.FindOrAdd(FName("aa")) = ChunkAmplitude; // Smooth chunk intensity Frame.FindOrAdd(FName("aa")) = EffectiveAmp;
AmplitudeQueue.Add(ChunkAmplitude); AmplitudeQueue.Add(EffectiveAmp);
TotalActiveFramesSeen++;
} }
VisemeQueue.Add(Frame); VisemeQueue.Add(Frame);
MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : ChunkAmplitude); MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : EffectiveAmp);
MaxAmp = FMath::Max(MaxAmp, ChunkAmplitude); MaxAmp = FMath::Max(MaxAmp, EffectiveAmp);
WindowsQueued++; WindowsQueued++;
} }
} }
@ -1156,7 +1315,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3) if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3)
{ {
// Text already available — apply and start playback immediately // Text already available — apply and start playback immediately
ApplyTextVisemesToQueue(); if (bPoseMode)
BuildVisemeTimeline();
else
ApplyTextVisemesToQueue();
PlaybackTimer = 0.0f; PlaybackTimer = 0.0f;
UE_LOG(LogElevenLabsLipSync, Verbose, UE_LOG(LogElevenLabsLipSync, Verbose,
TEXT("Text already available (%d visemes). Starting lip sync immediately."), TEXT("Text already available (%d visemes). Starting lip sync immediately."),
@ -1176,7 +1338,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0) else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0)
{ {
// Not a new utterance but text is available — apply to new frames // Not a new utterance but text is available — apply to new frames
ApplyTextVisemesToQueue(); if (bPoseMode)
BuildVisemeTimeline();
else
ApplyTextVisemesToQueue();
} }
UE_LOG(LogElevenLabsLipSync, Log, UE_LOG(LogElevenLabsLipSync, Log,
@ -1216,9 +1381,13 @@ void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialTe
// apply text visemes to queued frames and start consuming. // apply text visemes to queued frames and start consuming.
if (bWaitingForText && TextVisemeSequence.Num() >= 3) if (bWaitingForText && TextVisemeSequence.Num() >= 3)
{ {
const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
if (VisemeQueue.Num() > 0) if (VisemeQueue.Num() > 0)
{ {
ApplyTextVisemesToQueue(); if (bPoseMode)
BuildVisemeTimeline();
else
ApplyTextVisemesToQueue();
} }
bWaitingForText = false; bWaitingForText = false;
PlaybackTimer = 0.0f; // Start consuming now PlaybackTimer = 0.0f; // Start consuming now
@ -1239,10 +1408,13 @@ void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& Response
UE_LOG(LogElevenLabsLipSync, Log, UE_LOG(LogElevenLabsLipSync, Log,
TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num()); TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num());
// Apply to any remaining queued frames // Apply to any remaining queued frames (or extend timeline in pose mode)
if (VisemeQueue.Num() > 0)
{ {
ApplyTextVisemesToQueue(); const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
if (bPoseMode)
BuildVisemeTimeline();
else if (VisemeQueue.Num() > 0)
ApplyTextVisemesToQueue();
} }
// If we were waiting for text to arrive before starting playback, start now // If we were waiting for text to arrive before starting playback, start now
@ -1775,22 +1947,25 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue()
Frame.FindOrAdd(Name) = 0.0f; Frame.FindOrAdd(Name) = 0.0f;
} }
// Anticipatory blending: in the last 30% of each viseme, // Full crossfade with smoothstep in the last 40% of each viseme.
// gradually blend towards the next viseme shape. // The previous 30%/50%-max blend caused a discontinuity at viseme
const float BlendZone = 0.3f; // boundaries (50/50 → 0/100 jump), visible as trembling on 50+ curves.
// Now: full 0→100% crossfade with smoothstep (ease-in-out) curve
// eliminates any discontinuity at the boundary.
const float BlendZone = 0.4f;
float BlendToNext = 0.0f; float BlendToNext = 0.0f;
if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme) if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme)
{ {
BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone; const float T = (LocalProgress - (1.0f - BlendZone)) / BlendZone; // 0..1
BlendToNext = T * T * (3.0f - 2.0f * T); // Smoothstep: ease-in-out
} }
// Primary viseme shape × amplitude // Crossfade: current viseme fades out, next fades in
Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f); Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext);
// Blend towards next viseme
if (BlendToNext > 0.0f) if (BlendToNext > 0.0f)
{ {
Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f; Frame.FindOrAdd(NextViseme) += Amp * BlendToNext;
} }
ActiveIdx++; ActiveIdx++;
@ -1806,6 +1981,90 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue()
FinalRatio, FinalRatio * 32.0f); FinalRatio, FinalRatio * 32.0f);
} }
// ─────────────────────────────────────────────────────────────────────────────
// Decoupled viseme timeline (pose mode)
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::BuildVisemeTimeline()
{
if (TextVisemeSequence.Num() == 0) return;
// Use TOTAL active frames seen across all chunks (not just remaining queue).
// Frames already consumed by TickComponent are counted too, so the timeline
// is properly scaled to the full audio duration.
constexpr float WindowDurationSec = 512.0f / 16000.0f; // ~32ms
if (TotalActiveFramesSeen == 0) return;
const float AudioDurationSec = TotalActiveFramesSeen * WindowDurationSec;
// ── Subsample viseme sequence to natural speech rate ──────────────────
// Real speech has ~4-5 distinct mouth shapes per second (one per syllable
// nucleus). The text-to-viseme pipeline can produce 15-25 visemes for a
// short phrase, which at 1.2s audio = ~60ms each = saccades.
// Cap at ~10 visemes/sec (100ms minimum) — allows more phoneme detail
// while staying within natural French syllable rate (~8-10 shapes/sec).
constexpr float MinVisemeDurationSec = 0.100f;
const int32 MaxVisemes = FMath::Max(2, FMath::CeilToInt(AudioDurationSec / MinVisemeDurationSec));
TArray<FName> FinalSequence;
if (TextVisemeSequence.Num() > MaxVisemes)
{
// Subsample: take evenly-spaced visemes from the full sequence
for (int32 i = 0; i < MaxVisemes; ++i)
{
const int32 Idx = (i * (TextVisemeSequence.Num() - 1)) / FMath::Max(1, MaxVisemes - 1);
const FName& V = TextVisemeSequence[Idx];
// Skip consecutive duplicates
if (FinalSequence.Num() == 0 || FinalSequence.Last() != V)
FinalSequence.Add(V);
}
}
else
{
FinalSequence = TextVisemeSequence;
}
if (FinalSequence.Num() == 0) return;
// Compute natural durations from phoneme weights
float NaturalTotalSec = 0.0f;
for (const FName& V : FinalSequence)
{
NaturalTotalSec += GetVisemeDurationWeight(V) * 0.120f;
}
// Scale factor: match actual audio duration
const float Scale = (NaturalTotalSec > 0.01f) ? AudioDurationSec / NaturalTotalSec : 1.0f;
// If timeline is already playing, preserve absolute cursor position.
const float SavedCursor = bVisemeTimelineActive ? VisemeTimelineCursor : 0.0f;
// Build timeline entries with scaled durations
VisemeTimeline.Reset();
float CursorSec = 0.0f;
for (const FName& V : FinalSequence)
{
FVisemeTimelineEntry Entry;
Entry.Viseme = V;
Entry.StartSec = CursorSec;
Entry.DurationSec = GetVisemeDurationWeight(V) * 0.120f * Scale;
VisemeTimeline.Add(Entry);
CursorSec += Entry.DurationSec;
}
// Restore cursor: keep absolute position, clamped to new timeline
VisemeTimelineCursor = FMath::Min(SavedCursor, FMath::Max(0.0f, CursorSec - 0.001f));
bVisemeTimelineActive = true;
bTextVisemesApplied = true;
UE_LOG(LogElevenLabsLipSync, Log,
TEXT("Built viseme timeline: %d entries (from %d, max %d), audio=%.0fms, scale=%.2f → %.0fms/viseme avg"),
FinalSequence.Num(), TextVisemeSequence.Num(), MaxVisemes,
AudioDurationSec * 1000.0f, Scale,
(FinalSequence.Num() > 0) ? (CursorSec * 1000.0f / FinalSequence.Num()) : 0.0f);
}
void UElevenLabsLipSyncComponent::AnalyzeSpectrum() void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
{ {
if (!SpectrumAnalyzer) return; if (!SpectrumAnalyzer) return;

View File

@ -11,6 +11,15 @@ class UElevenLabsConversationalAgentComponent;
class UElevenLabsLipSyncPoseMap; class UElevenLabsLipSyncPoseMap;
class USkeletalMeshComponent; class USkeletalMeshComponent;
/** A single entry in the decoupled viseme timeline.
* Built from text phoneme analysis, played back independently of audio chunks. */
struct FVisemeTimelineEntry
{
FName Viseme;
float StartSec; // absolute start time from utterance start
float DurationSec; // how long this viseme is held
};
// Fired every tick when viseme/blendshape data has been updated. // Fired every tick when viseme/blendshape data has been updated.
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady); DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);
@ -66,6 +75,24 @@ public:
ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive.")) ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
float SmoothingSpeed = 50.0f; float SmoothingSpeed = 50.0f;
// ── Audio Envelope ──────────────────────────────────────────────────────
/** Envelope attack time in milliseconds.
* Controls how fast the mouth opens when speech starts or gets louder.
* Lower = snappier onset, higher = gentler opening. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "5.0", ClampMax = "100.0",
ToolTip = "Envelope attack (ms).\n10 = snappy, 15 = balanced, 30 = gentle.\nHow fast the mouth opens on speech onset."))
float EnvelopeAttackMs = 15.0f;
/** Envelope release time in milliseconds.
* Controls how slowly the mouth closes when speech gets quieter.
* Higher = smoother, more natural decay. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "20.0", ClampMax = "500.0",
ToolTip = "Envelope release (ms).\n60 = responsive, 100 = balanced, 200 = smooth/cinematic.\nHow slowly the mouth closes between syllables."))
float EnvelopeReleaseMs = 100.0f;
// ── Phoneme Pose Map ───────────────────────────────────────────────────── // ── Phoneme Pose Map ─────────────────────────────────────────────────────
/** Optional pose map asset mapping OVR visemes to phoneme AnimSequences. /** Optional pose map asset mapping OVR visemes to phoneme AnimSequences.
@ -130,9 +157,14 @@ private:
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */ /** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
void ConvertTextToVisemes(const FString& Text); void ConvertTextToVisemes(const FString& Text);
/** Apply text-derived viseme shapes to the remaining queued frames. */ /** Apply text-derived viseme shapes to the remaining queued frames (non-pose mode). */
void ApplyTextVisemesToQueue(); void ApplyTextVisemesToQueue();
/** Build a decoupled viseme timeline from text (pose mode).
* Visemes get natural phoneme durations, evaluated continuously in Tick
* instead of being quantized to 32ms audio chunks. */
void BuildVisemeTimeline();
/** Extract frequency band energies from the spectrum analyzer. */ /** Extract frequency band energies from the spectrum analyzer. */
void AnalyzeSpectrum(); void AnalyzeSpectrum();
@ -198,6 +230,10 @@ private:
// Timer for consuming queued viseme frames at the FFT window rate // Timer for consuming queued viseme frames at the FFT window rate
float PlaybackTimer = 0.0f; float PlaybackTimer = 0.0f;
// Envelope follower state: smoothed amplitude that tracks speech dynamics
// with fast attack (mouth opens quickly) and slow release (closes gradually).
float AudioEnvelopeValue = 0.0f;
// Whether we have pending analysis results to process // Whether we have pending analysis results to process
bool bHasPendingAnalysis = false; bool bHasPendingAnalysis = false;
@ -214,6 +250,16 @@ private:
// Whether text-based visemes have been applied to the current queue // Whether text-based visemes have been applied to the current queue
bool bTextVisemesApplied = false; bool bTextVisemesApplied = false;
// ── Decoupled viseme timeline (pose mode) ────────────────────────────────
// In pose mode, text visemes are played from an independent timeline
// evaluated each tick at render framerate, instead of being quantized
// to 32ms audio chunk windows. Audio provides only the amplitude envelope.
TArray<FVisemeTimelineEntry> VisemeTimeline;
float VisemeTimelineCursor = 0.0f; // current playback position (seconds)
bool bVisemeTimelineActive = false; // true when timeline is playing
int32 TotalActiveFramesSeen = 0; // cumulative non-silent frames across all chunks
// Set when agent_response arrives (full text for this utterance). // Set when agent_response arrives (full text for this utterance).
// Prevents resetting AccumulatedText between audio chunks of the // Prevents resetting AccumulatedText between audio chunks of the
// SAME utterance — only reset once the full response is confirmed. // SAME utterance — only reset once the full response is confirmed.