v2.1.0: Decoupled viseme timeline with quintic crossfade and tuned dead zone

Decouple viseme timing from 32ms audio chunks by introducing an independent FVisemeTimelineEntry timeline evaluated at render framerate. Playback-time envelope tracking from consumed queue frames replaces arrival-time-only updates, with fast 40ms decay when queue is empty. - Viseme subsampling caps at ~10/sec (100ms min) to prevent saccades - Full-duration quintic smootherstep crossfade (C2 continuous, no hold phase) - Dead zone lowered to 0.15 for cleaner silence transitions - TotalActiveFramesSeen cumulative counter for accurate timeline scaling - Absolute cursor preservation on timeline rebuild - Moderate Lerp smoothing (attack 0.55, release 0.40) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 16:46:36 +01:00 · 2026-02-24 16:46:36 +01:00 · e57be0a1d9
commit e57be0a1d9
parent aa3010bae8
3 changed files with 418 additions and 113 deletions
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@ -612,104 +612,233 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick

 	PlaybackTimer += DeltaTime;

+	bool bConsumedFrame = false;
+	float LastConsumedAmp = 0.0f;
+
 	while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0)
 	{
 		LastConsumedVisemes = VisemeQueue[0];
 		TargetVisemes = VisemeQueue[0];
+		if (AmplitudeQueue.Num() > 0)
+		{
+			LastConsumedAmp = AmplitudeQueue[0];
+			AmplitudeQueue.RemoveAt(0);
+		}
 		VisemeQueue.RemoveAt(0);
-		if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0);
 		PlaybackTimer -= WindowDuration;
+		bConsumedFrame = true;
 	}

-	// ── Inter-frame interpolation ─────────────────────────────────────────
-	// Instead of holding the same TargetVisemes for 32ms then jumping to the
-	// next frame, blend smoothly between the last consumed frame and the next
-	// queued frame. This prevents the "frantic" look from step-wise changes
-	// and creates continuous, natural-looking mouth motion.
-	if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0)
+	// ── Playback-time envelope update ─────────────────────────────────────
+	// The AudioEnvelopeValue was originally only set in OnAudioChunkReceived
+	// (at chunk ARRIVAL time). In timeline mode, we need it to track the
+	// amplitude of audio being PLAYED (consumed frames), not received.
+	// Also decay towards 0 when no audio is being consumed (Q=0),
+	// so the timeline pauses and the mouth closes properly.
 	{
-		const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f);
-		for (const FName& Name : VisemeNames)
+		const float EnvAttackCoeff = 1.0f - FMath::Exp(-WindowDuration
+			/ FMath::Max(0.001f, EnvelopeAttackMs * 0.001f));
+		const float EnvReleaseCoeff = 1.0f - FMath::Exp(-WindowDuration
+			/ FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f));
+
+		if (bConsumedFrame)
 		{
-			const float From = LastConsumedVisemes.FindRef(Name);
-			const float To = VisemeQueue[0].FindRef(Name);
-			TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T);
+			// Update envelope from consumed frame amplitude (playback-synced)
+			const float Coeff = (LastConsumedAmp > AudioEnvelopeValue)
+				? EnvAttackCoeff : EnvReleaseCoeff;
+			AudioEnvelopeValue += (LastConsumedAmp - AudioEnvelopeValue) * Coeff;
+		}
+		else if (VisemeQueue.Num() == 0)
+		{
+			// No audio being played — fast decay to close mouth promptly.
+			// Uses 40ms time constant (faster than EnvelopeReleaseMs=100ms)
+			// so the mouth closes in ~120ms instead of ~600ms.
+			const float FastDecayCoeff = 1.0f - FMath::Exp(-DeltaTime / 0.040f);
+			AudioEnvelopeValue *= (1.0f - FastDecayCoeff);
+			if (AudioEnvelopeValue < 0.001f) AudioEnvelopeValue = 0.0f;
 		}
 	}

-	// If queue runs dry, decay towards silence and reset text state
-	if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
-	{
-		for (const FName& Name : VisemeNames)
-		{
-			TargetVisemes.FindOrAdd(Name) = 0.0f;
-		}
-		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
-		PlaybackTimer = 0.0f;
-
-		// Reset text state — but ONLY after the full response (agent_response)
-		// has arrived AND text was applied. This prevents destroying text between
-		// audio chunks of the SAME utterance: partial text arrives once, but
-		// ElevenLabs splits the audio into 2-3 chunks with gaps. Without
-		// bFullTextReceived, the text is erased after chunk 1's queue empties,
-		// leaving chunks 2-3 without text visemes (spectral fallback only).
-		if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
-		{
-			AccumulatedText.Reset();
-			TextVisemeSequence.Reset();
-			bTextVisemesApplied = false;
-			bFullTextReceived = false;
-		}
-	}
-
-	// ── Asymmetric smoothing ─────────────────────────────────────────────────
-	// At SmoothingSpeed=50: AttackSpeed=50 → alpha=0.83/frame, ~1-2 frames to target.
-	// ReleaseSpeed=32.5 → alpha=0.54/frame, ~3 frames to 70%. Mouth opens quickly,
-	// closes more gradually for natural-looking speech.
-	//
-	// In pose mode, use gentler smoothing: the spectral analysis oscillates
-	// between frames and with 50+ simultaneous curves per viseme, rapid
-	// tracking amplifies noise into visible vibration. Slower smoothing
-	// absorbs the oscillation while keeping movements deliberate and clean.
-	const bool bPoseSmoothing = (PoseExtractedCurveMap.Num() > 0);
-	const float AttackSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.7f : 1.0f);
-	const float ReleaseSpeed = SmoothingSpeed * (bPoseSmoothing ? 0.45f : 0.65f);
+	const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
 	bool bAnyNonZero = false;

-	for (const FName& Name : VisemeNames)
+	if (bPoseMode && bVisemeTimelineActive && VisemeTimeline.Num() > 0)
 	{
-		float& Current = SmoothedVisemes.FindOrAdd(Name);
-		const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
+		// ── POSE MODE: Decoupled viseme timeline ─────────────────────────
+		// Visemes are evaluated from an independent timeline at render
+		// framerate, completely decoupled from 32ms audio chunk windows.
+		// Audio provides only the amplitude envelope for modulation.

-		const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed;
-		const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f);
+		// Pause timeline during silence gaps between TTS chunks.
+		// This keeps the timeline in sync when ElevenLabs sends audio
+		// in 2-3 chunks with ~2s gaps between them.
+		const bool bShouldPause = (VisemeQueue.Num() == 0 && AudioEnvelopeValue < 0.05f);
+		if (!bShouldPause)
+			VisemeTimelineCursor += DeltaTime;

-		Current = FMath::Lerp(Current, Target, Alpha);
-
-		// Snap to zero to avoid infinite tiny values
-		if (Current < 0.001f) Current = 0.0f;
-		if (Current > 0.001f) bAnyNonZero = true;
-	}
-
-	// Periodic viseme activity log (Verbose — enable with log verbosity for debugging)
-	static int32 TickLogCount = 0;
-	if (++TickLogCount % 30 == 1)
-	{
-		FName DominantViseme = FName("sil");
-		float DominantWeight = 0.0f;
-		for (const FName& Name : VisemeNames)
+		// Timeline end handling
+		const float TimelineEnd = VisemeTimeline.Last().StartSec + VisemeTimeline.Last().DurationSec;
+		if (VisemeTimelineCursor >= TimelineEnd)
 		{
-			const float W = SmoothedVisemes.FindOrAdd(Name);
-			if (W > DominantWeight)
+			// Clamp at end — envelope will close the mouth naturally
+			VisemeTimelineCursor = TimelineEnd - 0.001f;
+
+			// If queue is also dry, deactivate and reset text state
+			if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
 			{
-				DominantWeight = W;
-				DominantViseme = Name;
+				bVisemeTimelineActive = false;
+
+				if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
+				{
+					AccumulatedText.Reset();
+					TextVisemeSequence.Reset();
+					bTextVisemesApplied = false;
+					bFullTextReceived = false;
+				}
 			}
 		}

-		UE_LOG(LogElevenLabsLipSync, Verbose,
-			TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"),
-			VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight);
+		// Dead zone: quadratic suppression of low envelope values
+		// to eliminate mouth trembling before silence transitions.
+		float EffectiveEnv = AudioEnvelopeValue;
+		const float DeadZone = 0.15f;
+		if (EffectiveEnv < DeadZone)
+		{
+			const float DzT = EffectiveEnv / DeadZone;
+			EffectiveEnv = DzT * DzT * DeadZone;
+		}
+
+		// Find current viseme entry in timeline
+		int32 CurrentIdx = VisemeTimeline.Num() - 1;
+		for (int32 i = 0; i < VisemeTimeline.Num(); ++i)
+		{
+			if (VisemeTimelineCursor < VisemeTimeline[i].StartSec + VisemeTimeline[i].DurationSec)
+			{
+				CurrentIdx = i;
+				break;
+			}
+		}
+
+		const FVisemeTimelineEntry& Entry = VisemeTimeline[CurrentIdx];
+		const float LocalProgress = FMath::Clamp(
+			(VisemeTimelineCursor - Entry.StartSec) / FMath::Max(0.001f, Entry.DurationSec),
+			0.0f, 1.0f);
+
+		const int32 NextIdx = FMath::Min(CurrentIdx + 1, VisemeTimeline.Num() - 1);
+		const FName& NextViseme = VisemeTimeline[NextIdx].Viseme;
+
+		// Full-duration crossfade with quintic smootherstep (C2 continuous).
+		// NO hold phase — the mouth is always in motion, transitioning from
+		// one viseme to the next over the entire duration. This eliminates
+		// the "static hold then snap" feel of partial crossfades.
+		//
+		// Quintic smootherstep: T³(6T²-15T+10) — smoother than smoothstep,
+		// zero 1st AND 2nd derivative at endpoints = no visible acceleration
+		// discontinuity at viseme boundaries.
+		float BlendToNext = 0.0f;
+		if (NextViseme != Entry.Viseme)
+		{
+			const float T = LocalProgress; // 0..1 over full duration
+			BlendToNext = T * T * T * (T * (T * 6.0f - 15.0f) + 10.0f);
+		}
+
+		// Set TargetVisemes from timeline × amplitude envelope
+		for (const FName& Name : VisemeNames)
+			TargetVisemes.FindOrAdd(Name) = 0.0f;
+
+		const float Amp = FMath::Max(EffectiveEnv, 0.0f);
+		TargetVisemes.FindOrAdd(Entry.Viseme) = Amp * (1.0f - BlendToNext);
+		if (BlendToNext > 0.0f)
+			TargetVisemes.FindOrAdd(NextViseme) = Amp * BlendToNext;
+
+		// Moderate Lerp smoothing — smooths transitions across 50+ pose curves.
+		// Attack=0.55: reaches target in ~3 frames (99ms at 30fps)
+		// Release=0.40: fades in ~5 frames (165ms at 30fps)
+		// Slower than before (was 0.90/0.70) to eliminate saccades while the
+		// smoothstep crossfade handles the primary transition shape.
+		const float PoseAttackAlpha = 0.55f;
+		const float PoseReleaseAlpha = 0.40f;
+
+		for (const FName& Name : VisemeNames)
+		{
+			float& Current = SmoothedVisemes.FindOrAdd(Name);
+			const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
+			const float Alpha = (Target > Current) ? PoseAttackAlpha : PoseReleaseAlpha;
+			Current = FMath::Lerp(Current, Target, Alpha);
+			if (Current < 0.005f) Current = 0.0f;
+			if (Current > 0.001f) bAnyNonZero = true;
+		}
+	}
+	else
+	{
+		// ── NON-POSE MODE (or pose mode without timeline) ────────────────
+		// Queue-based system: inter-frame interpolation + asymmetric smoothing.
+
+		// Inter-frame interpolation: blend between last consumed and next queued frame
+		if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0)
+		{
+			const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f);
+			for (const FName& Name : VisemeNames)
+			{
+				const float From = LastConsumedVisemes.FindRef(Name);
+				const float To = VisemeQueue[0].FindRef(Name);
+				TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T);
+			}
+		}
+
+		// Queue-dry: decay to silence and reset text state
+		if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
+		{
+			for (const FName& Name : VisemeNames)
+				TargetVisemes.FindOrAdd(Name) = 0.0f;
+			TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
+			PlaybackTimer = 0.0f;
+
+			if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
+			{
+				AccumulatedText.Reset();
+				TextVisemeSequence.Reset();
+				bTextVisemesApplied = false;
+				bFullTextReceived = false;
+			}
+		}
+
+		// Asymmetric smoothing (fast attack, slow release)
+		const float AttackSpeed = SmoothingSpeed;
+		const float ReleaseSpeed = SmoothingSpeed * 0.65f;
+
+		for (const FName& Name : VisemeNames)
+		{
+			float& Current = SmoothedVisemes.FindOrAdd(Name);
+			const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
+			const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed;
+			const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f);
+			Current = FMath::Lerp(Current, Target, Alpha);
+			if (Current < 0.001f) Current = 0.0f;
+			if (Current > 0.001f) bAnyNonZero = true;
+		}
+	}
+
+	// Real-time viseme debug log — every 3 ticks (~100ms at 30fps).
+	// Shows all active smoothed visemes + envelope to diagnose trembling.
+	static int32 TickLogCount = 0;
+	if (++TickLogCount % 3 == 0 && bAnyNonZero)
+	{
+		FString ActiveVisemes;
+		for (const FName& Name : VisemeNames)
+		{
+			const float W = SmoothedVisemes.FindOrAdd(Name);
+			if (W > 0.01f)
+			{
+				if (ActiveVisemes.Len() > 0) ActiveVisemes += TEXT(" ");
+				ActiveVisemes += FString::Printf(TEXT("%s=%.3f"), *Name.ToString(), W);
+			}
+		}
+		if (ActiveVisemes.IsEmpty()) ActiveVisemes = TEXT("(none)");
+
+		UE_LOG(LogElevenLabsLipSync, Log,
+			TEXT("VISEME Q=%d Env=%.3f TL=%.0fms | %s"),
+			VisemeQueue.Num(), AudioEnvelopeValue, VisemeTimelineCursor * 1000.0f, *ActiveVisemes);
 	}

 	// Convert visemes to ARKit blendshapes
@ -794,7 +923,14 @@ void UElevenLabsLipSyncComponent::OnAgentStopped()
 	VisemeQueue.Reset();
 	AmplitudeQueue.Reset();
 	PlaybackTimer = 0.0f;
+	AudioEnvelopeValue = 0.0f;
 	bWaitingForText = false;
+
+	// Deactivate viseme timeline (will be rebuilt on next utterance)
+	bVisemeTimelineActive = false;
+	VisemeTimeline.Reset();
+	VisemeTimelineCursor = 0.0f;
+	TotalActiveFramesSeen = 0;
 }

 void UElevenLabsLipSyncComponent::ResetToNeutral()
@ -803,6 +939,7 @@ void UElevenLabsLipSyncComponent::ResetToNeutral()
 	VisemeQueue.Reset();
 	AmplitudeQueue.Reset();
 	PlaybackTimer = 0.0f;
+	AudioEnvelopeValue = 0.0f;
 	bWaitingForText = false;

 	// Reset text-driven lip sync state for the interrupted utterance
@ -811,6 +948,12 @@ void UElevenLabsLipSyncComponent::ResetToNeutral()
 	bTextVisemesApplied = false;
 	bFullTextReceived = false;

+	// Reset decoupled viseme timeline
+	bVisemeTimelineActive = false;
+	VisemeTimeline.Reset();
+	VisemeTimelineCursor = 0.0f;
+	TotalActiveFramesSeen = 0;
+
 	// Snap all visemes to silence immediately (no smoothing delay)
 	for (const FName& Name : VisemeNames)
 	{
@ -883,29 +1026,27 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD

 	if (bPoseMode)
 	{
-		// ── Pose mode: hybrid amplitude ──────────────────────────────────
-		// Two amplitude levels serve different purposes:
+		// ── Pose mode: envelope-modulated amplitude ──────────────────────
+		// An envelope follower applied to per-window RMS creates a smooth
+		// amplitude curve that tracks speech dynamics:
 		//
-		// 1. CHUNK-LEVEL RMS → shape intensity (smooth, no per-window jitter).
-		//    All active frames use this amplitude so 50+ pose curves don't
-		//    vibrate from 32ms-level amplitude oscillation.
+		//   - Fast ATTACK: mouth opens quickly on speech onset / louder syllables
+		//   - Slow RELEASE: mouth closes gradually between syllables / pauses
+		//   - No per-window jitter: the envelope smooths 32ms-level oscillation
+		//     that would vibrate 50+ simultaneous pose curves
 		//
-		// 2. PER-WINDOW RMS → silence detection only (binary: speech or pause).
-		//    Detects intra-chunk silence windows that chunk-level averaging
-		//    would miss. This makes audio pauses (commas, breathing) visible
-		//    as the mouth properly closes during gaps.
+		// Raw per-window RMS is still used for silence detection (binary gate).
+		// The envelope value drives the shape intensity (how open the mouth is).
 		//
-		float ChunkSumSq = 0.0f;
-		for (int32 i = 0; i < NumSamples; ++i)
-			ChunkSumSq += FloatBuffer[i] * FloatBuffer[i];
-		const float ChunkRMS = FMath::Sqrt(ChunkSumSq / FMath::Max(1.0f, static_cast<float>(NumSamples)));
-		float ChunkAmplitude = FMath::Clamp(ChunkRMS * 10.0f, 0.0f, 1.5f);
-		ChunkAmplitude = FMath::Clamp(FMath::Pow(ChunkAmplitude, 0.4f), 0.0f, 1.0f);
-		ChunkAmplitude *= AmplitudeScale;
+		const float WindowDurationSec = static_cast<float>(WindowSize) / 16000.0f; // ~32ms
+		const float AttackCoeff = 1.0f - FMath::Exp(-WindowDurationSec
+			/ FMath::Max(0.001f, EnvelopeAttackMs * 0.001f));
+		const float ReleaseCoeff = 1.0f - FMath::Exp(-WindowDurationSec
+			/ FMath::Max(0.001f, EnvelopeReleaseMs * 0.001f));

 		for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize)
 		{
-			// Per-window amplitude for silence detection
+			// Per-window RMS amplitude
 			float WindowSumSq = 0.0f;
 			const int32 WindowEnd = FMath::Min(Offset + WindowSize, NumSamples);
 			const int32 WindowLen = WindowEnd - Offset;
@ -916,7 +1057,24 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 			WindowAmp = FMath::Clamp(FMath::Pow(WindowAmp, 0.4f), 0.0f, 1.0f);
 			WindowAmp *= AmplitudeScale;

-			const bool bSilentWindow = (WindowAmp < 0.08f);
+			// Envelope follower: fast attack, slow release
+			const float Coeff = (WindowAmp > AudioEnvelopeValue) ? AttackCoeff : ReleaseCoeff;
+			AudioEnvelopeValue += (WindowAmp - AudioEnvelopeValue) * Coeff;
+
+			// Dead zone: quadratic suppression of low amplitudes.
+			// Low envelope values (0.08-0.15) produce tiny mouth movements
+			// that look like trembling before silence. The quadratic curve
+			// pushes these below the silence gate, creating a
+			// clean cut into silence instead of gradual fade-to-tremble.
+			const float DeadZone = 0.15f;
+			float EffectiveAmp = AudioEnvelopeValue;
+			if (EffectiveAmp < DeadZone)
+			{
+				const float T = EffectiveAmp / DeadZone;        // 0..1
+				EffectiveAmp = T * T * DeadZone;                 // Quadratic: low → ~0
+			}
+
+			const bool bSilentWindow = (EffectiveAmp < 0.08f);

 			TMap<FName, float> Frame;
 			for (const FName& Name : VisemeNames)
@ -925,17 +1083,18 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 			if (bSilentWindow)
 			{
 				Frame.FindOrAdd(FName("sil")) = 1.0f;
-				AmplitudeQueue.Add(0.0f); // Marked silent for ApplyTextVisemesToQueue
+				AmplitudeQueue.Add(0.0f);
 			}
 			else
 			{
-				Frame.FindOrAdd(FName("aa")) = ChunkAmplitude; // Smooth chunk intensity
-				AmplitudeQueue.Add(ChunkAmplitude);
+				Frame.FindOrAdd(FName("aa")) = EffectiveAmp;
+				AmplitudeQueue.Add(EffectiveAmp);
+				TotalActiveFramesSeen++;
 			}

 			VisemeQueue.Add(Frame);
-			MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : ChunkAmplitude);
-			MaxAmp = FMath::Max(MaxAmp, ChunkAmplitude);
+			MinAmp = FMath::Min(MinAmp, bSilentWindow ? 0.0f : EffectiveAmp);
+			MaxAmp = FMath::Max(MaxAmp, EffectiveAmp);
 			WindowsQueued++;
 		}
 	}
@ -1156,7 +1315,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 		if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3)
 		{
 			// Text already available — apply and start playback immediately
-			ApplyTextVisemesToQueue();
+			if (bPoseMode)
+				BuildVisemeTimeline();
+			else
+				ApplyTextVisemesToQueue();
 			PlaybackTimer = 0.0f;
 			UE_LOG(LogElevenLabsLipSync, Verbose,
 				TEXT("Text already available (%d visemes). Starting lip sync immediately."),
@ -1176,7 +1338,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 	else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0)
 	{
 		// Not a new utterance but text is available — apply to new frames
-		ApplyTextVisemesToQueue();
+		if (bPoseMode)
+			BuildVisemeTimeline();
+		else
+			ApplyTextVisemesToQueue();
 	}

 	UE_LOG(LogElevenLabsLipSync, Log,
@ -1216,9 +1381,13 @@ void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialTe
 	// apply text visemes to queued frames and start consuming.
 	if (bWaitingForText && TextVisemeSequence.Num() >= 3)
 	{
+		const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
 		if (VisemeQueue.Num() > 0)
 		{
-			ApplyTextVisemesToQueue();
+			if (bPoseMode)
+				BuildVisemeTimeline();
+			else
+				ApplyTextVisemesToQueue();
 		}
 		bWaitingForText = false;
 		PlaybackTimer = 0.0f; // Start consuming now
@ -1239,10 +1408,13 @@ void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& Response
 	UE_LOG(LogElevenLabsLipSync, Log,
 		TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num());

-	// Apply to any remaining queued frames
-	if (VisemeQueue.Num() > 0)
+	// Apply to any remaining queued frames (or extend timeline in pose mode)
 	{
-		ApplyTextVisemesToQueue();
+		const bool bPoseMode = (PoseExtractedCurveMap.Num() > 0);
+		if (bPoseMode)
+			BuildVisemeTimeline();
+		else if (VisemeQueue.Num() > 0)
+			ApplyTextVisemesToQueue();
 	}

 	// If we were waiting for text to arrive before starting playback, start now
@ -1775,22 +1947,25 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue()
 			Frame.FindOrAdd(Name) = 0.0f;
 		}

-		// Anticipatory blending: in the last 30% of each viseme,
-		// gradually blend towards the next viseme shape.
-		const float BlendZone = 0.3f;
+		// Full crossfade with smoothstep in the last 40% of each viseme.
+		// The previous 30%/50%-max blend caused a discontinuity at viseme
+		// boundaries (50/50 → 0/100 jump), visible as trembling on 50+ curves.
+		// Now: full 0→100% crossfade with smoothstep (ease-in-out) curve
+		// eliminates any discontinuity at the boundary.
+		const float BlendZone = 0.4f;
 		float BlendToNext = 0.0f;
 		if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme)
 		{
-			BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone;
+			const float T = (LocalProgress - (1.0f - BlendZone)) / BlendZone; // 0..1
+			BlendToNext = T * T * (3.0f - 2.0f * T); // Smoothstep: ease-in-out
 		}

-		// Primary viseme shape × amplitude
-		Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f);
+		// Crossfade: current viseme fades out, next fades in
+		Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext);

-		// Blend towards next viseme
 		if (BlendToNext > 0.0f)
 		{
-			Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f;
+			Frame.FindOrAdd(NextViseme) += Amp * BlendToNext;
 		}

 		ActiveIdx++;
@ -1806,6 +1981,90 @@ void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue()
 		FinalRatio, FinalRatio * 32.0f);
 }

+// ─────────────────────────────────────────────────────────────────────────────
+// Decoupled viseme timeline (pose mode)
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::BuildVisemeTimeline()
+{
+	if (TextVisemeSequence.Num() == 0) return;
+
+	// Use TOTAL active frames seen across all chunks (not just remaining queue).
+	// Frames already consumed by TickComponent are counted too, so the timeline
+	// is properly scaled to the full audio duration.
+	constexpr float WindowDurationSec = 512.0f / 16000.0f; // ~32ms
+	if (TotalActiveFramesSeen == 0) return;
+
+	const float AudioDurationSec = TotalActiveFramesSeen * WindowDurationSec;
+
+	// ── Subsample viseme sequence to natural speech rate ──────────────────
+	// Real speech has ~4-5 distinct mouth shapes per second (one per syllable
+	// nucleus). The text-to-viseme pipeline can produce 15-25 visemes for a
+	// short phrase, which at 1.2s audio = ~60ms each = saccades.
+	// Cap at ~10 visemes/sec (100ms minimum) — allows more phoneme detail
+	// while staying within natural French syllable rate (~8-10 shapes/sec).
+	constexpr float MinVisemeDurationSec = 0.100f;
+	const int32 MaxVisemes = FMath::Max(2, FMath::CeilToInt(AudioDurationSec / MinVisemeDurationSec));
+
+	TArray<FName> FinalSequence;
+	if (TextVisemeSequence.Num() > MaxVisemes)
+	{
+		// Subsample: take evenly-spaced visemes from the full sequence
+		for (int32 i = 0; i < MaxVisemes; ++i)
+		{
+			const int32 Idx = (i * (TextVisemeSequence.Num() - 1)) / FMath::Max(1, MaxVisemes - 1);
+			const FName& V = TextVisemeSequence[Idx];
+			// Skip consecutive duplicates
+			if (FinalSequence.Num() == 0 || FinalSequence.Last() != V)
+				FinalSequence.Add(V);
+		}
+	}
+	else
+	{
+		FinalSequence = TextVisemeSequence;
+	}
+
+	if (FinalSequence.Num() == 0) return;
+
+	// Compute natural durations from phoneme weights
+	float NaturalTotalSec = 0.0f;
+	for (const FName& V : FinalSequence)
+	{
+		NaturalTotalSec += GetVisemeDurationWeight(V) * 0.120f;
+	}
+
+	// Scale factor: match actual audio duration
+	const float Scale = (NaturalTotalSec > 0.01f) ? AudioDurationSec / NaturalTotalSec : 1.0f;
+
+	// If timeline is already playing, preserve absolute cursor position.
+	const float SavedCursor = bVisemeTimelineActive ? VisemeTimelineCursor : 0.0f;
+
+	// Build timeline entries with scaled durations
+	VisemeTimeline.Reset();
+	float CursorSec = 0.0f;
+	for (const FName& V : FinalSequence)
+	{
+		FVisemeTimelineEntry Entry;
+		Entry.Viseme = V;
+		Entry.StartSec = CursorSec;
+		Entry.DurationSec = GetVisemeDurationWeight(V) * 0.120f * Scale;
+		VisemeTimeline.Add(Entry);
+		CursorSec += Entry.DurationSec;
+	}
+
+	// Restore cursor: keep absolute position, clamped to new timeline
+	VisemeTimelineCursor = FMath::Min(SavedCursor, FMath::Max(0.0f, CursorSec - 0.001f));
+
+	bVisemeTimelineActive = true;
+	bTextVisemesApplied = true;
+
+	UE_LOG(LogElevenLabsLipSync, Log,
+		TEXT("Built viseme timeline: %d entries (from %d, max %d), audio=%.0fms, scale=%.2f → %.0fms/viseme avg"),
+		FinalSequence.Num(), TextVisemeSequence.Num(), MaxVisemes,
+		AudioDurationSec * 1000.0f, Scale,
+		(FinalSequence.Num() > 0) ? (CursorSec * 1000.0f / FinalSequence.Num()) : 0.0f);
+}
+
 void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
 {
 	if (!SpectrumAnalyzer) return;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@ -11,6 +11,15 @@ class UElevenLabsConversationalAgentComponent;
 class UElevenLabsLipSyncPoseMap;
 class USkeletalMeshComponent;

+/** A single entry in the decoupled viseme timeline.
+ *  Built from text phoneme analysis, played back independently of audio chunks. */
+struct FVisemeTimelineEntry
+{
+	FName Viseme;
+	float StartSec;     // absolute start time from utterance start
+	float DurationSec;  // how long this viseme is held
+};
+
 // Fired every tick when viseme/blendshape data has been updated.
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);

@ -66,6 +75,24 @@ public:
 		ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
 	float SmoothingSpeed = 50.0f;

+	// ── Audio Envelope ──────────────────────────────────────────────────────
+
+	/** Envelope attack time in milliseconds.
+	 *  Controls how fast the mouth opens when speech starts or gets louder.
+	 *  Lower = snappier onset, higher = gentler opening. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "5.0", ClampMax = "100.0",
+		ToolTip = "Envelope attack (ms).\n10 = snappy, 15 = balanced, 30 = gentle.\nHow fast the mouth opens on speech onset."))
+	float EnvelopeAttackMs = 15.0f;
+
+	/** Envelope release time in milliseconds.
+	 *  Controls how slowly the mouth closes when speech gets quieter.
+	 *  Higher = smoother, more natural decay. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "20.0", ClampMax = "500.0",
+		ToolTip = "Envelope release (ms).\n60 = responsive, 100 = balanced, 200 = smooth/cinematic.\nHow slowly the mouth closes between syllables."))
+	float EnvelopeReleaseMs = 100.0f;
+
 	// ── Phoneme Pose Map ─────────────────────────────────────────────────────

 	/** Optional pose map asset mapping OVR visemes to phoneme AnimSequences.
@ -130,9 +157,14 @@ private:
 	/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
 	void ConvertTextToVisemes(const FString& Text);

-	/** Apply text-derived viseme shapes to the remaining queued frames. */
+	/** Apply text-derived viseme shapes to the remaining queued frames (non-pose mode). */
 	void ApplyTextVisemesToQueue();

+	/** Build a decoupled viseme timeline from text (pose mode).
+	 *  Visemes get natural phoneme durations, evaluated continuously in Tick
+	 *  instead of being quantized to 32ms audio chunks. */
+	void BuildVisemeTimeline();
+
 	/** Extract frequency band energies from the spectrum analyzer. */
 	void AnalyzeSpectrum();

@ -198,6 +230,10 @@ private:
 	// Timer for consuming queued viseme frames at the FFT window rate
 	float PlaybackTimer = 0.0f;

+	// Envelope follower state: smoothed amplitude that tracks speech dynamics
+	// with fast attack (mouth opens quickly) and slow release (closes gradually).
+	float AudioEnvelopeValue = 0.0f;
+
 	// Whether we have pending analysis results to process
 	bool bHasPendingAnalysis = false;

@ -214,6 +250,16 @@ private:
 	// Whether text-based visemes have been applied to the current queue
 	bool bTextVisemesApplied = false;

+	// ── Decoupled viseme timeline (pose mode) ────────────────────────────────
+	// In pose mode, text visemes are played from an independent timeline
+	// evaluated each tick at render framerate, instead of being quantized
+	// to 32ms audio chunk windows. Audio provides only the amplitude envelope.
+
+	TArray<FVisemeTimelineEntry> VisemeTimeline;
+	float VisemeTimelineCursor = 0.0f;      // current playback position (seconds)
+	bool bVisemeTimelineActive = false;      // true when timeline is playing
+	int32 TotalActiveFramesSeen = 0;        // cumulative non-silent frames across all chunks
+
 	// Set when agent_response arrives (full text for this utterance).
 	// Prevents resetting AccumulatedText between audio chunks of the
 	// SAME utterance — only reset once the full response is confirmed.