diff --git a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap
index 1fe726f..b00a002 100644
Binary files a/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap and b/Unreal/PS_AI_Agent/Content/Demo_VoiceOnly.umap differ
diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
index 27f231c..432938f 100644
Binary files a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset and b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset differ
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
index c29d21f..4b25444 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -80,6 +80,25 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
 		GeneratingTickCount = 0;
 	}
 
+	// Pre-buffer timer: start playback after the pre-buffer period expires.
+	// If the second TTS chunk didn't arrive in time, start playing with
+	// whatever we have. The silence padding will bridge any remaining gap.
+	if (bPreBuffering)
+	{
+		const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
+		if (Elapsed >= static_cast<double>(AudioPreBufferMs))
+		{
+			bPreBuffering = false;
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[Turn %d] Pre-buffer timeout (%dms). Starting playback."),
+				LastClosedTurnIndex, AudioPreBufferMs);
+			if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+			{
+				AudioPlaybackComponent->Play();
+			}
+		}
+	}
+
 	// Silence detection.
 	// ISSUE-8: broadcast OnAgentStoppedSpeaking OUTSIDE AudioQueueLock.
 	// OnProceduralUnderflow (audio thread) also acquires AudioQueueLock — if we broadcast
@@ -540,13 +559,32 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 	USoundWaveProcedural* InProceduralWave, const int32 SamplesRequired)
 {
 	FScopeLock Lock(&AudioQueueLock);
-	if (AudioQueue.Num() == 0) return;
 
-	const int32 BytesRequired = SamplesRequired * sizeof(int16);
-	const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
+	if (AudioQueue.Num() > 0)
+	{
+		const int32 BytesRequired = SamplesRequired * sizeof(int16);
+		const int32 BytesToPush = FMath::Min(AudioQueue.Num(), BytesRequired);
 
-	InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
-	AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
+		InProceduralWave->QueueAudio(AudioQueue.GetData(), BytesToPush);
+		AudioQueue.RemoveAt(0, BytesToPush, EAllowShrinking::No);
+	}
+	else if (bAgentSpeaking)
+	{
+		// Queue is empty but agent is still speaking (TTS inter-batch gap).
+		// Feed a SMALL amount of silence to keep the audio component alive.
+		// IMPORTANT: we cap at 512 samples (32ms at 16kHz) regardless of
+		// SamplesRequired to avoid queuing large blocks of silence in the
+		// audio component's internal buffer. Without this cap, multiple
+		// underflow calls during a TTS gap accumulate hundreds of ms of silence
+		// that must be played through BEFORE real audio data — causing the
+		// audible 1s+ pause between TTS chunks. With 32ms chunks, at most
+		// one small silence block sits ahead of new audio when it arrives.
+		constexpr int32 MaxSilenceSamples = 512; // 32ms at 16kHz
+		const int32 SilenceSamples = FMath::Min(SamplesRequired, MaxSilenceSamples);
+		const int32 SilenceBytes = SilenceSamples * sizeof(int16);
+		SilenceBuffer.SetNumZeroed(SilenceBytes);
+		InProceduralWave->QueueAudio(SilenceBuffer.GetData(), SilenceBytes);
+	}
 }
 
 void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uint8>& PCMData)
@@ -573,10 +611,50 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
 
 		OnAgentStartedSpeaking.Broadcast();
 
+		if (AudioPreBufferMs > 0)
+		{
+			// Pre-buffer: accumulate audio before starting playback.
+			// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
+			// chunk 1 finishes playing, eliminating mid-sentence pauses.
+			bPreBuffering = true;
+			PreBufferStartTime = FPlatformTime::Seconds();
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[Turn %d] Pre-buffering %dms before starting playback."),
+				LastClosedTurnIndex, AudioPreBufferMs);
+		}
+		else if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+		{
+			AudioPlaybackComponent->Play();
+		}
+	}
+	else if (bPreBuffering)
+	{
+		// Second (or later) audio chunk arrived during pre-buffer period.
+		// We now have both chunks buffered — start playback immediately.
+		bPreBuffering = false;
+		const double BufferedMs = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
+		UE_LOG(LogElevenLabsAgent, Log,
+			TEXT("[Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
+			LastClosedTurnIndex, BufferedMs);
 		if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
 		{
 			AudioPlaybackComponent->Play();
 		}
+		SilentTickCount = 0;
+	}
+	else
+	{
+		// Already speaking — but the audio component may have stopped due to
+		// buffer underrun (TTS inter-batch gap). Restart it if needed.
+		if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
+		{
+			UE_LOG(LogElevenLabsAgent, Warning,
+				TEXT("[Turn %d] Audio component stopped during speech (buffer underrun). Restarting playback."),
+				LastClosedTurnIndex);
+			AudioPlaybackComponent->Play();
+		}
+		// Reset silence counter — new audio arrived, we're not in a gap anymore
+		SilentTickCount = 0;
 	}
 }
 
@@ -592,6 +670,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 	// while holding it would block the audio thread for the full Blueprint handler duration.
 	bool bWasSpeaking = false;
 	double Now = 0.0;
+	bPreBuffering = false; // Clear pre-buffer state on stop.
 	{
 		FScopeLock Lock(&AudioQueueLock);
 		AudioQueue.Empty();
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
index bcb4adf..3a0e0b1 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -33,136 +33,136 @@ TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::CreateVisemeToBlend
 	// PP — bilabial (P, B, M): lips pressed together
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthClose"), 0.7f);
-		BS.Add(FName("mouthPressLeft"), 0.3f);
-		BS.Add(FName("mouthPressRight"), 0.3f);
+		BS.Add(FName("mouthClose"), 0.9f);
+		BS.Add(FName("mouthPressLeft"), 0.5f);
+		BS.Add(FName("mouthPressRight"), 0.5f);
 		Map.Add(FName("PP"), BS);
 	}
 
 	// FF — labiodental (F, V): lower lip tucked under upper teeth
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthShrugLower"), 0.5f);
-		BS.Add(FName("mouthUpperUpLeft"), 0.3f);
-		BS.Add(FName("mouthUpperUpRight"), 0.3f);
-		BS.Add(FName("jawOpen"), 0.1f);
+		BS.Add(FName("mouthShrugLower"), 0.7f);
+		BS.Add(FName("mouthUpperUpLeft"), 0.4f);
+		BS.Add(FName("mouthUpperUpRight"), 0.4f);
+		BS.Add(FName("jawOpen"), 0.15f);
 		Map.Add(FName("FF"), BS);
 	}
 
 	// TH — dental (TH): tongue between teeth
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("tongueOut"), 0.4f);
-		BS.Add(FName("jawOpen"), 0.15f);
+		BS.Add(FName("tongueOut"), 0.5f);
+		BS.Add(FName("jawOpen"), 0.2f);
 		Map.Add(FName("TH"), BS);
 	}
 
 	// DD — alveolar (D, T, N): tongue on alveolar ridge
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.25f);
-		BS.Add(FName("mouthClose"), 0.2f);
-		BS.Add(FName("mouthLowerDownLeft"), 0.15f);
-		BS.Add(FName("mouthLowerDownRight"), 0.15f);
+		BS.Add(FName("jawOpen"), 0.35f);
+		BS.Add(FName("mouthClose"), 0.3f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.25f);
+		BS.Add(FName("mouthLowerDownRight"), 0.25f);
 		Map.Add(FName("DD"), BS);
 	}
 
 	// kk — velar (K, G): back of tongue raised
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.25f);
-		BS.Add(FName("mouthStretchLeft"), 0.15f);
-		BS.Add(FName("mouthStretchRight"), 0.15f);
+		BS.Add(FName("jawOpen"), 0.35f);
+		BS.Add(FName("mouthStretchLeft"), 0.25f);
+		BS.Add(FName("mouthStretchRight"), 0.25f);
 		Map.Add(FName("kk"), BS);
 	}
 
 	// CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthFunnel"), 0.45f);
-		BS.Add(FName("jawOpen"), 0.2f);
-		BS.Add(FName("mouthPucker"), 0.15f);
+		BS.Add(FName("mouthFunnel"), 0.65f);
+		BS.Add(FName("jawOpen"), 0.3f);
+		BS.Add(FName("mouthPucker"), 0.3f);
 		Map.Add(FName("CH"), BS);
 	}
 
 	// SS — alveolar fricative (S, Z): air through narrow channel
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthStretchLeft"), 0.4f);
-		BS.Add(FName("mouthStretchRight"), 0.4f);
-		BS.Add(FName("jawOpen"), 0.1f);
-		BS.Add(FName("mouthSmileLeft"), 0.15f);
-		BS.Add(FName("mouthSmileRight"), 0.15f);
+		BS.Add(FName("mouthStretchLeft"), 0.6f);
+		BS.Add(FName("mouthStretchRight"), 0.6f);
+		BS.Add(FName("jawOpen"), 0.15f);
+		BS.Add(FName("mouthSmileLeft"), 0.3f);
+		BS.Add(FName("mouthSmileRight"), 0.3f);
 		Map.Add(FName("SS"), BS);
 	}
 
 	// nn — nasal (N, M, NG): soft palate lowered
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.15f);
-		BS.Add(FName("mouthClose"), 0.2f);
-		BS.Add(FName("mouthPressLeft"), 0.1f);
-		BS.Add(FName("mouthPressRight"), 0.1f);
+		BS.Add(FName("jawOpen"), 0.2f);
+		BS.Add(FName("mouthClose"), 0.35f);
+		BS.Add(FName("mouthPressLeft"), 0.2f);
+		BS.Add(FName("mouthPressRight"), 0.2f);
 		Map.Add(FName("nn"), BS);
 	}
 
 	// RR — retroflex/rhotic (R, L): tongue curled or lateral
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthFunnel"), 0.3f);
-		BS.Add(FName("jawOpen"), 0.2f);
-		BS.Add(FName("mouthRollLower"), 0.15f);
+		BS.Add(FName("mouthFunnel"), 0.5f);
+		BS.Add(FName("jawOpen"), 0.3f);
+		BS.Add(FName("mouthRollLower"), 0.3f);
 		Map.Add(FName("RR"), BS);
 	}
 
 	// aa — open vowel (A as in "father"): wide open jaw
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.7f);
-		BS.Add(FName("mouthLowerDownLeft"), 0.4f);
-		BS.Add(FName("mouthLowerDownRight"), 0.4f);
-		BS.Add(FName("mouthShrugUpper"), 0.1f);
+		BS.Add(FName("jawOpen"), 0.85f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.5f);
+		BS.Add(FName("mouthLowerDownRight"), 0.5f);
+		BS.Add(FName("mouthShrugUpper"), 0.15f);
 		Map.Add(FName("aa"), BS);
 	}
 
 	// E — mid front vowel (E as in "bed"): mid-open, spread lips
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.4f);
-		BS.Add(FName("mouthSmileLeft"), 0.3f);
-		BS.Add(FName("mouthSmileRight"), 0.3f);
-		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
-		BS.Add(FName("mouthLowerDownRight"), 0.2f);
+		BS.Add(FName("jawOpen"), 0.5f);
+		BS.Add(FName("mouthSmileLeft"), 0.5f);
+		BS.Add(FName("mouthSmileRight"), 0.5f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.3f);
+		BS.Add(FName("mouthLowerDownRight"), 0.3f);
 		Map.Add(FName("E"), BS);
 	}
 
 	// ih — close front vowel (I as in "sit"): narrow opening, spread lips
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.2f);
-		BS.Add(FName("mouthSmileLeft"), 0.25f);
-		BS.Add(FName("mouthSmileRight"), 0.25f);
-		BS.Add(FName("mouthStretchLeft"), 0.1f);
-		BS.Add(FName("mouthStretchRight"), 0.1f);
+		BS.Add(FName("jawOpen"), 0.25f);
+		BS.Add(FName("mouthSmileLeft"), 0.45f);
+		BS.Add(FName("mouthSmileRight"), 0.45f);
+		BS.Add(FName("mouthStretchLeft"), 0.2f);
+		BS.Add(FName("mouthStretchRight"), 0.2f);
 		Map.Add(FName("ih"), BS);
 	}
 
 	// oh — mid back vowel (O as in "go"): rounded lips, open jaw
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("jawOpen"), 0.5f);
-		BS.Add(FName("mouthFunnel"), 0.5f);
-		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
-		BS.Add(FName("mouthLowerDownRight"), 0.2f);
+		BS.Add(FName("jawOpen"), 0.6f);
+		BS.Add(FName("mouthFunnel"), 0.7f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.3f);
+		BS.Add(FName("mouthLowerDownRight"), 0.3f);
 		Map.Add(FName("oh"), BS);
 	}
 
 	// ou — close back vowel (OO as in "boot"): tightly rounded lips
 	{
 		TMap<FName, float> BS;
-		BS.Add(FName("mouthPucker"), 0.6f);
-		BS.Add(FName("mouthFunnel"), 0.4f);
-		BS.Add(FName("jawOpen"), 0.15f);
+		BS.Add(FName("mouthPucker"), 0.8f);
+		BS.Add(FName("mouthFunnel"), 0.6f);
+		BS.Add(FName("jawOpen"), 0.2f);
 		Map.Add(FName("ou"), BS);
 	}
 
@@ -220,7 +220,20 @@ void UElevenLabsLipSyncComponent::BeginPlay()
 		AgentComponent = Agent;
 		AudioDataHandle = Agent->OnAgentAudioData.AddUObject(
 			this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived);
-		UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName());
+
+		// Bind to text response delegates for text-driven lip sync.
+		// Partial text (streaming) provides text BEFORE audio arrives.
+		// Full text provides the complete sentence (arrives just after audio).
+		Agent->OnAgentPartialResponse.AddDynamic(
+			this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
+		Agent->OnAgentTextResponse.AddDynamic(
+			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
+
+		// Enable partial response streaming if not already enabled
+		Agent->bEnableAgentPartialResponse = true;
+
+		UE_LOG(LogElevenLabsLipSync, Log,
+			TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
 	}
 	else
 	{
@@ -368,10 +381,17 @@ void UElevenLabsLipSyncComponent::BeginPlay()
 void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
 {
 	// Unbind from agent component
-	if (AgentComponent.IsValid() && AudioDataHandle.IsValid())
+	if (AgentComponent.IsValid())
 	{
-		AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
-		AudioDataHandle.Reset();
+		if (AudioDataHandle.IsValid())
+		{
+			AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
+			AudioDataHandle.Reset();
+		}
+		AgentComponent->OnAgentPartialResponse.RemoveDynamic(
+			this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
+		AgentComponent->OnAgentTextResponse.RemoveDynamic(
+			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
 	}
 	AgentComponent.Reset();
 	SpectrumAnalyzer.Reset();
@@ -388,31 +408,156 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 {
 	Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
 
-	// Smooth viseme weights towards targets using exponential interpolation
-	const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f);
+	// ── Consume queued viseme analysis frames at the FFT window rate ─────────
+	// Each 512-sample FFT window at 16kHz = 32ms of audio.
+	// We consume one queued frame every 32ms to match the original audio timing.
+	constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
+
+	// Pre-buffer sync: don't consume viseme queue while the agent component is
+	// pre-buffering audio. This keeps lip sync in sync with audio playback.
+	// Without this, the lip sync would start 250ms ahead of the audio.
+	if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
+	{
+		return;
+	}
+
+	// Wait-for-text: hold playback until text arrives so all frames get proper
+	// text-driven visemes. Timeout after 500ms and start with spectral shapes.
+	if (bWaitingForText)
+	{
+		const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime;
+		if (WaitElapsed >= 0.5)
+		{
+			// Timeout — start playback with spectral shapes as fallback
+			bWaitingForText = false;
+			PlaybackTimer = 0.0f;
+			UE_LOG(LogElevenLabsLipSync, Warning,
+				TEXT("Text wait timeout (%.0fms). Starting lip sync with spectral shapes (Queue=%d)."),
+				WaitElapsed * 1000.0, VisemeQueue.Num());
+		}
+		else
+		{
+			// Still waiting — keep timer frozen, skip consumption
+			PlaybackTimer = 0.0f;
+		}
+	}
+
+	PlaybackTimer += DeltaTime;
+
+	while (PlaybackTimer >= WindowDuration && VisemeQueue.Num() > 0)
+	{
+		LastConsumedVisemes = VisemeQueue[0];
+		TargetVisemes = VisemeQueue[0];
+		VisemeQueue.RemoveAt(0);
+		if (AmplitudeQueue.Num() > 0) AmplitudeQueue.RemoveAt(0);
+		PlaybackTimer -= WindowDuration;
+	}
+
+	// ── Inter-frame interpolation ─────────────────────────────────────────
+	// Instead of holding the same TargetVisemes for 32ms then jumping to the
+	// next frame, blend smoothly between the last consumed frame and the next
+	// queued frame. This prevents the "frantic" look from step-wise changes
+	// and creates continuous, natural-looking mouth motion.
+	if (VisemeQueue.Num() > 0 && LastConsumedVisemes.Num() > 0)
+	{
+		const float T = FMath::Clamp(PlaybackTimer / WindowDuration, 0.0f, 1.0f);
+		for (const FName& Name : VisemeNames)
+		{
+			const float From = LastConsumedVisemes.FindRef(Name);
+			const float To = VisemeQueue[0].FindRef(Name);
+			TargetVisemes.FindOrAdd(Name) = FMath::Lerp(From, To, T);
+		}
+	}
+
+	// If queue runs dry, decay towards silence and reset text state
+	if (VisemeQueue.Num() == 0 && PlaybackTimer > WindowDuration * 3.0f)
+	{
+		for (const FName& Name : VisemeNames)
+		{
+			TargetVisemes.FindOrAdd(Name) = 0.0f;
+		}
+		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
+		PlaybackTimer = 0.0f;
+
+		// Reset text state — but ONLY after the full response (agent_response)
+		// has arrived AND text was applied. This prevents destroying text between
+		// audio chunks of the SAME utterance: partial text arrives once, but
+		// ElevenLabs splits the audio into 2-3 chunks with gaps. Without
+		// bFullTextReceived, the text is erased after chunk 1's queue empties,
+		// leaving chunks 2-3 without text visemes (spectral fallback only).
+		if (AccumulatedText.Len() > 0 && bTextVisemesApplied && bFullTextReceived)
+		{
+			AccumulatedText.Reset();
+			TextVisemeSequence.Reset();
+			bTextVisemesApplied = false;
+			bFullTextReceived = false;
+		}
+	}
+
+	// ── Asymmetric smoothing ─────────────────────────────────────────────────
+	// At SmoothingSpeed=15: AttackSpeed=15 → alpha=0.24/frame, ~4 frames to 70%.
+	// ReleaseSpeed=7.5 → alpha=0.12/frame, ~9 frames to 70%. Mouth opens quickly,
+	// closes more gradually for natural-looking speech.
+	const float AttackSpeed = SmoothingSpeed * 1.0f;
+	const float ReleaseSpeed = SmoothingSpeed * 0.65f;
 	bool bAnyNonZero = false;
 
 	for (const FName& Name : VisemeNames)
 	{
 		float& Current = SmoothedVisemes.FindOrAdd(Name);
-		const float Target = TargetVisemes.FindOrAdd(Name);
+		const float Target = TargetVisemes.FindOrAdd(Name) * LipSyncStrength;
 
-		Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha);
+		const float Speed = (Target > Current) ? AttackSpeed : ReleaseSpeed;
+		const float Alpha = FMath::Clamp(DeltaTime * Speed, 0.0f, 1.0f);
+
+		Current = FMath::Lerp(Current, Target, Alpha);
 
 		// Snap to zero to avoid infinite tiny values
 		if (Current < 0.001f) Current = 0.0f;
 		if (Current > 0.001f) bAnyNonZero = true;
 	}
 
-	// "sil" uses LipSyncStrength=1 always — it's the rest pose
-	SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp(
-		SmoothedVisemes.FindOrAdd(FName("sil")),
-		TargetVisemes.FindOrAdd(FName("sil")),
-		Alpha);
+	// Periodic viseme activity log (Verbose — enable with log verbosity for debugging)
+	static int32 TickLogCount = 0;
+	if (++TickLogCount % 30 == 1)
+	{
+		FName DominantViseme = FName("sil");
+		float DominantWeight = 0.0f;
+		for (const FName& Name : VisemeNames)
+		{
+			const float W = SmoothedVisemes.FindOrAdd(Name);
+			if (W > DominantWeight)
+			{
+				DominantWeight = W;
+				DominantViseme = Name;
+			}
+		}
+
+		UE_LOG(LogElevenLabsLipSync, Verbose,
+			TEXT("LipSync: Queue=%d Viseme=%s(%.2f)"),
+			VisemeQueue.Num(), *DominantViseme.ToString(), DominantWeight);
+	}
 
 	// Convert visemes to ARKit blendshapes
 	MapVisemesToBlendshapes();
 
+	// ── Additional blendshape-level smoothing ─────────────────────────────
+	// A second smoothing pass on the final ARKit blendshape values removes
+	// residual jitter from the OVR→ARKit mapping step. This is lighter than
+	// the viseme-level smoothing and provides a natural "soft" look.
+	{
+		const float BSmoothAlpha = FMath::Clamp(DeltaTime * SmoothingSpeed * 0.4f, 0.0f, 1.0f);
+		for (auto& Pair : CurrentBlendshapes)
+		{
+			const float* Prev = PreviousBlendshapes.Find(Pair.Key);
+			if (Prev)
+			{
+				Pair.Value = FMath::Lerp(*Prev, Pair.Value, BSmoothAlpha);
+			}
+		}
+		PreviousBlendshapes = CurrentBlendshapes;
+	}
+
 	// Auto-apply morph targets if a target mesh is set
 	if (TargetMesh)
 	{
@@ -438,11 +583,10 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 	const int16* Samples = reinterpret_cast<const int16*>(PCMData.GetData());
 	const int32 NumSamples = PCMData.Num() / sizeof(int16);
 
-	// DEBUG: log first audio chunk received
 	static bool bFirstChunkLogged = false;
 	if (!bFirstChunkLogged)
 	{
-		UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
+		UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
 		bFirstChunkLogged = true;
 	}
 
@@ -452,14 +596,819 @@ void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMD
 		FloatBuffer.Add(static_cast<float>(Samples[i]) / 32768.0f);
 	}
 
-	// Feed to rolling FFT analyzer
+	// ── STEP 1: ONE spectral analysis for the whole chunk (SHAPE) ─────────
+	// The FSpectrumAnalyzer's ring buffer returns nearly identical results for
+	// sequential 512-sample pushes. So we analyze the chunk as a whole to
+	// determine which mouth shape (viseme blend) to use.
 	SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples);
+	SpectrumAnalyzer->PerformAnalysisIfPossible(true);
+	AnalyzeSpectrum(); // Sets TargetVisemes with shape-only weights (~1.0)
 
-	// Try to perform analysis (returns true when enough data for one FFT window)
-	if (SpectrumAnalyzer->PerformAnalysisIfPossible(true))
+	// Save the spectral shape for this chunk
+	TMap<FName, float> ChunkShape = TargetVisemes;
+
+	// ── Late start fix: when queue was empty, delay playback to wait for text ──
+	// Partial text usually arrives 50-100ms before audio, but sometimes audio
+	// comes first. A small delay gives text time to arrive and be applied to
+	// the first frames, preventing mute mouth at utterance start.
+	const bool bQueueWasEmpty = (VisemeQueue.Num() == 0);
+
+	// ── STEP 2: Per-window amplitude + ZCR (DYNAMICS + VARIATION) ─────────
+	// For each 512-sample window (~32ms), compute:
+	//   - RMS amplitude: captures syllable rhythm (natural opening/closing)
+	//   - Zero-crossing rate: detects sibilants/fricatives within the chunk
+	// The shape (which visemes) stays constant per chunk, but the amplitude
+	// (how much) varies per window, creating realistic speech dynamics.
+	constexpr int32 WindowSize = 512;
+	int32 WindowsQueued = 0;
+	float MinAmp = 1.0f, MaxAmp = 0.0f; // For debug logging
+
+	for (int32 Offset = 0; Offset + WindowSize <= NumSamples; Offset += WindowSize)
 	{
-		AnalyzeSpectrum();
+		// RMS amplitude for this window
+		float SumSquares = 0.0f;
+		int32 ZeroCrossings = 0;
+		for (int32 i = 0; i < WindowSize; ++i)
+		{
+			const float S = FloatBuffer[Offset + i];
+			SumSquares += S * S;
+			if (i > 0 && ((S >= 0.0f) != (FloatBuffer[Offset + i - 1] >= 0.0f)))
+				ZeroCrossings++;
+		}
+
+		const float WindowRMS = FMath::Sqrt(SumSquares / static_cast<float>(WindowSize));
+		const float ZCR = static_cast<float>(ZeroCrossings) / static_cast<float>(WindowSize - 1);
+
+		// Normalize amplitude: typical speech RMS at 16-bit is 0.02-0.15.
+		// Scale up and apply power curve for dynamic range compression.
+		// pow(0.4) compresses more than sqrt (0.5): quiet parts become more
+		// visible while loud parts are slightly reduced. This ensures the
+		// first part of a TTS response (often quieter) has adequate lip movement.
+		float Amplitude = FMath::Clamp(WindowRMS * 10.0f, 0.0f, 1.5f);
+		Amplitude = FMath::Clamp(FMath::Pow(Amplitude, 0.4f), 0.0f, 1.0f);
+
+		// Apply user-configurable amplitude attenuation (AmplitudeScale 0-1).
+		// This reduces overall mouth movement intensity without changing the
+		// viseme shape, giving control over "how much" the mouth opens.
+		Amplitude *= AmplitudeScale;
+
+		MinAmp = FMath::Min(MinAmp, Amplitude);
+		MaxAmp = FMath::Max(MaxAmp, Amplitude);
+
+		// Build this window's viseme frame
+		TMap<FName, float> WindowVisemes;
+
+		if (Amplitude < 0.08f)
+		{
+			// Silence — mouth closed (between syllables / pauses)
+			for (const FName& Name : VisemeNames)
+				WindowVisemes.Add(Name, 0.0f);
+			WindowVisemes.FindOrAdd(FName("sil")) = 1.0f;
+		}
+		else
+		{
+			// Active speech — determine shape and scale by amplitude
+
+			// High ZCR (>0.15) suggests fricative/sibilant energy.
+			// This provides within-chunk shape variation: voiced segments
+			// use the spectral shape, fricative segments override to SS/FF.
+			if (ZCR > 0.15f)
+			{
+				for (const FName& Name : VisemeNames)
+					WindowVisemes.Add(Name, 0.0f);
+
+				float SibStrength = FMath::Clamp((ZCR - 0.15f) * 5.0f, 0.0f, 1.0f);
+				WindowVisemes.FindOrAdd(FName("SS")) = SibStrength * Amplitude;
+				WindowVisemes.FindOrAdd(FName("FF")) = (1.0f - SibStrength) * Amplitude * 0.5f;
+				WindowVisemes.FindOrAdd(FName("ih")) = (1.0f - SibStrength) * Amplitude * 0.3f;
+
+				// Blend in the chunk shape at reduced weight for non-sibilant visemes
+				for (const FName& Name : VisemeNames)
+				{
+					if (Name != FName("SS") && Name != FName("FF") && Name != FName("ih") && Name != FName("sil"))
+					{
+						WindowVisemes.FindOrAdd(Name) += ChunkShape.FindRef(Name) * Amplitude * (1.0f - SibStrength) * 0.4f;
+					}
+				}
+			}
+			else
+			{
+				// Voiced segment — use chunk spectral shape scaled by amplitude.
+				// This creates the primary speech animation: syllable rhythm
+				// from amplitude, mouth shape from spectral analysis.
+				for (const FName& Name : VisemeNames)
+				{
+					if (Name == FName("sil"))
+					{
+						WindowVisemes.Add(Name, 0.0f);
+					}
+					else
+					{
+						WindowVisemes.Add(Name, ChunkShape.FindRef(Name) * Amplitude);
+					}
+				}
+			}
+		}
+
+		VisemeQueue.Add(WindowVisemes);
+		AmplitudeQueue.Add(Amplitude);
+		WindowsQueued++;
 	}
+
+	// ── Pseudo-speech fallback (no text available) ──────────────────────
+	// When text visemes are not available (server doesn't send partial text,
+	// or text arrives much later than audio), create natural-looking mouth
+	// movement by cycling through vowel/consonant shapes at speech rate.
+	// This is MUCH better than the single spectral shape (one shape for the
+	// entire chunk). If text arrives later, ApplyTextVisemesToQueue() will
+	// overwrite these frames with proper text-driven visemes.
+	if (TextVisemeSequence.Num() == 0 && WindowsQueued > 0)
+	{
+		// Vowel/consonant alternation at ~5 syllables/second.
+		// Each "syllable" = 3 frames vowel + 2 frames consonant = 5 frames × 32ms = 160ms.
+		static const FName VowelShapes[] = { FName("aa"), FName("oh"), FName("E"), FName("ih"), FName("ou") };
+		static const FName ConsonantShapes[] = { FName("nn"), FName("PP"), FName("DD"), FName("kk"), FName("RR") };
+		constexpr int32 NumShapes = 5;
+		constexpr int32 VowelFrames = 3;    // ~96ms open
+		constexpr int32 ConsonantFrames = 2; // ~64ms transition
+		constexpr int32 SyllableFrames = VowelFrames + ConsonantFrames; // ~160ms
+
+		int32 StartIdx = VisemeQueue.Num() - WindowsQueued;
+		int32 ActiveCount = 0;
+		int32 PseudoCount = 0;
+
+		for (int32 Idx = StartIdx; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx)
+		{
+			const float Amp = AmplitudeQueue[Idx];
+			if (Amp < 0.08f) continue; // Keep silent frames as-is
+
+			const int32 SyllableIdx = ActiveCount / SyllableFrames;
+			const int32 FrameInSyllable = ActiveCount % SyllableFrames;
+			const int32 ShapeIdx = SyllableIdx % NumShapes;
+
+			TMap<FName, float>& Frame = VisemeQueue[Idx];
+			for (const FName& Name : VisemeNames)
+				Frame.FindOrAdd(Name) = 0.0f;
+
+			if (FrameInSyllable < VowelFrames)
+			{
+				// Vowel phase — mouth open
+				const FName Vowel = VowelShapes[ShapeIdx];
+				Frame.FindOrAdd(Vowel) = Amp;
+
+				// Anticipatory blend in last vowel frame towards consonant
+				if (FrameInSyllable == VowelFrames - 1)
+				{
+					const FName Consonant = ConsonantShapes[ShapeIdx];
+					Frame.FindOrAdd(Vowel) = Amp * 0.7f;
+					Frame.FindOrAdd(Consonant) = Amp * 0.3f;
+				}
+			}
+			else
+			{
+				// Consonant/transition phase — mouth partially closed
+				const FName Consonant = ConsonantShapes[ShapeIdx];
+				Frame.FindOrAdd(Consonant) = Amp * 0.7f;
+
+				// Anticipatory blend towards next vowel in last consonant frame
+				if (FrameInSyllable == SyllableFrames - 1)
+				{
+					const int32 NextShapeIdx = (SyllableIdx + 1) % NumShapes;
+					const FName NextVowel = VowelShapes[NextShapeIdx];
+					Frame.FindOrAdd(NextVowel) = Amp * 0.3f;
+				}
+			}
+
+			ActiveCount++;
+			PseudoCount++;
+		}
+
+		if (PseudoCount > 0)
+		{
+			UE_LOG(LogElevenLabsLipSync, Verbose,
+				TEXT("Pseudo-speech: %d active frames (%d syllables)"),
+				PseudoCount, (PseudoCount + SyllableFrames - 1) / SyllableFrames);
+		}
+	}
+
+	// ── Late start fix + wait-for-text ───────────────────────────────────
+	// When a new utterance begins (queue was empty):
+	// 1) Override leading silent frames (TTS fade-in) with minimum amplitude
+	// 2) If text hasn't arrived yet, hold playback until it does (max 500ms)
+	//    This ensures ALL frames get text-driven visemes from the start.
+	if (bQueueWasEmpty && WindowsQueued > 0)
+	{
+		// Override leading silent frames with minimum amplitude
+		constexpr float MinStartAmplitude = 0.15f;
+		int32 FixedCount = 0;
+		for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx)
+		{
+			if (AmplitudeQueue[Idx] >= 0.08f)
+				break; // Stop at first naturally active frame
+
+			AmplitudeQueue[Idx] = MinStartAmplitude;
+			TMap<FName, float>& Frame = VisemeQueue[Idx];
+			for (const FName& Name : VisemeNames)
+			{
+				Frame.FindOrAdd(Name) = (Name == FName("sil"))
+					? 0.0f
+					: ChunkShape.FindRef(Name) * MinStartAmplitude;
+			}
+			FixedCount++;
+		}
+
+		if (FixedCount > 0)
+		{
+			UE_LOG(LogElevenLabsLipSync, Verbose,
+				TEXT("Late start fix: overrode %d leading silent frames with min amplitude %.2f"),
+				FixedCount, MinStartAmplitude);
+		}
+
+		// If text is already available (from partial responses arriving before audio),
+		// apply it immediately and start playback.
+		// Otherwise, hold playback until text arrives (wait-for-text mechanism).
+		if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() >= 3)
+		{
+			// Text already available — apply and start playback immediately
+			ApplyTextVisemesToQueue();
+			PlaybackTimer = 0.0f;
+			UE_LOG(LogElevenLabsLipSync, Verbose,
+				TEXT("Text already available (%d visemes). Starting lip sync immediately."),
+				TextVisemeSequence.Num());
+		}
+		else
+		{
+			// No text yet — hold playback until text arrives or timeout
+			bWaitingForText = true;
+			WaitingForTextStartTime = FPlatformTime::Seconds();
+			PlaybackTimer = 0.0f;
+			UE_LOG(LogElevenLabsLipSync, Log,
+				TEXT("Waiting for text before starting lip sync (%d frames queued)."),
+				WindowsQueued);
+		}
+	}
+	else if (AccumulatedText.Len() > 0 && TextVisemeSequence.Num() > 0)
+	{
+		// Not a new utterance but text is available — apply to new frames
+		ApplyTextVisemesToQueue();
+	}
+
+	UE_LOG(LogElevenLabsLipSync, Log,
+		TEXT("Audio chunk: %d samples → %d windows | Amp=[%.2f-%.2f] | Queue=%d (%.1fs) | TextVisemes=%d"),
+		NumSamples, WindowsQueued,
+		MinAmp, MaxAmp, VisemeQueue.Num(),
+		VisemeQueue.Num() * (512.0f / 16000.0f), TextVisemeSequence.Num());
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Text-driven lip sync
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::OnPartialTextReceived(const FString& PartialText)
+{
+	// If the previous utterance's full text was already received,
+	// this partial text belongs to a NEW utterance — start fresh.
+	if (bFullTextReceived)
+	{
+		AccumulatedText.Reset();
+		TextVisemeSequence.Reset();
+		bTextVisemesApplied = false;
+		bFullTextReceived = false;
+	}
+
+	// Accumulate streaming text fragments (arrive BEFORE audio)
+	AccumulatedText += PartialText;
+
+	// Convert accumulated text to viseme sequence progressively
+	ConvertTextToVisemes(AccumulatedText);
+
+	UE_LOG(LogElevenLabsLipSync, Log,
+		TEXT("Partial text: \"%s\" → %d visemes (accumulated: \"%s\")"),
+		*PartialText, TextVisemeSequence.Num(), *AccumulatedText);
+
+	// If we were waiting for text to arrive before starting playback,
+	// apply text visemes to queued frames and start consuming.
+	if (bWaitingForText && TextVisemeSequence.Num() >= 3)
+	{
+		if (VisemeQueue.Num() > 0)
+		{
+			ApplyTextVisemesToQueue();
+		}
+		bWaitingForText = false;
+		PlaybackTimer = 0.0f; // Start consuming now
+		const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime;
+		UE_LOG(LogElevenLabsLipSync, Verbose,
+			TEXT("Text arrived after %.0fms wait. Starting lip sync playback."),
+			WaitElapsed * 1000.0);
+	}
+}
+
+void UElevenLabsLipSyncComponent::OnTextResponseReceived(const FString& ResponseText)
+{
+	// Full text arrived — use it as the definitive source
+	bFullTextReceived = true;
+	AccumulatedText = ResponseText;
+	ConvertTextToVisemes(ResponseText);
+
+	UE_LOG(LogElevenLabsLipSync, Log,
+		TEXT("Full text: \"%s\" → %d visemes"), *ResponseText, TextVisemeSequence.Num());
+
+	// Apply to any remaining queued frames
+	if (VisemeQueue.Num() > 0)
+	{
+		ApplyTextVisemesToQueue();
+	}
+
+	// If we were waiting for text to arrive before starting playback, start now
+	if (bWaitingForText)
+	{
+		bWaitingForText = false;
+		PlaybackTimer = 0.0f;
+		const double WaitElapsed = FPlatformTime::Seconds() - WaitingForTextStartTime;
+		UE_LOG(LogElevenLabsLipSync, Verbose,
+			TEXT("Full text arrived after %.0fms wait. Starting lip sync playback."),
+			WaitElapsed * 1000.0);
+	}
+
+	// Log the viseme sequence for debugging
+	{
+		FString VisSeq;
+		int32 Count = 0;
+		for (const FName& V : TextVisemeSequence)
+		{
+			if (Count > 0) VisSeq += TEXT(" ");
+			VisSeq += V.ToString();
+			if (++Count >= 30) { VisSeq += TEXT(" ..."); break; }
+		}
+		UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("Viseme sequence: [%s]"), *VisSeq);
+	}
+
+	// NOTE: Do NOT reset bTextVisemesApplied here. It's reset in TickComponent
+	// when the queue empties AFTER the text has been consumed. Resetting it here
+	// would prevent TickComponent from cleaning up AccumulatedText, which then
+	// persists and corrupts the next utterance's partial text accumulation.
+}
+
+void UElevenLabsLipSyncComponent::ConvertTextToVisemes(const FString& Text)
+{
+	TextVisemeSequence.Reset();
+
+	// Lowercase for matching
+	FString Lower = Text.ToLower();
+
+	// Process character by character, checking multi-char graphemes first.
+	// This supports French and English phoneme-to-viseme mapping.
+	int32 i = 0;
+	while (i < Lower.Len())
+	{
+		TCHAR C = Lower[i];
+		TCHAR C1 = (i + 1 < Lower.Len()) ? Lower[i + 1] : 0;
+		TCHAR C2 = (i + 2 < Lower.Len()) ? Lower[i + 2] : 0;
+
+		// ── 3-char graphemes ──────────────────────────────────────────────
+		if (C == 'e' && C1 == 'a' && C2 == 'u')
+		{
+			// eau → /o/ (oh)
+			TextVisemeSequence.Add(FName("oh"));
+			i += 3; continue;
+		}
+		if (C == 'a' && C1 == 'i' && C2 == 'n')
+		{
+			// ain → /ɛ̃/ (E nasal)
+			TextVisemeSequence.Add(FName("E"));
+			i += 3; continue;
+		}
+		if (C == 'e' && C1 == 'i' && C2 == 'n')
+		{
+			// ein → /ɛ̃/ (E nasal)
+			TextVisemeSequence.Add(FName("E"));
+			i += 3; continue;
+		}
+		if (C == 'o' && C1 == 'e' && C2 == 'u')
+		{
+			// oeu → /ø/ (oh-like)
+			TextVisemeSequence.Add(FName("oh"));
+			i += 3; continue;
+		}
+
+		// ── 2-char graphemes ──────────────────────────────────────────────
+		if (C == 'o' && C1 == 'u')
+		{
+			// ou → /u/ (ou)
+			TextVisemeSequence.Add(FName("ou"));
+			i += 2; continue;
+		}
+		if (C == 'o' && C1 == 'i')
+		{
+			// oi → /wa/ (ou + aa)
+			TextVisemeSequence.Add(FName("ou"));
+			TextVisemeSequence.Add(FName("aa"));
+			i += 2; continue;
+		}
+		if (C == 'o' && C1 == 'n')
+		{
+			// on → /ɔ̃/ (oh nasal)
+			TextVisemeSequence.Add(FName("oh"));
+			i += 2; continue;
+		}
+		if (C == 'o' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2)))
+		{
+			// om at end → /ɔ̃/ (oh nasal)
+			TextVisemeSequence.Add(FName("oh"));
+			i += 2; continue;
+		}
+		if (C == 'a' && (C1 == 'n' || C1 == 'm'))
+		{
+			// an, am → /ɑ̃/ (aa nasal)
+			TextVisemeSequence.Add(FName("aa"));
+			i += 2; continue;
+		}
+		if (C == 'e' && C1 == 'n')
+		{
+			// en → /ɑ̃/ (aa nasal, French)
+			TextVisemeSequence.Add(FName("aa"));
+			i += 2; continue;
+		}
+		if (C == 'e' && C1 == 'm' && (C2 == 0 || !FChar::IsAlpha(C2)))
+		{
+			// em at end → /ɑ̃/
+			TextVisemeSequence.Add(FName("aa"));
+			i += 2; continue;
+		}
+		if (C == 'i' && (C1 == 'n' || C1 == 'm'))
+		{
+			// in, im → /ɛ̃/
+			TextVisemeSequence.Add(FName("ih"));
+			i += 2; continue;
+		}
+		if (C == 'u' && C1 == 'n')
+		{
+			// un → /œ̃/
+			TextVisemeSequence.Add(FName("ih"));
+			i += 2; continue;
+		}
+		if (C == 'a' && C1 == 'u')
+		{
+			// au → /o/
+			TextVisemeSequence.Add(FName("oh"));
+			i += 2; continue;
+		}
+		if (C == 'a' && C1 == 'i')
+		{
+			// ai → /ɛ/
+			TextVisemeSequence.Add(FName("E"));
+			i += 2; continue;
+		}
+		if (C == 'e' && C1 == 'i')
+		{
+			// ei → /ɛ/
+			TextVisemeSequence.Add(FName("E"));
+			i += 2; continue;
+		}
+		if (C == 'e' && C1 == 'u')
+		{
+			// eu → /ø/
+			TextVisemeSequence.Add(FName("oh"));
+			i += 2; continue;
+		}
+		if (C == 'c' && C1 == 'h')
+		{
+			// ch → /ʃ/
+			TextVisemeSequence.Add(FName("CH"));
+			i += 2; continue;
+		}
+		if (C == 's' && C1 == 'h')
+		{
+			// sh → /ʃ/
+			TextVisemeSequence.Add(FName("CH"));
+			i += 2; continue;
+		}
+		if (C == 'g' && C1 == 'n')
+		{
+			// gn → /ɲ/
+			TextVisemeSequence.Add(FName("nn"));
+			i += 2; continue;
+		}
+		if (C == 'p' && C1 == 'h')
+		{
+			// ph → /f/
+			TextVisemeSequence.Add(FName("FF"));
+			i += 2; continue;
+		}
+		if (C == 't' && C1 == 'h')
+		{
+			// th → /θ/
+			TextVisemeSequence.Add(FName("TH"));
+			i += 2; continue;
+		}
+		if (C == 'q' && C1 == 'u')
+		{
+			// qu → /k/
+			TextVisemeSequence.Add(FName("kk"));
+			i += 2; continue;
+		}
+		if (C == 'l' && C1 == 'l')
+		{
+			// ll → /l/ (single)
+			TextVisemeSequence.Add(FName("RR"));
+			i += 2; continue;
+		}
+		if (C == 's' && C1 == 's')
+		{
+			// ss → /s/
+			TextVisemeSequence.Add(FName("SS"));
+			i += 2; continue;
+		}
+		if (C == 'm' && C1 == 'm')
+		{
+			// mm → /m/
+			TextVisemeSequence.Add(FName("PP"));
+			i += 2; continue;
+		}
+		if (C == 'n' && C1 == 'n')
+		{
+			// nn → /n/
+			TextVisemeSequence.Add(FName("nn"));
+			i += 2; continue;
+		}
+		if (C == 't' && C1 == 't')
+		{
+			// tt → /t/
+			TextVisemeSequence.Add(FName("DD"));
+			i += 2; continue;
+		}
+		if (C == 'c' && (C1 == 'e' || C1 == 'i' || C1 == 'y'))
+		{
+			// ce, ci, cy → /s/
+			TextVisemeSequence.Add(FName("SS"));
+			i += 1; continue; // Only consume the 'c', let the vowel be processed next
+		}
+		if (C == 'g' && (C1 == 'e' || C1 == 'i' || C1 == 'y'))
+		{
+			// ge, gi, gy → /ʒ/
+			TextVisemeSequence.Add(FName("CH"));
+			i += 1; continue;
+		}
+
+		// ── French silent letters at end of word ──────────────────────────
+		// In French, final s, t, d, x, z are typically silent.
+		// Examples: "vous" → /vu/, "comment" → /kɔmɑ̃/, "allez" → /ale/
+		{
+			bool bIsWordFinal = (i + 1 >= Lower.Len()) || !FChar::IsAlpha(Lower[i + 1]);
+
+			// Silent final consonants
+			if (bIsWordFinal && (C == 's' || C == 't' || C == 'd' || C == 'x' || C == 'z'))
+			{
+				i++; continue;
+			}
+
+			// e muet (silent 'e') at end of word — not é, è, ê
+			// Plain 'e' at end of a word is usually silent in French.
+			// Accented variants (é=0xE9, è=0xE8, ê=0xEA) are always pronounced.
+			if (C == 'e' && bIsWordFinal && i > 0 && FChar::IsAlpha(Lower[i - 1]))
+			{
+				i++; continue;
+			}
+		}
+
+		// ── Single characters ─────────────────────────────────────────────
+		switch (C)
+		{
+		// Vowels
+		case 'a': case TCHAR(0xE0): case TCHAR(0xE2): // a, à, â
+			TextVisemeSequence.Add(FName("aa")); break;
+		case 'e': case TCHAR(0xE9): case TCHAR(0xE8): case TCHAR(0xEA): // e, é, è, ê
+			TextVisemeSequence.Add(FName("E")); break;
+		case 'i': case TCHAR(0xEE): case TCHAR(0xEF): // i, î, ï
+			TextVisemeSequence.Add(FName("ih")); break;
+		case 'o': case TCHAR(0xF4): // o, ô
+			TextVisemeSequence.Add(FName("oh")); break;
+		case 'u': case TCHAR(0xFB): case TCHAR(0xFC): // u, û, ü
+			TextVisemeSequence.Add(FName("ou")); break;
+		case 'y':
+			TextVisemeSequence.Add(FName("ih")); break;
+
+		// Consonants
+		case 'b':
+			TextVisemeSequence.Add(FName("PP")); break;
+		case 'c': case 'k': case 'q':
+			TextVisemeSequence.Add(FName("kk")); break;
+		case 'd':
+			TextVisemeSequence.Add(FName("DD")); break;
+		case 'f':
+			TextVisemeSequence.Add(FName("FF")); break;
+		case 'g':
+			TextVisemeSequence.Add(FName("kk")); break;
+		case 'h':
+			// Silent in French, aspirated in English — skip
+			break;
+		case 'j':
+			TextVisemeSequence.Add(FName("CH")); break;
+		case 'l':
+			TextVisemeSequence.Add(FName("RR")); break;
+		case 'm':
+			TextVisemeSequence.Add(FName("PP")); break;
+		case 'n':
+			TextVisemeSequence.Add(FName("nn")); break;
+		case 'p':
+			TextVisemeSequence.Add(FName("PP")); break;
+		case 'r':
+			TextVisemeSequence.Add(FName("RR")); break;
+		case 's':
+			TextVisemeSequence.Add(FName("SS")); break;
+		case 't':
+			TextVisemeSequence.Add(FName("DD")); break;
+		case 'v':
+			TextVisemeSequence.Add(FName("FF")); break;
+		case 'w':
+			TextVisemeSequence.Add(FName("ou")); break;
+		case 'x':
+			TextVisemeSequence.Add(FName("kk"));
+			TextVisemeSequence.Add(FName("SS")); break;
+		case 'z':
+			TextVisemeSequence.Add(FName("SS")); break;
+
+		// Space / punctuation → silence
+		case ' ': case ',': case '.': case '!': case '?': case ';': case ':':
+		case '-': case '\n': case '\r':
+			TextVisemeSequence.Add(FName("sil")); break;
+
+		default:
+			// Unknown character — skip
+			break;
+		}
+
+		i++;
+	}
+
+	// ── Post-processing: merge consecutive silence entries ────────────────
+	// "Bonjour, " generates two sil (comma + space). Collapse to one.
+	{
+		TArray<FName> Merged;
+		Merged.Reserve(TextVisemeSequence.Num());
+		for (const FName& V : TextVisemeSequence)
+		{
+			if (V == FName("sil") && Merged.Num() > 0 && Merged.Last() == FName("sil"))
+				continue; // Skip duplicate sil
+			Merged.Add(V);
+		}
+		// Also strip leading/trailing sil
+		while (Merged.Num() > 0 && Merged[0] == FName("sil"))
+			Merged.RemoveAt(0);
+		while (Merged.Num() > 0 && Merged.Last() == FName("sil"))
+			Merged.RemoveAt(Merged.Num() - 1);
+
+		TextVisemeSequence = MoveTemp(Merged);
+	}
+}
+
+// Duration weights for viseme types.
+// Vowels naturally last longer than consonants in speech.
+// These weights control how many audio frames each viseme occupies.
+static float GetVisemeDurationWeight(const FName& Viseme)
+{
+	// Vowels — sustained, mouth held open: ~100-150ms
+	if (Viseme == FName("aa") || Viseme == FName("oh") || Viseme == FName("E"))
+		return 2.0f;
+	if (Viseme == FName("ih") || Viseme == FName("ou"))
+		return 1.7f;
+
+	// Liquids / nasals — semi-sustained: ~60-100ms
+	if (Viseme == FName("RR") || Viseme == FName("nn"))
+		return 1.5f;
+
+	// Fricatives — moderate duration: ~60-80ms
+	if (Viseme == FName("SS") || Viseme == FName("FF") || Viseme == FName("CH") || Viseme == FName("TH"))
+		return 1.2f;
+
+	// Plosives — short closure: ~50-70ms (not too short to avoid frantic look)
+	if (Viseme == FName("PP") || Viseme == FName("DD") || Viseme == FName("kk"))
+		return 0.8f;
+
+	// Silence — brief pause between words (keep short to avoid frozen look)
+	if (Viseme == FName("sil"))
+		return 1.0f;
+
+	return 1.0f;
+}
+
+void UElevenLabsLipSyncComponent::ApplyTextVisemesToQueue()
+{
+	if (TextVisemeSequence.Num() == 0 || VisemeQueue.Num() == 0) return;
+
+	// Count non-silent frames (amplitude > threshold) in the queue
+	int32 ActiveFrames = 0;
+	for (int32 Idx = 0; Idx < AmplitudeQueue.Num(); ++Idx)
+	{
+		if (AmplitudeQueue[Idx] >= 0.08f)
+			ActiveFrames++;
+	}
+
+	if (ActiveFrames == 0) return;
+
+	// ── Duration-weighted distribution ────────────────────────────────────
+	// Vowels get more frames than consonants, creating natural timing where
+	// the mouth lingers on open vowels and quickly transitions through plosives.
+
+	// Compute total weighted duration of the viseme sequence
+	float TotalWeight = 0.0f;
+	for (const FName& V : TextVisemeSequence)
+	{
+		TotalWeight += GetVisemeDurationWeight(V);
+	}
+
+	// Build a cumulative weight array for mapping frame index → viseme index.
+	// CumulativeWeight[i] = sum of weights from viseme 0..i-1
+	TArray<float> CumulativeWeight;
+	CumulativeWeight.SetNum(TextVisemeSequence.Num() + 1);
+	CumulativeWeight[0] = 0.0f;
+	for (int32 V = 0; V < TextVisemeSequence.Num(); ++V)
+	{
+		CumulativeWeight[V + 1] = CumulativeWeight[V] + GetVisemeDurationWeight(TextVisemeSequence[V]);
+	}
+
+	// For each active audio frame, find which viseme it maps to based on
+	// its proportional position in the weighted timeline.
+	int32 ActiveIdx = 0;
+	for (int32 Idx = 0; Idx < VisemeQueue.Num() && Idx < AmplitudeQueue.Num(); ++Idx)
+	{
+		const float Amp = AmplitudeQueue[Idx];
+
+		if (Amp < 0.08f)
+		{
+			// Silent frame — keep as silence
+			continue;
+		}
+
+		// Where are we in the weighted timeline? (0..TotalWeight)
+		const float TimelinePos = (static_cast<float>(ActiveIdx) / static_cast<float>(ActiveFrames)) * TotalWeight;
+
+		// Find which viseme this position falls into (binary-style search)
+		int32 VisemeIdx = 0;
+		for (int32 V = 0; V < TextVisemeSequence.Num(); ++V)
+		{
+			if (TimelinePos >= CumulativeWeight[V] && TimelinePos < CumulativeWeight[V + 1])
+			{
+				VisemeIdx = V;
+				break;
+			}
+			VisemeIdx = V; // Fallback to last
+		}
+
+		const FName TextViseme = TextVisemeSequence[VisemeIdx];
+
+		// Blend progress within current viseme (0..1)
+		const float VisemeStart = CumulativeWeight[VisemeIdx];
+		const float VisemeDuration = CumulativeWeight[VisemeIdx + 1] - VisemeStart;
+		const float LocalProgress = (VisemeDuration > 0.01f)
+			? FMath::Clamp((TimelinePos - VisemeStart) / VisemeDuration, 0.0f, 1.0f)
+			: 0.0f;
+
+		// Next viseme for blending during the last 30% of each viseme
+		const int32 NextIdx = FMath::Min(VisemeIdx + 1, TextVisemeSequence.Num() - 1);
+		const FName NextViseme = TextVisemeSequence[NextIdx];
+
+		// Rebuild this frame: text-derived shape × stored amplitude
+		TMap<FName, float>& Frame = VisemeQueue[Idx];
+		for (const FName& Name : VisemeNames)
+		{
+			Frame.FindOrAdd(Name) = 0.0f;
+		}
+
+		if (TextViseme == FName("sil"))
+		{
+			// Text-driven silence — mouth closes
+			Frame.FindOrAdd(FName("sil")) = 1.0f;
+		}
+		else
+		{
+			// Anticipatory blending: in the last 30% of each viseme,
+			// gradually blend towards the next viseme shape.
+			const float BlendZone = 0.3f;
+			float BlendToNext = 0.0f;
+			if (LocalProgress > (1.0f - BlendZone) && NextViseme != TextViseme && NextViseme != FName("sil"))
+			{
+				BlendToNext = (LocalProgress - (1.0f - BlendZone)) / BlendZone;
+			}
+
+			// Primary viseme shape × amplitude
+			Frame.FindOrAdd(TextViseme) += Amp * (1.0f - BlendToNext * 0.5f);
+
+			// Blend towards next viseme
+			if (BlendToNext > 0.0f && NextViseme != FName("sil"))
+			{
+				Frame.FindOrAdd(NextViseme) += Amp * BlendToNext * 0.5f;
+			}
+		}
+
+		ActiveIdx++;
+	}
+
+	bTextVisemesApplied = true;
+
+	UE_LOG(LogElevenLabsLipSync, Log,
+		TEXT("Applied %d text visemes to %d active frames (of %d total)"),
+		TextVisemeSequence.Num(), ActiveFrames, VisemeQueue.Num());
 }
 
 void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
@@ -478,14 +1427,9 @@ void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
 
 	const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy;
 
-	// DEBUG: log energy levels periodically
-	static int32 AnalysisCount = 0;
-	if (++AnalysisCount % 50 == 1) // Log every ~50 analyses
-	{
-		UE_LOG(LogElevenLabsLipSync, Log,
-			TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
-			TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
-	}
+	UE_LOG(LogElevenLabsLipSync, Verbose,
+		TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
+		TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
 
 	EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
 }
@@ -519,131 +1463,103 @@ void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy,
 		TargetVisemes.FindOrAdd(Name) = 0.0f;
 	}
 
-	// Silence threshold — below this, mouth is closed
-	constexpr float SilenceThreshold = 0.002f;
-
-	if (TotalEnergy < SilenceThreshold)
+	// Below noise floor → silence shape
+	if (TotalEnergy < 0.01f)
 	{
 		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
 		return;
 	}
 
-	// Normalize band energies relative to total
+	// ── Spectral ratios determine mouth SHAPE (not intensity) ────────────
+	// These weights are ~1.0 (full strength). Per-window amplitude in
+	// OnAudioChunkReceived scales them to create speech dynamics.
+	// This function produces a "shape template" for the entire audio chunk.
 	const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f);
 	const float NormF1 = F1Energy * InvTotal;
 	const float NormF2 = F2Energy * InvTotal;
 	const float NormF3 = F3Energy * InvTotal;
 	const float NormSibilant = SibilantEnergy * InvTotal;
 
-	// Energy-based intensity (how "loud" the speech is — drives overall jaw opening)
-	// Scale to a usable 0-1 range. The constant is empirically tuned.
-	const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f);
+	// Brightness: ratio of high-freq to low-freq energy.
+	// Low brightness = rounded lips (oh, ou). High = spread lips (E, ih, SS).
+	const float Brightness = FMath::Clamp(
+		(NormF2 + NormF3 + NormSibilant * 2.0f) / FMath::Max(NormF1 + 0.01f, 0.01f),
+		0.0f, 4.0f) / 4.0f;
 
-	// ── Classification based on spectral shape ───────────────────────────────
-	// The approach: compute "votes" for each viseme category based on where
-	// the spectral energy is concentrated. Multiple visemes can be active
-	// simultaneously (blended).
+	// ── Primary vowel/consonant shape (mutually exclusive) ───────────────
 
-	// Fricatives / sibilants: high-frequency energy dominates
-	if (NormSibilant > 0.25f)
+	if (NormSibilant > 0.2f)
 	{
-		const float FricativeWeight = NormSibilant * Intensity;
-		// Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq)
+		// Sibilant chunk — per-window ZCR will refine this further
+		float SibWeight = FMath::Clamp(NormSibilant * 2.0f, 0.0f, 1.0f);
 		if (NormF3 > NormF2)
 		{
-			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight;
+			TargetVisemes.FindOrAdd(FName("SS")) = SibWeight;
 		}
 		else
 		{
-			TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f;
-			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f;
+			TargetVisemes.FindOrAdd(FName("CH")) = SibWeight * 0.7f;
+			TargetVisemes.FindOrAdd(FName("SS")) = SibWeight * 0.3f;
 		}
-		// F/V component
-		TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f;
+		TargetVisemes.FindOrAdd(FName("FF")) = SibWeight * 0.3f;
 	}
-
-	// Voiced speech: most energy in voice + F1 + F2
-	if (NormSibilant < 0.5f)
+	else if (Brightness > 0.55f)
 	{
-		const float VoicedWeight = (1.0f - NormSibilant) * Intensity;
-
-		// Open vowels: strong F1 = wide jaw opening
-		if (NormF1 > 0.3f)
+		// Bright (front vowel): E or ih — spread lips
+		if (NormF1 > 0.2f)
 		{
-			if (NormF2 > 0.35f)
-			{
-				// High F2 + high F1 → front open vowel (A as in "cat")
-				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1;
-			}
-			else
-			{
-				// Low F2 + high F1 → back open vowel (O as in "go")
-				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f;
-				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f;
-			}
+			TargetVisemes.FindOrAdd(FName("E")) = 1.0f;
+			TargetVisemes.FindOrAdd(FName("aa")) = 0.3f;
 		}
-
-		// Mid vowels: moderate F1
-		if (NormF1 > 0.15f && NormF1 <= 0.3f)
+		else
 		{
-			if (NormF2 > 0.4f)
-			{
-				// High F2 → front mid vowel (E as in "bed")
-				TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f;
-			}
-			else
-			{
-				// Low F2 → rounded mid vowel
-				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f;
-			}
-		}
-
-		// Close vowels: weak F1
-		if (NormF1 <= 0.15f && NormF2 > 0.0f)
-		{
-			if (NormF2 > 0.4f)
-			{
-				// High F2 → front close vowel (I as in "see")
-				TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f;
-			}
-			else
-			{
-				// Low F2 → back close vowel (OO as in "boot")
-				TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f;
-			}
-		}
-
-		// Nasals / liquids: prominent F3 with low sibilant
-		if (NormF3 > 0.2f && NormSibilant < 0.15f)
-		{
-			if (NormF1 < 0.2f)
-			{
-				TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f;
-			}
-			else
-			{
-				TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f;
-			}
-		}
-
-		// Plosive detection: very low F1 with moderate energy = lips/tongue closed
-		if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f)
-		{
-			TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f;
-			TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f;
+			TargetVisemes.FindOrAdd(FName("ih")) = 0.8f;
 		}
 	}
-
-	// TH detection: moderate sibilant + moderate F3 (dental fricative)
-	if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f)
+	else if (Brightness < 0.3f)
 	{
-		TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f;
+		// Dark (back vowel): oh or ou — rounded lips
+		if (NormF1 > 0.2f)
+		{
+			TargetVisemes.FindOrAdd(FName("oh")) = 1.0f;
+		}
+		else
+		{
+			TargetVisemes.FindOrAdd(FName("ou")) = 0.8f;
+		}
+	}
+	else
+	{
+		// Neutral / open vowel: aa — wide open jaw
+		TargetVisemes.FindOrAdd(FName("aa")) = 1.0f;
 	}
 
-	// Ensure at least some silence weight when energy is very low
-	if (Intensity < 0.1f)
+	// ── Secondary consonant contributions (additive) ─────────────────────
+
+	// Nasals (N, M, NG): prominent F3, low sibilant
+	if (NormF3 > 0.25f && NormSibilant < 0.15f)
 	{
-		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f;
+		TargetVisemes.FindOrAdd(FName("nn")) = 0.5f;
+		TargetVisemes.FindOrAdd(FName("RR")) = 0.2f;
+	}
+
+	// Plosive hint (P, B): very low F1
+	if (NormF1 < 0.08f && NormSibilant < 0.2f)
+	{
+		TargetVisemes.FindOrAdd(FName("PP")) = 0.5f;
+		TargetVisemes.FindOrAdd(FName("DD")) = 0.3f;
+	}
+
+	// Labiodental (F, V): moderate sibilant + lip involvement
+	if (NormSibilant > 0.12f && NormSibilant < 0.3f && NormF1 < 0.15f)
+	{
+		TargetVisemes.FindOrAdd(FName("FF")) = 0.6f;
+	}
+
+	// Dental (TH): moderate sibilant + moderate F3
+	if (NormSibilant > 0.12f && NormSibilant < 0.35f && NormF3 > 0.15f)
+	{
+		TargetVisemes.FindOrAdd(FName("TH")) = 0.4f;
 	}
 }
 
@@ -686,9 +1602,8 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets()
 {
 	if (!TargetMesh) return;
 
-	// DEBUG: log blendshape values periodically
 	static int32 ApplyCount = 0;
-	if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps
+	if (++ApplyCount % 120 == 1)
 	{
 		FString DebugStr;
 		for (const auto& Pair : CurrentBlendshapes)
@@ -708,7 +1623,7 @@ void UElevenLabsLipSyncComponent::ApplyMorphTargets()
 		}
 		if (DebugStr.Len() > 0)
 		{
-			UE_LOG(LogElevenLabsLipSync, Log, TEXT("%s: %s"),
+			UE_LOG(LogElevenLabsLipSync, Verbose, TEXT("%s: %s"),
 				bUseCurveMode ? TEXT("Curves") : TEXT("Blendshapes"), *DebugStr);
 		}
 	}
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
index e5a5fe2..e3c1ced 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -136,6 +136,17 @@ public:
 		meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
 	bool bEnableAgentPartialResponse = false;
 
+	/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
+	 *  ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
+	 *  Pre-buffering delays playback start so the second chunk arrives before the
+	 *  first finishes playing, eliminating the audible gap mid-sentence.
+	 *  Higher values = fewer gaps but more latency on the first word.
+	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
+		meta = (ClampMin = "0", ClampMax = "500",
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500 = maximum smoothness."))
+	int32 AudioPreBufferMs = 250;
+
 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs",
 		meta = (ClampMin = "0.0",
@@ -257,6 +268,11 @@ public:
 	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
 	const FElevenLabsConversationInfo& GetConversationInfo() const;
 
+	/** True while audio is being pre-buffered (playback hasn't started yet).
+	 *  Used by the LipSync component to pause viseme queue consumption. */
+	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
+	bool IsPreBuffering() const { return bPreBuffering; }
+
 	/** Access the underlying WebSocket proxy (advanced use). */
 	UFUNCTION(BlueprintPure, Category = "ElevenLabs")
 	UElevenLabsWebSocketProxy* GetWebSocketProxy() const { return WebSocketProxy; }
@@ -353,6 +369,14 @@ private:
 	TArray<uint8> AudioQueue;
 	FCriticalSection AudioQueueLock;
 
+	// Reusable zero-filled buffer fed to USoundWaveProcedural during TTS gaps
+	// to keep the audio component alive (prevents stop on buffer underrun).
+	TArray<uint8> SilenceBuffer;
+
+	// Pre-buffer state: delay playback start to absorb TTS inter-chunk gaps.
+	bool bPreBuffering = false;
+	double PreBufferStartTime = 0.0;
+
 	// Silence detection: how many consecutive ticks with an empty audio queue.
 	int32 SilentTickCount = 0;
 
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
index 9a98e1d..abaf230 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -51,11 +51,19 @@ public:
 		ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
 	float LipSyncStrength = 1.0f;
 
+	/** Scales the audio amplitude driving mouth movement.
+	 *  Lower values produce subtler animation, higher values are more pronounced.
+	 *  Use this to tone down overly strong lip movement without changing the shape. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "0.5", ClampMax = "1.0",
+		ToolTip = "Audio amplitude scale.\n0.5 = subtle, 0.75 = balanced, 1.0 = full.\nReduces overall mouth movement without affecting viseme shape."))
+	float AmplitudeScale = 0.75f;
+
 	/** How quickly viseme weights interpolate towards new values each frame. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
-		meta = (ClampMin = "1.0", ClampMax = "100.0",
-		ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
-	float SmoothingSpeed = 20.0f;
+		meta = (ClampMin = "35.0", ClampMax = "65.0",
+		ToolTip = "Smoothing speed for viseme transitions.\n35 = smooth and soft, 50 = balanced, 65 = sharp and responsive."))
+	float SmoothingSpeed = 50.0f;
 
 	// ── Events ────────────────────────────────────────────────────────────────
 
@@ -87,6 +95,20 @@ private:
 	/** Receives raw PCM from the agent component. */
 	void OnAudioChunkReceived(const TArray<uint8>& PCMData);
 
+	/** Receives full text response from the agent component. */
+	UFUNCTION()
+	void OnTextResponseReceived(const FString& ResponseText);
+
+	/** Receives partial text streaming from the agent component. */
+	UFUNCTION()
+	void OnPartialTextReceived(const FString& PartialText);
+
+	/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
+	void ConvertTextToVisemes(const FString& Text);
+
+	/** Apply text-derived viseme shapes to the remaining queued frames. */
+	void ApplyTextVisemesToQueue();
+
 	/** Extract frequency band energies from the spectrum analyzer. */
 	void AnalyzeSpectrum();
 
@@ -122,6 +144,13 @@ private:
 	// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
 	TMap<FName, float> CurrentBlendshapes;
 
+	// Previous frame's blendshape values for additional output smoothing
+	TMap<FName, float> PreviousBlendshapes;
+
+	// Last consumed queue frame — used for inter-frame interpolation
+	// to create continuous motion instead of 32ms step-wise jumps
+	TMap<FName, float> LastConsumedVisemes;
+
 	// MetaHuman mode: Face mesh has no morph targets, use animation curves instead.
 	// Set automatically in BeginPlay when TargetMesh has 0 morph targets.
 	bool bUseCurveMode = false;
@@ -129,9 +158,48 @@ private:
 	// Cache of ARKit→MetaHuman curve name conversions to avoid per-frame string ops.
 	TMap<FName, FName> CurveNameCache;
 
+	// RMS amplitude from the latest audio chunk (0-1 range, drives jaw opening)
+	float CurrentAmplitude = 0.0f;
+
+	// ── Viseme queue ──────────────────────────────────────────────────────────
+
+	// Queue of per-window viseme analysis results.
+	// OnAudioChunkReceived builds one frame per 512-sample window (~32ms).
+	// TickComponent consumes them at the correct playback rate.
+	TArray<TMap<FName, float>> VisemeQueue;
+
+	// Parallel queue of per-window amplitude values (for text-driven shape replacement)
+	TArray<float> AmplitudeQueue;
+
+	// Timer for consuming queued viseme frames at the FFT window rate
+	float PlaybackTimer = 0.0f;
+
 	// Whether we have pending analysis results to process
 	bool bHasPendingAnalysis = false;
 
+	// ── Text-driven lip sync ──────────────────────────────────────────────────
+
+	// Accumulated partial text from streaming (agent_chat_response_part events).
+	// Built up token-by-token before the audio arrives.
+	FString AccumulatedText;
+
+	// Ordered sequence of OVR viseme names derived from text.
+	// E.g. "Bonjour" → [PP, oh, nn, CH, ou, RR]
+	TArray<FName> TextVisemeSequence;
+
+	// Whether text-based visemes have been applied to the current queue
+	bool bTextVisemesApplied = false;
+
+	// Set when agent_response arrives (full text for this utterance).
+	// Prevents resetting AccumulatedText between audio chunks of the
+	// SAME utterance — only reset once the full response is confirmed.
+	bool bFullTextReceived = false;
+
+	// Wait-for-text mechanism: when audio arrives without text, hold playback
+	// until text arrives (partial or full) so all frames get proper text visemes.
+	bool bWaitingForText = false;
+	double WaitingForTextStartTime = 0.0;
+
 	// Cached reference to the agent component on the same Actor
 	TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
 	FDelegateHandle AudioDataHandle;