WIP: Add ElevenLabsLipSyncComponent with spectral analysis lip sync

Real-time lip sync component that performs client-side spectral analysis on the agent's PCM audio stream (ElevenLabs doesn't provide viseme data). Pipeline: 512-point FFT (16kHz) → 5 frequency bands → 15 OVR visemes → ARKit blendshapes (MetaHuman compatible) → auto-apply morph targets. Currently uses SetMorphTarget() which may be overridden by MetaHuman's Face AnimBP — face animation not yet working. Debug logs added to diagnose: audio flow, spectrum energy, morph target name matching. Next steps: verify debug output, fix MetaHuman morph target override (likely needs AnimBP integration like Convai approach). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 11:23:34 +01:00
parent 52f75f884b
commit 224af6a27b
4 changed files with 813 additions and 0 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -426,6 +426,8 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
 void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
 {
 	EnqueueAgentAudio(PCMData);
+	// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
+	OnAgentAudioData.Broadcast(PCMData);
 }

 void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -0,0 +1,663 @@
+// Copyright ASTERION. All Rights Reserved.
+
+#include "ElevenLabsLipSyncComponent.h"
+#include "ElevenLabsConversationalAgentComponent.h"
+#include "ElevenLabsDefinitions.h"
+#include "Components/SkeletalMeshComponent.h"
+#include "Engine/SkeletalMesh.h"
+#include "Animation/AnimInstance.h"
+#include "Animation/MorphTarget.h"
+#include "GameFramework/Actor.h"
+
+DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsLipSync, Log, All);
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Static data
+// ─────────────────────────────────────────────────────────────────────────────
+
+const TArray<FName> UElevenLabsLipSyncComponent::VisemeNames = {
+	FName("sil"), FName("PP"), FName("FF"), FName("TH"), FName("DD"),
+	FName("kk"),  FName("CH"), FName("SS"), FName("nn"), FName("RR"),
+	FName("aa"),  FName("E"),  FName("ih"), FName("oh"), FName("ou")
+};
+
+// OVR Viseme → ARKit blendshape mapping.
+// Each viseme activates a combination of ARKit morph targets with specific weights.
+// These values are tuned for MetaHuman faces and can be adjusted per project.
+TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap()
+{
+	TMap<FName, TMap<FName, float>> Map;
+
+	// sil — silence, mouth at rest
+	Map.Add(FName("sil"), {});
+
+	// PP — bilabial (P, B, M): lips pressed together
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthClose"), 0.7f);
+		BS.Add(FName("mouthPressLeft"), 0.3f);
+		BS.Add(FName("mouthPressRight"), 0.3f);
+		Map.Add(FName("PP"), BS);
+	}
+
+	// FF — labiodental (F, V): lower lip tucked under upper teeth
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthShrugLower"), 0.5f);
+		BS.Add(FName("mouthUpperUpLeft"), 0.3f);
+		BS.Add(FName("mouthUpperUpRight"), 0.3f);
+		BS.Add(FName("jawOpen"), 0.1f);
+		Map.Add(FName("FF"), BS);
+	}
+
+	// TH — dental (TH): tongue between teeth
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("tongueOut"), 0.4f);
+		BS.Add(FName("jawOpen"), 0.15f);
+		Map.Add(FName("TH"), BS);
+	}
+
+	// DD — alveolar (D, T, N): tongue on alveolar ridge
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.25f);
+		BS.Add(FName("mouthClose"), 0.2f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.15f);
+		BS.Add(FName("mouthLowerDownRight"), 0.15f);
+		Map.Add(FName("DD"), BS);
+	}
+
+	// kk — velar (K, G): back of tongue raised
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.25f);
+		BS.Add(FName("mouthStretchLeft"), 0.15f);
+		BS.Add(FName("mouthStretchRight"), 0.15f);
+		Map.Add(FName("kk"), BS);
+	}
+
+	// CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthFunnel"), 0.45f);
+		BS.Add(FName("jawOpen"), 0.2f);
+		BS.Add(FName("mouthPucker"), 0.15f);
+		Map.Add(FName("CH"), BS);
+	}
+
+	// SS — alveolar fricative (S, Z): air through narrow channel
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthStretchLeft"), 0.4f);
+		BS.Add(FName("mouthStretchRight"), 0.4f);
+		BS.Add(FName("jawOpen"), 0.1f);
+		BS.Add(FName("mouthSmileLeft"), 0.15f);
+		BS.Add(FName("mouthSmileRight"), 0.15f);
+		Map.Add(FName("SS"), BS);
+	}
+
+	// nn — nasal (N, M, NG): soft palate lowered
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.15f);
+		BS.Add(FName("mouthClose"), 0.2f);
+		BS.Add(FName("mouthPressLeft"), 0.1f);
+		BS.Add(FName("mouthPressRight"), 0.1f);
+		Map.Add(FName("nn"), BS);
+	}
+
+	// RR — retroflex/rhotic (R, L): tongue curled or lateral
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthFunnel"), 0.3f);
+		BS.Add(FName("jawOpen"), 0.2f);
+		BS.Add(FName("mouthRollLower"), 0.15f);
+		Map.Add(FName("RR"), BS);
+	}
+
+	// aa — open vowel (A as in "father"): wide open jaw
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.7f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.4f);
+		BS.Add(FName("mouthLowerDownRight"), 0.4f);
+		BS.Add(FName("mouthShrugUpper"), 0.1f);
+		Map.Add(FName("aa"), BS);
+	}
+
+	// E — mid front vowel (E as in "bed"): mid-open, spread lips
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.4f);
+		BS.Add(FName("mouthSmileLeft"), 0.3f);
+		BS.Add(FName("mouthSmileRight"), 0.3f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
+		BS.Add(FName("mouthLowerDownRight"), 0.2f);
+		Map.Add(FName("E"), BS);
+	}
+
+	// ih — close front vowel (I as in "sit"): narrow opening, spread lips
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.2f);
+		BS.Add(FName("mouthSmileLeft"), 0.25f);
+		BS.Add(FName("mouthSmileRight"), 0.25f);
+		BS.Add(FName("mouthStretchLeft"), 0.1f);
+		BS.Add(FName("mouthStretchRight"), 0.1f);
+		Map.Add(FName("ih"), BS);
+	}
+
+	// oh — mid back vowel (O as in "go"): rounded lips, open jaw
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("jawOpen"), 0.5f);
+		BS.Add(FName("mouthFunnel"), 0.5f);
+		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
+		BS.Add(FName("mouthLowerDownRight"), 0.2f);
+		Map.Add(FName("oh"), BS);
+	}
+
+	// ou — close back vowel (OO as in "boot"): tightly rounded lips
+	{
+		TMap<FName, float> BS;
+		BS.Add(FName("mouthPucker"), 0.6f);
+		BS.Add(FName("mouthFunnel"), 0.4f);
+		BS.Add(FName("jawOpen"), 0.15f);
+		Map.Add(FName("ou"), BS);
+	}
+
+	return Map;
+}
+
+const TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::VisemeToBlendshapeMap =
+	UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap();
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Constructor / Destructor
+// ─────────────────────────────────────────────────────────────────────────────
+
+UElevenLabsLipSyncComponent::UElevenLabsLipSyncComponent()
+{
+	PrimaryComponentTick.bCanEverTick = true;
+	PrimaryComponentTick.TickInterval = 1.0f / 60.0f; // 60 fps for smooth animation
+
+	// Initialize viseme maps with all names at zero
+	for (const FName& Name : VisemeNames)
+	{
+		TargetVisemes.Add(Name, 0.0f);
+		SmoothedVisemes.Add(Name, 0.0f);
+	}
+	TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
+	SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
+}
+
+UElevenLabsLipSyncComponent::~UElevenLabsLipSyncComponent() = default;
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Lifecycle
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::BeginPlay()
+{
+	Super::BeginPlay();
+
+	// Create the spectrum analyzer (512-point FFT, Hann window, 16kHz)
+	Audio::FSpectrumAnalyzerSettings Settings;
+	Settings.FFTSize = Audio::FSpectrumAnalyzerSettings::EFFTSize::Medium_512;
+	Settings.WindowType = Audio::EWindowType::Hann;
+	SpectrumAnalyzer = MakeUnique<Audio::FSpectrumAnalyzer>(
+		Settings, static_cast<float>(ElevenLabsAudio::SampleRate));
+
+	// Auto-discover the agent component on the same actor
+	AActor* Owner = GetOwner();
+	if (!Owner) return;
+
+	UElevenLabsConversationalAgentComponent* Agent =
+		Owner->FindComponentByClass<UElevenLabsConversationalAgentComponent>();
+
+	if (Agent)
+	{
+		AgentComponent = Agent;
+		AudioDataHandle = Agent->OnAgentAudioData.AddUObject(
+			this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived);
+		UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName());
+	}
+	else
+	{
+		UE_LOG(LogElevenLabsLipSync, Warning,
+			TEXT("No ElevenLabsConversationalAgentComponent found on %s. Lip sync will not work."),
+			*Owner->GetName());
+	}
+
+	// Auto-detect TargetMesh if not set manually.
+	// Search for a SkeletalMeshComponent named "Face" (MetaHuman convention),
+	// then fall back to the first SkeletalMeshComponent found on the actor.
+	if (!TargetMesh)
+	{
+		TArray<USkeletalMeshComponent*> SkeletalMeshes;
+		Owner->GetComponents<USkeletalMeshComponent>(SkeletalMeshes);
+
+		// First pass: look for a component named "Face" (MetaHuman face mesh)
+		for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
+		{
+			if (Mesh && Mesh->GetFName().ToString().Contains(TEXT("Face")))
+			{
+				TargetMesh = Mesh;
+				UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected face mesh: %s"), *Mesh->GetName());
+				break;
+			}
+		}
+
+		// Second pass: fall back to the first skeletal mesh with morph targets
+		if (!TargetMesh)
+		{
+			for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
+			{
+				if (Mesh)
+				{
+					TargetMesh = Mesh;
+					UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected skeletal mesh (fallback): %s"), *Mesh->GetName());
+					break;
+				}
+			}
+		}
+
+		if (!TargetMesh)
+		{
+			UE_LOG(LogElevenLabsLipSync, Warning,
+				TEXT("No SkeletalMeshComponent found on %s. Set TargetMesh manually or use GetCurrentBlendshapes() in Blueprint."),
+				*Owner->GetName());
+		}
+	}
+
+	// DEBUG: list available morph targets on the target mesh
+	if (TargetMesh && TargetMesh->GetSkeletalMeshAsset())
+	{
+		const TArray<UMorphTarget*>& MorphTargets = TargetMesh->GetSkeletalMeshAsset()->GetMorphTargets();
+		UE_LOG(LogElevenLabsLipSync, Log, TEXT("TargetMesh '%s' has %d morph targets."),
+			*TargetMesh->GetName(), MorphTargets.Num());
+
+		// Log first 20 morph target names to verify ARKit naming
+		FString Names;
+		int32 Count = 0;
+		for (const UMorphTarget* MT : MorphTargets)
+		{
+			if (MT)
+			{
+				if (Count > 0) Names += TEXT(", ");
+				Names += MT->GetName();
+				if (++Count >= 20) { Names += TEXT(" ..."); break; }
+			}
+		}
+		if (Count > 0)
+		{
+			UE_LOG(LogElevenLabsLipSync, Log, TEXT("Morph target sample: %s"), *Names);
+		}
+
+		// Verify our blendshape names exist as morph targets on this mesh
+		TArray<FName> TestNames = { FName("jawOpen"), FName("mouthClose"), FName("mouthFunnel") };
+		for (const FName& TestName : TestNames)
+		{
+			bool bFound = false;
+			for (const UMorphTarget* MT : MorphTargets)
+			{
+				if (MT && MT->GetFName() == TestName)
+				{
+					bFound = true;
+					break;
+				}
+			}
+			UE_LOG(LogElevenLabsLipSync, Log, TEXT("  Morph target '%s': %s"),
+				*TestName.ToString(), bFound ? TEXT("FOUND") : TEXT("NOT FOUND"));
+		}
+	}
+}
+
+void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
+{
+	// Unbind from agent component
+	if (AgentComponent.IsValid() && AudioDataHandle.IsValid())
+	{
+		AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
+		AudioDataHandle.Reset();
+	}
+	AgentComponent.Reset();
+	SpectrumAnalyzer.Reset();
+
+	Super::EndPlay(EndPlayReason);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Tick — smooth visemes and apply morph targets
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick TickType,
+	FActorComponentTickFunction* ThisTickFunction)
+{
+	Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
+
+	// Smooth viseme weights towards targets using exponential interpolation
+	const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f);
+	bool bAnyNonZero = false;
+
+	for (const FName& Name : VisemeNames)
+	{
+		float& Current = SmoothedVisemes.FindOrAdd(Name);
+		const float Target = TargetVisemes.FindOrAdd(Name);
+
+		Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha);
+
+		// Snap to zero to avoid infinite tiny values
+		if (Current < 0.001f) Current = 0.0f;
+		if (Current > 0.001f) bAnyNonZero = true;
+	}
+
+	// "sil" uses LipSyncStrength=1 always — it's the rest pose
+	SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp(
+		SmoothedVisemes.FindOrAdd(FName("sil")),
+		TargetVisemes.FindOrAdd(FName("sil")),
+		Alpha);
+
+	// Convert visemes to ARKit blendshapes
+	MapVisemesToBlendshapes();
+
+	// Auto-apply morph targets if a target mesh is set
+	if (TargetMesh)
+	{
+		ApplyMorphTargets();
+	}
+
+	// Notify Blueprint listeners
+	if (bAnyNonZero || CurrentBlendshapes.Num() > 0)
+	{
+		OnVisemesReady.Broadcast();
+	}
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Audio analysis
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMData)
+{
+	if (!SpectrumAnalyzer) return;
+
+	// Convert int16 PCM to float32 [-1, 1]
+	const int16* Samples = reinterpret_cast<const int16*>(PCMData.GetData());
+	const int32 NumSamples = PCMData.Num() / sizeof(int16);
+
+	// DEBUG: log first audio chunk received
+	static bool bFirstChunkLogged = false;
+	if (!bFirstChunkLogged)
+	{
+		UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
+		bFirstChunkLogged = true;
+	}
+
+	FloatBuffer.Reset(NumSamples);
+	for (int32 i = 0; i < NumSamples; ++i)
+	{
+		FloatBuffer.Add(static_cast<float>(Samples[i]) / 32768.0f);
+	}
+
+	// Feed to rolling FFT analyzer
+	SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples);
+
+	// Try to perform analysis (returns true when enough data for one FFT window)
+	if (SpectrumAnalyzer->PerformAnalysisIfPossible(true))
+	{
+		AnalyzeSpectrum();
+	}
+}
+
+void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
+{
+	if (!SpectrumAnalyzer) return;
+
+	Audio::FSpectrumAnalyzerScopeLock Lock(SpectrumAnalyzer.Get());
+
+	// Extract energy in frequency bands relevant for speech phoneme classification.
+	// Band boundaries chosen based on speech formant ranges.
+	const float VoiceEnergy    = GetBandEnergy(80.0f, 400.0f);    // Fundamental frequency
+	const float F1Energy       = GetBandEnergy(300.0f, 800.0f);   // First formant → jaw openness
+	const float F2Energy       = GetBandEnergy(800.0f, 2500.0f);  // Second formant → vowel front/back
+	const float F3Energy       = GetBandEnergy(2500.0f, 4000.0f); // Third formant → liquids, nasals
+	const float SibilantEnergy = GetBandEnergy(4000.0f, 7500.0f); // Fricative/sibilant energy
+
+	const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy;
+
+	// DEBUG: log energy levels periodically
+	static int32 AnalysisCount = 0;
+	if (++AnalysisCount % 50 == 1) // Log every ~50 analyses
+	{
+		UE_LOG(LogElevenLabsLipSync, Log,
+			TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
+			TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
+	}
+
+	EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
+}
+
+float UElevenLabsLipSyncComponent::GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples) const
+{
+	if (!SpectrumAnalyzer || NumSamples <= 0) return 0.0f;
+
+	float Total = 0.0f;
+	const float Step = (HighFreq - LowFreq) / static_cast<float>(NumSamples);
+
+	for (int32 i = 0; i < NumSamples; ++i)
+	{
+		const float Freq = LowFreq + Step * (static_cast<float>(i) + 0.5f);
+		Total += SpectrumAnalyzer->GetMagnitudeForFrequency(Freq);
+	}
+
+	return Total / static_cast<float>(NumSamples);
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Viseme estimation from spectral analysis
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy,
+	float F1Energy, float F2Energy, float F3Energy, float SibilantEnergy)
+{
+	// Reset all visemes to zero
+	for (const FName& Name : VisemeNames)
+	{
+		TargetVisemes.FindOrAdd(Name) = 0.0f;
+	}
+
+	// Silence threshold — below this, mouth is closed
+	constexpr float SilenceThreshold = 0.002f;
+
+	if (TotalEnergy < SilenceThreshold)
+	{
+		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
+		return;
+	}
+
+	// Normalize band energies relative to total
+	const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f);
+	const float NormF1 = F1Energy * InvTotal;
+	const float NormF2 = F2Energy * InvTotal;
+	const float NormF3 = F3Energy * InvTotal;
+	const float NormSibilant = SibilantEnergy * InvTotal;
+
+	// Energy-based intensity (how "loud" the speech is — drives overall jaw opening)
+	// Scale to a usable 0-1 range. The constant is empirically tuned.
+	const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f);
+
+	// ── Classification based on spectral shape ───────────────────────────────
+	// The approach: compute "votes" for each viseme category based on where
+	// the spectral energy is concentrated. Multiple visemes can be active
+	// simultaneously (blended).
+
+	// Fricatives / sibilants: high-frequency energy dominates
+	if (NormSibilant > 0.25f)
+	{
+		const float FricativeWeight = NormSibilant * Intensity;
+		// Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq)
+		if (NormF3 > NormF2)
+		{
+			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight;
+		}
+		else
+		{
+			TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f;
+			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f;
+		}
+		// F/V component
+		TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f;
+	}
+
+	// Voiced speech: most energy in voice + F1 + F2
+	if (NormSibilant < 0.5f)
+	{
+		const float VoicedWeight = (1.0f - NormSibilant) * Intensity;
+
+		// Open vowels: strong F1 = wide jaw opening
+		if (NormF1 > 0.3f)
+		{
+			if (NormF2 > 0.35f)
+			{
+				// High F2 + high F1 → front open vowel (A as in "cat")
+				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1;
+			}
+			else
+			{
+				// Low F2 + high F1 → back open vowel (O as in "go")
+				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f;
+				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f;
+			}
+		}
+
+		// Mid vowels: moderate F1
+		if (NormF1 > 0.15f && NormF1 <= 0.3f)
+		{
+			if (NormF2 > 0.4f)
+			{
+				// High F2 → front mid vowel (E as in "bed")
+				TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f;
+			}
+			else
+			{
+				// Low F2 → rounded mid vowel
+				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f;
+			}
+		}
+
+		// Close vowels: weak F1
+		if (NormF1 <= 0.15f && NormF2 > 0.0f)
+		{
+			if (NormF2 > 0.4f)
+			{
+				// High F2 → front close vowel (I as in "see")
+				TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f;
+			}
+			else
+			{
+				// Low F2 → back close vowel (OO as in "boot")
+				TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f;
+			}
+		}
+
+		// Nasals / liquids: prominent F3 with low sibilant
+		if (NormF3 > 0.2f && NormSibilant < 0.15f)
+		{
+			if (NormF1 < 0.2f)
+			{
+				TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f;
+			}
+			else
+			{
+				TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f;
+			}
+		}
+
+		// Plosive detection: very low F1 with moderate energy = lips/tongue closed
+		if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f)
+		{
+			TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f;
+			TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f;
+		}
+	}
+
+	// TH detection: moderate sibilant + moderate F3 (dental fricative)
+	if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f)
+	{
+		TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f;
+	}
+
+	// Ensure at least some silence weight when energy is very low
+	if (Intensity < 0.1f)
+	{
+		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f;
+	}
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Viseme → ARKit blendshape mapping
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes()
+{
+	CurrentBlendshapes.Reset();
+
+	// Accumulate blendshape contributions from all active visemes
+	for (const FName& VisemeName : VisemeNames)
+	{
+		const float VisemeWeight = SmoothedVisemes.FindOrAdd(VisemeName);
+		if (VisemeWeight < 0.001f) continue;
+
+		const TMap<FName, float>* Mapping = VisemeToBlendshapeMap.Find(VisemeName);
+		if (!Mapping) continue;
+
+		for (const auto& Pair : *Mapping)
+		{
+			float& BS = CurrentBlendshapes.FindOrAdd(Pair.Key);
+			BS += Pair.Value * VisemeWeight;
+		}
+	}
+
+	// Clamp all blendshape values to [0, 1]
+	for (auto& Pair : CurrentBlendshapes)
+	{
+		Pair.Value = FMath::Clamp(Pair.Value, 0.0f, 1.0f);
+	}
+}
+
+// ─────────────────────────────────────────────────────────────────────────────
+// Morph target application
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::ApplyMorphTargets()
+{
+	if (!TargetMesh) return;
+
+	// DEBUG: log blendshape values periodically
+	static int32 ApplyCount = 0;
+	if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps
+	{
+		FString DebugStr;
+		for (const auto& Pair : CurrentBlendshapes)
+		{
+			if (Pair.Value > 0.01f)
+			{
+				DebugStr += FString::Printf(TEXT("%s=%.2f "), *Pair.Key.ToString(), Pair.Value);
+			}
+		}
+		if (DebugStr.Len() > 0)
+		{
+			UE_LOG(LogElevenLabsLipSync, Log, TEXT("Blendshapes: %s"), *DebugStr);
+		}
+	}
+
+	// Apply morph targets directly.
+	// NOTE: For MetaHuman, the face AnimBP may override these values.
+	// In that case, use GetCurrentBlendshapes() in the AnimBP instead.
+	for (const auto& Pair : CurrentBlendshapes)
+	{
+		TargetMesh->SetMorphTarget(Pair.Key, Pair.Value);
+	}
+}
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -62,6 +62,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
 */
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);

+// Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only).
+// Delivers PCM chunks as int16, 16kHz mono, little-endian.
+DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray<uint8>& /*PCMData*/);
+
 // ─────────────────────────────────────────────────────────────────────────────
 // UElevenLabsConversationalAgentComponent
 //
@@ -195,6 +199,11 @@ public:
 		meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
 	FOnAgentResponseTimeout OnAgentResponseTimeout;

+	// ── Raw audio data (C++ only, used by LipSync component) ────────────────
+	/** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk.
+	 *  Used internally by UElevenLabsLipSyncComponent for spectral analysis. */
+	FOnAgentAudioData OnAgentAudioData;
+
 	// ── Control ───────────────────────────────────────────────────────────────

 	/**
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -0,0 +1,139 @@
+// Copyright ASTERION. All Rights Reserved.
+
+#pragma once
+
+#include "CoreMinimal.h"
+#include "Components/ActorComponent.h"
+#include "DSP/SpectrumAnalyzer.h"
+#include "ElevenLabsLipSyncComponent.generated.h"
+
+class UElevenLabsConversationalAgentComponent;
+class USkeletalMeshComponent;
+
+// Fired every tick when viseme/blendshape data has been updated.
+DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);
+
+/**
+ * Real-time lip sync component for ElevenLabs Conversational AI.
+ *
+ * Attaches to the same Actor as the Conversational Agent component.
+ * Receives the agent's audio stream, performs spectral analysis,
+ * estimates 15 OVR viseme weights, maps them to ARKit blendshapes
+ * (MetaHuman compatible), and optionally auto-applies morph targets.
+ *
+ * Usage:
+ *   1. Add this component alongside the Conversational Agent component.
+ *   2. (Optional) Set TargetMesh to the MetaHuman Face skeletal mesh.
+ *   3. Conversation starts → lip sync works automatically.
+ *   4. (Optional) Bind OnVisemesReady for custom Blueprint handling.
+ */
+UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent),
+	DisplayName = "ElevenLabs Lip Sync")
+class PS_AI_AGENT_ELEVENLABS_API UElevenLabsLipSyncComponent : public UActorComponent
+{
+	GENERATED_BODY()
+
+public:
+	UElevenLabsLipSyncComponent();
+	~UElevenLabsLipSyncComponent();
+
+	// ── Configuration ─────────────────────────────────────────────────────────
+
+	/** Target skeletal mesh to auto-apply morph targets. Leave empty to handle
+	 *  visemes manually via OnVisemesReady + GetCurrentBlendshapes(). */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ToolTip = "Skeletal mesh to drive morph targets on.\nLeave empty to read values manually via GetCurrentBlendshapes()."))
+	TObjectPtr<USkeletalMeshComponent> TargetMesh;
+
+	/** Overall mouth movement intensity multiplier. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "0.0", ClampMax = "3.0",
+		ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
+	float LipSyncStrength = 1.0f;
+
+	/** How quickly viseme weights interpolate towards new values each frame. */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
+		meta = (ClampMin = "1.0", ClampMax = "100.0",
+		ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
+	float SmoothingSpeed = 20.0f;
+
+	// ── Events ────────────────────────────────────────────────────────────────
+
+	/** Fires every tick when viseme data has been updated.
+	 *  Use GetCurrentVisemes() or GetCurrentBlendshapes() to read values. */
+	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|LipSync",
+		meta = (ToolTip = "Fires each frame with updated viseme data.\nCall GetCurrentVisemes() or GetCurrentBlendshapes() to read values."))
+	FOnElevenLabsVisemesReady OnVisemesReady;
+
+	// ── Getters ───────────────────────────────────────────────────────────────
+
+	/** Get current OVR viseme weights (15 values: sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, ih, oh, ou). */
+	UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
+	TMap<FName, float> GetCurrentVisemes() const { return SmoothedVisemes; }
+
+	/** Get current ARKit blendshape weights (MetaHuman compatible: jawOpen, mouthFunnel, mouthClose, etc.). */
+	UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
+	TMap<FName, float> GetCurrentBlendshapes() const { return CurrentBlendshapes; }
+
+	// ── UActorComponent overrides ─────────────────────────────────────────────
+	virtual void BeginPlay() override;
+	virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;
+	virtual void TickComponent(float DeltaTime, ELevelTick TickType,
+		FActorComponentTickFunction* ThisTickFunction) override;
+
+private:
+	// ── Audio analysis pipeline ───────────────────────────────────────────────
+
+	/** Receives raw PCM from the agent component. */
+	void OnAudioChunkReceived(const TArray<uint8>& PCMData);
+
+	/** Extract frequency band energies from the spectrum analyzer. */
+	void AnalyzeSpectrum();
+
+	/** Map frequency band energies to 15 OVR viseme target weights. */
+	void EstimateVisemes(float TotalEnergy, float F1Energy, float F2Energy,
+		float F3Energy, float SibilantEnergy);
+
+	/** Convert smoothed OVR visemes to ARKit blendshape weights. */
+	void MapVisemesToBlendshapes();
+
+	/** Apply CurrentBlendshapes to TargetMesh morph targets. */
+	void ApplyMorphTargets();
+
+	/** Sample the spectrum magnitude across a frequency range. */
+	float GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples = 8) const;
+
+	// ── State ─────────────────────────────────────────────────────────────────
+
+	TUniquePtr<Audio::FSpectrumAnalyzer> SpectrumAnalyzer;
+
+	// Reused float buffer for int16→float conversion (avoid per-chunk allocations)
+	TArray<float> FloatBuffer;
+
+	// Target viseme weights (set by spectral analysis, not yet smoothed)
+	TMap<FName, float> TargetVisemes;
+
+	// Smoothed viseme weights (interpolated each tick, exposed via GetCurrentVisemes)
+	TMap<FName, float> SmoothedVisemes;
+
+	// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
+	TMap<FName, float> CurrentBlendshapes;
+
+	// Whether we have pending analysis results to process
+	bool bHasPendingAnalysis = false;
+
+	// Cached reference to the agent component on the same Actor
+	TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
+	FDelegateHandle AudioDataHandle;
+
+	// ── Static data ───────────────────────────────────────────────────────────
+
+	/** OVR Viseme names (15 standard visemes). */
+	static const TArray<FName> VisemeNames;
+
+	/** Initialize OVR→ARKit blendshape mapping table. */
+	static TMap<FName, TMap<FName, float>> CreateVisemeToBlendshapeMap();
+
+	/** Cached mapping: OVR viseme name → { ARKit blendshape name → weight }. */
+	static const TMap<FName, TMap<FName, float>> VisemeToBlendshapeMap;
+};