WIP: Add ElevenLabsLipSyncComponent with spectral analysis lip sync

Real-time lip sync component that performs client-side spectral analysis on the agent's PCM audio stream (ElevenLabs doesn't provide viseme data). Pipeline: 512-point FFT (16kHz) → 5 frequency bands → 15 OVR visemes → ARKit blendshapes (MetaHuman compatible) → auto-apply morph targets. Currently uses SetMorphTarget() which may be overridden by MetaHuman's Face AnimBP — face animation not yet working. Debug logs added to diagnose: audio flow, spectrum energy, morph target name matching. Next steps: verify debug output, fix MetaHuman morph target override (likely needs AnimBP integration like Convai approach). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 11:23:34 +01:00
parent 52f75f884b
commit 224af6a27b
4 changed files with 813 additions and 0 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -426,6 +426,8 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
 void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
 {
 	EnqueueAgentAudio(PCMData);
 	// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
 	OnAgentAudioData.Broadcast(PCMData);
 }
 void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -0,0 +1,663 @@
 // Copyright ASTERION. All Rights Reserved.
 #include "ElevenLabsLipSyncComponent.h"
 #include "ElevenLabsConversationalAgentComponent.h"
 #include "ElevenLabsDefinitions.h"
 #include "Components/SkeletalMeshComponent.h"
 #include "Engine/SkeletalMesh.h"
 #include "Animation/AnimInstance.h"
 #include "Animation/MorphTarget.h"
 #include "GameFramework/Actor.h"
 DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsLipSync, Log, All);
 // ─────────────────────────────────────────────────────────────────────────────
 // Static data
 // ─────────────────────────────────────────────────────────────────────────────
 const TArray<FName> UElevenLabsLipSyncComponent::VisemeNames = {
 	FName("sil"), FName("PP"), FName("FF"), FName("TH"), FName("DD"),
 	FName("kk"),  FName("CH"), FName("SS"), FName("nn"), FName("RR"),
 	FName("aa"),  FName("E"),  FName("ih"), FName("oh"), FName("ou")
 };
 // OVR Viseme → ARKit blendshape mapping.
 // Each viseme activates a combination of ARKit morph targets with specific weights.
 // These values are tuned for MetaHuman faces and can be adjusted per project.
 TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap()
 {
 	TMap<FName, TMap<FName, float>> Map;
 	// sil — silence, mouth at rest
 	Map.Add(FName("sil"), {});
 	// PP — bilabial (P, B, M): lips pressed together
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthClose"), 0.7f);
 		BS.Add(FName("mouthPressLeft"), 0.3f);
 		BS.Add(FName("mouthPressRight"), 0.3f);
 		Map.Add(FName("PP"), BS);
 	}
 	// FF — labiodental (F, V): lower lip tucked under upper teeth
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthShrugLower"), 0.5f);
 		BS.Add(FName("mouthUpperUpLeft"), 0.3f);
 		BS.Add(FName("mouthUpperUpRight"), 0.3f);
 		BS.Add(FName("jawOpen"), 0.1f);
 		Map.Add(FName("FF"), BS);
 	}
 	// TH — dental (TH): tongue between teeth
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("tongueOut"), 0.4f);
 		BS.Add(FName("jawOpen"), 0.15f);
 		Map.Add(FName("TH"), BS);
 	}
 	// DD — alveolar (D, T, N): tongue on alveolar ridge
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.25f);
 		BS.Add(FName("mouthClose"), 0.2f);
 		BS.Add(FName("mouthLowerDownLeft"), 0.15f);
 		BS.Add(FName("mouthLowerDownRight"), 0.15f);
 		Map.Add(FName("DD"), BS);
 	}
 	// kk — velar (K, G): back of tongue raised
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.25f);
 		BS.Add(FName("mouthStretchLeft"), 0.15f);
 		BS.Add(FName("mouthStretchRight"), 0.15f);
 		Map.Add(FName("kk"), BS);
 	}
 	// CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthFunnel"), 0.45f);
 		BS.Add(FName("jawOpen"), 0.2f);
 		BS.Add(FName("mouthPucker"), 0.15f);
 		Map.Add(FName("CH"), BS);
 	}
 	// SS — alveolar fricative (S, Z): air through narrow channel
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthStretchLeft"), 0.4f);
 		BS.Add(FName("mouthStretchRight"), 0.4f);
 		BS.Add(FName("jawOpen"), 0.1f);
 		BS.Add(FName("mouthSmileLeft"), 0.15f);
 		BS.Add(FName("mouthSmileRight"), 0.15f);
 		Map.Add(FName("SS"), BS);
 	}
 	// nn — nasal (N, M, NG): soft palate lowered
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.15f);
 		BS.Add(FName("mouthClose"), 0.2f);
 		BS.Add(FName("mouthPressLeft"), 0.1f);
 		BS.Add(FName("mouthPressRight"), 0.1f);
 		Map.Add(FName("nn"), BS);
 	}
 	// RR — retroflex/rhotic (R, L): tongue curled or lateral
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthFunnel"), 0.3f);
 		BS.Add(FName("jawOpen"), 0.2f);
 		BS.Add(FName("mouthRollLower"), 0.15f);
 		Map.Add(FName("RR"), BS);
 	}
 	// aa — open vowel (A as in "father"): wide open jaw
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.7f);
 		BS.Add(FName("mouthLowerDownLeft"), 0.4f);
 		BS.Add(FName("mouthLowerDownRight"), 0.4f);
 		BS.Add(FName("mouthShrugUpper"), 0.1f);
 		Map.Add(FName("aa"), BS);
 	}
 	// E — mid front vowel (E as in "bed"): mid-open, spread lips
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.4f);
 		BS.Add(FName("mouthSmileLeft"), 0.3f);
 		BS.Add(FName("mouthSmileRight"), 0.3f);
 		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
 		BS.Add(FName("mouthLowerDownRight"), 0.2f);
 		Map.Add(FName("E"), BS);
 	}
 	// ih — close front vowel (I as in "sit"): narrow opening, spread lips
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.2f);
 		BS.Add(FName("mouthSmileLeft"), 0.25f);
 		BS.Add(FName("mouthSmileRight"), 0.25f);
 		BS.Add(FName("mouthStretchLeft"), 0.1f);
 		BS.Add(FName("mouthStretchRight"), 0.1f);
 		Map.Add(FName("ih"), BS);
 	}
 	// oh — mid back vowel (O as in "go"): rounded lips, open jaw
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("jawOpen"), 0.5f);
 		BS.Add(FName("mouthFunnel"), 0.5f);
 		BS.Add(FName("mouthLowerDownLeft"), 0.2f);
 		BS.Add(FName("mouthLowerDownRight"), 0.2f);
 		Map.Add(FName("oh"), BS);
 	}
 	// ou — close back vowel (OO as in "boot"): tightly rounded lips
 	{
 		TMap<FName, float> BS;
 		BS.Add(FName("mouthPucker"), 0.6f);
 		BS.Add(FName("mouthFunnel"), 0.4f);
 		BS.Add(FName("jawOpen"), 0.15f);
 		Map.Add(FName("ou"), BS);
 	}
 	return Map;
 }
 const TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::VisemeToBlendshapeMap =
 	UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap();
 // ─────────────────────────────────────────────────────────────────────────────
 // Constructor / Destructor
 // ─────────────────────────────────────────────────────────────────────────────
 UElevenLabsLipSyncComponent::UElevenLabsLipSyncComponent()
 {
 	PrimaryComponentTick.bCanEverTick = true;
 	PrimaryComponentTick.TickInterval = 1.0f / 60.0f; // 60 fps for smooth animation
 	// Initialize viseme maps with all names at zero
 	for (const FName& Name : VisemeNames)
 	{
 		TargetVisemes.Add(Name, 0.0f);
 		SmoothedVisemes.Add(Name, 0.0f);
 	}
 	TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
 	SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
 }
 UElevenLabsLipSyncComponent::~UElevenLabsLipSyncComponent() = default;
 // ─────────────────────────────────────────────────────────────────────────────
 // Lifecycle
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::BeginPlay()
 {
 	Super::BeginPlay();
 	// Create the spectrum analyzer (512-point FFT, Hann window, 16kHz)
 	Audio::FSpectrumAnalyzerSettings Settings;
 	Settings.FFTSize = Audio::FSpectrumAnalyzerSettings::EFFTSize::Medium_512;
 	Settings.WindowType = Audio::EWindowType::Hann;
 	SpectrumAnalyzer = MakeUnique<Audio::FSpectrumAnalyzer>(
 		Settings, static_cast<float>(ElevenLabsAudio::SampleRate));
 	// Auto-discover the agent component on the same actor
 	AActor* Owner = GetOwner();
 	if (!Owner) return;
 	UElevenLabsConversationalAgentComponent* Agent =
 		Owner->FindComponentByClass<UElevenLabsConversationalAgentComponent>();
 	if (Agent)
 	{
 		AgentComponent = Agent;
 		AudioDataHandle = Agent->OnAgentAudioData.AddUObject(
 			this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived);
 		UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName());
 	}
 	else
 	{
 		UE_LOG(LogElevenLabsLipSync, Warning,
 			TEXT("No ElevenLabsConversationalAgentComponent found on %s. Lip sync will not work."),
 			*Owner->GetName());
 	}
 	// Auto-detect TargetMesh if not set manually.
 	// Search for a SkeletalMeshComponent named "Face" (MetaHuman convention),
 	// then fall back to the first SkeletalMeshComponent found on the actor.
 	if (!TargetMesh)
 	{
 		TArray<USkeletalMeshComponent*> SkeletalMeshes;
 		Owner->GetComponents<USkeletalMeshComponent>(SkeletalMeshes);
 		// First pass: look for a component named "Face" (MetaHuman face mesh)
 		for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
 		{
 			if (Mesh && Mesh->GetFName().ToString().Contains(TEXT("Face")))
 			{
 				TargetMesh = Mesh;
 				UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected face mesh: %s"), *Mesh->GetName());
 				break;
 			}
 		}
 		// Second pass: fall back to the first skeletal mesh with morph targets
 		if (!TargetMesh)
 		{
 			for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
 			{
 				if (Mesh)
 				{
 					TargetMesh = Mesh;
 					UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected skeletal mesh (fallback): %s"), *Mesh->GetName());
 					break;
 				}
 			}
 		}
 		if (!TargetMesh)
 		{
 			UE_LOG(LogElevenLabsLipSync, Warning,
 				TEXT("No SkeletalMeshComponent found on %s. Set TargetMesh manually or use GetCurrentBlendshapes() in Blueprint."),
 				*Owner->GetName());
 		}
 	}
 	// DEBUG: list available morph targets on the target mesh
 	if (TargetMesh && TargetMesh->GetSkeletalMeshAsset())
 	{
 		const TArray<UMorphTarget*>& MorphTargets = TargetMesh->GetSkeletalMeshAsset()->GetMorphTargets();
 		UE_LOG(LogElevenLabsLipSync, Log, TEXT("TargetMesh '%s' has %d morph targets."),
 			*TargetMesh->GetName(), MorphTargets.Num());
 		// Log first 20 morph target names to verify ARKit naming
 		FString Names;
 		int32 Count = 0;
 		for (const UMorphTarget* MT : MorphTargets)
 		{
 			if (MT)
 			{
 				if (Count > 0) Names += TEXT(", ");
 				Names += MT->GetName();
 				if (++Count >= 20) { Names += TEXT(" ..."); break; }
 			}
 		}
 		if (Count > 0)
 		{
 			UE_LOG(LogElevenLabsLipSync, Log, TEXT("Morph target sample: %s"), *Names);
 		}
 		// Verify our blendshape names exist as morph targets on this mesh
 		TArray<FName> TestNames = { FName("jawOpen"), FName("mouthClose"), FName("mouthFunnel") };
 		for (const FName& TestName : TestNames)
 		{
 			bool bFound = false;
 			for (const UMorphTarget* MT : MorphTargets)
 			{
 				if (MT && MT->GetFName() == TestName)
 				{
 					bFound = true;
 					break;
 				}
 			}
 			UE_LOG(LogElevenLabsLipSync, Log, TEXT("  Morph target '%s': %s"),
 				*TestName.ToString(), bFound ? TEXT("FOUND") : TEXT("NOT FOUND"));
 		}
 	}
 }
 void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
 {
 	// Unbind from agent component
 	if (AgentComponent.IsValid() && AudioDataHandle.IsValid())
 	{
 		AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
 		AudioDataHandle.Reset();
 	}
 	AgentComponent.Reset();
 	SpectrumAnalyzer.Reset();
 	Super::EndPlay(EndPlayReason);
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Tick — smooth visemes and apply morph targets
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick TickType,
 	FActorComponentTickFunction* ThisTickFunction)
 {
 	Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
 	// Smooth viseme weights towards targets using exponential interpolation
 	const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f);
 	bool bAnyNonZero = false;
 	for (const FName& Name : VisemeNames)
 	{
 		float& Current = SmoothedVisemes.FindOrAdd(Name);
 		const float Target = TargetVisemes.FindOrAdd(Name);
 		Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha);
 		// Snap to zero to avoid infinite tiny values
 		if (Current < 0.001f) Current = 0.0f;
 		if (Current > 0.001f) bAnyNonZero = true;
 	}
 	// "sil" uses LipSyncStrength=1 always — it's the rest pose
 	SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp(
 		SmoothedVisemes.FindOrAdd(FName("sil")),
 		TargetVisemes.FindOrAdd(FName("sil")),
 		Alpha);
 	// Convert visemes to ARKit blendshapes
 	MapVisemesToBlendshapes();
 	// Auto-apply morph targets if a target mesh is set
 	if (TargetMesh)
 	{
 		ApplyMorphTargets();
 	}
 	// Notify Blueprint listeners
 	if (bAnyNonZero || CurrentBlendshapes.Num() > 0)
 	{
 		OnVisemesReady.Broadcast();
 	}
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Audio analysis
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMData)
 {
 	if (!SpectrumAnalyzer) return;
 	// Convert int16 PCM to float32 [-1, 1]
 	const int16* Samples = reinterpret_cast<const int16*>(PCMData.GetData());
 	const int32 NumSamples = PCMData.Num() / sizeof(int16);
 	// DEBUG: log first audio chunk received
 	static bool bFirstChunkLogged = false;
 	if (!bFirstChunkLogged)
 	{
 		UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
 		bFirstChunkLogged = true;
 	}
 	FloatBuffer.Reset(NumSamples);
 	for (int32 i = 0; i < NumSamples; ++i)
 	{
 		FloatBuffer.Add(static_cast<float>(Samples[i]) / 32768.0f);
 	}
 	// Feed to rolling FFT analyzer
 	SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples);
 	// Try to perform analysis (returns true when enough data for one FFT window)
 	if (SpectrumAnalyzer->PerformAnalysisIfPossible(true))
 	{
 		AnalyzeSpectrum();
 	}
 }
 void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
 {
 	if (!SpectrumAnalyzer) return;
 	Audio::FSpectrumAnalyzerScopeLock Lock(SpectrumAnalyzer.Get());
 	// Extract energy in frequency bands relevant for speech phoneme classification.
 	// Band boundaries chosen based on speech formant ranges.
 	const float VoiceEnergy    = GetBandEnergy(80.0f, 400.0f);    // Fundamental frequency
 	const float F1Energy       = GetBandEnergy(300.0f, 800.0f);   // First formant → jaw openness
 	const float F2Energy       = GetBandEnergy(800.0f, 2500.0f);  // Second formant → vowel front/back
 	const float F3Energy       = GetBandEnergy(2500.0f, 4000.0f); // Third formant → liquids, nasals
 	const float SibilantEnergy = GetBandEnergy(4000.0f, 7500.0f); // Fricative/sibilant energy
 	const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy;
 	// DEBUG: log energy levels periodically
 	static int32 AnalysisCount = 0;
 	if (++AnalysisCount % 50 == 1) // Log every ~50 analyses
 	{
 		UE_LOG(LogElevenLabsLipSync, Log,
 			TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
 			TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
 	}
 	EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
 }
 float UElevenLabsLipSyncComponent::GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples) const
 {
 	if (!SpectrumAnalyzer || NumSamples <= 0) return 0.0f;
 	float Total = 0.0f;
 	const float Step = (HighFreq - LowFreq) / static_cast<float>(NumSamples);
 	for (int32 i = 0; i < NumSamples; ++i)
 	{
 		const float Freq = LowFreq + Step * (static_cast<float>(i) + 0.5f);
 		Total += SpectrumAnalyzer->GetMagnitudeForFrequency(Freq);
 	}
 	return Total / static_cast<float>(NumSamples);
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Viseme estimation from spectral analysis
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy,
 	float F1Energy, float F2Energy, float F3Energy, float SibilantEnergy)
 {
 	// Reset all visemes to zero
 	for (const FName& Name : VisemeNames)
 	{
 		TargetVisemes.FindOrAdd(Name) = 0.0f;
 	}
 	// Silence threshold — below this, mouth is closed
 	constexpr float SilenceThreshold = 0.002f;
 	if (TotalEnergy < SilenceThreshold)
 	{
 		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
 		return;
 	}
 	// Normalize band energies relative to total
 	const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f);
 	const float NormF1 = F1Energy * InvTotal;
 	const float NormF2 = F2Energy * InvTotal;
 	const float NormF3 = F3Energy * InvTotal;
 	const float NormSibilant = SibilantEnergy * InvTotal;
 	// Energy-based intensity (how "loud" the speech is — drives overall jaw opening)
 	// Scale to a usable 0-1 range. The constant is empirically tuned.
 	const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f);
 	// ── Classification based on spectral shape ───────────────────────────────
 	// The approach: compute "votes" for each viseme category based on where
 	// the spectral energy is concentrated. Multiple visemes can be active
 	// simultaneously (blended).
 	// Fricatives / sibilants: high-frequency energy dominates
 	if (NormSibilant > 0.25f)
 	{
 		const float FricativeWeight = NormSibilant * Intensity;
 		// Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq)
 		if (NormF3 > NormF2)
 		{
 			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight;
 		}
 		else
 		{
 			TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f;
 			TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f;
 		}
 		// F/V component
 		TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f;
 	}
 	// Voiced speech: most energy in voice + F1 + F2
 	if (NormSibilant < 0.5f)
 	{
 		const float VoicedWeight = (1.0f - NormSibilant) * Intensity;
 		// Open vowels: strong F1 = wide jaw opening
 		if (NormF1 > 0.3f)
 		{
 			if (NormF2 > 0.35f)
 			{
 				// High F2 + high F1 → front open vowel (A as in "cat")
 				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1;
 			}
 			else
 			{
 				// Low F2 + high F1 → back open vowel (O as in "go")
 				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f;
 				TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f;
 			}
 		}
 		// Mid vowels: moderate F1
 		if (NormF1 > 0.15f && NormF1 <= 0.3f)
 		{
 			if (NormF2 > 0.4f)
 			{
 				// High F2 → front mid vowel (E as in "bed")
 				TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f;
 			}
 			else
 			{
 				// Low F2 → rounded mid vowel
 				TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f;
 			}
 		}
 		// Close vowels: weak F1
 		if (NormF1 <= 0.15f && NormF2 > 0.0f)
 		{
 			if (NormF2 > 0.4f)
 			{
 				// High F2 → front close vowel (I as in "see")
 				TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f;
 			}
 			else
 			{
 				// Low F2 → back close vowel (OO as in "boot")
 				TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f;
 			}
 		}
 		// Nasals / liquids: prominent F3 with low sibilant
 		if (NormF3 > 0.2f && NormSibilant < 0.15f)
 		{
 			if (NormF1 < 0.2f)
 			{
 				TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f;
 			}
 			else
 			{
 				TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f;
 			}
 		}
 		// Plosive detection: very low F1 with moderate energy = lips/tongue closed
 		if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f)
 		{
 			TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f;
 			TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f;
 		}
 	}
 	// TH detection: moderate sibilant + moderate F3 (dental fricative)
 	if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f)
 	{
 		TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f;
 	}
 	// Ensure at least some silence weight when energy is very low
 	if (Intensity < 0.1f)
 	{
 		TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f;
 	}
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Viseme → ARKit blendshape mapping
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes()
 {
 	CurrentBlendshapes.Reset();
 	// Accumulate blendshape contributions from all active visemes
 	for (const FName& VisemeName : VisemeNames)
 	{
 		const float VisemeWeight = SmoothedVisemes.FindOrAdd(VisemeName);
 		if (VisemeWeight < 0.001f) continue;
 		const TMap<FName, float>* Mapping = VisemeToBlendshapeMap.Find(VisemeName);
 		if (!Mapping) continue;
 		for (const auto& Pair : *Mapping)
 		{
 			float& BS = CurrentBlendshapes.FindOrAdd(Pair.Key);
 			BS += Pair.Value * VisemeWeight;
 		}
 	}
 	// Clamp all blendshape values to [0, 1]
 	for (auto& Pair : CurrentBlendshapes)
 	{
 		Pair.Value = FMath::Clamp(Pair.Value, 0.0f, 1.0f);
 	}
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Morph target application
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::ApplyMorphTargets()
 {
 	if (!TargetMesh) return;
 	// DEBUG: log blendshape values periodically
 	static int32 ApplyCount = 0;
 	if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps
 	{
 		FString DebugStr;
 		for (const auto& Pair : CurrentBlendshapes)
 		{
 			if (Pair.Value > 0.01f)
 			{
 				DebugStr += FString::Printf(TEXT("%s=%.2f "), *Pair.Key.ToString(), Pair.Value);
 			}
 		}
 		if (DebugStr.Len() > 0)
 		{
 			UE_LOG(LogElevenLabsLipSync, Log, TEXT("Blendshapes: %s"), *DebugStr);
 		}
 	}
 	// Apply morph targets directly.
 	// NOTE: For MetaHuman, the face AnimBP may override these values.
 	// In that case, use GetCurrentBlendshapes() in the AnimBP instead.
 	for (const auto& Pair : CurrentBlendshapes)
 	{
 		TargetMesh->SetMorphTarget(Pair.Key, Pair.Value);
 	}
 }
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -62,6 +62,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
 */
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);
 // Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only).
 // Delivers PCM chunks as int16, 16kHz mono, little-endian.
 DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray<uint8>& /*PCMData*/);
 // ─────────────────────────────────────────────────────────────────────────────
 // UElevenLabsConversationalAgentComponent
 //
@@ -195,6 +199,11 @@ public:
 		meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
 	FOnAgentResponseTimeout OnAgentResponseTimeout;
 	// ── Raw audio data (C++ only, used by LipSync component) ────────────────
 	/** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk.
 	 *  Used internally by UElevenLabsLipSyncComponent for spectral analysis. */
 	FOnAgentAudioData OnAgentAudioData;
 	// ── Control ───────────────────────────────────────────────────────────────
 	/**
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -0,0 +1,139 @@
 // Copyright ASTERION. All Rights Reserved.
 #pragma once
 #include "CoreMinimal.h"
 #include "Components/ActorComponent.h"
 #include "DSP/SpectrumAnalyzer.h"
 #include "ElevenLabsLipSyncComponent.generated.h"
 class UElevenLabsConversationalAgentComponent;
 class USkeletalMeshComponent;
 // Fired every tick when viseme/blendshape data has been updated.
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);
 /**
 * Real-time lip sync component for ElevenLabs Conversational AI.
 *
 * Attaches to the same Actor as the Conversational Agent component.
 * Receives the agent's audio stream, performs spectral analysis,
 * estimates 15 OVR viseme weights, maps them to ARKit blendshapes
 * (MetaHuman compatible), and optionally auto-applies morph targets.
 *
 * Usage:
 *   1. Add this component alongside the Conversational Agent component.
 *   2. (Optional) Set TargetMesh to the MetaHuman Face skeletal mesh.
 *   3. Conversation starts → lip sync works automatically.
 *   4. (Optional) Bind OnVisemesReady for custom Blueprint handling.
 */
 UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent),
 	DisplayName = "ElevenLabs Lip Sync")
 class PS_AI_AGENT_ELEVENLABS_API UElevenLabsLipSyncComponent : public UActorComponent
 {
 	GENERATED_BODY()
 public:
 	UElevenLabsLipSyncComponent();
 	~UElevenLabsLipSyncComponent();
 	// ── Configuration ─────────────────────────────────────────────────────────
 	/** Target skeletal mesh to auto-apply morph targets. Leave empty to handle
 	 *  visemes manually via OnVisemesReady + GetCurrentBlendshapes(). */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
 		meta = (ToolTip = "Skeletal mesh to drive morph targets on.\nLeave empty to read values manually via GetCurrentBlendshapes()."))
 	TObjectPtr<USkeletalMeshComponent> TargetMesh;
 	/** Overall mouth movement intensity multiplier. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
 		meta = (ClampMin = "0.0", ClampMax = "3.0",
 		ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
 	float LipSyncStrength = 1.0f;
 	/** How quickly viseme weights interpolate towards new values each frame. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
 		meta = (ClampMin = "1.0", ClampMax = "100.0",
 		ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
 	float SmoothingSpeed = 20.0f;
 	// ── Events ────────────────────────────────────────────────────────────────
 	/** Fires every tick when viseme data has been updated.
 	 *  Use GetCurrentVisemes() or GetCurrentBlendshapes() to read values. */
 	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|LipSync",
 		meta = (ToolTip = "Fires each frame with updated viseme data.\nCall GetCurrentVisemes() or GetCurrentBlendshapes() to read values."))
 	FOnElevenLabsVisemesReady OnVisemesReady;
 	// ── Getters ───────────────────────────────────────────────────────────────
 	/** Get current OVR viseme weights (15 values: sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, ih, oh, ou). */
 	UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
 	TMap<FName, float> GetCurrentVisemes() const { return SmoothedVisemes; }
 	/** Get current ARKit blendshape weights (MetaHuman compatible: jawOpen, mouthFunnel, mouthClose, etc.). */
 	UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
 	TMap<FName, float> GetCurrentBlendshapes() const { return CurrentBlendshapes; }
 	// ── UActorComponent overrides ─────────────────────────────────────────────
 	virtual void BeginPlay() override;
 	virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;
 	virtual void TickComponent(float DeltaTime, ELevelTick TickType,
 		FActorComponentTickFunction* ThisTickFunction) override;
 private:
 	// ── Audio analysis pipeline ───────────────────────────────────────────────
 	/** Receives raw PCM from the agent component. */
 	void OnAudioChunkReceived(const TArray<uint8>& PCMData);
 	/** Extract frequency band energies from the spectrum analyzer. */
 	void AnalyzeSpectrum();
 	/** Map frequency band energies to 15 OVR viseme target weights. */
 	void EstimateVisemes(float TotalEnergy, float F1Energy, float F2Energy,
 		float F3Energy, float SibilantEnergy);
 	/** Convert smoothed OVR visemes to ARKit blendshape weights. */
 	void MapVisemesToBlendshapes();
 	/** Apply CurrentBlendshapes to TargetMesh morph targets. */
 	void ApplyMorphTargets();
 	/** Sample the spectrum magnitude across a frequency range. */
 	float GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples = 8) const;
 	// ── State ─────────────────────────────────────────────────────────────────
 	TUniquePtr<Audio::FSpectrumAnalyzer> SpectrumAnalyzer;
 	// Reused float buffer for int16→float conversion (avoid per-chunk allocations)
 	TArray<float> FloatBuffer;
 	// Target viseme weights (set by spectral analysis, not yet smoothed)
 	TMap<FName, float> TargetVisemes;
 	// Smoothed viseme weights (interpolated each tick, exposed via GetCurrentVisemes)
 	TMap<FName, float> SmoothedVisemes;
 	// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
 	TMap<FName, float> CurrentBlendshapes;
 	// Whether we have pending analysis results to process
 	bool bHasPendingAnalysis = false;
 	// Cached reference to the agent component on the same Actor
 	TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
 	FDelegateHandle AudioDataHandle;
 	// ── Static data ───────────────────────────────────────────────────────────
 	/** OVR Viseme names (15 standard visemes). */
 	static const TArray<FName> VisemeNames;
 	/** Initialize OVR→ARKit blendshape mapping table. */
 	static TMap<FName, TMap<FName, float>> CreateVisemeToBlendshapeMap();
 	/** Cached mapping: OVR viseme name → { ARKit blendshape name → weight }. */
 	static const TMap<FName, TMap<FName, float>> VisemeToBlendshapeMap;
 };