From 224af6a27b191be76b2be55d96171ae271c9f220 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Sun, 22 Feb 2026 11:23:34 +0100 Subject: [PATCH] WIP: Add ElevenLabsLipSyncComponent with spectral analysis lip sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-time lip sync component that performs client-side spectral analysis on the agent's PCM audio stream (ElevenLabs doesn't provide viseme data). Pipeline: 512-point FFT (16kHz) → 5 frequency bands → 15 OVR visemes → ARKit blendshapes (MetaHuman compatible) → auto-apply morph targets. Currently uses SetMorphTarget() which may be overridden by MetaHuman's Face AnimBP — face animation not yet working. Debug logs added to diagnose: audio flow, spectrum energy, morph target name matching. Next steps: verify debug output, fix MetaHuman morph target override (likely needs AnimBP integration like Convai approach). Co-Authored-By: Claude Opus 4.6 --- ...ElevenLabsConversationalAgentComponent.cpp | 2 + .../Private/ElevenLabsLipSyncComponent.cpp | 663 ++++++++++++++++++ .../ElevenLabsConversationalAgentComponent.h | 9 + .../Public/ElevenLabsLipSyncComponent.h | 139 ++++ 4 files changed, 813 insertions(+) create mode 100644 Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp create mode 100644 Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 812bb1b..c29d21f 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -426,6 +426,8 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray& PCMData) { EnqueueAgentAudio(PCMData); + // Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis). + OnAgentAudioData.Broadcast(PCMData); } void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp new file mode 100644 index 0000000..8531cf7 --- /dev/null +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp @@ -0,0 +1,663 @@ +// Copyright ASTERION. All Rights Reserved. + +#include "ElevenLabsLipSyncComponent.h" +#include "ElevenLabsConversationalAgentComponent.h" +#include "ElevenLabsDefinitions.h" +#include "Components/SkeletalMeshComponent.h" +#include "Engine/SkeletalMesh.h" +#include "Animation/AnimInstance.h" +#include "Animation/MorphTarget.h" +#include "GameFramework/Actor.h" + +DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsLipSync, Log, All); + +// ───────────────────────────────────────────────────────────────────────────── +// Static data +// ───────────────────────────────────────────────────────────────────────────── + +const TArray UElevenLabsLipSyncComponent::VisemeNames = { + FName("sil"), FName("PP"), FName("FF"), FName("TH"), FName("DD"), + FName("kk"), FName("CH"), FName("SS"), FName("nn"), FName("RR"), + FName("aa"), FName("E"), FName("ih"), FName("oh"), FName("ou") +}; + +// OVR Viseme → ARKit blendshape mapping. +// Each viseme activates a combination of ARKit morph targets with specific weights. +// These values are tuned for MetaHuman faces and can be adjusted per project. +TMap> UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap() +{ + TMap> Map; + + // sil — silence, mouth at rest + Map.Add(FName("sil"), {}); + + // PP — bilabial (P, B, M): lips pressed together + { + TMap BS; + BS.Add(FName("mouthClose"), 0.7f); + BS.Add(FName("mouthPressLeft"), 0.3f); + BS.Add(FName("mouthPressRight"), 0.3f); + Map.Add(FName("PP"), BS); + } + + // FF — labiodental (F, V): lower lip tucked under upper teeth + { + TMap BS; + BS.Add(FName("mouthShrugLower"), 0.5f); + BS.Add(FName("mouthUpperUpLeft"), 0.3f); + BS.Add(FName("mouthUpperUpRight"), 0.3f); + BS.Add(FName("jawOpen"), 0.1f); + Map.Add(FName("FF"), BS); + } + + // TH — dental (TH): tongue between teeth + { + TMap BS; + BS.Add(FName("tongueOut"), 0.4f); + BS.Add(FName("jawOpen"), 0.15f); + Map.Add(FName("TH"), BS); + } + + // DD — alveolar (D, T, N): tongue on alveolar ridge + { + TMap BS; + BS.Add(FName("jawOpen"), 0.25f); + BS.Add(FName("mouthClose"), 0.2f); + BS.Add(FName("mouthLowerDownLeft"), 0.15f); + BS.Add(FName("mouthLowerDownRight"), 0.15f); + Map.Add(FName("DD"), BS); + } + + // kk — velar (K, G): back of tongue raised + { + TMap BS; + BS.Add(FName("jawOpen"), 0.25f); + BS.Add(FName("mouthStretchLeft"), 0.15f); + BS.Add(FName("mouthStretchRight"), 0.15f); + Map.Add(FName("kk"), BS); + } + + // CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge + { + TMap BS; + BS.Add(FName("mouthFunnel"), 0.45f); + BS.Add(FName("jawOpen"), 0.2f); + BS.Add(FName("mouthPucker"), 0.15f); + Map.Add(FName("CH"), BS); + } + + // SS — alveolar fricative (S, Z): air through narrow channel + { + TMap BS; + BS.Add(FName("mouthStretchLeft"), 0.4f); + BS.Add(FName("mouthStretchRight"), 0.4f); + BS.Add(FName("jawOpen"), 0.1f); + BS.Add(FName("mouthSmileLeft"), 0.15f); + BS.Add(FName("mouthSmileRight"), 0.15f); + Map.Add(FName("SS"), BS); + } + + // nn — nasal (N, M, NG): soft palate lowered + { + TMap BS; + BS.Add(FName("jawOpen"), 0.15f); + BS.Add(FName("mouthClose"), 0.2f); + BS.Add(FName("mouthPressLeft"), 0.1f); + BS.Add(FName("mouthPressRight"), 0.1f); + Map.Add(FName("nn"), BS); + } + + // RR — retroflex/rhotic (R, L): tongue curled or lateral + { + TMap BS; + BS.Add(FName("mouthFunnel"), 0.3f); + BS.Add(FName("jawOpen"), 0.2f); + BS.Add(FName("mouthRollLower"), 0.15f); + Map.Add(FName("RR"), BS); + } + + // aa — open vowel (A as in "father"): wide open jaw + { + TMap BS; + BS.Add(FName("jawOpen"), 0.7f); + BS.Add(FName("mouthLowerDownLeft"), 0.4f); + BS.Add(FName("mouthLowerDownRight"), 0.4f); + BS.Add(FName("mouthShrugUpper"), 0.1f); + Map.Add(FName("aa"), BS); + } + + // E — mid front vowel (E as in "bed"): mid-open, spread lips + { + TMap BS; + BS.Add(FName("jawOpen"), 0.4f); + BS.Add(FName("mouthSmileLeft"), 0.3f); + BS.Add(FName("mouthSmileRight"), 0.3f); + BS.Add(FName("mouthLowerDownLeft"), 0.2f); + BS.Add(FName("mouthLowerDownRight"), 0.2f); + Map.Add(FName("E"), BS); + } + + // ih — close front vowel (I as in "sit"): narrow opening, spread lips + { + TMap BS; + BS.Add(FName("jawOpen"), 0.2f); + BS.Add(FName("mouthSmileLeft"), 0.25f); + BS.Add(FName("mouthSmileRight"), 0.25f); + BS.Add(FName("mouthStretchLeft"), 0.1f); + BS.Add(FName("mouthStretchRight"), 0.1f); + Map.Add(FName("ih"), BS); + } + + // oh — mid back vowel (O as in "go"): rounded lips, open jaw + { + TMap BS; + BS.Add(FName("jawOpen"), 0.5f); + BS.Add(FName("mouthFunnel"), 0.5f); + BS.Add(FName("mouthLowerDownLeft"), 0.2f); + BS.Add(FName("mouthLowerDownRight"), 0.2f); + Map.Add(FName("oh"), BS); + } + + // ou — close back vowel (OO as in "boot"): tightly rounded lips + { + TMap BS; + BS.Add(FName("mouthPucker"), 0.6f); + BS.Add(FName("mouthFunnel"), 0.4f); + BS.Add(FName("jawOpen"), 0.15f); + Map.Add(FName("ou"), BS); + } + + return Map; +} + +const TMap> UElevenLabsLipSyncComponent::VisemeToBlendshapeMap = + UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap(); + +// ───────────────────────────────────────────────────────────────────────────── +// Constructor / Destructor +// ───────────────────────────────────────────────────────────────────────────── + +UElevenLabsLipSyncComponent::UElevenLabsLipSyncComponent() +{ + PrimaryComponentTick.bCanEverTick = true; + PrimaryComponentTick.TickInterval = 1.0f / 60.0f; // 60 fps for smooth animation + + // Initialize viseme maps with all names at zero + for (const FName& Name : VisemeNames) + { + TargetVisemes.Add(Name, 0.0f); + SmoothedVisemes.Add(Name, 0.0f); + } + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f; +} + +UElevenLabsLipSyncComponent::~UElevenLabsLipSyncComponent() = default; + +// ───────────────────────────────────────────────────────────────────────────── +// Lifecycle +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::BeginPlay() +{ + Super::BeginPlay(); + + // Create the spectrum analyzer (512-point FFT, Hann window, 16kHz) + Audio::FSpectrumAnalyzerSettings Settings; + Settings.FFTSize = Audio::FSpectrumAnalyzerSettings::EFFTSize::Medium_512; + Settings.WindowType = Audio::EWindowType::Hann; + SpectrumAnalyzer = MakeUnique( + Settings, static_cast(ElevenLabsAudio::SampleRate)); + + // Auto-discover the agent component on the same actor + AActor* Owner = GetOwner(); + if (!Owner) return; + + UElevenLabsConversationalAgentComponent* Agent = + Owner->FindComponentByClass(); + + if (Agent) + { + AgentComponent = Agent; + AudioDataHandle = Agent->OnAgentAudioData.AddUObject( + this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived); + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName()); + } + else + { + UE_LOG(LogElevenLabsLipSync, Warning, + TEXT("No ElevenLabsConversationalAgentComponent found on %s. Lip sync will not work."), + *Owner->GetName()); + } + + // Auto-detect TargetMesh if not set manually. + // Search for a SkeletalMeshComponent named "Face" (MetaHuman convention), + // then fall back to the first SkeletalMeshComponent found on the actor. + if (!TargetMesh) + { + TArray SkeletalMeshes; + Owner->GetComponents(SkeletalMeshes); + + // First pass: look for a component named "Face" (MetaHuman face mesh) + for (USkeletalMeshComponent* Mesh : SkeletalMeshes) + { + if (Mesh && Mesh->GetFName().ToString().Contains(TEXT("Face"))) + { + TargetMesh = Mesh; + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected face mesh: %s"), *Mesh->GetName()); + break; + } + } + + // Second pass: fall back to the first skeletal mesh with morph targets + if (!TargetMesh) + { + for (USkeletalMeshComponent* Mesh : SkeletalMeshes) + { + if (Mesh) + { + TargetMesh = Mesh; + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected skeletal mesh (fallback): %s"), *Mesh->GetName()); + break; + } + } + } + + if (!TargetMesh) + { + UE_LOG(LogElevenLabsLipSync, Warning, + TEXT("No SkeletalMeshComponent found on %s. Set TargetMesh manually or use GetCurrentBlendshapes() in Blueprint."), + *Owner->GetName()); + } + } + + // DEBUG: list available morph targets on the target mesh + if (TargetMesh && TargetMesh->GetSkeletalMeshAsset()) + { + const TArray& MorphTargets = TargetMesh->GetSkeletalMeshAsset()->GetMorphTargets(); + UE_LOG(LogElevenLabsLipSync, Log, TEXT("TargetMesh '%s' has %d morph targets."), + *TargetMesh->GetName(), MorphTargets.Num()); + + // Log first 20 morph target names to verify ARKit naming + FString Names; + int32 Count = 0; + for (const UMorphTarget* MT : MorphTargets) + { + if (MT) + { + if (Count > 0) Names += TEXT(", "); + Names += MT->GetName(); + if (++Count >= 20) { Names += TEXT(" ..."); break; } + } + } + if (Count > 0) + { + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Morph target sample: %s"), *Names); + } + + // Verify our blendshape names exist as morph targets on this mesh + TArray TestNames = { FName("jawOpen"), FName("mouthClose"), FName("mouthFunnel") }; + for (const FName& TestName : TestNames) + { + bool bFound = false; + for (const UMorphTarget* MT : MorphTargets) + { + if (MT && MT->GetFName() == TestName) + { + bFound = true; + break; + } + } + UE_LOG(LogElevenLabsLipSync, Log, TEXT(" Morph target '%s': %s"), + *TestName.ToString(), bFound ? TEXT("FOUND") : TEXT("NOT FOUND")); + } + } +} + +void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason) +{ + // Unbind from agent component + if (AgentComponent.IsValid() && AudioDataHandle.IsValid()) + { + AgentComponent->OnAgentAudioData.Remove(AudioDataHandle); + AudioDataHandle.Reset(); + } + AgentComponent.Reset(); + SpectrumAnalyzer.Reset(); + + Super::EndPlay(EndPlayReason); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Tick — smooth visemes and apply morph targets +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick TickType, + FActorComponentTickFunction* ThisTickFunction) +{ + Super::TickComponent(DeltaTime, TickType, ThisTickFunction); + + // Smooth viseme weights towards targets using exponential interpolation + const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f); + bool bAnyNonZero = false; + + for (const FName& Name : VisemeNames) + { + float& Current = SmoothedVisemes.FindOrAdd(Name); + const float Target = TargetVisemes.FindOrAdd(Name); + + Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha); + + // Snap to zero to avoid infinite tiny values + if (Current < 0.001f) Current = 0.0f; + if (Current > 0.001f) bAnyNonZero = true; + } + + // "sil" uses LipSyncStrength=1 always — it's the rest pose + SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp( + SmoothedVisemes.FindOrAdd(FName("sil")), + TargetVisemes.FindOrAdd(FName("sil")), + Alpha); + + // Convert visemes to ARKit blendshapes + MapVisemesToBlendshapes(); + + // Auto-apply morph targets if a target mesh is set + if (TargetMesh) + { + ApplyMorphTargets(); + } + + // Notify Blueprint listeners + if (bAnyNonZero || CurrentBlendshapes.Num() > 0) + { + OnVisemesReady.Broadcast(); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Audio analysis +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray& PCMData) +{ + if (!SpectrumAnalyzer) return; + + // Convert int16 PCM to float32 [-1, 1] + const int16* Samples = reinterpret_cast(PCMData.GetData()); + const int32 NumSamples = PCMData.Num() / sizeof(int16); + + // DEBUG: log first audio chunk received + static bool bFirstChunkLogged = false; + if (!bFirstChunkLogged) + { + UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples); + bFirstChunkLogged = true; + } + + FloatBuffer.Reset(NumSamples); + for (int32 i = 0; i < NumSamples; ++i) + { + FloatBuffer.Add(static_cast(Samples[i]) / 32768.0f); + } + + // Feed to rolling FFT analyzer + SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples); + + // Try to perform analysis (returns true when enough data for one FFT window) + if (SpectrumAnalyzer->PerformAnalysisIfPossible(true)) + { + AnalyzeSpectrum(); + } +} + +void UElevenLabsLipSyncComponent::AnalyzeSpectrum() +{ + if (!SpectrumAnalyzer) return; + + Audio::FSpectrumAnalyzerScopeLock Lock(SpectrumAnalyzer.Get()); + + // Extract energy in frequency bands relevant for speech phoneme classification. + // Band boundaries chosen based on speech formant ranges. + const float VoiceEnergy = GetBandEnergy(80.0f, 400.0f); // Fundamental frequency + const float F1Energy = GetBandEnergy(300.0f, 800.0f); // First formant → jaw openness + const float F2Energy = GetBandEnergy(800.0f, 2500.0f); // Second formant → vowel front/back + const float F3Energy = GetBandEnergy(2500.0f, 4000.0f); // Third formant → liquids, nasals + const float SibilantEnergy = GetBandEnergy(4000.0f, 7500.0f); // Fricative/sibilant energy + + const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy; + + // DEBUG: log energy levels periodically + static int32 AnalysisCount = 0; + if (++AnalysisCount % 50 == 1) // Log every ~50 analyses + { + UE_LOG(LogElevenLabsLipSync, Log, + TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"), + TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); + } + + EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy); +} + +float UElevenLabsLipSyncComponent::GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples) const +{ + if (!SpectrumAnalyzer || NumSamples <= 0) return 0.0f; + + float Total = 0.0f; + const float Step = (HighFreq - LowFreq) / static_cast(NumSamples); + + for (int32 i = 0; i < NumSamples; ++i) + { + const float Freq = LowFreq + Step * (static_cast(i) + 0.5f); + Total += SpectrumAnalyzer->GetMagnitudeForFrequency(Freq); + } + + return Total / static_cast(NumSamples); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Viseme estimation from spectral analysis +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy, + float F1Energy, float F2Energy, float F3Energy, float SibilantEnergy) +{ + // Reset all visemes to zero + for (const FName& Name : VisemeNames) + { + TargetVisemes.FindOrAdd(Name) = 0.0f; + } + + // Silence threshold — below this, mouth is closed + constexpr float SilenceThreshold = 0.002f; + + if (TotalEnergy < SilenceThreshold) + { + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + return; + } + + // Normalize band energies relative to total + const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f); + const float NormF1 = F1Energy * InvTotal; + const float NormF2 = F2Energy * InvTotal; + const float NormF3 = F3Energy * InvTotal; + const float NormSibilant = SibilantEnergy * InvTotal; + + // Energy-based intensity (how "loud" the speech is — drives overall jaw opening) + // Scale to a usable 0-1 range. The constant is empirically tuned. + const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f); + + // ── Classification based on spectral shape ─────────────────────────────── + // The approach: compute "votes" for each viseme category based on where + // the spectral energy is concentrated. Multiple visemes can be active + // simultaneously (blended). + + // Fricatives / sibilants: high-frequency energy dominates + if (NormSibilant > 0.25f) + { + const float FricativeWeight = NormSibilant * Intensity; + // Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq) + if (NormF3 > NormF2) + { + TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight; + } + else + { + TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f; + TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f; + } + // F/V component + TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f; + } + + // Voiced speech: most energy in voice + F1 + F2 + if (NormSibilant < 0.5f) + { + const float VoicedWeight = (1.0f - NormSibilant) * Intensity; + + // Open vowels: strong F1 = wide jaw opening + if (NormF1 > 0.3f) + { + if (NormF2 > 0.35f) + { + // High F2 + high F1 → front open vowel (A as in "cat") + TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1; + } + else + { + // Low F2 + high F1 → back open vowel (O as in "go") + TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f; + TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f; + } + } + + // Mid vowels: moderate F1 + if (NormF1 > 0.15f && NormF1 <= 0.3f) + { + if (NormF2 > 0.4f) + { + // High F2 → front mid vowel (E as in "bed") + TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f; + } + else + { + // Low F2 → rounded mid vowel + TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f; + } + } + + // Close vowels: weak F1 + if (NormF1 <= 0.15f && NormF2 > 0.0f) + { + if (NormF2 > 0.4f) + { + // High F2 → front close vowel (I as in "see") + TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f; + } + else + { + // Low F2 → back close vowel (OO as in "boot") + TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f; + } + } + + // Nasals / liquids: prominent F3 with low sibilant + if (NormF3 > 0.2f && NormSibilant < 0.15f) + { + if (NormF1 < 0.2f) + { + TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f; + } + else + { + TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f; + } + } + + // Plosive detection: very low F1 with moderate energy = lips/tongue closed + if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f) + { + TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f; + TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f; + } + } + + // TH detection: moderate sibilant + moderate F3 (dental fricative) + if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f) + { + TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f; + } + + // Ensure at least some silence weight when energy is very low + if (Intensity < 0.1f) + { + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f; + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Viseme → ARKit blendshape mapping +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes() +{ + CurrentBlendshapes.Reset(); + + // Accumulate blendshape contributions from all active visemes + for (const FName& VisemeName : VisemeNames) + { + const float VisemeWeight = SmoothedVisemes.FindOrAdd(VisemeName); + if (VisemeWeight < 0.001f) continue; + + const TMap* Mapping = VisemeToBlendshapeMap.Find(VisemeName); + if (!Mapping) continue; + + for (const auto& Pair : *Mapping) + { + float& BS = CurrentBlendshapes.FindOrAdd(Pair.Key); + BS += Pair.Value * VisemeWeight; + } + } + + // Clamp all blendshape values to [0, 1] + for (auto& Pair : CurrentBlendshapes) + { + Pair.Value = FMath::Clamp(Pair.Value, 0.0f, 1.0f); + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Morph target application +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::ApplyMorphTargets() +{ + if (!TargetMesh) return; + + // DEBUG: log blendshape values periodically + static int32 ApplyCount = 0; + if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps + { + FString DebugStr; + for (const auto& Pair : CurrentBlendshapes) + { + if (Pair.Value > 0.01f) + { + DebugStr += FString::Printf(TEXT("%s=%.2f "), *Pair.Key.ToString(), Pair.Value); + } + } + if (DebugStr.Len() > 0) + { + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Blendshapes: %s"), *DebugStr); + } + } + + // Apply morph targets directly. + // NOTE: For MetaHuman, the face AnimBP may override these values. + // In that case, use GetCurrentBlendshapes() in the AnimBP instead. + for (const auto& Pair : CurrentBlendshapes) + { + TargetMesh->SetMorphTarget(Pair.Key, Pair.Value); + } +} diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 4513c4f..e5a5fe2 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -62,6 +62,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse, */ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout); +// Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only). +// Delivers PCM chunks as int16, 16kHz mono, little-endian. +DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray& /*PCMData*/); + // ───────────────────────────────────────────────────────────────────────────── // UElevenLabsConversationalAgentComponent // @@ -195,6 +199,11 @@ public: meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically.")) FOnAgentResponseTimeout OnAgentResponseTimeout; + // ── Raw audio data (C++ only, used by LipSync component) ──────────────── + /** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk. + * Used internally by UElevenLabsLipSyncComponent for spectral analysis. */ + FOnAgentAudioData OnAgentAudioData; + // ── Control ─────────────────────────────────────────────────────────────── /** diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h new file mode 100644 index 0000000..156de05 --- /dev/null +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h @@ -0,0 +1,139 @@ +// Copyright ASTERION. All Rights Reserved. + +#pragma once + +#include "CoreMinimal.h" +#include "Components/ActorComponent.h" +#include "DSP/SpectrumAnalyzer.h" +#include "ElevenLabsLipSyncComponent.generated.h" + +class UElevenLabsConversationalAgentComponent; +class USkeletalMeshComponent; + +// Fired every tick when viseme/blendshape data has been updated. +DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady); + +/** + * Real-time lip sync component for ElevenLabs Conversational AI. + * + * Attaches to the same Actor as the Conversational Agent component. + * Receives the agent's audio stream, performs spectral analysis, + * estimates 15 OVR viseme weights, maps them to ARKit blendshapes + * (MetaHuman compatible), and optionally auto-applies morph targets. + * + * Usage: + * 1. Add this component alongside the Conversational Agent component. + * 2. (Optional) Set TargetMesh to the MetaHuman Face skeletal mesh. + * 3. Conversation starts → lip sync works automatically. + * 4. (Optional) Bind OnVisemesReady for custom Blueprint handling. + */ +UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent), + DisplayName = "ElevenLabs Lip Sync") +class PS_AI_AGENT_ELEVENLABS_API UElevenLabsLipSyncComponent : public UActorComponent +{ + GENERATED_BODY() + +public: + UElevenLabsLipSyncComponent(); + ~UElevenLabsLipSyncComponent(); + + // ── Configuration ───────────────────────────────────────────────────────── + + /** Target skeletal mesh to auto-apply morph targets. Leave empty to handle + * visemes manually via OnVisemesReady + GetCurrentBlendshapes(). */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ToolTip = "Skeletal mesh to drive morph targets on.\nLeave empty to read values manually via GetCurrentBlendshapes().")) + TObjectPtr TargetMesh; + + /** Overall mouth movement intensity multiplier. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "0.0", ClampMax = "3.0", + ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler.")) + float LipSyncStrength = 1.0f; + + /** How quickly viseme weights interpolate towards new values each frame. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync", + meta = (ClampMin = "1.0", ClampMax = "100.0", + ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good.")) + float SmoothingSpeed = 20.0f; + + // ── Events ──────────────────────────────────────────────────────────────── + + /** Fires every tick when viseme data has been updated. + * Use GetCurrentVisemes() or GetCurrentBlendshapes() to read values. */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|LipSync", + meta = (ToolTip = "Fires each frame with updated viseme data.\nCall GetCurrentVisemes() or GetCurrentBlendshapes() to read values.")) + FOnElevenLabsVisemesReady OnVisemesReady; + + // ── Getters ─────────────────────────────────────────────────────────────── + + /** Get current OVR viseme weights (15 values: sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, ih, oh, ou). */ + UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync") + TMap GetCurrentVisemes() const { return SmoothedVisemes; } + + /** Get current ARKit blendshape weights (MetaHuman compatible: jawOpen, mouthFunnel, mouthClose, etc.). */ + UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync") + TMap GetCurrentBlendshapes() const { return CurrentBlendshapes; } + + // ── UActorComponent overrides ───────────────────────────────────────────── + virtual void BeginPlay() override; + virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override; + virtual void TickComponent(float DeltaTime, ELevelTick TickType, + FActorComponentTickFunction* ThisTickFunction) override; + +private: + // ── Audio analysis pipeline ─────────────────────────────────────────────── + + /** Receives raw PCM from the agent component. */ + void OnAudioChunkReceived(const TArray& PCMData); + + /** Extract frequency band energies from the spectrum analyzer. */ + void AnalyzeSpectrum(); + + /** Map frequency band energies to 15 OVR viseme target weights. */ + void EstimateVisemes(float TotalEnergy, float F1Energy, float F2Energy, + float F3Energy, float SibilantEnergy); + + /** Convert smoothed OVR visemes to ARKit blendshape weights. */ + void MapVisemesToBlendshapes(); + + /** Apply CurrentBlendshapes to TargetMesh morph targets. */ + void ApplyMorphTargets(); + + /** Sample the spectrum magnitude across a frequency range. */ + float GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples = 8) const; + + // ── State ───────────────────────────────────────────────────────────────── + + TUniquePtr SpectrumAnalyzer; + + // Reused float buffer for int16→float conversion (avoid per-chunk allocations) + TArray FloatBuffer; + + // Target viseme weights (set by spectral analysis, not yet smoothed) + TMap TargetVisemes; + + // Smoothed viseme weights (interpolated each tick, exposed via GetCurrentVisemes) + TMap SmoothedVisemes; + + // ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes) + TMap CurrentBlendshapes; + + // Whether we have pending analysis results to process + bool bHasPendingAnalysis = false; + + // Cached reference to the agent component on the same Actor + TWeakObjectPtr AgentComponent; + FDelegateHandle AudioDataHandle; + + // ── Static data ─────────────────────────────────────────────────────────── + + /** OVR Viseme names (15 standard visemes). */ + static const TArray VisemeNames; + + /** Initialize OVR→ARKit blendshape mapping table. */ + static TMap> CreateVisemeToBlendshapeMap(); + + /** Cached mapping: OVR viseme name → { ARKit blendshape name → weight }. */ + static const TMap> VisemeToBlendshapeMap; +};