WIP: Add ElevenLabsLipSyncComponent with spectral analysis lip sync

Real-time lip sync component that performs client-side spectral analysis
on the agent's PCM audio stream (ElevenLabs doesn't provide viseme data).

Pipeline: 512-point FFT (16kHz) → 5 frequency bands → 15 OVR visemes
→ ARKit blendshapes (MetaHuman compatible) → auto-apply morph targets.

Currently uses SetMorphTarget() which may be overridden by MetaHuman's
Face AnimBP — face animation not yet working. Debug logs added to
diagnose: audio flow, spectrum energy, morph target name matching.

Next steps: verify debug output, fix MetaHuman morph target override
(likely needs AnimBP integration like Convai approach).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-22 11:23:34 +01:00
parent 52f75f884b
commit 224af6a27b
4 changed files with 813 additions and 0 deletions

View File

@ -426,6 +426,8 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
{
EnqueueAgentAudio(PCMData);
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
OnAgentAudioData.Broadcast(PCMData);
}
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)

View File

@ -0,0 +1,663 @@
// Copyright ASTERION. All Rights Reserved.
#include "ElevenLabsLipSyncComponent.h"
#include "ElevenLabsConversationalAgentComponent.h"
#include "ElevenLabsDefinitions.h"
#include "Components/SkeletalMeshComponent.h"
#include "Engine/SkeletalMesh.h"
#include "Animation/AnimInstance.h"
#include "Animation/MorphTarget.h"
#include "GameFramework/Actor.h"
DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsLipSync, Log, All);
// ─────────────────────────────────────────────────────────────────────────────
// Static data
// ─────────────────────────────────────────────────────────────────────────────
const TArray<FName> UElevenLabsLipSyncComponent::VisemeNames = {
FName("sil"), FName("PP"), FName("FF"), FName("TH"), FName("DD"),
FName("kk"), FName("CH"), FName("SS"), FName("nn"), FName("RR"),
FName("aa"), FName("E"), FName("ih"), FName("oh"), FName("ou")
};
// OVR Viseme → ARKit blendshape mapping.
// Each viseme activates a combination of ARKit morph targets with specific weights.
// These values are tuned for MetaHuman faces and can be adjusted per project.
TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap()
{
TMap<FName, TMap<FName, float>> Map;
// sil — silence, mouth at rest
Map.Add(FName("sil"), {});
// PP — bilabial (P, B, M): lips pressed together
{
TMap<FName, float> BS;
BS.Add(FName("mouthClose"), 0.7f);
BS.Add(FName("mouthPressLeft"), 0.3f);
BS.Add(FName("mouthPressRight"), 0.3f);
Map.Add(FName("PP"), BS);
}
// FF — labiodental (F, V): lower lip tucked under upper teeth
{
TMap<FName, float> BS;
BS.Add(FName("mouthShrugLower"), 0.5f);
BS.Add(FName("mouthUpperUpLeft"), 0.3f);
BS.Add(FName("mouthUpperUpRight"), 0.3f);
BS.Add(FName("jawOpen"), 0.1f);
Map.Add(FName("FF"), BS);
}
// TH — dental (TH): tongue between teeth
{
TMap<FName, float> BS;
BS.Add(FName("tongueOut"), 0.4f);
BS.Add(FName("jawOpen"), 0.15f);
Map.Add(FName("TH"), BS);
}
// DD — alveolar (D, T, N): tongue on alveolar ridge
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.25f);
BS.Add(FName("mouthClose"), 0.2f);
BS.Add(FName("mouthLowerDownLeft"), 0.15f);
BS.Add(FName("mouthLowerDownRight"), 0.15f);
Map.Add(FName("DD"), BS);
}
// kk — velar (K, G): back of tongue raised
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.25f);
BS.Add(FName("mouthStretchLeft"), 0.15f);
BS.Add(FName("mouthStretchRight"), 0.15f);
Map.Add(FName("kk"), BS);
}
// CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge
{
TMap<FName, float> BS;
BS.Add(FName("mouthFunnel"), 0.45f);
BS.Add(FName("jawOpen"), 0.2f);
BS.Add(FName("mouthPucker"), 0.15f);
Map.Add(FName("CH"), BS);
}
// SS — alveolar fricative (S, Z): air through narrow channel
{
TMap<FName, float> BS;
BS.Add(FName("mouthStretchLeft"), 0.4f);
BS.Add(FName("mouthStretchRight"), 0.4f);
BS.Add(FName("jawOpen"), 0.1f);
BS.Add(FName("mouthSmileLeft"), 0.15f);
BS.Add(FName("mouthSmileRight"), 0.15f);
Map.Add(FName("SS"), BS);
}
// nn — nasal (N, M, NG): soft palate lowered
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.15f);
BS.Add(FName("mouthClose"), 0.2f);
BS.Add(FName("mouthPressLeft"), 0.1f);
BS.Add(FName("mouthPressRight"), 0.1f);
Map.Add(FName("nn"), BS);
}
// RR — retroflex/rhotic (R, L): tongue curled or lateral
{
TMap<FName, float> BS;
BS.Add(FName("mouthFunnel"), 0.3f);
BS.Add(FName("jawOpen"), 0.2f);
BS.Add(FName("mouthRollLower"), 0.15f);
Map.Add(FName("RR"), BS);
}
// aa — open vowel (A as in "father"): wide open jaw
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.7f);
BS.Add(FName("mouthLowerDownLeft"), 0.4f);
BS.Add(FName("mouthLowerDownRight"), 0.4f);
BS.Add(FName("mouthShrugUpper"), 0.1f);
Map.Add(FName("aa"), BS);
}
// E — mid front vowel (E as in "bed"): mid-open, spread lips
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.4f);
BS.Add(FName("mouthSmileLeft"), 0.3f);
BS.Add(FName("mouthSmileRight"), 0.3f);
BS.Add(FName("mouthLowerDownLeft"), 0.2f);
BS.Add(FName("mouthLowerDownRight"), 0.2f);
Map.Add(FName("E"), BS);
}
// ih — close front vowel (I as in "sit"): narrow opening, spread lips
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.2f);
BS.Add(FName("mouthSmileLeft"), 0.25f);
BS.Add(FName("mouthSmileRight"), 0.25f);
BS.Add(FName("mouthStretchLeft"), 0.1f);
BS.Add(FName("mouthStretchRight"), 0.1f);
Map.Add(FName("ih"), BS);
}
// oh — mid back vowel (O as in "go"): rounded lips, open jaw
{
TMap<FName, float> BS;
BS.Add(FName("jawOpen"), 0.5f);
BS.Add(FName("mouthFunnel"), 0.5f);
BS.Add(FName("mouthLowerDownLeft"), 0.2f);
BS.Add(FName("mouthLowerDownRight"), 0.2f);
Map.Add(FName("oh"), BS);
}
// ou — close back vowel (OO as in "boot"): tightly rounded lips
{
TMap<FName, float> BS;
BS.Add(FName("mouthPucker"), 0.6f);
BS.Add(FName("mouthFunnel"), 0.4f);
BS.Add(FName("jawOpen"), 0.15f);
Map.Add(FName("ou"), BS);
}
return Map;
}
const TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::VisemeToBlendshapeMap =
UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap();
// ─────────────────────────────────────────────────────────────────────────────
// Constructor / Destructor
// ─────────────────────────────────────────────────────────────────────────────
UElevenLabsLipSyncComponent::UElevenLabsLipSyncComponent()
{
PrimaryComponentTick.bCanEverTick = true;
PrimaryComponentTick.TickInterval = 1.0f / 60.0f; // 60 fps for smooth animation
// Initialize viseme maps with all names at zero
for (const FName& Name : VisemeNames)
{
TargetVisemes.Add(Name, 0.0f);
SmoothedVisemes.Add(Name, 0.0f);
}
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
}
UElevenLabsLipSyncComponent::~UElevenLabsLipSyncComponent() = default;
// ─────────────────────────────────────────────────────────────────────────────
// Lifecycle
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::BeginPlay()
{
Super::BeginPlay();
// Create the spectrum analyzer (512-point FFT, Hann window, 16kHz)
Audio::FSpectrumAnalyzerSettings Settings;
Settings.FFTSize = Audio::FSpectrumAnalyzerSettings::EFFTSize::Medium_512;
Settings.WindowType = Audio::EWindowType::Hann;
SpectrumAnalyzer = MakeUnique<Audio::FSpectrumAnalyzer>(
Settings, static_cast<float>(ElevenLabsAudio::SampleRate));
// Auto-discover the agent component on the same actor
AActor* Owner = GetOwner();
if (!Owner) return;
UElevenLabsConversationalAgentComponent* Agent =
Owner->FindComponentByClass<UElevenLabsConversationalAgentComponent>();
if (Agent)
{
AgentComponent = Agent;
AudioDataHandle = Agent->OnAgentAudioData.AddUObject(
this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived);
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName());
}
else
{
UE_LOG(LogElevenLabsLipSync, Warning,
TEXT("No ElevenLabsConversationalAgentComponent found on %s. Lip sync will not work."),
*Owner->GetName());
}
// Auto-detect TargetMesh if not set manually.
// Search for a SkeletalMeshComponent named "Face" (MetaHuman convention),
// then fall back to the first SkeletalMeshComponent found on the actor.
if (!TargetMesh)
{
TArray<USkeletalMeshComponent*> SkeletalMeshes;
Owner->GetComponents<USkeletalMeshComponent>(SkeletalMeshes);
// First pass: look for a component named "Face" (MetaHuman face mesh)
for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
{
if (Mesh && Mesh->GetFName().ToString().Contains(TEXT("Face")))
{
TargetMesh = Mesh;
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected face mesh: %s"), *Mesh->GetName());
break;
}
}
// Second pass: fall back to the first skeletal mesh with morph targets
if (!TargetMesh)
{
for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
{
if (Mesh)
{
TargetMesh = Mesh;
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected skeletal mesh (fallback): %s"), *Mesh->GetName());
break;
}
}
}
if (!TargetMesh)
{
UE_LOG(LogElevenLabsLipSync, Warning,
TEXT("No SkeletalMeshComponent found on %s. Set TargetMesh manually or use GetCurrentBlendshapes() in Blueprint."),
*Owner->GetName());
}
}
// DEBUG: list available morph targets on the target mesh
if (TargetMesh && TargetMesh->GetSkeletalMeshAsset())
{
const TArray<UMorphTarget*>& MorphTargets = TargetMesh->GetSkeletalMeshAsset()->GetMorphTargets();
UE_LOG(LogElevenLabsLipSync, Log, TEXT("TargetMesh '%s' has %d morph targets."),
*TargetMesh->GetName(), MorphTargets.Num());
// Log first 20 morph target names to verify ARKit naming
FString Names;
int32 Count = 0;
for (const UMorphTarget* MT : MorphTargets)
{
if (MT)
{
if (Count > 0) Names += TEXT(", ");
Names += MT->GetName();
if (++Count >= 20) { Names += TEXT(" ..."); break; }
}
}
if (Count > 0)
{
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Morph target sample: %s"), *Names);
}
// Verify our blendshape names exist as morph targets on this mesh
TArray<FName> TestNames = { FName("jawOpen"), FName("mouthClose"), FName("mouthFunnel") };
for (const FName& TestName : TestNames)
{
bool bFound = false;
for (const UMorphTarget* MT : MorphTargets)
{
if (MT && MT->GetFName() == TestName)
{
bFound = true;
break;
}
}
UE_LOG(LogElevenLabsLipSync, Log, TEXT(" Morph target '%s': %s"),
*TestName.ToString(), bFound ? TEXT("FOUND") : TEXT("NOT FOUND"));
}
}
}
void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
{
// Unbind from agent component
if (AgentComponent.IsValid() && AudioDataHandle.IsValid())
{
AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
AudioDataHandle.Reset();
}
AgentComponent.Reset();
SpectrumAnalyzer.Reset();
Super::EndPlay(EndPlayReason);
}
// ─────────────────────────────────────────────────────────────────────────────
// Tick — smooth visemes and apply morph targets
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick TickType,
FActorComponentTickFunction* ThisTickFunction)
{
Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
// Smooth viseme weights towards targets using exponential interpolation
const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f);
bool bAnyNonZero = false;
for (const FName& Name : VisemeNames)
{
float& Current = SmoothedVisemes.FindOrAdd(Name);
const float Target = TargetVisemes.FindOrAdd(Name);
Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha);
// Snap to zero to avoid infinite tiny values
if (Current < 0.001f) Current = 0.0f;
if (Current > 0.001f) bAnyNonZero = true;
}
// "sil" uses LipSyncStrength=1 always — it's the rest pose
SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp(
SmoothedVisemes.FindOrAdd(FName("sil")),
TargetVisemes.FindOrAdd(FName("sil")),
Alpha);
// Convert visemes to ARKit blendshapes
MapVisemesToBlendshapes();
// Auto-apply morph targets if a target mesh is set
if (TargetMesh)
{
ApplyMorphTargets();
}
// Notify Blueprint listeners
if (bAnyNonZero || CurrentBlendshapes.Num() > 0)
{
OnVisemesReady.Broadcast();
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Audio analysis
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMData)
{
if (!SpectrumAnalyzer) return;
// Convert int16 PCM to float32 [-1, 1]
const int16* Samples = reinterpret_cast<const int16*>(PCMData.GetData());
const int32 NumSamples = PCMData.Num() / sizeof(int16);
// DEBUG: log first audio chunk received
static bool bFirstChunkLogged = false;
if (!bFirstChunkLogged)
{
UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
bFirstChunkLogged = true;
}
FloatBuffer.Reset(NumSamples);
for (int32 i = 0; i < NumSamples; ++i)
{
FloatBuffer.Add(static_cast<float>(Samples[i]) / 32768.0f);
}
// Feed to rolling FFT analyzer
SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples);
// Try to perform analysis (returns true when enough data for one FFT window)
if (SpectrumAnalyzer->PerformAnalysisIfPossible(true))
{
AnalyzeSpectrum();
}
}
void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
{
if (!SpectrumAnalyzer) return;
Audio::FSpectrumAnalyzerScopeLock Lock(SpectrumAnalyzer.Get());
// Extract energy in frequency bands relevant for speech phoneme classification.
// Band boundaries chosen based on speech formant ranges.
const float VoiceEnergy = GetBandEnergy(80.0f, 400.0f); // Fundamental frequency
const float F1Energy = GetBandEnergy(300.0f, 800.0f); // First formant → jaw openness
const float F2Energy = GetBandEnergy(800.0f, 2500.0f); // Second formant → vowel front/back
const float F3Energy = GetBandEnergy(2500.0f, 4000.0f); // Third formant → liquids, nasals
const float SibilantEnergy = GetBandEnergy(4000.0f, 7500.0f); // Fricative/sibilant energy
const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy;
// DEBUG: log energy levels periodically
static int32 AnalysisCount = 0;
if (++AnalysisCount % 50 == 1) // Log every ~50 analyses
{
UE_LOG(LogElevenLabsLipSync, Log,
TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
}
EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
}
float UElevenLabsLipSyncComponent::GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples) const
{
if (!SpectrumAnalyzer || NumSamples <= 0) return 0.0f;
float Total = 0.0f;
const float Step = (HighFreq - LowFreq) / static_cast<float>(NumSamples);
for (int32 i = 0; i < NumSamples; ++i)
{
const float Freq = LowFreq + Step * (static_cast<float>(i) + 0.5f);
Total += SpectrumAnalyzer->GetMagnitudeForFrequency(Freq);
}
return Total / static_cast<float>(NumSamples);
}
// ─────────────────────────────────────────────────────────────────────────────
// Viseme estimation from spectral analysis
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy,
float F1Energy, float F2Energy, float F3Energy, float SibilantEnergy)
{
// Reset all visemes to zero
for (const FName& Name : VisemeNames)
{
TargetVisemes.FindOrAdd(Name) = 0.0f;
}
// Silence threshold — below this, mouth is closed
constexpr float SilenceThreshold = 0.002f;
if (TotalEnergy < SilenceThreshold)
{
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
return;
}
// Normalize band energies relative to total
const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f);
const float NormF1 = F1Energy * InvTotal;
const float NormF2 = F2Energy * InvTotal;
const float NormF3 = F3Energy * InvTotal;
const float NormSibilant = SibilantEnergy * InvTotal;
// Energy-based intensity (how "loud" the speech is — drives overall jaw opening)
// Scale to a usable 0-1 range. The constant is empirically tuned.
const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f);
// ── Classification based on spectral shape ───────────────────────────────
// The approach: compute "votes" for each viseme category based on where
// the spectral energy is concentrated. Multiple visemes can be active
// simultaneously (blended).
// Fricatives / sibilants: high-frequency energy dominates
if (NormSibilant > 0.25f)
{
const float FricativeWeight = NormSibilant * Intensity;
// Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq)
if (NormF3 > NormF2)
{
TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight;
}
else
{
TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f;
TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f;
}
// F/V component
TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f;
}
// Voiced speech: most energy in voice + F1 + F2
if (NormSibilant < 0.5f)
{
const float VoicedWeight = (1.0f - NormSibilant) * Intensity;
// Open vowels: strong F1 = wide jaw opening
if (NormF1 > 0.3f)
{
if (NormF2 > 0.35f)
{
// High F2 + high F1 → front open vowel (A as in "cat")
TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1;
}
else
{
// Low F2 + high F1 → back open vowel (O as in "go")
TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f;
TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f;
}
}
// Mid vowels: moderate F1
if (NormF1 > 0.15f && NormF1 <= 0.3f)
{
if (NormF2 > 0.4f)
{
// High F2 → front mid vowel (E as in "bed")
TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f;
}
else
{
// Low F2 → rounded mid vowel
TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f;
}
}
// Close vowels: weak F1
if (NormF1 <= 0.15f && NormF2 > 0.0f)
{
if (NormF2 > 0.4f)
{
// High F2 → front close vowel (I as in "see")
TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f;
}
else
{
// Low F2 → back close vowel (OO as in "boot")
TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f;
}
}
// Nasals / liquids: prominent F3 with low sibilant
if (NormF3 > 0.2f && NormSibilant < 0.15f)
{
if (NormF1 < 0.2f)
{
TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f;
}
else
{
TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f;
}
}
// Plosive detection: very low F1 with moderate energy = lips/tongue closed
if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f)
{
TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f;
TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f;
}
}
// TH detection: moderate sibilant + moderate F3 (dental fricative)
if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f)
{
TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f;
}
// Ensure at least some silence weight when energy is very low
if (Intensity < 0.1f)
{
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f;
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Viseme → ARKit blendshape mapping
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes()
{
CurrentBlendshapes.Reset();
// Accumulate blendshape contributions from all active visemes
for (const FName& VisemeName : VisemeNames)
{
const float VisemeWeight = SmoothedVisemes.FindOrAdd(VisemeName);
if (VisemeWeight < 0.001f) continue;
const TMap<FName, float>* Mapping = VisemeToBlendshapeMap.Find(VisemeName);
if (!Mapping) continue;
for (const auto& Pair : *Mapping)
{
float& BS = CurrentBlendshapes.FindOrAdd(Pair.Key);
BS += Pair.Value * VisemeWeight;
}
}
// Clamp all blendshape values to [0, 1]
for (auto& Pair : CurrentBlendshapes)
{
Pair.Value = FMath::Clamp(Pair.Value, 0.0f, 1.0f);
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Morph target application
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::ApplyMorphTargets()
{
if (!TargetMesh) return;
// DEBUG: log blendshape values periodically
static int32 ApplyCount = 0;
if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps
{
FString DebugStr;
for (const auto& Pair : CurrentBlendshapes)
{
if (Pair.Value > 0.01f)
{
DebugStr += FString::Printf(TEXT("%s=%.2f "), *Pair.Key.ToString(), Pair.Value);
}
}
if (DebugStr.Len() > 0)
{
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Blendshapes: %s"), *DebugStr);
}
}
// Apply morph targets directly.
// NOTE: For MetaHuman, the face AnimBP may override these values.
// In that case, use GetCurrentBlendshapes() in the AnimBP instead.
for (const auto& Pair : CurrentBlendshapes)
{
TargetMesh->SetMorphTarget(Pair.Key, Pair.Value);
}
}

View File

@ -62,6 +62,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
*/
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);
// Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only).
// Delivers PCM chunks as int16, 16kHz mono, little-endian.
DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray<uint8>& /*PCMData*/);
// ─────────────────────────────────────────────────────────────────────────────
// UElevenLabsConversationalAgentComponent
//
@ -195,6 +199,11 @@ public:
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
FOnAgentResponseTimeout OnAgentResponseTimeout;
// ── Raw audio data (C++ only, used by LipSync component) ────────────────
/** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk.
* Used internally by UElevenLabsLipSyncComponent for spectral analysis. */
FOnAgentAudioData OnAgentAudioData;
// ── Control ───────────────────────────────────────────────────────────────
/**

View File

@ -0,0 +1,139 @@
// Copyright ASTERION. All Rights Reserved.
#pragma once
#include "CoreMinimal.h"
#include "Components/ActorComponent.h"
#include "DSP/SpectrumAnalyzer.h"
#include "ElevenLabsLipSyncComponent.generated.h"
class UElevenLabsConversationalAgentComponent;
class USkeletalMeshComponent;
// Fired every tick when viseme/blendshape data has been updated.
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);
/**
* Real-time lip sync component for ElevenLabs Conversational AI.
*
* Attaches to the same Actor as the Conversational Agent component.
* Receives the agent's audio stream, performs spectral analysis,
* estimates 15 OVR viseme weights, maps them to ARKit blendshapes
* (MetaHuman compatible), and optionally auto-applies morph targets.
*
* Usage:
* 1. Add this component alongside the Conversational Agent component.
* 2. (Optional) Set TargetMesh to the MetaHuman Face skeletal mesh.
* 3. Conversation starts lip sync works automatically.
* 4. (Optional) Bind OnVisemesReady for custom Blueprint handling.
*/
UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent),
DisplayName = "ElevenLabs Lip Sync")
class PS_AI_AGENT_ELEVENLABS_API UElevenLabsLipSyncComponent : public UActorComponent
{
GENERATED_BODY()
public:
UElevenLabsLipSyncComponent();
~UElevenLabsLipSyncComponent();
// ── Configuration ─────────────────────────────────────────────────────────
/** Target skeletal mesh to auto-apply morph targets. Leave empty to handle
* visemes manually via OnVisemesReady + GetCurrentBlendshapes(). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ToolTip = "Skeletal mesh to drive morph targets on.\nLeave empty to read values manually via GetCurrentBlendshapes()."))
TObjectPtr<USkeletalMeshComponent> TargetMesh;
/** Overall mouth movement intensity multiplier. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "0.0", ClampMax = "3.0",
ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
float LipSyncStrength = 1.0f;
/** How quickly viseme weights interpolate towards new values each frame. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
meta = (ClampMin = "1.0", ClampMax = "100.0",
ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
float SmoothingSpeed = 20.0f;
// ── Events ────────────────────────────────────────────────────────────────
/** Fires every tick when viseme data has been updated.
* Use GetCurrentVisemes() or GetCurrentBlendshapes() to read values. */
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|LipSync",
meta = (ToolTip = "Fires each frame with updated viseme data.\nCall GetCurrentVisemes() or GetCurrentBlendshapes() to read values."))
FOnElevenLabsVisemesReady OnVisemesReady;
// ── Getters ───────────────────────────────────────────────────────────────
/** Get current OVR viseme weights (15 values: sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, ih, oh, ou). */
UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
TMap<FName, float> GetCurrentVisemes() const { return SmoothedVisemes; }
/** Get current ARKit blendshape weights (MetaHuman compatible: jawOpen, mouthFunnel, mouthClose, etc.). */
UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
TMap<FName, float> GetCurrentBlendshapes() const { return CurrentBlendshapes; }
// ── UActorComponent overrides ─────────────────────────────────────────────
virtual void BeginPlay() override;
virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;
virtual void TickComponent(float DeltaTime, ELevelTick TickType,
FActorComponentTickFunction* ThisTickFunction) override;
private:
// ── Audio analysis pipeline ───────────────────────────────────────────────
/** Receives raw PCM from the agent component. */
void OnAudioChunkReceived(const TArray<uint8>& PCMData);
/** Extract frequency band energies from the spectrum analyzer. */
void AnalyzeSpectrum();
/** Map frequency band energies to 15 OVR viseme target weights. */
void EstimateVisemes(float TotalEnergy, float F1Energy, float F2Energy,
float F3Energy, float SibilantEnergy);
/** Convert smoothed OVR visemes to ARKit blendshape weights. */
void MapVisemesToBlendshapes();
/** Apply CurrentBlendshapes to TargetMesh morph targets. */
void ApplyMorphTargets();
/** Sample the spectrum magnitude across a frequency range. */
float GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples = 8) const;
// ── State ─────────────────────────────────────────────────────────────────
TUniquePtr<Audio::FSpectrumAnalyzer> SpectrumAnalyzer;
// Reused float buffer for int16→float conversion (avoid per-chunk allocations)
TArray<float> FloatBuffer;
// Target viseme weights (set by spectral analysis, not yet smoothed)
TMap<FName, float> TargetVisemes;
// Smoothed viseme weights (interpolated each tick, exposed via GetCurrentVisemes)
TMap<FName, float> SmoothedVisemes;
// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
TMap<FName, float> CurrentBlendshapes;
// Whether we have pending analysis results to process
bool bHasPendingAnalysis = false;
// Cached reference to the agent component on the same Actor
TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
FDelegateHandle AudioDataHandle;
// ── Static data ───────────────────────────────────────────────────────────
/** OVR Viseme names (15 standard visemes). */
static const TArray<FName> VisemeNames;
/** Initialize OVR→ARKit blendshape mapping table. */
static TMap<FName, TMap<FName, float>> CreateVisemeToBlendshapeMap();
/** Cached mapping: OVR viseme name → { ARKit blendshape name → weight }. */
static const TMap<FName, TMap<FName, float>> VisemeToBlendshapeMap;
};