WIP: Add ElevenLabsLipSyncComponent with spectral analysis lip sync
Real-time lip sync component that performs client-side spectral analysis on the agent's PCM audio stream (ElevenLabs doesn't provide viseme data). Pipeline: 512-point FFT (16kHz) → 5 frequency bands → 15 OVR visemes → ARKit blendshapes (MetaHuman compatible) → auto-apply morph targets. Currently uses SetMorphTarget() which may be overridden by MetaHuman's Face AnimBP — face animation not yet working. Debug logs added to diagnose: audio flow, spectrum energy, morph target name matching. Next steps: verify debug output, fix MetaHuman morph target override (likely needs AnimBP integration like Convai approach). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
52f75f884b
commit
224af6a27b
@ -426,6 +426,8 @@ void UElevenLabsConversationalAgentComponent::HandleError(const FString& ErrorMe
|
|||||||
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
|
void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<uint8>& PCMData)
|
||||||
{
|
{
|
||||||
EnqueueAgentAudio(PCMData);
|
EnqueueAgentAudio(PCMData);
|
||||||
|
// Forward raw PCM to any listeners (e.g. LipSync component for spectral analysis).
|
||||||
|
OnAgentAudioData.Broadcast(PCMData);
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
|
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
|
||||||
|
|||||||
@ -0,0 +1,663 @@
|
|||||||
|
// Copyright ASTERION. All Rights Reserved.
|
||||||
|
|
||||||
|
#include "ElevenLabsLipSyncComponent.h"
|
||||||
|
#include "ElevenLabsConversationalAgentComponent.h"
|
||||||
|
#include "ElevenLabsDefinitions.h"
|
||||||
|
#include "Components/SkeletalMeshComponent.h"
|
||||||
|
#include "Engine/SkeletalMesh.h"
|
||||||
|
#include "Animation/AnimInstance.h"
|
||||||
|
#include "Animation/MorphTarget.h"
|
||||||
|
#include "GameFramework/Actor.h"
|
||||||
|
|
||||||
|
DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsLipSync, Log, All);
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Static data
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const TArray<FName> UElevenLabsLipSyncComponent::VisemeNames = {
|
||||||
|
FName("sil"), FName("PP"), FName("FF"), FName("TH"), FName("DD"),
|
||||||
|
FName("kk"), FName("CH"), FName("SS"), FName("nn"), FName("RR"),
|
||||||
|
FName("aa"), FName("E"), FName("ih"), FName("oh"), FName("ou")
|
||||||
|
};
|
||||||
|
|
||||||
|
// OVR Viseme → ARKit blendshape mapping.
|
||||||
|
// Each viseme activates a combination of ARKit morph targets with specific weights.
|
||||||
|
// These values are tuned for MetaHuman faces and can be adjusted per project.
|
||||||
|
TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap()
|
||||||
|
{
|
||||||
|
TMap<FName, TMap<FName, float>> Map;
|
||||||
|
|
||||||
|
// sil — silence, mouth at rest
|
||||||
|
Map.Add(FName("sil"), {});
|
||||||
|
|
||||||
|
// PP — bilabial (P, B, M): lips pressed together
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthClose"), 0.7f);
|
||||||
|
BS.Add(FName("mouthPressLeft"), 0.3f);
|
||||||
|
BS.Add(FName("mouthPressRight"), 0.3f);
|
||||||
|
Map.Add(FName("PP"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// FF — labiodental (F, V): lower lip tucked under upper teeth
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthShrugLower"), 0.5f);
|
||||||
|
BS.Add(FName("mouthUpperUpLeft"), 0.3f);
|
||||||
|
BS.Add(FName("mouthUpperUpRight"), 0.3f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.1f);
|
||||||
|
Map.Add(FName("FF"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TH — dental (TH): tongue between teeth
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("tongueOut"), 0.4f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.15f);
|
||||||
|
Map.Add(FName("TH"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// DD — alveolar (D, T, N): tongue on alveolar ridge
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.25f);
|
||||||
|
BS.Add(FName("mouthClose"), 0.2f);
|
||||||
|
BS.Add(FName("mouthLowerDownLeft"), 0.15f);
|
||||||
|
BS.Add(FName("mouthLowerDownRight"), 0.15f);
|
||||||
|
Map.Add(FName("DD"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// kk — velar (K, G): back of tongue raised
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.25f);
|
||||||
|
BS.Add(FName("mouthStretchLeft"), 0.15f);
|
||||||
|
BS.Add(FName("mouthStretchRight"), 0.15f);
|
||||||
|
Map.Add(FName("kk"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// CH — postalveolar (CH, SH, J): tongue bunched behind alveolar ridge
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthFunnel"), 0.45f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.2f);
|
||||||
|
BS.Add(FName("mouthPucker"), 0.15f);
|
||||||
|
Map.Add(FName("CH"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// SS — alveolar fricative (S, Z): air through narrow channel
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthStretchLeft"), 0.4f);
|
||||||
|
BS.Add(FName("mouthStretchRight"), 0.4f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.1f);
|
||||||
|
BS.Add(FName("mouthSmileLeft"), 0.15f);
|
||||||
|
BS.Add(FName("mouthSmileRight"), 0.15f);
|
||||||
|
Map.Add(FName("SS"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// nn — nasal (N, M, NG): soft palate lowered
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.15f);
|
||||||
|
BS.Add(FName("mouthClose"), 0.2f);
|
||||||
|
BS.Add(FName("mouthPressLeft"), 0.1f);
|
||||||
|
BS.Add(FName("mouthPressRight"), 0.1f);
|
||||||
|
Map.Add(FName("nn"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// RR — retroflex/rhotic (R, L): tongue curled or lateral
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthFunnel"), 0.3f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.2f);
|
||||||
|
BS.Add(FName("mouthRollLower"), 0.15f);
|
||||||
|
Map.Add(FName("RR"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// aa — open vowel (A as in "father"): wide open jaw
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.7f);
|
||||||
|
BS.Add(FName("mouthLowerDownLeft"), 0.4f);
|
||||||
|
BS.Add(FName("mouthLowerDownRight"), 0.4f);
|
||||||
|
BS.Add(FName("mouthShrugUpper"), 0.1f);
|
||||||
|
Map.Add(FName("aa"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// E — mid front vowel (E as in "bed"): mid-open, spread lips
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.4f);
|
||||||
|
BS.Add(FName("mouthSmileLeft"), 0.3f);
|
||||||
|
BS.Add(FName("mouthSmileRight"), 0.3f);
|
||||||
|
BS.Add(FName("mouthLowerDownLeft"), 0.2f);
|
||||||
|
BS.Add(FName("mouthLowerDownRight"), 0.2f);
|
||||||
|
Map.Add(FName("E"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ih — close front vowel (I as in "sit"): narrow opening, spread lips
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.2f);
|
||||||
|
BS.Add(FName("mouthSmileLeft"), 0.25f);
|
||||||
|
BS.Add(FName("mouthSmileRight"), 0.25f);
|
||||||
|
BS.Add(FName("mouthStretchLeft"), 0.1f);
|
||||||
|
BS.Add(FName("mouthStretchRight"), 0.1f);
|
||||||
|
Map.Add(FName("ih"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// oh — mid back vowel (O as in "go"): rounded lips, open jaw
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("jawOpen"), 0.5f);
|
||||||
|
BS.Add(FName("mouthFunnel"), 0.5f);
|
||||||
|
BS.Add(FName("mouthLowerDownLeft"), 0.2f);
|
||||||
|
BS.Add(FName("mouthLowerDownRight"), 0.2f);
|
||||||
|
Map.Add(FName("oh"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ou — close back vowel (OO as in "boot"): tightly rounded lips
|
||||||
|
{
|
||||||
|
TMap<FName, float> BS;
|
||||||
|
BS.Add(FName("mouthPucker"), 0.6f);
|
||||||
|
BS.Add(FName("mouthFunnel"), 0.4f);
|
||||||
|
BS.Add(FName("jawOpen"), 0.15f);
|
||||||
|
Map.Add(FName("ou"), BS);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Map;
|
||||||
|
}
|
||||||
|
|
||||||
|
const TMap<FName, TMap<FName, float>> UElevenLabsLipSyncComponent::VisemeToBlendshapeMap =
|
||||||
|
UElevenLabsLipSyncComponent::CreateVisemeToBlendshapeMap();
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Constructor / Destructor
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
UElevenLabsLipSyncComponent::UElevenLabsLipSyncComponent()
|
||||||
|
{
|
||||||
|
PrimaryComponentTick.bCanEverTick = true;
|
||||||
|
PrimaryComponentTick.TickInterval = 1.0f / 60.0f; // 60 fps for smooth animation
|
||||||
|
|
||||||
|
// Initialize viseme maps with all names at zero
|
||||||
|
for (const FName& Name : VisemeNames)
|
||||||
|
{
|
||||||
|
TargetVisemes.Add(Name, 0.0f);
|
||||||
|
SmoothedVisemes.Add(Name, 0.0f);
|
||||||
|
}
|
||||||
|
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||||
|
SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
UElevenLabsLipSyncComponent::~UElevenLabsLipSyncComponent() = default;
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Lifecycle
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::BeginPlay()
|
||||||
|
{
|
||||||
|
Super::BeginPlay();
|
||||||
|
|
||||||
|
// Create the spectrum analyzer (512-point FFT, Hann window, 16kHz)
|
||||||
|
Audio::FSpectrumAnalyzerSettings Settings;
|
||||||
|
Settings.FFTSize = Audio::FSpectrumAnalyzerSettings::EFFTSize::Medium_512;
|
||||||
|
Settings.WindowType = Audio::EWindowType::Hann;
|
||||||
|
SpectrumAnalyzer = MakeUnique<Audio::FSpectrumAnalyzer>(
|
||||||
|
Settings, static_cast<float>(ElevenLabsAudio::SampleRate));
|
||||||
|
|
||||||
|
// Auto-discover the agent component on the same actor
|
||||||
|
AActor* Owner = GetOwner();
|
||||||
|
if (!Owner) return;
|
||||||
|
|
||||||
|
UElevenLabsConversationalAgentComponent* Agent =
|
||||||
|
Owner->FindComponentByClass<UElevenLabsConversationalAgentComponent>();
|
||||||
|
|
||||||
|
if (Agent)
|
||||||
|
{
|
||||||
|
AgentComponent = Agent;
|
||||||
|
AudioDataHandle = Agent->OnAgentAudioData.AddUObject(
|
||||||
|
this, &UElevenLabsLipSyncComponent::OnAudioChunkReceived);
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Lip sync bound to agent component on %s."), *Owner->GetName());
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Warning,
|
||||||
|
TEXT("No ElevenLabsConversationalAgentComponent found on %s. Lip sync will not work."),
|
||||||
|
*Owner->GetName());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Auto-detect TargetMesh if not set manually.
|
||||||
|
// Search for a SkeletalMeshComponent named "Face" (MetaHuman convention),
|
||||||
|
// then fall back to the first SkeletalMeshComponent found on the actor.
|
||||||
|
if (!TargetMesh)
|
||||||
|
{
|
||||||
|
TArray<USkeletalMeshComponent*> SkeletalMeshes;
|
||||||
|
Owner->GetComponents<USkeletalMeshComponent>(SkeletalMeshes);
|
||||||
|
|
||||||
|
// First pass: look for a component named "Face" (MetaHuman face mesh)
|
||||||
|
for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
|
||||||
|
{
|
||||||
|
if (Mesh && Mesh->GetFName().ToString().Contains(TEXT("Face")))
|
||||||
|
{
|
||||||
|
TargetMesh = Mesh;
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected face mesh: %s"), *Mesh->GetName());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Second pass: fall back to the first skeletal mesh with morph targets
|
||||||
|
if (!TargetMesh)
|
||||||
|
{
|
||||||
|
for (USkeletalMeshComponent* Mesh : SkeletalMeshes)
|
||||||
|
{
|
||||||
|
if (Mesh)
|
||||||
|
{
|
||||||
|
TargetMesh = Mesh;
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Auto-detected skeletal mesh (fallback): %s"), *Mesh->GetName());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!TargetMesh)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Warning,
|
||||||
|
TEXT("No SkeletalMeshComponent found on %s. Set TargetMesh manually or use GetCurrentBlendshapes() in Blueprint."),
|
||||||
|
*Owner->GetName());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// DEBUG: list available morph targets on the target mesh
|
||||||
|
if (TargetMesh && TargetMesh->GetSkeletalMeshAsset())
|
||||||
|
{
|
||||||
|
const TArray<UMorphTarget*>& MorphTargets = TargetMesh->GetSkeletalMeshAsset()->GetMorphTargets();
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("TargetMesh '%s' has %d morph targets."),
|
||||||
|
*TargetMesh->GetName(), MorphTargets.Num());
|
||||||
|
|
||||||
|
// Log first 20 morph target names to verify ARKit naming
|
||||||
|
FString Names;
|
||||||
|
int32 Count = 0;
|
||||||
|
for (const UMorphTarget* MT : MorphTargets)
|
||||||
|
{
|
||||||
|
if (MT)
|
||||||
|
{
|
||||||
|
if (Count > 0) Names += TEXT(", ");
|
||||||
|
Names += MT->GetName();
|
||||||
|
if (++Count >= 20) { Names += TEXT(" ..."); break; }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (Count > 0)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Morph target sample: %s"), *Names);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify our blendshape names exist as morph targets on this mesh
|
||||||
|
TArray<FName> TestNames = { FName("jawOpen"), FName("mouthClose"), FName("mouthFunnel") };
|
||||||
|
for (const FName& TestName : TestNames)
|
||||||
|
{
|
||||||
|
bool bFound = false;
|
||||||
|
for (const UMorphTarget* MT : MorphTargets)
|
||||||
|
{
|
||||||
|
if (MT && MT->GetFName() == TestName)
|
||||||
|
{
|
||||||
|
bFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT(" Morph target '%s': %s"),
|
||||||
|
*TestName.ToString(), bFound ? TEXT("FOUND") : TEXT("NOT FOUND"));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
|
||||||
|
{
|
||||||
|
// Unbind from agent component
|
||||||
|
if (AgentComponent.IsValid() && AudioDataHandle.IsValid())
|
||||||
|
{
|
||||||
|
AgentComponent->OnAgentAudioData.Remove(AudioDataHandle);
|
||||||
|
AudioDataHandle.Reset();
|
||||||
|
}
|
||||||
|
AgentComponent.Reset();
|
||||||
|
SpectrumAnalyzer.Reset();
|
||||||
|
|
||||||
|
Super::EndPlay(EndPlayReason);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Tick — smooth visemes and apply morph targets
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick TickType,
|
||||||
|
FActorComponentTickFunction* ThisTickFunction)
|
||||||
|
{
|
||||||
|
Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
|
||||||
|
|
||||||
|
// Smooth viseme weights towards targets using exponential interpolation
|
||||||
|
const float Alpha = FMath::Clamp(DeltaTime * SmoothingSpeed, 0.0f, 1.0f);
|
||||||
|
bool bAnyNonZero = false;
|
||||||
|
|
||||||
|
for (const FName& Name : VisemeNames)
|
||||||
|
{
|
||||||
|
float& Current = SmoothedVisemes.FindOrAdd(Name);
|
||||||
|
const float Target = TargetVisemes.FindOrAdd(Name);
|
||||||
|
|
||||||
|
Current = FMath::Lerp(Current, Target * LipSyncStrength, Alpha);
|
||||||
|
|
||||||
|
// Snap to zero to avoid infinite tiny values
|
||||||
|
if (Current < 0.001f) Current = 0.0f;
|
||||||
|
if (Current > 0.001f) bAnyNonZero = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// "sil" uses LipSyncStrength=1 always — it's the rest pose
|
||||||
|
SmoothedVisemes.FindOrAdd(FName("sil")) = FMath::Lerp(
|
||||||
|
SmoothedVisemes.FindOrAdd(FName("sil")),
|
||||||
|
TargetVisemes.FindOrAdd(FName("sil")),
|
||||||
|
Alpha);
|
||||||
|
|
||||||
|
// Convert visemes to ARKit blendshapes
|
||||||
|
MapVisemesToBlendshapes();
|
||||||
|
|
||||||
|
// Auto-apply morph targets if a target mesh is set
|
||||||
|
if (TargetMesh)
|
||||||
|
{
|
||||||
|
ApplyMorphTargets();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Notify Blueprint listeners
|
||||||
|
if (bAnyNonZero || CurrentBlendshapes.Num() > 0)
|
||||||
|
{
|
||||||
|
OnVisemesReady.Broadcast();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Audio analysis
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::OnAudioChunkReceived(const TArray<uint8>& PCMData)
|
||||||
|
{
|
||||||
|
if (!SpectrumAnalyzer) return;
|
||||||
|
|
||||||
|
// Convert int16 PCM to float32 [-1, 1]
|
||||||
|
const int16* Samples = reinterpret_cast<const int16*>(PCMData.GetData());
|
||||||
|
const int32 NumSamples = PCMData.Num() / sizeof(int16);
|
||||||
|
|
||||||
|
// DEBUG: log first audio chunk received
|
||||||
|
static bool bFirstChunkLogged = false;
|
||||||
|
if (!bFirstChunkLogged)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("First audio chunk received: %d bytes (%d samples)"), PCMData.Num(), NumSamples);
|
||||||
|
bFirstChunkLogged = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
FloatBuffer.Reset(NumSamples);
|
||||||
|
for (int32 i = 0; i < NumSamples; ++i)
|
||||||
|
{
|
||||||
|
FloatBuffer.Add(static_cast<float>(Samples[i]) / 32768.0f);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Feed to rolling FFT analyzer
|
||||||
|
SpectrumAnalyzer->PushAudio(FloatBuffer.GetData(), NumSamples);
|
||||||
|
|
||||||
|
// Try to perform analysis (returns true when enough data for one FFT window)
|
||||||
|
if (SpectrumAnalyzer->PerformAnalysisIfPossible(true))
|
||||||
|
{
|
||||||
|
AnalyzeSpectrum();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::AnalyzeSpectrum()
|
||||||
|
{
|
||||||
|
if (!SpectrumAnalyzer) return;
|
||||||
|
|
||||||
|
Audio::FSpectrumAnalyzerScopeLock Lock(SpectrumAnalyzer.Get());
|
||||||
|
|
||||||
|
// Extract energy in frequency bands relevant for speech phoneme classification.
|
||||||
|
// Band boundaries chosen based on speech formant ranges.
|
||||||
|
const float VoiceEnergy = GetBandEnergy(80.0f, 400.0f); // Fundamental frequency
|
||||||
|
const float F1Energy = GetBandEnergy(300.0f, 800.0f); // First formant → jaw openness
|
||||||
|
const float F2Energy = GetBandEnergy(800.0f, 2500.0f); // Second formant → vowel front/back
|
||||||
|
const float F3Energy = GetBandEnergy(2500.0f, 4000.0f); // Third formant → liquids, nasals
|
||||||
|
const float SibilantEnergy = GetBandEnergy(4000.0f, 7500.0f); // Fricative/sibilant energy
|
||||||
|
|
||||||
|
const float TotalEnergy = VoiceEnergy + F1Energy + F2Energy + F3Energy + SibilantEnergy;
|
||||||
|
|
||||||
|
// DEBUG: log energy levels periodically
|
||||||
|
static int32 AnalysisCount = 0;
|
||||||
|
if (++AnalysisCount % 50 == 1) // Log every ~50 analyses
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log,
|
||||||
|
TEXT("Spectrum: Total=%.4f F1=%.4f F2=%.4f F3=%.4f Sibilant=%.4f"),
|
||||||
|
TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
|
||||||
|
}
|
||||||
|
|
||||||
|
EstimateVisemes(TotalEnergy, F1Energy, F2Energy, F3Energy, SibilantEnergy);
|
||||||
|
}
|
||||||
|
|
||||||
|
float UElevenLabsLipSyncComponent::GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples) const
|
||||||
|
{
|
||||||
|
if (!SpectrumAnalyzer || NumSamples <= 0) return 0.0f;
|
||||||
|
|
||||||
|
float Total = 0.0f;
|
||||||
|
const float Step = (HighFreq - LowFreq) / static_cast<float>(NumSamples);
|
||||||
|
|
||||||
|
for (int32 i = 0; i < NumSamples; ++i)
|
||||||
|
{
|
||||||
|
const float Freq = LowFreq + Step * (static_cast<float>(i) + 0.5f);
|
||||||
|
Total += SpectrumAnalyzer->GetMagnitudeForFrequency(Freq);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Total / static_cast<float>(NumSamples);
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Viseme estimation from spectral analysis
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::EstimateVisemes(float TotalEnergy,
|
||||||
|
float F1Energy, float F2Energy, float F3Energy, float SibilantEnergy)
|
||||||
|
{
|
||||||
|
// Reset all visemes to zero
|
||||||
|
for (const FName& Name : VisemeNames)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(Name) = 0.0f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Silence threshold — below this, mouth is closed
|
||||||
|
constexpr float SilenceThreshold = 0.002f;
|
||||||
|
|
||||||
|
if (TotalEnergy < SilenceThreshold)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize band energies relative to total
|
||||||
|
const float InvTotal = 1.0f / FMath::Max(TotalEnergy, 0.0001f);
|
||||||
|
const float NormF1 = F1Energy * InvTotal;
|
||||||
|
const float NormF2 = F2Energy * InvTotal;
|
||||||
|
const float NormF3 = F3Energy * InvTotal;
|
||||||
|
const float NormSibilant = SibilantEnergy * InvTotal;
|
||||||
|
|
||||||
|
// Energy-based intensity (how "loud" the speech is — drives overall jaw opening)
|
||||||
|
// Scale to a usable 0-1 range. The constant is empirically tuned.
|
||||||
|
const float Intensity = FMath::Clamp(TotalEnergy * 25.0f, 0.0f, 1.0f);
|
||||||
|
|
||||||
|
// ── Classification based on spectral shape ───────────────────────────────
|
||||||
|
// The approach: compute "votes" for each viseme category based on where
|
||||||
|
// the spectral energy is concentrated. Multiple visemes can be active
|
||||||
|
// simultaneously (blended).
|
||||||
|
|
||||||
|
// Fricatives / sibilants: high-frequency energy dominates
|
||||||
|
if (NormSibilant > 0.25f)
|
||||||
|
{
|
||||||
|
const float FricativeWeight = NormSibilant * Intensity;
|
||||||
|
// Distinguish S/Z (narrow, higher freq) from SH/CH (broader, lower freq)
|
||||||
|
if (NormF3 > NormF2)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("CH")) = FricativeWeight * 0.7f;
|
||||||
|
TargetVisemes.FindOrAdd(FName("SS")) = FricativeWeight * 0.3f;
|
||||||
|
}
|
||||||
|
// F/V component
|
||||||
|
TargetVisemes.FindOrAdd(FName("FF")) = FricativeWeight * 0.3f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Voiced speech: most energy in voice + F1 + F2
|
||||||
|
if (NormSibilant < 0.5f)
|
||||||
|
{
|
||||||
|
const float VoicedWeight = (1.0f - NormSibilant) * Intensity;
|
||||||
|
|
||||||
|
// Open vowels: strong F1 = wide jaw opening
|
||||||
|
if (NormF1 > 0.3f)
|
||||||
|
{
|
||||||
|
if (NormF2 > 0.35f)
|
||||||
|
{
|
||||||
|
// High F2 + high F1 → front open vowel (A as in "cat")
|
||||||
|
TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Low F2 + high F1 → back open vowel (O as in "go")
|
||||||
|
TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * NormF1 * 0.7f;
|
||||||
|
TargetVisemes.FindOrAdd(FName("aa")) = VoicedWeight * NormF1 * 0.3f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Mid vowels: moderate F1
|
||||||
|
if (NormF1 > 0.15f && NormF1 <= 0.3f)
|
||||||
|
{
|
||||||
|
if (NormF2 > 0.4f)
|
||||||
|
{
|
||||||
|
// High F2 → front mid vowel (E as in "bed")
|
||||||
|
TargetVisemes.FindOrAdd(FName("E")) = VoicedWeight * 0.7f;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Low F2 → rounded mid vowel
|
||||||
|
TargetVisemes.FindOrAdd(FName("oh")) = VoicedWeight * 0.5f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close vowels: weak F1
|
||||||
|
if (NormF1 <= 0.15f && NormF2 > 0.0f)
|
||||||
|
{
|
||||||
|
if (NormF2 > 0.4f)
|
||||||
|
{
|
||||||
|
// High F2 → front close vowel (I as in "see")
|
||||||
|
TargetVisemes.FindOrAdd(FName("ih")) = VoicedWeight * 0.6f;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Low F2 → back close vowel (OO as in "boot")
|
||||||
|
TargetVisemes.FindOrAdd(FName("ou")) = VoicedWeight * 0.6f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Nasals / liquids: prominent F3 with low sibilant
|
||||||
|
if (NormF3 > 0.2f && NormSibilant < 0.15f)
|
||||||
|
{
|
||||||
|
if (NormF1 < 0.2f)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("nn")) = VoicedWeight * 0.4f;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("RR")) = VoicedWeight * 0.3f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Plosive detection: very low F1 with moderate energy = lips/tongue closed
|
||||||
|
if (NormF1 < 0.1f && Intensity > 0.3f && NormSibilant < 0.2f)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("PP")) = VoicedWeight * 0.3f;
|
||||||
|
TargetVisemes.FindOrAdd(FName("DD")) = VoicedWeight * 0.2f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TH detection: moderate sibilant + moderate F3 (dental fricative)
|
||||||
|
if (NormSibilant > 0.15f && NormSibilant < 0.35f && NormF3 > 0.15f)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("TH")) = Intensity * 0.3f;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Ensure at least some silence weight when energy is very low
|
||||||
|
if (Intensity < 0.1f)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f - Intensity * 10.0f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Viseme → ARKit blendshape mapping
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes()
|
||||||
|
{
|
||||||
|
CurrentBlendshapes.Reset();
|
||||||
|
|
||||||
|
// Accumulate blendshape contributions from all active visemes
|
||||||
|
for (const FName& VisemeName : VisemeNames)
|
||||||
|
{
|
||||||
|
const float VisemeWeight = SmoothedVisemes.FindOrAdd(VisemeName);
|
||||||
|
if (VisemeWeight < 0.001f) continue;
|
||||||
|
|
||||||
|
const TMap<FName, float>* Mapping = VisemeToBlendshapeMap.Find(VisemeName);
|
||||||
|
if (!Mapping) continue;
|
||||||
|
|
||||||
|
for (const auto& Pair : *Mapping)
|
||||||
|
{
|
||||||
|
float& BS = CurrentBlendshapes.FindOrAdd(Pair.Key);
|
||||||
|
BS += Pair.Value * VisemeWeight;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Clamp all blendshape values to [0, 1]
|
||||||
|
for (auto& Pair : CurrentBlendshapes)
|
||||||
|
{
|
||||||
|
Pair.Value = FMath::Clamp(Pair.Value, 0.0f, 1.0f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Morph target application
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::ApplyMorphTargets()
|
||||||
|
{
|
||||||
|
if (!TargetMesh) return;
|
||||||
|
|
||||||
|
// DEBUG: log blendshape values periodically
|
||||||
|
static int32 ApplyCount = 0;
|
||||||
|
if (++ApplyCount % 120 == 1) // Log every ~2s at 60fps
|
||||||
|
{
|
||||||
|
FString DebugStr;
|
||||||
|
for (const auto& Pair : CurrentBlendshapes)
|
||||||
|
{
|
||||||
|
if (Pair.Value > 0.01f)
|
||||||
|
{
|
||||||
|
DebugStr += FString::Printf(TEXT("%s=%.2f "), *Pair.Key.ToString(), Pair.Value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (DebugStr.Len() > 0)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Blendshapes: %s"), *DebugStr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply morph targets directly.
|
||||||
|
// NOTE: For MetaHuman, the face AnimBP may override these values.
|
||||||
|
// In that case, use GetCurrentBlendshapes() in the AnimBP instead.
|
||||||
|
for (const auto& Pair : CurrentBlendshapes)
|
||||||
|
{
|
||||||
|
TargetMesh->SetMorphTarget(Pair.Key, Pair.Value);
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -62,6 +62,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
|
|||||||
*/
|
*/
|
||||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);
|
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);
|
||||||
|
|
||||||
|
// Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only).
|
||||||
|
// Delivers PCM chunks as int16, 16kHz mono, little-endian.
|
||||||
|
DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray<uint8>& /*PCMData*/);
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// UElevenLabsConversationalAgentComponent
|
// UElevenLabsConversationalAgentComponent
|
||||||
//
|
//
|
||||||
@ -195,6 +199,11 @@ public:
|
|||||||
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
|
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
|
||||||
FOnAgentResponseTimeout OnAgentResponseTimeout;
|
FOnAgentResponseTimeout OnAgentResponseTimeout;
|
||||||
|
|
||||||
|
// ── Raw audio data (C++ only, used by LipSync component) ────────────────
|
||||||
|
/** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk.
|
||||||
|
* Used internally by UElevenLabsLipSyncComponent for spectral analysis. */
|
||||||
|
FOnAgentAudioData OnAgentAudioData;
|
||||||
|
|
||||||
// ── Control ───────────────────────────────────────────────────────────────
|
// ── Control ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@ -0,0 +1,139 @@
|
|||||||
|
// Copyright ASTERION. All Rights Reserved.
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "CoreMinimal.h"
|
||||||
|
#include "Components/ActorComponent.h"
|
||||||
|
#include "DSP/SpectrumAnalyzer.h"
|
||||||
|
#include "ElevenLabsLipSyncComponent.generated.h"
|
||||||
|
|
||||||
|
class UElevenLabsConversationalAgentComponent;
|
||||||
|
class USkeletalMeshComponent;
|
||||||
|
|
||||||
|
// Fired every tick when viseme/blendshape data has been updated.
|
||||||
|
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsVisemesReady);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Real-time lip sync component for ElevenLabs Conversational AI.
|
||||||
|
*
|
||||||
|
* Attaches to the same Actor as the Conversational Agent component.
|
||||||
|
* Receives the agent's audio stream, performs spectral analysis,
|
||||||
|
* estimates 15 OVR viseme weights, maps them to ARKit blendshapes
|
||||||
|
* (MetaHuman compatible), and optionally auto-applies morph targets.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* 1. Add this component alongside the Conversational Agent component.
|
||||||
|
* 2. (Optional) Set TargetMesh to the MetaHuman Face skeletal mesh.
|
||||||
|
* 3. Conversation starts → lip sync works automatically.
|
||||||
|
* 4. (Optional) Bind OnVisemesReady for custom Blueprint handling.
|
||||||
|
*/
|
||||||
|
UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent),
|
||||||
|
DisplayName = "ElevenLabs Lip Sync")
|
||||||
|
class PS_AI_AGENT_ELEVENLABS_API UElevenLabsLipSyncComponent : public UActorComponent
|
||||||
|
{
|
||||||
|
GENERATED_BODY()
|
||||||
|
|
||||||
|
public:
|
||||||
|
UElevenLabsLipSyncComponent();
|
||||||
|
~UElevenLabsLipSyncComponent();
|
||||||
|
|
||||||
|
// ── Configuration ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Target skeletal mesh to auto-apply morph targets. Leave empty to handle
|
||||||
|
* visemes manually via OnVisemesReady + GetCurrentBlendshapes(). */
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
|
||||||
|
meta = (ToolTip = "Skeletal mesh to drive morph targets on.\nLeave empty to read values manually via GetCurrentBlendshapes()."))
|
||||||
|
TObjectPtr<USkeletalMeshComponent> TargetMesh;
|
||||||
|
|
||||||
|
/** Overall mouth movement intensity multiplier. */
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
|
||||||
|
meta = (ClampMin = "0.0", ClampMax = "3.0",
|
||||||
|
ToolTip = "Lip sync intensity.\n1.0 = normal, higher = more expressive, lower = subtler."))
|
||||||
|
float LipSyncStrength = 1.0f;
|
||||||
|
|
||||||
|
/** How quickly viseme weights interpolate towards new values each frame. */
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|LipSync",
|
||||||
|
meta = (ClampMin = "1.0", ClampMax = "100.0",
|
||||||
|
ToolTip = "Smoothing speed for viseme transitions.\nLower = smoother but laggy, higher = responsive but jittery.\n15-25 is usually good."))
|
||||||
|
float SmoothingSpeed = 20.0f;
|
||||||
|
|
||||||
|
// ── Events ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Fires every tick when viseme data has been updated.
|
||||||
|
* Use GetCurrentVisemes() or GetCurrentBlendshapes() to read values. */
|
||||||
|
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|LipSync",
|
||||||
|
meta = (ToolTip = "Fires each frame with updated viseme data.\nCall GetCurrentVisemes() or GetCurrentBlendshapes() to read values."))
|
||||||
|
FOnElevenLabsVisemesReady OnVisemesReady;
|
||||||
|
|
||||||
|
// ── Getters ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Get current OVR viseme weights (15 values: sil, PP, FF, TH, DD, kk, CH, SS, nn, RR, aa, E, ih, oh, ou). */
|
||||||
|
UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
|
||||||
|
TMap<FName, float> GetCurrentVisemes() const { return SmoothedVisemes; }
|
||||||
|
|
||||||
|
/** Get current ARKit blendshape weights (MetaHuman compatible: jawOpen, mouthFunnel, mouthClose, etc.). */
|
||||||
|
UFUNCTION(BlueprintCallable, Category = "ElevenLabs|LipSync")
|
||||||
|
TMap<FName, float> GetCurrentBlendshapes() const { return CurrentBlendshapes; }
|
||||||
|
|
||||||
|
// ── UActorComponent overrides ─────────────────────────────────────────────
|
||||||
|
virtual void BeginPlay() override;
|
||||||
|
virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;
|
||||||
|
virtual void TickComponent(float DeltaTime, ELevelTick TickType,
|
||||||
|
FActorComponentTickFunction* ThisTickFunction) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// ── Audio analysis pipeline ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** Receives raw PCM from the agent component. */
|
||||||
|
void OnAudioChunkReceived(const TArray<uint8>& PCMData);
|
||||||
|
|
||||||
|
/** Extract frequency band energies from the spectrum analyzer. */
|
||||||
|
void AnalyzeSpectrum();
|
||||||
|
|
||||||
|
/** Map frequency band energies to 15 OVR viseme target weights. */
|
||||||
|
void EstimateVisemes(float TotalEnergy, float F1Energy, float F2Energy,
|
||||||
|
float F3Energy, float SibilantEnergy);
|
||||||
|
|
||||||
|
/** Convert smoothed OVR visemes to ARKit blendshape weights. */
|
||||||
|
void MapVisemesToBlendshapes();
|
||||||
|
|
||||||
|
/** Apply CurrentBlendshapes to TargetMesh morph targets. */
|
||||||
|
void ApplyMorphTargets();
|
||||||
|
|
||||||
|
/** Sample the spectrum magnitude across a frequency range. */
|
||||||
|
float GetBandEnergy(float LowFreq, float HighFreq, int32 NumSamples = 8) const;
|
||||||
|
|
||||||
|
// ── State ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TUniquePtr<Audio::FSpectrumAnalyzer> SpectrumAnalyzer;
|
||||||
|
|
||||||
|
// Reused float buffer for int16→float conversion (avoid per-chunk allocations)
|
||||||
|
TArray<float> FloatBuffer;
|
||||||
|
|
||||||
|
// Target viseme weights (set by spectral analysis, not yet smoothed)
|
||||||
|
TMap<FName, float> TargetVisemes;
|
||||||
|
|
||||||
|
// Smoothed viseme weights (interpolated each tick, exposed via GetCurrentVisemes)
|
||||||
|
TMap<FName, float> SmoothedVisemes;
|
||||||
|
|
||||||
|
// ARKit blendshape weights derived from SmoothedVisemes (exposed via GetCurrentBlendshapes)
|
||||||
|
TMap<FName, float> CurrentBlendshapes;
|
||||||
|
|
||||||
|
// Whether we have pending analysis results to process
|
||||||
|
bool bHasPendingAnalysis = false;
|
||||||
|
|
||||||
|
// Cached reference to the agent component on the same Actor
|
||||||
|
TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
|
||||||
|
FDelegateHandle AudioDataHandle;
|
||||||
|
|
||||||
|
// ── Static data ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
/** OVR Viseme names (15 standard visemes). */
|
||||||
|
static const TArray<FName> VisemeNames;
|
||||||
|
|
||||||
|
/** Initialize OVR→ARKit blendshape mapping table. */
|
||||||
|
static TMap<FName, TMap<FName, float>> CreateVisemeToBlendshapeMap();
|
||||||
|
|
||||||
|
/** Cached mapping: OVR viseme name → { ARKit blendshape name → weight }. */
|
||||||
|
static const TMap<FName, TMap<FName, float>> VisemeToBlendshapeMap;
|
||||||
|
};
|
||||||
Loading…
x
Reference in New Issue
Block a user