WIP: Emotion facial expressions + client tool support
- ElevenLabs client tool call/result WebSocket support (set_emotion) - EElevenLabsEmotion + EElevenLabsEmotionIntensity enums - Emotion poses in PoseMap data asset (Normal/Medium/Extreme per emotion) - Standalone ElevenLabsFacialExpressionComponent (separated from LipSync) - Two-layer architecture: emotion base (eyes/brows/cheeks) + lip sync on top (mouth) - Smooth emotion blending (~500ms configurable transitions) - LipSync reads emotion curves from FacialExpressionComponent via GetCurrentEmotionCurves() Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e57be0a1d9
commit
f57bb65297
Binary file not shown.
Binary file not shown.
@ -179,6 +179,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
|
||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
|
||||
WebSocketProxy->OnAgentResponsePart.AddDynamic(this,
|
||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponsePart);
|
||||
WebSocketProxy->OnClientToolCall.AddDynamic(this,
|
||||
&UElevenLabsConversationalAgentComponent::HandleClientToolCall);
|
||||
}
|
||||
|
||||
// Pass configuration to the proxy before connecting.
|
||||
@ -429,6 +431,8 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
|
||||
GeneratingTickCount = 0;
|
||||
TurnIndex = 0;
|
||||
LastClosedTurnIndex = 0;
|
||||
CurrentEmotion = EElevenLabsEmotion::Neutral;
|
||||
CurrentEmotionIntensity = EElevenLabsEmotionIntensity::Medium;
|
||||
{
|
||||
FScopeLock Lock(&MicSendLock);
|
||||
MicAccumulationBuffer.Reset();
|
||||
@ -540,6 +544,77 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponsePart(const FStr
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleClientToolCall(const FElevenLabsClientToolCall& ToolCall)
|
||||
{
|
||||
// Built-in handler for the "set_emotion" tool: parse emotion + intensity, auto-respond, broadcast.
|
||||
if (ToolCall.ToolName == TEXT("set_emotion"))
|
||||
{
|
||||
// Parse emotion
|
||||
EElevenLabsEmotion NewEmotion = EElevenLabsEmotion::Neutral;
|
||||
const FString* EmotionStr = ToolCall.Parameters.Find(TEXT("emotion"));
|
||||
if (EmotionStr)
|
||||
{
|
||||
const FString Lower = EmotionStr->ToLower();
|
||||
if (Lower == TEXT("joy") || Lower == TEXT("happy") || Lower == TEXT("happiness"))
|
||||
NewEmotion = EElevenLabsEmotion::Joy;
|
||||
else if (Lower == TEXT("sadness") || Lower == TEXT("sad"))
|
||||
NewEmotion = EElevenLabsEmotion::Sadness;
|
||||
else if (Lower == TEXT("anger") || Lower == TEXT("angry"))
|
||||
NewEmotion = EElevenLabsEmotion::Anger;
|
||||
else if (Lower == TEXT("surprise") || Lower == TEXT("surprised"))
|
||||
NewEmotion = EElevenLabsEmotion::Surprise;
|
||||
else if (Lower == TEXT("fear") || Lower == TEXT("afraid") || Lower == TEXT("scared"))
|
||||
NewEmotion = EElevenLabsEmotion::Fear;
|
||||
else if (Lower == TEXT("disgust") || Lower == TEXT("disgusted"))
|
||||
NewEmotion = EElevenLabsEmotion::Disgust;
|
||||
else if (Lower == TEXT("neutral"))
|
||||
NewEmotion = EElevenLabsEmotion::Neutral;
|
||||
else
|
||||
UE_LOG(LogElevenLabsAgent, Warning, TEXT("Unknown emotion '%s', defaulting to Neutral."), **EmotionStr);
|
||||
}
|
||||
|
||||
// Parse intensity (default: medium)
|
||||
EElevenLabsEmotionIntensity NewIntensity = EElevenLabsEmotionIntensity::Medium;
|
||||
const FString* IntensityStr = ToolCall.Parameters.Find(TEXT("intensity"));
|
||||
if (IntensityStr)
|
||||
{
|
||||
const FString Lower = IntensityStr->ToLower();
|
||||
if (Lower == TEXT("low") || Lower == TEXT("subtle") || Lower == TEXT("light"))
|
||||
NewIntensity = EElevenLabsEmotionIntensity::Low;
|
||||
else if (Lower == TEXT("medium") || Lower == TEXT("moderate") || Lower == TEXT("normal"))
|
||||
NewIntensity = EElevenLabsEmotionIntensity::Medium;
|
||||
else if (Lower == TEXT("high") || Lower == TEXT("strong") || Lower == TEXT("extreme") || Lower == TEXT("intense"))
|
||||
NewIntensity = EElevenLabsEmotionIntensity::High;
|
||||
else
|
||||
UE_LOG(LogElevenLabsAgent, Warning, TEXT("Unknown intensity '%s', defaulting to Medium."), **IntensityStr);
|
||||
}
|
||||
|
||||
CurrentEmotion = NewEmotion;
|
||||
CurrentEmotionIntensity = NewIntensity;
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsAgent, Log, TEXT("[T+%.2fs] Agent emotion changed to: %s (%s)"),
|
||||
T, *UEnum::GetValueAsString(NewEmotion), *UEnum::GetValueAsString(NewIntensity));
|
||||
|
||||
OnAgentEmotionChanged.Broadcast(NewEmotion, NewIntensity);
|
||||
|
||||
// Auto-respond to the tool call so the agent can continue.
|
||||
if (WebSocketProxy)
|
||||
{
|
||||
WebSocketProxy->SendClientToolResult(
|
||||
ToolCall.ToolCallId,
|
||||
FString::Printf(TEXT("emotion set to %s (%s)"),
|
||||
*UEnum::GetValueAsString(NewEmotion),
|
||||
*UEnum::GetValueAsString(NewIntensity)),
|
||||
false);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
// Unknown tool — forward to Blueprint for custom handling.
|
||||
OnAgentClientToolCall.Broadcast(ToolCall);
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Audio playback
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -0,0 +1,239 @@
|
||||
// Copyright ASTERION. All Rights Reserved.
|
||||
|
||||
#include "ElevenLabsFacialExpressionComponent.h"
|
||||
#include "ElevenLabsConversationalAgentComponent.h"
|
||||
#include "ElevenLabsLipSyncPoseMap.h"
|
||||
#include "Animation/AnimSequence.h"
|
||||
#include "Components/SkeletalMeshComponent.h"
|
||||
|
||||
DEFINE_LOG_CATEGORY_STATIC(LogElevenLabsFacialExpr, Log, All);
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Construction
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
UElevenLabsFacialExpressionComponent::UElevenLabsFacialExpressionComponent()
|
||||
{
|
||||
PrimaryComponentTick.bCanEverTick = true;
|
||||
PrimaryComponentTick.TickGroup = TG_PrePhysics;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// BeginPlay / EndPlay
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
void UElevenLabsFacialExpressionComponent::BeginPlay()
|
||||
{
|
||||
Super::BeginPlay();
|
||||
|
||||
// Find the agent component on the same actor
|
||||
AActor* Owner = GetOwner();
|
||||
if (!Owner)
|
||||
{
|
||||
UE_LOG(LogElevenLabsFacialExpr, Warning, TEXT("No owner actor — facial expressions disabled."));
|
||||
return;
|
||||
}
|
||||
|
||||
auto* Agent = Owner->FindComponentByClass<UElevenLabsConversationalAgentComponent>();
|
||||
if (Agent)
|
||||
{
|
||||
AgentComponent = Agent;
|
||||
Agent->OnAgentEmotionChanged.AddDynamic(
|
||||
this, &UElevenLabsFacialExpressionComponent::OnEmotionChanged);
|
||||
|
||||
UE_LOG(LogElevenLabsFacialExpr, Log,
|
||||
TEXT("Facial expression bound to agent component on %s."), *Owner->GetName());
|
||||
}
|
||||
else
|
||||
{
|
||||
UE_LOG(LogElevenLabsFacialExpr, Warning,
|
||||
TEXT("No ElevenLabsConversationalAgentComponent found on %s — "
|
||||
"facial expression will not respond to emotion changes."),
|
||||
*Owner->GetName());
|
||||
}
|
||||
|
||||
// Extract emotion curves from PoseMap
|
||||
InitializeEmotionPoses();
|
||||
}
|
||||
|
||||
void UElevenLabsFacialExpressionComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
|
||||
{
|
||||
if (AgentComponent.IsValid())
|
||||
{
|
||||
AgentComponent->OnAgentEmotionChanged.RemoveDynamic(
|
||||
this, &UElevenLabsFacialExpressionComponent::OnEmotionChanged);
|
||||
}
|
||||
|
||||
Super::EndPlay(EndPlayReason);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Emotion pose initialization
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
void UElevenLabsFacialExpressionComponent::InitializeEmotionPoses()
|
||||
{
|
||||
EmotionCurveMap.Reset();
|
||||
|
||||
if (!PoseMap || PoseMap->EmotionPoses.Num() == 0)
|
||||
{
|
||||
UE_LOG(LogElevenLabsFacialExpr, Log,
|
||||
TEXT("No emotion poses assigned in PoseMap — facial expressions disabled."));
|
||||
return;
|
||||
}
|
||||
|
||||
int32 EmotionCount = 0;
|
||||
for (const auto& EmotionPair : PoseMap->EmotionPoses)
|
||||
{
|
||||
const EElevenLabsEmotion Emotion = EmotionPair.Key;
|
||||
const FElevenLabsEmotionPoseSet& PoseSet = EmotionPair.Value;
|
||||
|
||||
auto& IntensityMap = EmotionCurveMap.FindOrAdd(Emotion);
|
||||
|
||||
if (PoseSet.Normal)
|
||||
{
|
||||
IntensityMap.Add(EElevenLabsEmotionIntensity::Low, ExtractCurvesFromAnim(PoseSet.Normal));
|
||||
++EmotionCount;
|
||||
}
|
||||
if (PoseSet.Medium)
|
||||
{
|
||||
IntensityMap.Add(EElevenLabsEmotionIntensity::Medium, ExtractCurvesFromAnim(PoseSet.Medium));
|
||||
++EmotionCount;
|
||||
}
|
||||
if (PoseSet.Extreme)
|
||||
{
|
||||
IntensityMap.Add(EElevenLabsEmotionIntensity::High, ExtractCurvesFromAnim(PoseSet.Extreme));
|
||||
++EmotionCount;
|
||||
}
|
||||
}
|
||||
|
||||
UE_LOG(LogElevenLabsFacialExpr, Log,
|
||||
TEXT("=== Emotion poses: %d emotions, %d total anim slots loaded ==="),
|
||||
PoseMap->EmotionPoses.Num(), EmotionCount);
|
||||
}
|
||||
|
||||
TMap<FName, float> UElevenLabsFacialExpressionComponent::ExtractCurvesFromAnim(UAnimSequence* AnimSeq)
|
||||
{
|
||||
TMap<FName, float> CurveValues;
|
||||
if (!AnimSeq) return CurveValues;
|
||||
|
||||
const IAnimationDataModel* DataModel = AnimSeq->GetDataModel();
|
||||
if (!DataModel) return CurveValues;
|
||||
|
||||
const TArray<FFloatCurve>& FloatCurves = DataModel->GetFloatCurves();
|
||||
for (const FFloatCurve& Curve : FloatCurves)
|
||||
{
|
||||
const FName CurveName = Curve.GetName();
|
||||
const float Value = Curve.FloatCurve.Eval(0.0f);
|
||||
if (FMath::Abs(Value) < 0.001f) continue;
|
||||
CurveValues.Add(CurveName, Value);
|
||||
}
|
||||
|
||||
UE_LOG(LogElevenLabsFacialExpr, Log,
|
||||
TEXT("Emotion anim '%s': Extracted %d non-zero curves."),
|
||||
*AnimSeq->GetName(), CurveValues.Num());
|
||||
return CurveValues;
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Emotion change handler
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
void UElevenLabsFacialExpressionComponent::OnEmotionChanged(
|
||||
EElevenLabsEmotion Emotion, EElevenLabsEmotionIntensity Intensity)
|
||||
{
|
||||
if (Emotion == ActiveEmotion && Intensity == ActiveEmotionIntensity)
|
||||
return; // No change
|
||||
|
||||
ActiveEmotion = Emotion;
|
||||
ActiveEmotionIntensity = Intensity;
|
||||
|
||||
// Look up target emotion curves
|
||||
TargetEmotionCurves.Reset();
|
||||
const auto* IntensityMap = EmotionCurveMap.Find(Emotion);
|
||||
if (IntensityMap)
|
||||
{
|
||||
const auto* Curves = IntensityMap->Find(Intensity);
|
||||
if (Curves)
|
||||
{
|
||||
TargetEmotionCurves = *Curves;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Fallback: try Medium, then Low, then High
|
||||
static const EElevenLabsEmotionIntensity Fallbacks[] = {
|
||||
EElevenLabsEmotionIntensity::Medium,
|
||||
EElevenLabsEmotionIntensity::Low,
|
||||
EElevenLabsEmotionIntensity::High
|
||||
};
|
||||
for (EElevenLabsEmotionIntensity Fb : Fallbacks)
|
||||
{
|
||||
Curves = IntensityMap->Find(Fb);
|
||||
if (Curves) { TargetEmotionCurves = *Curves; break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start blending from current to target
|
||||
EmotionBlendAlpha = 0.0f;
|
||||
|
||||
UE_LOG(LogElevenLabsFacialExpr, Log,
|
||||
TEXT("Emotion target set: %s (%s) — %d curves, blending over %.1fs..."),
|
||||
*UEnum::GetValueAsString(Emotion), *UEnum::GetValueAsString(Intensity),
|
||||
TargetEmotionCurves.Num(), EmotionBlendDuration);
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Tick — smooth emotion blending
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
void UElevenLabsFacialExpressionComponent::TickComponent(
|
||||
float DeltaTime, ELevelTick TickType, FActorComponentTickFunction* ThisTickFunction)
|
||||
{
|
||||
Super::TickComponent(DeltaTime, TickType, ThisTickFunction);
|
||||
|
||||
if (EmotionCurveMap.Num() == 0)
|
||||
return; // No emotion data loaded
|
||||
|
||||
// Advance blend alpha
|
||||
if (EmotionBlendAlpha < 1.0f)
|
||||
{
|
||||
const float BlendSpeed = 1.0f / FMath::Max(0.05f, EmotionBlendDuration);
|
||||
EmotionBlendAlpha = FMath::Min(1.0f, EmotionBlendAlpha + DeltaTime * BlendSpeed);
|
||||
}
|
||||
|
||||
// Blend CurrentEmotionCurves toward TargetEmotionCurves
|
||||
{
|
||||
TSet<FName> AllCurves;
|
||||
for (const auto& P : CurrentEmotionCurves) AllCurves.Add(P.Key);
|
||||
for (const auto& P : TargetEmotionCurves) AllCurves.Add(P.Key);
|
||||
|
||||
for (const FName& CurveName : AllCurves)
|
||||
{
|
||||
const float Current = CurrentEmotionCurves.Contains(CurveName)
|
||||
? CurrentEmotionCurves[CurveName] : 0.0f;
|
||||
const float Target = TargetEmotionCurves.Contains(CurveName)
|
||||
? TargetEmotionCurves[CurveName] : 0.0f;
|
||||
const float Blended = FMath::Lerp(Current, Target, EmotionBlendAlpha);
|
||||
|
||||
if (FMath::Abs(Blended) > 0.001f)
|
||||
CurrentEmotionCurves.FindOrAdd(CurveName) = Blended;
|
||||
else
|
||||
CurrentEmotionCurves.Remove(CurveName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Mouth curve classification
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
bool UElevenLabsFacialExpressionComponent::IsMouthCurve(const FName& CurveName)
|
||||
{
|
||||
const FString Name = CurveName.ToString().ToLower();
|
||||
return Name.Contains(TEXT("jaw"))
|
||||
|| Name.Contains(TEXT("mouth"))
|
||||
|| Name.Contains(TEXT("lips"))
|
||||
|| Name.Contains(TEXT("tongue"))
|
||||
|| Name.Contains(TEXT("cheekpuff"));
|
||||
}
|
||||
@ -3,7 +3,7 @@
|
||||
#include "ElevenLabsLipSyncComponent.h"
|
||||
#include "ElevenLabsLipSyncPoseMap.h"
|
||||
#include "ElevenLabsConversationalAgentComponent.h"
|
||||
#include "ElevenLabsDefinitions.h"
|
||||
#include "ElevenLabsFacialExpressionComponent.h"
|
||||
#include "Components/SkeletalMeshComponent.h"
|
||||
#include "Engine/SkeletalMesh.h"
|
||||
#include "Animation/MorphTarget.h"
|
||||
@ -539,6 +539,7 @@ void UElevenLabsLipSyncComponent::InitializePoseMappings()
|
||||
UE_LOG(LogElevenLabsLipSync, Log,
|
||||
TEXT("No phoneme pose AnimSequences assigned — using hardcoded ARKit mapping."));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReason)
|
||||
@ -2274,6 +2275,44 @@ void UElevenLabsLipSyncComponent::MapVisemesToBlendshapes()
|
||||
}
|
||||
}
|
||||
|
||||
// ── Merge emotion base layer from FacialExpressionComponent ──────────
|
||||
// Emotion provides the base expression (eyes, brows, cheeks).
|
||||
// Lip sync overrides only mouth-area curves.
|
||||
if (AActor* Owner = GetOwner())
|
||||
{
|
||||
if (auto* FaceExpr = Owner->FindComponentByClass<UElevenLabsFacialExpressionComponent>())
|
||||
{
|
||||
const TMap<FName, float>& EmotionCurves = FaceExpr->GetCurrentEmotionCurves();
|
||||
if (EmotionCurves.Num() > 0)
|
||||
{
|
||||
// Collect which curves lip sync is actively driving (mouth area)
|
||||
TSet<FName> LipSyncMouthCurves;
|
||||
for (const auto& Pair : CurrentBlendshapes)
|
||||
{
|
||||
if (UElevenLabsFacialExpressionComponent::IsMouthCurve(Pair.Key) && Pair.Value > 0.01f)
|
||||
LipSyncMouthCurves.Add(Pair.Key);
|
||||
}
|
||||
|
||||
// Add non-mouth emotion curves (eyes, brows, cheeks, nose)
|
||||
for (const auto& Pair : EmotionCurves)
|
||||
{
|
||||
if (!UElevenLabsFacialExpressionComponent::IsMouthCurve(Pair.Key))
|
||||
{
|
||||
// Emotion controls non-mouth curves exclusively
|
||||
CurrentBlendshapes.FindOrAdd(Pair.Key) = Pair.Value;
|
||||
}
|
||||
else if (!LipSyncMouthCurves.Contains(Pair.Key))
|
||||
{
|
||||
// Mouth curves from emotion only if lip sync has nothing active there
|
||||
// (e.g. during silence, the emotion's mouth pose shows through)
|
||||
CurrentBlendshapes.FindOrAdd(Pair.Key) = Pair.Value;
|
||||
}
|
||||
// Otherwise: lip sync already has a value for this mouth curve — keep it
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Clamp all values. Use wider range for pose data (CTRL curves can exceed 1.0).
|
||||
const float MaxClamp = bUsePoseMapping ? 2.0f : 1.0f;
|
||||
for (auto& Pair : CurrentBlendshapes)
|
||||
|
||||
@ -391,6 +391,10 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
// Silently ignore — corrected text after interruption.
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::ClientToolCall)
|
||||
{
|
||||
HandleClientToolCall(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
|
||||
{
|
||||
HandleInterruption(Root);
|
||||
@ -658,6 +662,64 @@ void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>
|
||||
OnInterrupted.Broadcast();
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleClientToolCall(const TSharedPtr<FJsonObject>& Root)
|
||||
{
|
||||
// Incoming: { "type": "client_tool_call", "client_tool_call": {
|
||||
// "tool_name": "set_emotion", "tool_call_id": "abc123",
|
||||
// "parameters": { "emotion": "surprise" } } }
|
||||
const TSharedPtr<FJsonObject>* ToolCallObj = nullptr;
|
||||
if (!Root->TryGetObjectField(TEXT("client_tool_call"), ToolCallObj) || !ToolCallObj)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Warning, TEXT("client_tool_call: missing client_tool_call object."));
|
||||
return;
|
||||
}
|
||||
|
||||
FElevenLabsClientToolCall ToolCall;
|
||||
(*ToolCallObj)->TryGetStringField(TEXT("tool_name"), ToolCall.ToolName);
|
||||
(*ToolCallObj)->TryGetStringField(TEXT("tool_call_id"), ToolCall.ToolCallId);
|
||||
|
||||
// Extract parameters as string key-value pairs
|
||||
const TSharedPtr<FJsonObject>* ParamsObj = nullptr;
|
||||
if ((*ToolCallObj)->TryGetObjectField(TEXT("parameters"), ParamsObj) && ParamsObj)
|
||||
{
|
||||
for (const auto& Pair : (*ParamsObj)->Values)
|
||||
{
|
||||
FString Value;
|
||||
if (Pair.Value->TryGetString(Value))
|
||||
{
|
||||
ToolCall.Parameters.Add(Pair.Key, Value);
|
||||
}
|
||||
else
|
||||
{
|
||||
// For non-string values, serialize to string
|
||||
ToolCall.Parameters.Add(Pair.Key, Pair.Value->AsString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] Client tool call: %s (id=%s, %d params)"),
|
||||
T, *ToolCall.ToolName, *ToolCall.ToolCallId, ToolCall.Parameters.Num());
|
||||
|
||||
OnClientToolCall.Broadcast(ToolCall);
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::SendClientToolResult(const FString& ToolCallId, const FString& Result, bool bIsError)
|
||||
{
|
||||
// Outgoing: { "type": "client_tool_result", "tool_call_id": "abc123",
|
||||
// "result": "emotion set to surprise", "is_error": false }
|
||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::ClientToolResult);
|
||||
Msg->SetStringField(TEXT("tool_call_id"), ToolCallId);
|
||||
Msg->SetStringField(TEXT("result"), Result);
|
||||
Msg->SetBoolField(TEXT("is_error"), bIsError);
|
||||
SendJsonMessage(Msg);
|
||||
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] Sent client_tool_result for %s: %s (error=%s)"),
|
||||
T, *ToolCallId, *Result, bIsError ? TEXT("true") : TEXT("false"));
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandlePing(const TSharedPtr<FJsonObject>& Root)
|
||||
{
|
||||
// Reply with a pong to keep the connection alive.
|
||||
|
||||
@ -62,6 +62,23 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentPartialResponse,
|
||||
*/
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentResponseTimeout);
|
||||
|
||||
/**
|
||||
* Fired when the agent sets an emotion via the "set_emotion" client tool.
|
||||
* Use this to drive facial expressions on your character (MetaHuman blendshapes, etc.).
|
||||
* The emotion changes BEFORE the corresponding audio arrives, giving time to blend.
|
||||
*/
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_TwoParams(FOnAgentEmotionChanged,
|
||||
EElevenLabsEmotion, Emotion,
|
||||
EElevenLabsEmotionIntensity, Intensity);
|
||||
|
||||
/**
|
||||
* Fired for any client tool call that is NOT automatically handled (i.e. not "set_emotion").
|
||||
* Use this to implement custom client tools in Blueprint.
|
||||
* You MUST call SendClientToolResult on the WebSocketProxy to acknowledge the call.
|
||||
*/
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnAgentClientToolCall,
|
||||
const FElevenLabsClientToolCall&, ToolCall);
|
||||
|
||||
// Non-dynamic delegate for raw agent audio (high-frequency, C++ consumers only).
|
||||
// Delivers PCM chunks as int16, 16kHz mono, little-endian.
|
||||
DECLARE_MULTICAST_DELEGATE_OneParam(FOnAgentAudioData, const TArray<uint8>& /*PCMData*/);
|
||||
@ -208,6 +225,24 @@ public:
|
||||
meta = (ToolTip = "Fires if the server doesn't respond within ResponseTimeoutSeconds.\nUse to show 'try again' or re-open the mic automatically."))
|
||||
FOnAgentResponseTimeout OnAgentResponseTimeout;
|
||||
|
||||
/** Fired when the agent changes emotion via the "set_emotion" client tool. The emotion is set BEFORE the corresponding audio arrives, giving you time to smoothly blend facial expressions. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires when the agent sets an emotion (joy, sadness, surprise, fear, anger, disgust).\nDriven by the 'set_emotion' client tool. Arrives before the audio."))
|
||||
FOnAgentEmotionChanged OnAgentEmotionChanged;
|
||||
|
||||
/** Fired for client tool calls that are NOT automatically handled (i.e. not "set_emotion"). You must call GetWebSocketProxy()->SendClientToolResult() to respond. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events",
|
||||
meta = (ToolTip = "Fires for custom client tool calls (not set_emotion).\nYou must respond via GetWebSocketProxy()->SendClientToolResult()."))
|
||||
FOnAgentClientToolCall OnAgentClientToolCall;
|
||||
|
||||
/** The current emotion of the agent, as set by the "set_emotion" client tool. Defaults to Neutral. */
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
EElevenLabsEmotion CurrentEmotion = EElevenLabsEmotion::Neutral;
|
||||
|
||||
/** The current emotion intensity. Defaults to Medium. */
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
EElevenLabsEmotionIntensity CurrentEmotionIntensity = EElevenLabsEmotionIntensity::Medium;
|
||||
|
||||
// ── Raw audio data (C++ only, used by LipSync component) ────────────────
|
||||
/** Raw PCM audio from the agent (int16, 16kHz mono). Fires for each WebSocket audio chunk.
|
||||
* Used internally by UElevenLabsLipSyncComponent for spectral analysis. */
|
||||
@ -312,6 +347,9 @@ private:
|
||||
UFUNCTION()
|
||||
void HandleAgentResponsePart(const FString& PartialText);
|
||||
|
||||
UFUNCTION()
|
||||
void HandleClientToolCall(const FElevenLabsClientToolCall& ToolCall);
|
||||
|
||||
// ── Audio playback ────────────────────────────────────────────────────────
|
||||
void InitAudioPlayback();
|
||||
void EnqueueAgentAudio(const TArray<uint8>& PCMData);
|
||||
|
||||
@ -108,3 +108,50 @@ struct PS_AI_AGENT_ELEVENLABS_API FElevenLabsTranscriptSegment
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
bool bIsFinal = false;
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Agent emotion (driven by client tool "set_emotion" from the LLM)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
UENUM(BlueprintType)
|
||||
enum class EElevenLabsEmotion : uint8
|
||||
{
|
||||
Neutral UMETA(DisplayName = "Neutral"),
|
||||
Joy UMETA(DisplayName = "Joy"),
|
||||
Sadness UMETA(DisplayName = "Sadness"),
|
||||
Anger UMETA(DisplayName = "Anger"),
|
||||
Surprise UMETA(DisplayName = "Surprise"),
|
||||
Fear UMETA(DisplayName = "Fear"),
|
||||
Disgust UMETA(DisplayName = "Disgust"),
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Emotion intensity (maps to Normal/Medium/Extreme pose variants)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
UENUM(BlueprintType)
|
||||
enum class EElevenLabsEmotionIntensity : uint8
|
||||
{
|
||||
Low UMETA(DisplayName = "Low (Normal)"),
|
||||
Medium UMETA(DisplayName = "Medium"),
|
||||
High UMETA(DisplayName = "High (Extreme)"),
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Client tool call received from ElevenLabs server
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
USTRUCT(BlueprintType)
|
||||
struct PS_AI_AGENT_ELEVENLABS_API FElevenLabsClientToolCall
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
/** Name of the tool the agent wants to invoke (e.g. "set_emotion"). */
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
FString ToolName;
|
||||
|
||||
/** Unique ID for this tool invocation — must be echoed back in client_tool_result. */
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
FString ToolCallId;
|
||||
|
||||
/** Raw JSON parameters as key-value string pairs. */
|
||||
UPROPERTY(BlueprintReadOnly, Category = "ElevenLabs")
|
||||
TMap<FString, FString> Parameters;
|
||||
};
|
||||
|
||||
@ -0,0 +1,116 @@
|
||||
// Copyright ASTERION. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
#include "Components/ActorComponent.h"
|
||||
#include "ElevenLabsDefinitions.h"
|
||||
#include "ElevenLabsFacialExpressionComponent.generated.h"
|
||||
|
||||
class UElevenLabsConversationalAgentComponent;
|
||||
class UElevenLabsLipSyncPoseMap;
|
||||
class USkeletalMeshComponent;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// UElevenLabsFacialExpressionComponent
|
||||
//
|
||||
// Drives emotion-based facial expressions on a MetaHuman (or any skeletal mesh)
|
||||
// as a BASE layer. Lip sync (from ElevenLabsLipSyncComponent) modulates on top,
|
||||
// overriding only mouth-area curves.
|
||||
//
|
||||
// Workflow:
|
||||
// 1. Assign a PoseMap data asset with Emotion Poses filled in.
|
||||
// 2. Assign the TargetMesh (same mesh as the LipSync component).
|
||||
// 3. The component listens to OnAgentEmotionChanged from the agent component.
|
||||
// 4. Emotion curves are smoothly blended (~500ms transitions).
|
||||
// 5. The LipSync component reads GetCurrentEmotionCurves() to merge as base layer.
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
UCLASS(ClassGroup = "ElevenLabs", meta = (BlueprintSpawnableComponent),
|
||||
DisplayName = "ElevenLabs Facial Expression")
|
||||
class PS_AI_AGENT_ELEVENLABS_API UElevenLabsFacialExpressionComponent : public UActorComponent
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
public:
|
||||
UElevenLabsFacialExpressionComponent();
|
||||
|
||||
// ── Configuration ─────────────────────────────────────────────────────────
|
||||
|
||||
/** Pose map asset containing emotion AnimSequences (Normal / Medium / Extreme per emotion).
|
||||
* Can be the same PoseMap asset used by the LipSync component. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|FacialExpression",
|
||||
meta = (ToolTip = "Pose map with Emotion Poses filled in.\nCan be the same asset as the LipSync component."))
|
||||
TObjectPtr<UElevenLabsLipSyncPoseMap> PoseMap;
|
||||
|
||||
/** Skeletal mesh to apply emotion curves to.
|
||||
* Should be the same mesh as the LipSync component's TargetMesh. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|FacialExpression",
|
||||
meta = (ToolTip = "Skeletal mesh for emotion curves.\nShould match the LipSync component's TargetMesh."))
|
||||
TObjectPtr<USkeletalMeshComponent> TargetMesh;
|
||||
|
||||
/** Emotion transition duration in seconds. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|FacialExpression",
|
||||
meta = (ClampMin = "0.1", ClampMax = "3.0",
|
||||
ToolTip = "How long (seconds) to blend between emotions.\n0.5 = snappy, 1.5 = smooth."))
|
||||
float EmotionBlendDuration = 0.5f;
|
||||
|
||||
// ── Getters ───────────────────────────────────────────────────────────────
|
||||
|
||||
/** Get the current smoothed emotion curves (for the LipSync component to merge). */
|
||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs|FacialExpression")
|
||||
const TMap<FName, float>& GetCurrentEmotionCurves() const { return CurrentEmotionCurves; }
|
||||
|
||||
/** Get the active emotion. */
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs|FacialExpression")
|
||||
EElevenLabsEmotion GetActiveEmotion() const { return ActiveEmotion; }
|
||||
|
||||
/** Get the active emotion intensity. */
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs|FacialExpression")
|
||||
EElevenLabsEmotionIntensity GetActiveIntensity() const { return ActiveEmotionIntensity; }
|
||||
|
||||
/** Check if a curve name belongs to the mouth area (overridden by lip sync). */
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs|FacialExpression")
|
||||
static bool IsMouthCurve(const FName& CurveName);
|
||||
|
||||
// ── UActorComponent overrides ─────────────────────────────────────────────
|
||||
virtual void BeginPlay() override;
|
||||
virtual void EndPlay(const EEndPlayReason::Type EndPlayReason) override;
|
||||
virtual void TickComponent(float DeltaTime, ELevelTick TickType,
|
||||
FActorComponentTickFunction* ThisTickFunction) override;
|
||||
|
||||
private:
|
||||
// ── Event handlers ────────────────────────────────────────────────────────
|
||||
|
||||
/** Called when the agent changes emotion via client tool. */
|
||||
UFUNCTION()
|
||||
void OnEmotionChanged(EElevenLabsEmotion Emotion, EElevenLabsEmotionIntensity Intensity);
|
||||
|
||||
// ── Curve extraction ──────────────────────────────────────────────────────
|
||||
|
||||
/** Extract curve values at t=0 from an AnimSequence. */
|
||||
TMap<FName, float> ExtractCurvesFromAnim(UAnimSequence* AnimSeq);
|
||||
|
||||
/** Initialize emotion curve data from PoseMap at BeginPlay. */
|
||||
void InitializeEmotionPoses();
|
||||
|
||||
// ── State ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Extracted curve data: Emotion → Intensity → { CurveName → Value }. */
|
||||
TMap<EElevenLabsEmotion, TMap<EElevenLabsEmotionIntensity, TMap<FName, float>>> EmotionCurveMap;
|
||||
|
||||
/** Current smoothed emotion curves (blended each tick). */
|
||||
TMap<FName, float> CurrentEmotionCurves;
|
||||
|
||||
/** Target emotion curves (set when emotion changes, blended toward). */
|
||||
TMap<FName, float> TargetEmotionCurves;
|
||||
|
||||
/** Current blend progress (0 = old emotion, 1 = target emotion). */
|
||||
float EmotionBlendAlpha = 1.0f;
|
||||
|
||||
/** Active emotion (for change detection). */
|
||||
EElevenLabsEmotion ActiveEmotion = EElevenLabsEmotion::Neutral;
|
||||
EElevenLabsEmotionIntensity ActiveEmotionIntensity = EElevenLabsEmotionIntensity::Medium;
|
||||
|
||||
/** Cached reference to the agent component on the same Actor. */
|
||||
TWeakObjectPtr<UElevenLabsConversationalAgentComponent> AgentComponent;
|
||||
};
|
||||
@ -5,10 +5,35 @@
|
||||
#include "CoreMinimal.h"
|
||||
#include "Engine/DataAsset.h"
|
||||
#include "Engine/AssetManager.h"
|
||||
#include "ElevenLabsDefinitions.h"
|
||||
#include "ElevenLabsLipSyncPoseMap.generated.h"
|
||||
|
||||
class UAnimSequence;
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Emotion pose set: 3 intensity levels (Normal / Medium / Extreme)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
USTRUCT(BlueprintType)
|
||||
struct PS_AI_AGENT_ELEVENLABS_API FElevenLabsEmotionPoseSet
|
||||
{
|
||||
GENERATED_BODY()
|
||||
|
||||
/** Low intensity expression (subtle). E.g. MHF_Happy_N */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite,
|
||||
meta = (ToolTip = "Low intensity (Normal). E.g. MHF_Happy_N"))
|
||||
TObjectPtr<UAnimSequence> Normal;
|
||||
|
||||
/** Medium intensity expression. E.g. MHF_Happy_M */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite,
|
||||
meta = (ToolTip = "Medium intensity. E.g. MHF_Happy_M"))
|
||||
TObjectPtr<UAnimSequence> Medium;
|
||||
|
||||
/** High intensity expression (extreme). E.g. MHF_Happy_E */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite,
|
||||
meta = (ToolTip = "High intensity (Extreme). E.g. MHF_Happy_E"))
|
||||
TObjectPtr<UAnimSequence> Extreme;
|
||||
};
|
||||
|
||||
/**
|
||||
* Reusable data asset that maps OVR visemes to phoneme pose AnimSequences.
|
||||
*
|
||||
@ -103,4 +128,17 @@ public:
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Phoneme Poses",
|
||||
meta = (ToolTip = "Close back vowel (OO). E.g. MHF_OU"))
|
||||
TObjectPtr<UAnimSequence> PoseOU;
|
||||
|
||||
// ── Emotion Poses ────────────────────────────────────────────────────────
|
||||
//
|
||||
// Facial expression animations for each emotion, with 3 intensity levels.
|
||||
// These are applied as a BASE layer (eyes, eyebrows, cheeks).
|
||||
// Lip sync MODULATES on top, overriding only mouth-area curves.
|
||||
|
||||
/** Map of emotions to their pose sets (Normal / Medium / Extreme).
|
||||
* Add entries for each emotion your agent uses (Joy, Sadness, Anger, Surprise, Fear, Disgust).
|
||||
* Neutral is optional — absence means no base expression. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Emotion Poses",
|
||||
meta = (ToolTip = "Emotion → AnimSequence mapping with 3 intensity levels.\nThese drive the base facial expression (eyes, brows, cheeks).\nLip sync overrides the mouth area on top."))
|
||||
TMap<EElevenLabsEmotion, FElevenLabsEmotionPoseSet> EmotionPoses;
|
||||
};
|
||||
|
||||
@ -48,6 +48,10 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponsePart,
|
||||
const FString&, PartialText);
|
||||
|
||||
/** Fired when the server sends a client_tool_call — the agent wants the client to execute a tool. */
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsClientToolCall,
|
||||
const FElevenLabsClientToolCall&, ToolCall);
|
||||
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// WebSocket Proxy
|
||||
@ -103,6 +107,10 @@ public:
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnElevenLabsAgentResponsePart OnAgentResponsePart;
|
||||
|
||||
/** Fired when the agent invokes a client tool. Handle the call and reply with SendClientToolResult. */
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnElevenLabsClientToolCall OnClientToolCall;
|
||||
|
||||
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
@ -172,6 +180,17 @@ public:
|
||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
||||
void SendInterrupt();
|
||||
|
||||
/**
|
||||
* Send the result of a client tool call back to ElevenLabs.
|
||||
* Must be called after receiving a OnClientToolCall event.
|
||||
*
|
||||
* @param ToolCallId The tool_call_id from the original client_tool_call.
|
||||
* @param Result A string result to return to the agent.
|
||||
* @param bIsError True if the tool execution failed.
|
||||
*/
|
||||
UFUNCTION(BlueprintCallable, Category = "ElevenLabs")
|
||||
void SendClientToolResult(const FString& ToolCallId, const FString& Result, bool bIsError = false);
|
||||
|
||||
// ── Info ──────────────────────────────────────────────────────────────────
|
||||
|
||||
UFUNCTION(BlueprintPure, Category = "ElevenLabs")
|
||||
@ -193,6 +212,7 @@ private:
|
||||
void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleClientToolCall(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandlePing(const TSharedPtr<FJsonObject>& Payload);
|
||||
|
||||
/** Build and send a JSON text frame to the server. */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user