Add Expressive Mode support for ElevenLabs V3 Conversational TTS

- Add bExpressiveMode toggle and editable ExpressiveModePromptFragment
  with audio tag instructions ([laughs], [whispers], [sighs], [slow], [excited])
- BuildAgentPayload: append prompt fragment, set expressive_mode API field
  on agent config, auto-override TTS model to eleven_v3_conversational
- OnFetchAgent: strip expressive fragment from prompt (exact + marker fallback),
  read expressive_mode bool from API, auto-detect V3 model
- TTS model combo: inject asset's current model if absent from /v1/models list
  (covers agent-only models like eleven_v3_conversational)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-12 11:06:20 +01:00
parent e5a32f5997
commit 18996a7254
2 changed files with 111 additions and 4 deletions

View File

@ -237,6 +237,35 @@ public:
"- \"high\" for strong reactions (big laugh, deep sadness, shock)\n\n" "- \"high\" for strong reactions (big laugh, deep sadness, shock)\n\n"
"Always return to neutral when the emotional moment passes."); "Always return to neutral when the emotional moment passes.");
// ── Expressive Mode (V3 Conversational) ─────────────────────────────────
/** Enable Expressive Mode (requires TTS model "eleven_v3_conversational").
* When enabled, the agent uses emotionally intelligent speech with
* natural intonation and expressive audio tags like [laughs], [whispers].
* A prompt fragment is appended to instruct the LLM to use these tags. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode",
meta = (ToolTip = "Enable Expressive Mode.\nRequires V3 Conversational TTS model.\nAdds expressive audio tag instructions to the prompt."))
bool bExpressiveMode = false;
/** System prompt fragment appended when bExpressiveMode is true.
* Instructs the LLM to use inline audio tags for emotional delivery.
* Editable for customization stripped automatically on Fetch. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode",
meta = (MultiLine = "true", EditCondition = "bExpressiveMode",
ToolTip = "Prompt instructions for expressive audio tags.\nAppended to CharacterPrompt when creating/updating the agent."))
FString ExpressiveModePromptFragment = TEXT(
"## Expressive Speech\n"
"You can use expressive audio tags in your responses for precise vocal control. "
"Each tag affects approximately the next 4-5 words before returning to normal delivery.\n\n"
"Available tags:\n"
"- [laughs] for moments of humor or amusement\n"
"- [whispers] for confidential or intimate moments\n"
"- [sighs] for resignation or relief\n"
"- [slow] when emphasizing important information\n"
"- [excited] for enthusiastic or energetic delivery\n\n"
"Example: \"That's great to hear! [laughs] I'm glad we could sort that out for you.\"\n\n"
"Use these tags naturally and sparingly to enhance expressiveness without overusing them.");
// ── Action Tool ───────────────────────────────────────────────────────── // ── Action Tool ─────────────────────────────────────────────────────────
/** Include a configurable "perform_action" client tool in the agent configuration. /** Include a configurable "perform_action" client tool in the agent configuration.

View File

@ -615,6 +615,18 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchModelsClicked(
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = Pinned->GetEditedAsset()) if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = Pinned->GetEditedAsset())
{ {
int32 Idx = Pinned->ModelIDs.IndexOfByKey(Asset->TTSModelID); int32 Idx = Pinned->ModelIDs.IndexOfByKey(Asset->TTSModelID);
// Agent-only models (e.g. eleven_v3_conversational) may not appear
// in the general /v1/models list. Inject the asset's current model
// so the combo always reflects the actual value.
if (Idx == INDEX_NONE && !Asset->TTSModelID.IsEmpty())
{
FString DisplayStr = FString::Printf(TEXT("%s"), *Asset->TTSModelID);
Pinned->ModelDisplayNames.Add(MakeShareable(new FString(DisplayStr)));
Pinned->ModelIDs.Add(Asset->TTSModelID);
Idx = Pinned->ModelIDs.Num() - 1;
}
if (Idx != INDEX_NONE && Pinned->ModelComboBox.IsValid()) if (Idx != INDEX_NONE && Pinned->ModelComboBox.IsValid())
{ {
Pinned->ModelComboBox->SetSelectedItem(Pinned->ModelDisplayNames[Idx]); Pinned->ModelComboBox->SetSelectedItem(Pinned->ModelDisplayNames[Idx]);
@ -1318,6 +1330,33 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
} }
} }
// 5. Expressive Mode fragment
{
const FString& Fragment = Asset->ExpressiveModePromptFragment;
if (!Fragment.IsEmpty())
{
int32 Idx = Prompt.Find(Fragment, ESearchCase::CaseSensitive);
if (Idx != INDEX_NONE)
{
Prompt.LeftInline(Idx);
Prompt.TrimEndInline();
Asset->bExpressiveMode = true;
}
}
// Fallback: marker-based stripping
if (!Asset->bExpressiveMode)
{
const FString Marker = TEXT("## Expressive Speech");
int32 Idx = Prompt.Find(Marker, ESearchCase::CaseSensitive);
if (Idx != INDEX_NONE)
{
Prompt.LeftInline(Idx);
Prompt.TrimEndInline();
Asset->bExpressiveMode = true;
}
}
}
Asset->CharacterPrompt = Prompt; Asset->CharacterPrompt = Prompt;
} }
@ -1348,6 +1387,13 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
{ {
Asset->MaxTurns = MaxTurns; Asset->MaxTurns = MaxTurns;
} }
// expressive_mode (V3 Conversational)
bool bExpressive = false;
if ((*AgentObj)->TryGetBoolField(TEXT("expressive_mode"), bExpressive))
{
Asset->bExpressiveMode = bExpressive;
}
} }
// conversation_config.tts // conversation_config.tts
@ -1364,6 +1410,12 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
if ((*TTSObj)->TryGetStringField(TEXT("model_id"), ModelID)) if ((*TTSObj)->TryGetStringField(TEXT("model_id"), ModelID))
{ {
Asset->TTSModelID = ModelID; Asset->TTSModelID = ModelID;
// Auto-detect Expressive Mode from V3 Conversational model
if (ModelID == TEXT("eleven_v3_conversational"))
{
Asset->bExpressiveMode = true;
}
} }
double Stability = 0.5; double Stability = 0.5;
@ -1563,14 +1615,15 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
UE_LOG(LogPS_AI_AgentConfigEditor, Log, UE_LOG(LogPS_AI_AgentConfigEditor, Log,
TEXT("BuildAgentPayload: CharacterPrompt=%d chars, bMultilingual=%d, bAutoLangInstr=%d, Language='%s', " TEXT("BuildAgentPayload: CharacterPrompt=%d chars, bMultilingual=%d, bAutoLangInstr=%d, Language='%s', "
"LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d"), "LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d, bExpressiveMode=%d"),
Asset->CharacterPrompt.Len(), Asset->CharacterPrompt.Len(),
Asset->bMultilingual, Asset->bMultilingual,
Asset->bAutoLanguageInstruction, Asset->bAutoLanguageInstruction,
*Asset->Language, *Asset->Language,
Asset->LanguagePromptFragment.Len(), Asset->LanguagePromptFragment.Len(),
Asset->MultilingualPromptFragment.Len(), Asset->MultilingualPromptFragment.Len(),
Asset->bIncludeEmotionTool); Asset->bIncludeEmotionTool,
Asset->bExpressiveMode);
// Language handling: multilingual mode vs fixed-language mode. // Language handling: multilingual mode vs fixed-language mode.
// The ElevenLabs "language" field only controls STT/TTS — the LLM defaults to // The ElevenLabs "language" field only controls STT/TTS — the LLM defaults to
@ -1618,6 +1671,14 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ActionToolPromptFragment from ActionSet")); UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ActionToolPromptFragment from ActionSet"));
} }
// Append expressive mode instructions (V3 Conversational audio tags).
if (Asset->bExpressiveMode && !Asset->ExpressiveModePromptFragment.IsEmpty())
{
FullPrompt += TEXT("\n\n");
FullPrompt += Asset->ExpressiveModePromptFragment;
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ExpressiveModePromptFragment"));
}
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: FullPrompt = %d chars"), FullPrompt.Len()); UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: FullPrompt = %d chars"), FullPrompt.Len());
// prompt object (includes LLM selection + tools) // prompt object (includes LLM selection + tools)
@ -1664,6 +1725,10 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
{ {
AgentObj->SetNumberField(TEXT("max_tokens"), Asset->MaxTurns); AgentObj->SetNumberField(TEXT("max_tokens"), Asset->MaxTurns);
} }
if (Asset->bExpressiveMode)
{
AgentObj->SetBoolField(TEXT("expressive_mode"), true);
}
// tts // tts
TSharedPtr<FJsonObject> TTSObj = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> TTSObj = MakeShareable(new FJsonObject());
@ -1678,14 +1743,27 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
// Monolingual models (e.g. eleven_monolingual_v1) only support English. // Monolingual models (e.g. eleven_monolingual_v1) only support English.
FString ResolvedModelID = Asset->TTSModelID; FString ResolvedModelID = Asset->TTSModelID;
// Expressive mode requires V3 Conversational — override if needed.
if (Asset->bExpressiveMode)
{
if (ResolvedModelID != TEXT("eleven_v3_conversational"))
{
UE_LOG(LogPS_AI_AgentConfigEditor, Warning,
TEXT("Expressive mode: overriding TTS model '%s' → eleven_v3_conversational (required for audio tags)."),
*ResolvedModelID);
ResolvedModelID = TEXT("eleven_v3_conversational");
}
}
auto IsMultilingualModel = [](const FString& ModelID) -> bool auto IsMultilingualModel = [](const FString& ModelID) -> bool
{ {
return ModelID.Contains(TEXT("multilingual")) return ModelID.Contains(TEXT("multilingual"))
|| ModelID.Contains(TEXT("turbo")) || ModelID.Contains(TEXT("turbo"))
|| ModelID.Contains(TEXT("flash")); || ModelID.Contains(TEXT("flash"))
|| ModelID.Contains(TEXT("v3")); // V3 models are multilingual
}; };
if (Asset->bMultilingual) if (!Asset->bExpressiveMode && Asset->bMultilingual)
{ {
// Multilingual mode: force a multilingual TTS model. // Multilingual mode: force a multilingual TTS model.
if (ResolvedModelID.IsEmpty() || !IsMultilingualModel(ResolvedModelID)) if (ResolvedModelID.IsEmpty() || !IsMultilingualModel(ResolvedModelID))