Add Expressive Mode support for ElevenLabs V3 Conversational TTS
- Add bExpressiveMode toggle and editable ExpressiveModePromptFragment with audio tag instructions ([laughs], [whispers], [sighs], [slow], [excited]) - BuildAgentPayload: append prompt fragment, set expressive_mode API field on agent config, auto-override TTS model to eleven_v3_conversational - OnFetchAgent: strip expressive fragment from prompt (exact + marker fallback), read expressive_mode bool from API, auto-detect V3 model - TTS model combo: inject asset's current model if absent from /v1/models list (covers agent-only models like eleven_v3_conversational) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e5a32f5997
commit
18996a7254
@ -237,6 +237,35 @@ public:
|
|||||||
"- \"high\" for strong reactions (big laugh, deep sadness, shock)\n\n"
|
"- \"high\" for strong reactions (big laugh, deep sadness, shock)\n\n"
|
||||||
"Always return to neutral when the emotional moment passes.");
|
"Always return to neutral when the emotional moment passes.");
|
||||||
|
|
||||||
|
// ── Expressive Mode (V3 Conversational) ─────────────────────────────────
|
||||||
|
|
||||||
|
/** Enable Expressive Mode (requires TTS model "eleven_v3_conversational").
|
||||||
|
* When enabled, the agent uses emotionally intelligent speech with
|
||||||
|
* natural intonation and expressive audio tags like [laughs], [whispers].
|
||||||
|
* A prompt fragment is appended to instruct the LLM to use these tags. */
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode",
|
||||||
|
meta = (ToolTip = "Enable Expressive Mode.\nRequires V3 Conversational TTS model.\nAdds expressive audio tag instructions to the prompt."))
|
||||||
|
bool bExpressiveMode = false;
|
||||||
|
|
||||||
|
/** System prompt fragment appended when bExpressiveMode is true.
|
||||||
|
* Instructs the LLM to use inline audio tags for emotional delivery.
|
||||||
|
* Editable for customization — stripped automatically on Fetch. */
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode",
|
||||||
|
meta = (MultiLine = "true", EditCondition = "bExpressiveMode",
|
||||||
|
ToolTip = "Prompt instructions for expressive audio tags.\nAppended to CharacterPrompt when creating/updating the agent."))
|
||||||
|
FString ExpressiveModePromptFragment = TEXT(
|
||||||
|
"## Expressive Speech\n"
|
||||||
|
"You can use expressive audio tags in your responses for precise vocal control. "
|
||||||
|
"Each tag affects approximately the next 4-5 words before returning to normal delivery.\n\n"
|
||||||
|
"Available tags:\n"
|
||||||
|
"- [laughs] for moments of humor or amusement\n"
|
||||||
|
"- [whispers] for confidential or intimate moments\n"
|
||||||
|
"- [sighs] for resignation or relief\n"
|
||||||
|
"- [slow] when emphasizing important information\n"
|
||||||
|
"- [excited] for enthusiastic or energetic delivery\n\n"
|
||||||
|
"Example: \"That's great to hear! [laughs] I'm glad we could sort that out for you.\"\n\n"
|
||||||
|
"Use these tags naturally and sparingly to enhance expressiveness without overusing them.");
|
||||||
|
|
||||||
// ── Action Tool ─────────────────────────────────────────────────────────
|
// ── Action Tool ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
/** Include a configurable "perform_action" client tool in the agent configuration.
|
/** Include a configurable "perform_action" client tool in the agent configuration.
|
||||||
|
|||||||
@ -615,6 +615,18 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchModelsClicked(
|
|||||||
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = Pinned->GetEditedAsset())
|
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = Pinned->GetEditedAsset())
|
||||||
{
|
{
|
||||||
int32 Idx = Pinned->ModelIDs.IndexOfByKey(Asset->TTSModelID);
|
int32 Idx = Pinned->ModelIDs.IndexOfByKey(Asset->TTSModelID);
|
||||||
|
|
||||||
|
// Agent-only models (e.g. eleven_v3_conversational) may not appear
|
||||||
|
// in the general /v1/models list. Inject the asset's current model
|
||||||
|
// so the combo always reflects the actual value.
|
||||||
|
if (Idx == INDEX_NONE && !Asset->TTSModelID.IsEmpty())
|
||||||
|
{
|
||||||
|
FString DisplayStr = FString::Printf(TEXT("%s"), *Asset->TTSModelID);
|
||||||
|
Pinned->ModelDisplayNames.Add(MakeShareable(new FString(DisplayStr)));
|
||||||
|
Pinned->ModelIDs.Add(Asset->TTSModelID);
|
||||||
|
Idx = Pinned->ModelIDs.Num() - 1;
|
||||||
|
}
|
||||||
|
|
||||||
if (Idx != INDEX_NONE && Pinned->ModelComboBox.IsValid())
|
if (Idx != INDEX_NONE && Pinned->ModelComboBox.IsValid())
|
||||||
{
|
{
|
||||||
Pinned->ModelComboBox->SetSelectedItem(Pinned->ModelDisplayNames[Idx]);
|
Pinned->ModelComboBox->SetSelectedItem(Pinned->ModelDisplayNames[Idx]);
|
||||||
@ -1318,6 +1330,33 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 5. Expressive Mode fragment
|
||||||
|
{
|
||||||
|
const FString& Fragment = Asset->ExpressiveModePromptFragment;
|
||||||
|
if (!Fragment.IsEmpty())
|
||||||
|
{
|
||||||
|
int32 Idx = Prompt.Find(Fragment, ESearchCase::CaseSensitive);
|
||||||
|
if (Idx != INDEX_NONE)
|
||||||
|
{
|
||||||
|
Prompt.LeftInline(Idx);
|
||||||
|
Prompt.TrimEndInline();
|
||||||
|
Asset->bExpressiveMode = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Fallback: marker-based stripping
|
||||||
|
if (!Asset->bExpressiveMode)
|
||||||
|
{
|
||||||
|
const FString Marker = TEXT("## Expressive Speech");
|
||||||
|
int32 Idx = Prompt.Find(Marker, ESearchCase::CaseSensitive);
|
||||||
|
if (Idx != INDEX_NONE)
|
||||||
|
{
|
||||||
|
Prompt.LeftInline(Idx);
|
||||||
|
Prompt.TrimEndInline();
|
||||||
|
Asset->bExpressiveMode = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Asset->CharacterPrompt = Prompt;
|
Asset->CharacterPrompt = Prompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1348,6 +1387,13 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
|
|||||||
{
|
{
|
||||||
Asset->MaxTurns = MaxTurns;
|
Asset->MaxTurns = MaxTurns;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expressive_mode (V3 Conversational)
|
||||||
|
bool bExpressive = false;
|
||||||
|
if ((*AgentObj)->TryGetBoolField(TEXT("expressive_mode"), bExpressive))
|
||||||
|
{
|
||||||
|
Asset->bExpressiveMode = bExpressive;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// conversation_config.tts
|
// conversation_config.tts
|
||||||
@ -1364,6 +1410,12 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked()
|
|||||||
if ((*TTSObj)->TryGetStringField(TEXT("model_id"), ModelID))
|
if ((*TTSObj)->TryGetStringField(TEXT("model_id"), ModelID))
|
||||||
{
|
{
|
||||||
Asset->TTSModelID = ModelID;
|
Asset->TTSModelID = ModelID;
|
||||||
|
|
||||||
|
// Auto-detect Expressive Mode from V3 Conversational model
|
||||||
|
if (ModelID == TEXT("eleven_v3_conversational"))
|
||||||
|
{
|
||||||
|
Asset->bExpressiveMode = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double Stability = 0.5;
|
double Stability = 0.5;
|
||||||
@ -1563,14 +1615,15 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
|
|||||||
|
|
||||||
UE_LOG(LogPS_AI_AgentConfigEditor, Log,
|
UE_LOG(LogPS_AI_AgentConfigEditor, Log,
|
||||||
TEXT("BuildAgentPayload: CharacterPrompt=%d chars, bMultilingual=%d, bAutoLangInstr=%d, Language='%s', "
|
TEXT("BuildAgentPayload: CharacterPrompt=%d chars, bMultilingual=%d, bAutoLangInstr=%d, Language='%s', "
|
||||||
"LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d"),
|
"LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d, bExpressiveMode=%d"),
|
||||||
Asset->CharacterPrompt.Len(),
|
Asset->CharacterPrompt.Len(),
|
||||||
Asset->bMultilingual,
|
Asset->bMultilingual,
|
||||||
Asset->bAutoLanguageInstruction,
|
Asset->bAutoLanguageInstruction,
|
||||||
*Asset->Language,
|
*Asset->Language,
|
||||||
Asset->LanguagePromptFragment.Len(),
|
Asset->LanguagePromptFragment.Len(),
|
||||||
Asset->MultilingualPromptFragment.Len(),
|
Asset->MultilingualPromptFragment.Len(),
|
||||||
Asset->bIncludeEmotionTool);
|
Asset->bIncludeEmotionTool,
|
||||||
|
Asset->bExpressiveMode);
|
||||||
|
|
||||||
// Language handling: multilingual mode vs fixed-language mode.
|
// Language handling: multilingual mode vs fixed-language mode.
|
||||||
// The ElevenLabs "language" field only controls STT/TTS — the LLM defaults to
|
// The ElevenLabs "language" field only controls STT/TTS — the LLM defaults to
|
||||||
@ -1618,6 +1671,14 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
|
|||||||
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ActionToolPromptFragment from ActionSet"));
|
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ActionToolPromptFragment from ActionSet"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Append expressive mode instructions (V3 Conversational audio tags).
|
||||||
|
if (Asset->bExpressiveMode && !Asset->ExpressiveModePromptFragment.IsEmpty())
|
||||||
|
{
|
||||||
|
FullPrompt += TEXT("\n\n");
|
||||||
|
FullPrompt += Asset->ExpressiveModePromptFragment;
|
||||||
|
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ExpressiveModePromptFragment"));
|
||||||
|
}
|
||||||
|
|
||||||
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: FullPrompt = %d chars"), FullPrompt.Len());
|
UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: FullPrompt = %d chars"), FullPrompt.Len());
|
||||||
|
|
||||||
// prompt object (includes LLM selection + tools)
|
// prompt object (includes LLM selection + tools)
|
||||||
@ -1664,6 +1725,10 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
|
|||||||
{
|
{
|
||||||
AgentObj->SetNumberField(TEXT("max_tokens"), Asset->MaxTurns);
|
AgentObj->SetNumberField(TEXT("max_tokens"), Asset->MaxTurns);
|
||||||
}
|
}
|
||||||
|
if (Asset->bExpressiveMode)
|
||||||
|
{
|
||||||
|
AgentObj->SetBoolField(TEXT("expressive_mode"), true);
|
||||||
|
}
|
||||||
|
|
||||||
// tts
|
// tts
|
||||||
TSharedPtr<FJsonObject> TTSObj = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> TTSObj = MakeShareable(new FJsonObject());
|
||||||
@ -1678,14 +1743,27 @@ TSharedPtr<FJsonObject> FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu
|
|||||||
// Monolingual models (e.g. eleven_monolingual_v1) only support English.
|
// Monolingual models (e.g. eleven_monolingual_v1) only support English.
|
||||||
FString ResolvedModelID = Asset->TTSModelID;
|
FString ResolvedModelID = Asset->TTSModelID;
|
||||||
|
|
||||||
|
// Expressive mode requires V3 Conversational — override if needed.
|
||||||
|
if (Asset->bExpressiveMode)
|
||||||
|
{
|
||||||
|
if (ResolvedModelID != TEXT("eleven_v3_conversational"))
|
||||||
|
{
|
||||||
|
UE_LOG(LogPS_AI_AgentConfigEditor, Warning,
|
||||||
|
TEXT("Expressive mode: overriding TTS model '%s' → eleven_v3_conversational (required for audio tags)."),
|
||||||
|
*ResolvedModelID);
|
||||||
|
ResolvedModelID = TEXT("eleven_v3_conversational");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
auto IsMultilingualModel = [](const FString& ModelID) -> bool
|
auto IsMultilingualModel = [](const FString& ModelID) -> bool
|
||||||
{
|
{
|
||||||
return ModelID.Contains(TEXT("multilingual"))
|
return ModelID.Contains(TEXT("multilingual"))
|
||||||
|| ModelID.Contains(TEXT("turbo"))
|
|| ModelID.Contains(TEXT("turbo"))
|
||||||
|| ModelID.Contains(TEXT("flash"));
|
|| ModelID.Contains(TEXT("flash"))
|
||||||
|
|| ModelID.Contains(TEXT("v3")); // V3 models are multilingual
|
||||||
};
|
};
|
||||||
|
|
||||||
if (Asset->bMultilingual)
|
if (!Asset->bExpressiveMode && Asset->bMultilingual)
|
||||||
{
|
{
|
||||||
// Multilingual mode: force a multilingual TTS model.
|
// Multilingual mode: force a multilingual TTS model.
|
||||||
if (ResolvedModelID.IsEmpty() || !IsMultilingualModel(ResolvedModelID))
|
if (ResolvedModelID.IsEmpty() || !IsMultilingualModel(ResolvedModelID))
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user