From 18996a7254e51842054217c381f85f96913bae04 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 12 Mar 2026 11:06:20 +0100 Subject: [PATCH] Add Expressive Mode support for ElevenLabs V3 Conversational TTS - Add bExpressiveMode toggle and editable ExpressiveModePromptFragment with audio tag instructions ([laughs], [whispers], [sighs], [slow], [excited]) - BuildAgentPayload: append prompt fragment, set expressive_mode API field on agent config, auto-override TTS model to eleven_v3_conversational - OnFetchAgent: strip expressive fragment from prompt (exact + marker fallback), read expressive_mode bool from API, auto-detect V3 model - TTS model combo: inject asset's current model if absent from /v1/models list (covers agent-only models like eleven_v3_conversational) Co-Authored-By: Claude Opus 4.6 --- .../PS_AI_ConvAgent_AgentConfig_ElevenLabs.h | 29 +++++++ ...nt_AgentConfigCustomization_ElevenLabs.cpp | 86 ++++++++++++++++++- 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h index c31ae08..47615d5 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h @@ -237,6 +237,35 @@ public: "- \"high\" for strong reactions (big laugh, deep sadness, shock)\n\n" "Always return to neutral when the emotional moment passes."); + // ── Expressive Mode (V3 Conversational) ───────────────────────────────── + + /** Enable Expressive Mode (requires TTS model "eleven_v3_conversational"). + * When enabled, the agent uses emotionally intelligent speech with + * natural intonation and expressive audio tags like [laughs], [whispers]. + * A prompt fragment is appended to instruct the LLM to use these tags. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode", + meta = (ToolTip = "Enable Expressive Mode.\nRequires V3 Conversational TTS model.\nAdds expressive audio tag instructions to the prompt.")) + bool bExpressiveMode = false; + + /** System prompt fragment appended when bExpressiveMode is true. + * Instructs the LLM to use inline audio tags for emotional delivery. + * Editable for customization — stripped automatically on Fetch. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Expressive Mode", + meta = (MultiLine = "true", EditCondition = "bExpressiveMode", + ToolTip = "Prompt instructions for expressive audio tags.\nAppended to CharacterPrompt when creating/updating the agent.")) + FString ExpressiveModePromptFragment = TEXT( + "## Expressive Speech\n" + "You can use expressive audio tags in your responses for precise vocal control. " + "Each tag affects approximately the next 4-5 words before returning to normal delivery.\n\n" + "Available tags:\n" + "- [laughs] for moments of humor or amusement\n" + "- [whispers] for confidential or intimate moments\n" + "- [sighs] for resignation or relief\n" + "- [slow] when emphasizing important information\n" + "- [excited] for enthusiastic or energetic delivery\n\n" + "Example: \"That's great to hear! [laughs] I'm glad we could sort that out for you.\"\n\n" + "Use these tags naturally and sparingly to enhance expressiveness without overusing them."); + // ── Action Tool ───────────────────────────────────────────────────────── /** Include a configurable "perform_action" client tool in the agent configuration. diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp index 23f5425..6f0a49e 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp @@ -615,6 +615,18 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchModelsClicked( if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = Pinned->GetEditedAsset()) { int32 Idx = Pinned->ModelIDs.IndexOfByKey(Asset->TTSModelID); + + // Agent-only models (e.g. eleven_v3_conversational) may not appear + // in the general /v1/models list. Inject the asset's current model + // so the combo always reflects the actual value. + if (Idx == INDEX_NONE && !Asset->TTSModelID.IsEmpty()) + { + FString DisplayStr = FString::Printf(TEXT("%s"), *Asset->TTSModelID); + Pinned->ModelDisplayNames.Add(MakeShareable(new FString(DisplayStr))); + Pinned->ModelIDs.Add(Asset->TTSModelID); + Idx = Pinned->ModelIDs.Num() - 1; + } + if (Idx != INDEX_NONE && Pinned->ModelComboBox.IsValid()) { Pinned->ModelComboBox->SetSelectedItem(Pinned->ModelDisplayNames[Idx]); @@ -1318,6 +1330,33 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked() } } + // 5. Expressive Mode fragment + { + const FString& Fragment = Asset->ExpressiveModePromptFragment; + if (!Fragment.IsEmpty()) + { + int32 Idx = Prompt.Find(Fragment, ESearchCase::CaseSensitive); + if (Idx != INDEX_NONE) + { + Prompt.LeftInline(Idx); + Prompt.TrimEndInline(); + Asset->bExpressiveMode = true; + } + } + // Fallback: marker-based stripping + if (!Asset->bExpressiveMode) + { + const FString Marker = TEXT("## Expressive Speech"); + int32 Idx = Prompt.Find(Marker, ESearchCase::CaseSensitive); + if (Idx != INDEX_NONE) + { + Prompt.LeftInline(Idx); + Prompt.TrimEndInline(); + Asset->bExpressiveMode = true; + } + } + } + Asset->CharacterPrompt = Prompt; } @@ -1348,6 +1387,13 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked() { Asset->MaxTurns = MaxTurns; } + + // expressive_mode (V3 Conversational) + bool bExpressive = false; + if ((*AgentObj)->TryGetBoolField(TEXT("expressive_mode"), bExpressive)) + { + Asset->bExpressiveMode = bExpressive; + } } // conversation_config.tts @@ -1364,6 +1410,12 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchAgentClicked() if ((*TTSObj)->TryGetStringField(TEXT("model_id"), ModelID)) { Asset->TTSModelID = ModelID; + + // Auto-detect Expressive Mode from V3 Conversational model + if (ModelID == TEXT("eleven_v3_conversational")) + { + Asset->bExpressiveMode = true; + } } double Stability = 0.5; @@ -1563,14 +1615,15 @@ TSharedPtr FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: CharacterPrompt=%d chars, bMultilingual=%d, bAutoLangInstr=%d, Language='%s', " - "LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d"), + "LangFragment=%d chars, MultiFragment=%d chars, bEmotionTool=%d, bExpressiveMode=%d"), Asset->CharacterPrompt.Len(), Asset->bMultilingual, Asset->bAutoLanguageInstruction, *Asset->Language, Asset->LanguagePromptFragment.Len(), Asset->MultilingualPromptFragment.Len(), - Asset->bIncludeEmotionTool); + Asset->bIncludeEmotionTool, + Asset->bExpressiveMode); // Language handling: multilingual mode vs fixed-language mode. // The ElevenLabs "language" field only controls STT/TTS — the LLM defaults to @@ -1618,6 +1671,14 @@ TSharedPtr FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ActionToolPromptFragment from ActionSet")); } + // Append expressive mode instructions (V3 Conversational audio tags). + if (Asset->bExpressiveMode && !Asset->ExpressiveModePromptFragment.IsEmpty()) + { + FullPrompt += TEXT("\n\n"); + FullPrompt += Asset->ExpressiveModePromptFragment; + UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT(" → Appended ExpressiveModePromptFragment")); + } + UE_LOG(LogPS_AI_AgentConfigEditor, Log, TEXT("BuildAgentPayload: FullPrompt = %d chars"), FullPrompt.Len()); // prompt object (includes LLM selection + tools) @@ -1664,6 +1725,10 @@ TSharedPtr FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu { AgentObj->SetNumberField(TEXT("max_tokens"), Asset->MaxTurns); } + if (Asset->bExpressiveMode) + { + AgentObj->SetBoolField(TEXT("expressive_mode"), true); + } // tts TSharedPtr TTSObj = MakeShareable(new FJsonObject()); @@ -1678,14 +1743,27 @@ TSharedPtr FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::Bu // Monolingual models (e.g. eleven_monolingual_v1) only support English. FString ResolvedModelID = Asset->TTSModelID; + // Expressive mode requires V3 Conversational — override if needed. + if (Asset->bExpressiveMode) + { + if (ResolvedModelID != TEXT("eleven_v3_conversational")) + { + UE_LOG(LogPS_AI_AgentConfigEditor, Warning, + TEXT("Expressive mode: overriding TTS model '%s' → eleven_v3_conversational (required for audio tags)."), + *ResolvedModelID); + ResolvedModelID = TEXT("eleven_v3_conversational"); + } + } + auto IsMultilingualModel = [](const FString& ModelID) -> bool { return ModelID.Contains(TEXT("multilingual")) || ModelID.Contains(TEXT("turbo")) - || ModelID.Contains(TEXT("flash")); + || ModelID.Contains(TEXT("flash")) + || ModelID.Contains(TEXT("v3")); // V3 models are multilingual }; - if (Asset->bMultilingual) + if (!Asset->bExpressiveMode && Asset->bMultilingual) { // Multilingual mode: force a multilingual TTS model. if (ResolvedModelID.IsEmpty() || !IsMultilingualModel(ResolvedModelID))