From 4456dfa9dc523d8f1f156420a509bd196aebb593 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Fri, 6 Mar 2026 16:43:20 +0100 Subject: [PATCH] Add turn eagerness, speculative turn, adaptive pre-buffer, and latency HUD improvements - Add TurnEagerness (Eager/Normal/Patient) and bSpeculativeTurn to agent config data asset, sent as conversation_config_override at WebSocket connection time - Add adaptive pre-buffer system: measures inter-chunk TTS timing and decreases pre-buffer when chunks arrive fast enough (decrease-only, resets each conversation) - New UPROPERTY: bAdaptivePreBuffer toggle, AudioPreBufferMs as starting/worst-case value - Rework latency HUD: TTS+Net, PreBuf actual/target with trend indicator, Gen>Ear, WS Ping, server region display - Fetch ElevenLabs server region from REST API x-region header - Add editor Detail Customization: TurnEagerness dropdown + SpeculativeTurn checkbox in AgentConfig with LLM picker and Language picker Co-Authored-By: Claude Opus 4.6 --- Unreal/PS_AI_Agent/Config/DefaultEngine.ini | 5 +- .../PS_AI_ConvAgent_ElevenLabsComponent.cpp | 259 +++++++++++++++--- ...AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp | 91 ++++-- .../PS_AI_ConvAgent/Public/PS_AI_ConvAgent.h | 53 +++- .../PS_AI_ConvAgent_AgentConfig_ElevenLabs.h | 19 ++ .../Public/PS_AI_ConvAgent_Definitions.h | 14 + .../PS_AI_ConvAgent_ElevenLabsComponent.h | 53 +++- ...S_AI_ConvAgent_WebSocket_ElevenLabsProxy.h | 24 ++ ...nt_AgentConfigCustomization_ElevenLabs.cpp | 127 +++++++-- 9 files changed, 540 insertions(+), 105 deletions(-) diff --git a/Unreal/PS_AI_Agent/Config/DefaultEngine.ini b/Unreal/PS_AI_Agent/Config/DefaultEngine.ini index 5897085..3a95522 100644 --- a/Unreal/PS_AI_Agent/Config/DefaultEngine.ini +++ b/Unreal/PS_AI_Agent/Config/DefaultEngine.ini @@ -1,8 +1,8 @@ [/Script/EngineSettings.GameMapsSettings] -GameDefaultMap=/Game/voidMap.voidMap -EditorStartupMap=/Game/voidMap.voidMap +GameDefaultMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman +EditorStartupMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman [/Script/Engine.RendererSettings] r.AllowStaticLighting=False @@ -182,4 +182,5 @@ ManualIPAddress= [/Script/PS_AI_ConvAgent.PS_AI_ConvAgent_Settings_ElevenLabs] API_Key=7b73c4244ccbec394cc010aaab01b0ec59ce0b11fc636ce4828354f675ca14a5 +ServerRegion=Global diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp index f047027..45d81ce 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp @@ -17,6 +17,9 @@ #include "GameFramework/PlayerController.h" #include "Net/UnrealNetwork.h" #include "VoiceModule.h" +#include "HttpModule.h" +#include "Interfaces/IHttpRequest.h" +#include "Interfaces/IHttpResponse.h" DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_ConvAgent_ElevenLabs, Log, All); @@ -147,15 +150,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel if (bPreBuffering) { const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0; - if (Elapsed >= static_cast(AudioPreBufferMs)) + const int32 EffPreBuf = (AudioPreBufferMs > 0) + ? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0; + if (Elapsed >= static_cast(EffPreBuf)) { bPreBuffering = false; if (bDebug) { const double Tpb = FPlatformTime::Seconds() - SessionStartTime; UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, - TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms). Starting playback."), - Tpb, LastClosedTurnIndex, AudioPreBufferMs); + TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms adaptive). Starting playback."), + Tpb, LastClosedTurnIndex, EffPreBuf); } // Only start playback if the agent is still speaking. // If silence detection already set bAgentSpeaking=false, this is stale. @@ -292,6 +297,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel // Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time. if (bShouldBroadcastStopped) { + // Adapt pre-buffer for next turn based on this turn's signals. + ApplyPreBufferAdaptation(); + if (bHardTimeoutFired && bDebug) { const double Tht = FPlatformTime::Seconds() - SessionStartTime; @@ -321,7 +329,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel { const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread(); const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency; - if (bShowLatency) + // Only draw on the active (connected) Authority component. + // Multiple agents in the scene would overwrite each other's HUD at the same + // BaseKey, causing visible blinking between their values. + if (bShowLatency && IsConnected() && GetOwnerRole() == ROLE_Authority) { DrawLatencyHUD(); } @@ -388,6 +399,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StartConversation_Internal() // Pass configuration to the proxy before connecting. WebSocketProxy->TurnMode = TurnMode; + if (AgentConfig) + { + WebSocketProxy->TurnEagerness = AgentConfig->TurnEagerness; + WebSocketProxy->bSpeculativeTurn = AgentConfig->bSpeculativeTurn; + } // Resolve AgentID by priority: AgentConfig > component string > project default. FString ResolvedAgentID = AgentID; @@ -834,6 +850,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen SessionStartTime = FPlatformTime::Seconds(); TurnIndex = 0; LastClosedTurnIndex = 0; + + // Initialize adaptive pre-buffer from designer settings. + AdaptivePreBufferMs = AudioPreBufferMs; // Start at the designer's value. + PreBufferTrend = 0; + TurnIdealPreBufferMs = -1; + bTurnGapMeasured = false; + UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID); OnAgentConnected.Broadcast(Info); @@ -852,6 +875,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen } } + // Probe server region once per session (only when latency HUD is enabled). + if (ServerRegion.IsEmpty() && GetOwnerRole() == ROLE_Authority) + { + const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread(); + const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency; + if (bShowLatency) + { + FetchServerRegion(); + } + } + // In Client turn mode (push-to-talk), the user controls listening manually via // StartListening()/StopListening(). Auto-starting would leave the mic open // permanently and interfere with push-to-talk — the T-release StopListening() @@ -1081,21 +1115,28 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted() // In Server VAD mode, StopListening() is not called — the server detects // end of user speech and immediately starts generating. If TurnEndTime was // not set by StopListening since the last generation (i.e. it's stale or 0), - // use Now as the best client-side approximation. + // use the proxy's LastUserTranscriptTime as the best approximation: + // user_transcript arrives after server VAD + ASR, just before LLM starts. const bool bFreshTurnEnd = (TurnEndTime > GenerationStartTime) && (GenerationStartTime > 0.0); if (!bFreshTurnEnd) { - TurnEndTime = Now; + const double TranscriptTime = WebSocketProxy ? WebSocketProxy->GetLastUserTranscriptTime() : 0.0; + TurnEndTime = (TranscriptTime > 0.0) ? TranscriptTime : Now; } - // Reset all latency measurements — new response cycle starts here. - // All metrics are anchored to GenerationStartTime (= now), which is the closest - // client-side proxy for "user stopped speaking" in Server VAD mode. - CurrentLatencies = FDebugLatencies(); + // New response cycle starts here. All client-side metrics are anchored to + // GenerationStartTime (= now). Do NOT zero CurrentLatencies — the per-field + // assignments in EnqueueAgentAudio() overwrite naturally, so the HUD shows the + // previous turn's values until the new turn's measurements arrive (no "---" blink). GenerationStartTime = Now; const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = Now - TurnEndTime; + + // LLM latency: time from user_transcript received to first text token arriving. + // In Server VAD mode, this approximates LLM TTFT + network (post-ASR). + // In Client turn mode, this is the full ASR + LLM latency. + CurrentLatencies.TurnEndToTextMs = static_cast(LatencyFromTurnEnd * 1000.0); if (bIsListening) { // In Server VAD + interruption mode, keep the mic open so the server can @@ -1321,7 +1362,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow( AudioQueueReadOffset = 0; } - // Log when queue recovers (new data arrived after being dry) + // Queue recovered: was dry, now has data again. + // Only flag as underrun if the gap was long enough to be audible. + // Short gaps (<200ms) are handled seamlessly by USoundWaveProcedural's + // internal silence — no need to increase the pre-buffer for those. if (bQueueWasDry) { bQueueWasDry = false; @@ -1329,7 +1373,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow( { const double T = FPlatformTime::Seconds() - SessionStartTime; UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, - TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."), + TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered (%d bytes remaining)."), T, LastClosedTurnIndex, AudioQueue.Num() - AudioQueueReadOffset); } } @@ -1371,6 +1415,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking. bQueueWasDry = false; SilentTickCount = 0; + // Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement. + TurnFirstChunkTime = FPlatformTime::Seconds(); + TurnFirstChunkBytes = PCMData.Num(); + TurnIdealPreBufferMs = -1; + bTurnGapMeasured = false; // Latency capture (always, for HUD display). if (GenerationStartTime > 0.0) @@ -1393,7 +1442,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray MulticastAgentStartedSpeaking(); } - if (AudioPreBufferMs > 0) + const int32 EffectivePreBufferMs = (AudioPreBufferMs > 0) + ? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0; + if (EffectivePreBufferMs > 0) { // Pre-buffer: accumulate audio before starting playback. // This absorbs TTS inter-chunk gaps so chunk 2 arrives before @@ -1404,8 +1455,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray { const double Tpb2 = FPlatformTime::Seconds() - SessionStartTime; UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, - TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms before starting playback."), - Tpb2, LastClosedTurnIndex, AudioPreBufferMs); + TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms (adaptive) before starting playback."), + Tpb2, LastClosedTurnIndex, EffectivePreBufferMs); } } else @@ -1433,14 +1484,25 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray if (GetOwnerRole() == ROLE_Authority) { bPreBuffering = false; + // Measure inter-chunk gap for adaptive pre-buffer. + if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0) + { + const double NowGap = FPlatformTime::Seconds(); + const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0; + // Chunk 1 audio duration: 16kHz 16-bit mono = 32000 bytes/sec. + const double Chunk1AudioMs = (TurnFirstChunkBytes > 0) + ? (static_cast(TurnFirstChunkBytes) / 32.0) : 0.0; + TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs)); + bTurnGapMeasured = true; + } if (bDebug) { const double NowPb = FPlatformTime::Seconds(); const double BufferedMs = (NowPb - PreBufferStartTime) * 1000.0; const double Tpb3 = NowPb - SessionStartTime; UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, - TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."), - Tpb3, LastClosedTurnIndex, BufferedMs); + TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered, ideal=%dms). Starting playback."), + Tpb3, LastClosedTurnIndex, BufferedMs, TurnIdealPreBufferMs); } if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) { @@ -1467,6 +1529,23 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray { AudioPlaybackComponent->Play(); } + // Measure inter-chunk gap for adaptive pre-buffer (first gap only). + if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0 && GetOwnerRole() == ROLE_Authority) + { + const double NowGap = FPlatformTime::Seconds(); + const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0; + const double Chunk1AudioMs = (TurnFirstChunkBytes > 0) + ? (static_cast(TurnFirstChunkBytes) / 32.0) : 0.0; + TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs)); + bTurnGapMeasured = true; + if (bDebug) + { + const double T = NowGap - SessionStartTime; + UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, + TEXT("[T+%.2fs] [Turn %d] Inter-chunk gap: %.0fms, chunk1 audio: %.0fms → ideal pre-buffer: %dms"), + T, LastClosedTurnIndex, InterChunkGapMs, Chunk1AudioMs, TurnIdealPreBufferMs); + } + } // Reset silence counter — new audio arrived, we're not in a gap anymore SilentTickCount = 0; } @@ -1516,6 +1595,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio() // Broadcast outside the lock. if (bWasSpeaking) { + // Adapt pre-buffer for next turn based on this turn's signals. + ApplyPreBufferAdaptation(); + if (bDebug) { const double T = Now - SessionStartTime; @@ -1536,6 +1618,52 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio() } } +void UPS_AI_ConvAgent_ElevenLabsComponent::ApplyPreBufferAdaptation() +{ + // Only adapt on Authority (where the WebSocket lives and measurements are taken). + if (GetOwnerRole() != ROLE_Authority) return; + // Adaptive mode must be enabled, and pre-buffering must be active. + if (!bAdaptivePreBuffer || AudioPreBufferMs == 0) return; + // No measurement this turn (single-chunk response or no second chunk arrived). + if (TurnIdealPreBufferMs < 0) { PreBufferTrend = 0; return; } + + const int32 Prev = AdaptivePreBufferMs; + + // DECREASE-ONLY: the measured ideal tells us the minimum pre-buffer needed. + // If the ideal is lower than our current value, the connection is fast enough + // that we can reduce the pre-buffer and save latency. + // If the ideal is higher (e.g. natural speech pause, slow network), we do NOT + // increase — USoundWaveProcedural handles gaps seamlessly in most cases. + // The user sets AudioPreBufferMs as the "worst case" starting value; + // the system only optimizes downward from there. Resets each conversation. + if (TurnIdealPreBufferMs < AdaptivePreBufferMs) + { + // Ideal is lower — decrease toward it (EMA 30% per turn, with 50ms margin). + const int32 TargetMs = FMath::Max(AdaptivePreBufferMinMs, TurnIdealPreBufferMs + 50); + AdaptivePreBufferMs = FMath::Max(AdaptivePreBufferMinMs, + FMath::RoundToInt32(AdaptivePreBufferMs * 0.7f + TargetMs * 0.3f)); + PreBufferTrend = (AdaptivePreBufferMs < Prev) ? -1 : 0; + } + else + { + // Ideal >= current — connection is same or worse, keep current value. + PreBufferTrend = 0; + } + + // Reset measurement for next turn. + const int32 IdealForLog = TurnIdealPreBufferMs; + TurnIdealPreBufferMs = -1; + bTurnGapMeasured = false; + + if (bDebug && Prev != AdaptivePreBufferMs) + { + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, + TEXT("[T+%.2fs] [Turn %d] Adaptive pre-buffer: %d ms -> %d ms (ideal=%dms)"), + T, LastClosedTurnIndex, Prev, AdaptivePreBufferMs, IdealForLog); + } +} + // ───────────────────────────────────────────────────────────────────────────── // Microphone → WebSocket // ───────────────────────────────────────────────────────────────────────────── @@ -2404,6 +2532,42 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT(""))); } +// ───────────────────────────────────────────────────────────────────────────── +// Server region detection (one-shot HTTP probe) +// ───────────────────────────────────────────────────────────────────────────── +void UPS_AI_ConvAgent_ElevenLabsComponent::FetchServerRegion() +{ + const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings(); + if (!Settings || Settings->API_Key.IsEmpty()) return; + + auto Request = FHttpModule::Get().CreateRequest(); + Request->SetURL(Settings->GetAPIBaseURL() + TEXT("/v1/models")); + Request->SetVerb(TEXT("GET")); + Request->SetHeader(TEXT("xi-api-key"), Settings->API_Key); + + TWeakObjectPtr WeakThis(this); + Request->OnProcessRequestComplete().BindLambda( + [WeakThis](FHttpRequestPtr /*Req*/, FHttpResponsePtr Resp, bool bSuccess) + { + if (!bSuccess || !Resp.IsValid()) return; + const FString Region = Resp->GetHeader(TEXT("x-region")); + if (Region.IsEmpty()) return; + + AsyncTask(ENamedThreads::GameThread, [WeakThis, Region]() + { + if (WeakThis.IsValid()) + { + WeakThis->ServerRegion = Region; + UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("ElevenLabs server region: %s"), *Region); + } + }); + }); + Request->ProcessRequest(); +} + +// ───────────────────────────────────────────────────────────────────────────── +// Latency debug HUD +// ───────────────────────────────────────────────────────────────────────────── void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const { if (!GEngine) return; @@ -2412,29 +2576,62 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const const int32 BaseKey = 93700; const float DisplayTime = 1.0f; // long enough to avoid flicker between ticks - const FColor TitleColor = FColor::Cyan; - const FColor ValueColor = FColor::White; + const FColor TitleColor = FColor::Cyan; + const FColor ValueColor = FColor::White; const FColor HighlightColor = FColor::Yellow; - // Helper: format a single metric — shows "---" when not yet captured this turn auto Fmt = [](float Ms) -> FString { return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); }; - // Title — all times measured from agent_response_started - GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, - TEXT("=== Latency (from gen start) ===")); + int32 Row = 0; - // 1. Gen → Audio: generation start → first audio chunk (LLM + TTS) - GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, - FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor, + TEXT("=== Voice-to-Voice Latency ===")); - // 2. Pre-buffer wait before playback - GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, - FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); + // Client-side breakdown: TTS+Net + Pre-buffer = Gen>Ear + // Note: LLM latency is only visible on ElevenLabs dashboard (server-side). + // In Server VAD mode, no reliable client-side "end of user speech" marker exists. + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor, + FString::Printf(TEXT(" TTS+Net: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); - // 3. Gen → Ear: generation start → playback starts (user-perceived total) - GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, + // Pre-buffer display depends on adaptive mode. + if (bAdaptivePreBuffer && AudioPreBufferMs > 0) + { + // Adaptive ON: show actual wait + adaptive target with trend arrow. + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor, + FString::Printf(TEXT(" PreBuf actual: %s"), *Fmt(CurrentLatencies.PreBufferMs))); + + const TCHAR* TrendArrow = (PreBufferTrend > 0) ? TEXT(" ^") + : (PreBufferTrend < 0) ? TEXT(" v") + : TEXT(""); + const FColor AdaptiveColor = (PreBufferTrend > 0) ? FColor::Red + : (PreBufferTrend < 0) ? FColor::Green + : ValueColor; + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, AdaptiveColor, + FString::Printf(TEXT(" PreBuf target: %d ms%s"), AdaptivePreBufferMs, TrendArrow)); + } + else + { + // Adaptive OFF (or pre-buffer disabled): show fixed pre-buffer value. + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor, + FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); + } + + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, HighlightColor, FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs))); + + // Connection section + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor, + TEXT("--- Connection ---")); + + const int32 PingMs = WebSocketProxy ? WebSocketProxy->GetLastPingMs() : -1; + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor, + FString::Printf(TEXT(" WS Ping: %s"), + (PingMs >= 0) ? *FString::Printf(TEXT("%d ms"), PingMs) : TEXT("---"))); + + GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor, + FString::Printf(TEXT(" Region: %s"), + ServerRegion.IsEmpty() ? TEXT("---") : *ServerRegion)); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp index 51ad395..bde5581 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.cpp @@ -207,41 +207,58 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected() // This produces smooth continuous audio chunks without the fragmentation caused by // explicit optimize_streaming_latency or enable_intermediate_response overrides. // - // In Client (push-to-talk) mode only, we override turn_timeout to reduce latency. - // In Server VAD mode, the config override is empty (matches C++ sample exactly). + // Build turn configuration based on mode + latency settings. TSharedPtr ConversationConfigOverride = MakeShareable(new FJsonObject()); - if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client) { - // turn_timeout: how long the server waits after VAD detects silence before - // processing the user's turn. Default is ~3s. In push-to-talk mode this - // directly adds latency — the server waits after the user releases T. - // 1s is safe without speculative_turn (which was removed — see history below). - // - // History: - // turn_timeout=1 was problematic when combined with speculative_turn=true - // (server silently dropped turns 3+). Without speculative_turn, 1s is safe - // and halves the per-turn latency. TSharedPtr TurnObj = MakeShareable(new FJsonObject()); - TurnObj->SetNumberField(TEXT("turn_timeout"), 1); + bool bHasTurnOverrides = false; - TSharedPtr AgentObj = MakeShareable(new FJsonObject()); - AgentObj->SetObjectField(TEXT("turn"), TurnObj); + // In Client (push-to-talk) mode, reduce turn_timeout to minimize latency. + if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client) + { + TurnObj->SetNumberField(TEXT("turn_timeout"), 1); + bHasTurnOverrides = true; + } - ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); + // turn_eagerness: controls how quickly the server interprets pauses as end-of-speech. + // "eager" = fastest (may cut user off), "normal" = balanced, "patient" = waits longer. + if (TurnEagerness != EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal) + { + FString EagernessStr; + switch (TurnEagerness) + { + case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Eager: EagernessStr = TEXT("eager"); break; + case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Patient: EagernessStr = TEXT("patient"); break; + default: EagernessStr = TEXT("normal"); break; + } + TurnObj->SetStringField(TEXT("turn_eagerness"), EagernessStr); + bHasTurnOverrides = true; + } + + // speculative_turn: start generating a response before confirming end-of-speech. + // Reduces latency but may cause occasional false starts (discarded if user continues). + if (bSpeculativeTurn) + { + TurnObj->SetBoolField(TEXT("speculative_turn"), true); + bHasTurnOverrides = true; + } + + if (bHasTurnOverrides) + { + TSharedPtr AgentObj = MakeShareable(new FJsonObject()); + AgentObj->SetObjectField(TEXT("turn"), TurnObj); + ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); + } } // NOTE: We intentionally do NOT send these overrides (matching C++ sample): // - // - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes - // the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely. - // With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering). - // With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps). - // Server default (omitted): produces smooth continuous audio (no gaps in C++ sample). + // - tts.optimize_streaming_latency: deprecated by ElevenLabs. Sending any value + // changes TTS chunking behaviour. Server default (omitted) is optimal. // // - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks - // before finishing generation → fragmented audio. When omitted (C++ sample), the - // LLM completes its response first → continuous TTS chunks. + // before finishing generation → fragmented audio. Omitted = server default. // // - custom_llm_extra_body (empty object): Even an empty object might override the // agent's configured custom_llm_extra_body with nothing. Omit entirely. @@ -259,12 +276,15 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected() FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter); { const UPS_AI_ConvAgent_Settings_ElevenLabs* S = FPS_AI_ConvAgentModule::Get().GetSettings(); - if (S->bVerboseLogging) + if (S && S->bVerboseLogging) { UE_LOG(LogPS_AI_ConvAgent_WS_ElevenLabs, Verbose, TEXT("Sending initiation: %s"), *InitJson); } } - WebSocket->Send(InitJson); + if (WebSocket.IsValid()) + { + WebSocket->Send(InitJson); + } } void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnectionError(const FString& Error) @@ -507,6 +527,10 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleTranscript(const TSharedP return; } + // Record arrival time for latency measurement (ASR+LLM breakdown). + // user_transcript arrives after server VAD + ASR, just before LLM starts. + LastUserTranscriptTime = FPlatformTime::Seconds(); + FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs Segment; Segment.Speaker = TEXT("user"); (*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text); @@ -679,6 +703,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandlePing(const TSharedPtrTryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent) { (*PingEvent)->TryGetNumberField(TEXT("event_id"), EventID); + + // Extract server-reported WS round-trip latency. + int32 PingValue = 0; + if ((*PingEvent)->TryGetNumberField(TEXT("ping_ms"), PingValue)) + { + LastPingMs.store(PingValue, std::memory_order_relaxed); + } } TSharedPtr Pong = MakeShareable(new FJsonObject()); @@ -718,7 +749,7 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr { const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings(); - // Custom URL override takes full precedence + // Custom URL override takes full precedence (advanced / proxy use case) if (!Settings->CustomWebSocketURL.IsEmpty()) { return Settings->CustomWebSocketURL; @@ -730,9 +761,9 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr return FString(); } - // Official ElevenLabs Conversational AI WebSocket endpoint - // wss://api.elevenlabs.io/v1/convai/conversation?agent_id= + // Build URL from the region-aware base: wss:///v1/convai/conversation?agent_id= + const FString BaseURL = Settings->GetWSBaseURL(); return FString::Printf( - TEXT("wss://api.elevenlabs.io/v1/convai/conversation?agent_id=%s"), - *AgentIDOverride); + TEXT("%s/v1/convai/conversation?agent_id=%s"), + *BaseURL, *AgentIDOverride); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent.h index 2dfaee7..81d62e6 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent.h @@ -6,6 +6,22 @@ #include "Modules/ModuleManager.h" #include "PS_AI_ConvAgent.generated.h" +// ───────────────────────────────────────────────────────────────────────────── +// ElevenLabs server region +// ───────────────────────────────────────────────────────────────────────────── +UENUM() +enum class EPS_AI_ConvAgent_ElevenLabsRegion : uint8 +{ + /** Automatic global routing (default). Server chosen by ElevenLabs based on client location. */ + Global UMETA(DisplayName = "Global (auto)"), + /** Force US servers: api.us.elevenlabs.io */ + US UMETA(DisplayName = "US"), + /** Force EU servers (Enterprise only): api.eu.residency.elevenlabs.io */ + EU UMETA(DisplayName = "EU (Enterprise)"), + /** Force India servers (Enterprise only): api.in.residency.elevenlabs.io */ + India UMETA(DisplayName = "India (Enterprise)") +}; + // ───────────────────────────────────────────────────────────────────────────── // Settings object – exposed in Project Settings → Plugins → PS AI ConvAgent - ElevenLabs // ───────────────────────────────────────────────────────────────────────────── @@ -24,8 +40,17 @@ public: FString API_Key; /** - * Override the ElevenLabs WebSocket base URL. Leave empty to use the default: - * wss://api.elevenlabs.io/v1/convai/conversation + * Server region for ElevenLabs API. + * - Global (default): automatic routing based on client location. + * - US: force US servers (api.us.elevenlabs.io). + * - EU / India: Enterprise-only data residency endpoints. + */ + UPROPERTY(Config, EditAnywhere, Category = "PS AI ConvAgent|ElevenLabs API") + EPS_AI_ConvAgent_ElevenLabsRegion ServerRegion = EPS_AI_ConvAgent_ElevenLabsRegion::Global; + + /** + * Override the ElevenLabs WebSocket URL entirely. Leave empty to use ServerRegion setting. + * Example: wss://custom-proxy.example.com/v1/convai/conversation?agent_id=YOUR_ID */ UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API") FString CustomWebSocketURL; @@ -33,6 +58,30 @@ public: /** Log verbose WebSocket messages to the Output Log (useful during development). */ UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API") bool bVerboseLogging = false; + + /** Return the API base URL (https) for the selected region. */ + FString GetAPIBaseURL() const + { + switch (ServerRegion) + { + case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("https://api.us.elevenlabs.io"); + case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("https://api.eu.residency.elevenlabs.io"); + case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("https://api.in.residency.elevenlabs.io"); + default: return TEXT("https://api.elevenlabs.io"); + } + } + + /** Return the WebSocket base URL (wss) for the selected region. */ + FString GetWSBaseURL() const + { + switch (ServerRegion) + { + case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("wss://api.us.elevenlabs.io"); + case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("wss://api.eu.residency.elevenlabs.io"); + case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("wss://api.in.residency.elevenlabs.io"); + default: return TEXT("wss://api.elevenlabs.io"); + } + } }; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h index 8c12ca1..4a1b5d0 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_AgentConfig_ElevenLabs.h @@ -4,6 +4,7 @@ #include "CoreMinimal.h" #include "Engine/DataAsset.h" +#include "PS_AI_ConvAgent_Definitions.h" #include "PS_AI_ConvAgent_AgentConfig_ElevenLabs.generated.h" /** @@ -186,6 +187,24 @@ public: ToolTip = "Max conversation turns.\n0 = unlimited.")) int32 MaxTurns = 0; + // ── Latency / Turn-taking ─────────────────────────────────────────────── + + /** How quickly the server detects end-of-speech and starts responding. + * Eager = fastest response, may cut the user off during pauses. + * Normal = balanced (default). Patient = waits longer for user to finish. + * Sent as conversation_config_override at WebSocket connection time. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency", + meta = (ToolTip = "Controls how quickly the server detects end-of-speech.\n- Eager: fastest response, may interrupt mid-pause.\n- Normal: balanced (default).\n- Patient: waits longer for user to finish.")) + EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal; + + /** Enable speculative turn processing: the server starts generating a response + * before it's certain the user has finished speaking. If the user continues, + * the speculative response is discarded. Reduces perceived latency. + * May cause occasional false starts — disable if the agent interrupts too often. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency", + meta = (ToolTip = "Start generating a response before confirming end-of-speech.\nReduces latency but may cause occasional false starts.\nDisable if the agent interrupts the user too often.")) + bool bSpeculativeTurn = false; + // ── Emotion Tool ───────────────────────────────────────────────────────── /** Include the built-in "set_emotion" client tool in the agent configuration. diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_Definitions.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_Definitions.h index eed1065..72c11d5 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_Definitions.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_Definitions.h @@ -29,6 +29,20 @@ enum class EPS_AI_ConvAgent_TurnMode_ElevenLabs : uint8 Client UMETA(DisplayName = "Client Controlled"), }; +// ───────────────────────────────────────────────────────────────────────────── +// Agent turn eagerness — controls how quickly the server detects end of speech +// ───────────────────────────────────────────────────────────────────────────── +UENUM(BlueprintType) +enum class EPS_AI_ConvAgent_TurnEagerness_ElevenLabs : uint8 +{ + /** Quick response at the earliest opportunity. Best for customer service. */ + Eager UMETA(DisplayName = "Eager"), + /** Balanced turn-taking for general scenarios (default). */ + Normal UMETA(DisplayName = "Normal"), + /** Longer wait for user to finish. Best for information collection. */ + Patient UMETA(DisplayName = "Patient"), +}; + // ───────────────────────────────────────────────────────────────────────────── // WebSocket message type helpers (internal, not exposed to Blueprint) // ───────────────────────────────────────────────────────────────────────────── diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h index 055ea00..5b616c7 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h @@ -185,14 +185,24 @@ public: meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text.")) bool bEnableAgentPartialResponse = false; - /** Pre-buffer delay (ms) before starting audio playback on the first chunk. - * Delays playback start so early TTS chunks can accumulate, preventing - * mid-sentence pauses when the second chunk hasn't arrived yet. - * Set to 0 for immediate playback. */ + /** Pre-buffer delay (ms) before starting audio playback on the first TTS chunk. + * Set this to your "worst case" value (e.g. 300-1000ms depending on connection). + * When adaptive mode is on, the system starts here and can only decrease + * (never increase) as it measures that chunks arrive fast enough. + * Set to 0 to disable pre-buffering entirely. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency", meta = (ClampMin = "0", ClampMax = "4000", - ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback.")) - int32 AudioPreBufferMs = 2000; + ToolTip = "Pre-buffer delay (ms) — your safe 'worst case' value.\nAdaptive mode can only decrease from here, never increase.\nSet 0 to disable pre-buffering entirely.")) + int32 AudioPreBufferMs = 300; + + /** Enable adaptive pre-buffer: measures inter-chunk timing and automatically + * lowers the pre-buffer when TTS chunks arrive fast enough. + * The system can only decrease from AudioPreBufferMs — never increase. + * Resets to AudioPreBufferMs at the start of each conversation. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency", + meta = (EditCondition = "AudioPreBufferMs > 0", + ToolTip = "Automatically lower pre-buffer when connection is good.\nCan only decrease, never increase beyond AudioPreBufferMs.\nResets each conversation.")) + bool bAdaptivePreBuffer = true; /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs", @@ -640,18 +650,23 @@ private: double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating. double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). - // Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted. - // All anchored to GenerationStartTime (agent_response_started event), which is - // the closest client-side proxy for "user stopped speaking" in Server VAD mode. - // Zero means "not yet measured this turn". + // Current-turn latency measurements (ms). Overwritten per-field as each + // measurement is captured — NOT reset to zero between turns, so the HUD + // always shows the most recent value instead of blinking "---". + // All anchored to GenerationStartTime (agent_response_started event). struct FDebugLatencies { - float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS) + float TurnEndToTextMs = 0.0f; // user turn end → first text from LLM (≈ ASR + LLM TTFT) + float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (≈ TTS + network) float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived) }; FDebugLatencies CurrentLatencies; + // ElevenLabs server region (from x-region header on REST API). Fetched once per session. + FString ServerRegion; + void FetchServerRegion(); + // Accumulates incoming PCM bytes until the audio component needs data. // Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every // underflow callback (~60Hz). Compacted periodically when read offset @@ -664,6 +679,22 @@ private: bool bPreBuffering = false; double PreBufferStartTime = 0.0; + // ── Adaptive pre-buffer ───────────────────────────────────────────────── + // Runtime pre-buffer duration (ms). Equals AudioPreBufferMs when adaptive is off. + // When adaptive is on: initialized from AudioPreBufferMs, adjusted based on + // measured inter-chunk timing (not queue-dry detection). + int32 AdaptivePreBufferMs = 300; + static constexpr int32 AdaptivePreBufferMinMs = 50; + // Direction of last adaptation: +1=raised, -1=lowered, 0=stable. Used by HUD. + int32 PreBufferTrend = 0; + void ApplyPreBufferAdaptation(); + // Per-turn inter-chunk timing measurement (game thread only). + // Set when the second TTS chunk arrives, consumed at turn end. + double TurnFirstChunkTime = 0.0; // When chunk 1 arrived. + int32 TurnFirstChunkBytes = 0; // Bytes in chunk 1 (to estimate audio duration). + int32 TurnIdealPreBufferMs = -1; // Computed ideal pre-buffer. -1 = not measured. + bool bTurnGapMeasured = false; // True after first inter-chunk gap is measured. + // Debug: track when the AudioQueue runs dry during speech (one-shot log). bool bQueueWasDry = false; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.h index 3945801..8970bea 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_WebSocket_ElevenLabsProxy.h @@ -197,6 +197,18 @@ public: UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const { return ConversationInfo; } + /** Latest WebSocket round-trip latency reported by the server (ms). + * Returns -1 if no ping has been received yet. Thread-safe. */ + int32 GetLastPingMs() const { return LastPingMs.load(std::memory_order_relaxed); } + + /** Timestamp of the last user audio chunk sent to the server. + * Used as a proxy for "user stopped speaking" in Server VAD mode. */ + double GetLastAudioChunkSentTime() const { return LastAudioChunkSentTime; } + + /** Timestamp of the last user_transcript received from the server. + * Marks when server finished ASR — best anchor for LLM latency measurement. */ + double GetLastUserTranscriptTime() const { return LastUserTranscriptTime; } + // ───────────────────────────────────────────────────────────────────────── // Internal // ───────────────────────────────────────────────────────────────────────── @@ -235,10 +247,16 @@ private: TArray BinaryFrameBuffer; // ── Latency tracking ───────────────────────────────────────────────────── + // Server-reported WebSocket round-trip latency from ping events (~every 2s). + // Atomic: written from WS callback thread, read from game thread (HUD). + std::atomic LastPingMs{-1}; + // Timestamp of the last audio chunk sent (user speech). double LastAudioChunkSentTime = 0.0; // Timestamp when user turn ended (StopListening). double UserTurnEndTime = 0.0; + // Timestamp of the last user_transcript received (server finished ASR). + double LastUserTranscriptTime = 0.0; // Whether we are waiting for the first response after user stopped speaking. // Atomic: defensive — documents thread-safety contract. std::atomic bWaitingForResponse{false}; @@ -264,4 +282,10 @@ public: // Set by UPS_AI_ConvAgent_ElevenLabsComponent before calling Connect(). // Controls turn_timeout in conversation_initiation_client_data. EPS_AI_ConvAgent_TurnMode_ElevenLabs TurnMode = EPS_AI_ConvAgent_TurnMode_ElevenLabs::Server; + + // Controls how eagerly the server interprets pauses as end-of-speech. + EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal; + + // Start generating before confirming end-of-speech (reduces latency, may cause false starts). + bool bSpeculativeTurn = false; }; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp index c09bb8f..58d372c 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgentEditor/Private/PS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs.cpp @@ -22,33 +22,43 @@ DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_AgentConfigEditor, Log, All); -// Approximate LLM latencies as shown on the ElevenLabs dashboard. -// The API does not expose this data — values are indicative and may change. +// Approximate LLM latencies as shown on the ElevenLabs dashboard (March 2026). +// The /v1/convai/llm/list API does NOT expose latency — values are indicative. // Update this table periodically to stay current. static FString GetLLMLatencyHint(const FString& ModelID) { struct FLatencyEntry { const TCHAR* ID; const TCHAR* Latency; }; static const FLatencyEntry Entries[] = { - // OpenAI - { TEXT("gpt-4o-mini"), TEXT("~350ms") }, - { TEXT("gpt-4o"), TEXT("~700ms") }, - { TEXT("gpt-4"), TEXT("~900ms") }, - { TEXT("gpt-4-turbo"), TEXT("~650ms") }, - // Anthropic - { TEXT("claude-sonnet-4-5"), TEXT("~750ms") }, - { TEXT("claude-haiku-4-5"), TEXT("~350ms") }, - { TEXT("claude-3-5-sonnet"), TEXT("~700ms") }, - // Google - { TEXT("gemini-1.5-pro"), TEXT("~500ms") }, - { TEXT("gemini-2.0-flash"), TEXT("~300ms") }, - { TEXT("gemini-2.5-flash"), TEXT("~250ms") }, - // xAI - { TEXT("grok-beta"), TEXT("~500ms") }, - // ElevenLabs-hosted - { TEXT("qwen3-30b-a3b"), TEXT("~207ms") }, - { TEXT("glm-4.5-air"), TEXT("~980ms") }, - { TEXT("gpt-oss-120b"), TEXT("~331ms") }, + // ── ElevenLabs-hosted ───────────────────────────────────────────── + { TEXT("glm-4.5-air"), TEXT("~949ms") }, + { TEXT("qwen3-30b-a3b"), TEXT("~189ms") }, + { TEXT("gpt-oss-120b"), TEXT("~321ms") }, + // ── Google ──────────────────────────────────────────────────────── + { TEXT("gemini-3-pro"), TEXT("~3.5s") }, + { TEXT("gemini-3-flash"), TEXT("~1.4s") }, + { TEXT("gemini-2.5-flash"), TEXT("~967ms") }, + { TEXT("gemini-2.5-flash-lite"), TEXT("~605ms") }, + // ── OpenAI ──────────────────────────────────────────────────────── + { TEXT("gpt-5"), TEXT("~1.1s") }, + { TEXT("gpt-5.1"), TEXT("~980ms") }, + { TEXT("gpt-5.2"), TEXT("~795ms") }, + { TEXT("gpt-5-mini"), TEXT("~884ms") }, + { TEXT("gpt-5-nano"), TEXT("~734ms") }, + { TEXT("gpt-4.1"), TEXT("~870ms") }, + { TEXT("gpt-4.1-mini"), TEXT("~916ms") }, + { TEXT("gpt-4.1-nano"), TEXT("~574ms") }, + { TEXT("gpt-4o"), TEXT("~728ms") }, + { TEXT("gpt-4o-mini"), TEXT("~767ms") }, + { TEXT("gpt-4-turbo"), TEXT("~1.5s") }, + { TEXT("gpt-3.5-turbo"), TEXT("~458ms") }, + // ── Anthropic ───────────────────────────────────────────────────── + { TEXT("claude-sonnet-4-5"), TEXT("~1.4s") }, + { TEXT("claude-sonnet-4"), TEXT("~1.1s") }, + { TEXT("claude-haiku-4-5"), TEXT("~644ms") }, + { TEXT("claude-3.7-sonnet"), TEXT("~1.2s") }, + { TEXT("claude-3-haiku"), TEXT("~484ms") }, + { TEXT("claude-3-5-sonnet"), TEXT("~1.2s") }, }; for (const auto& E : Entries) @@ -58,6 +68,22 @@ static FString GetLLMLatencyHint(const FString& ModelID) return FString(); } +// Infer provider from model ID prefix for display grouping. +static FString GetLLMProvider(const FString& ModelID) +{ + if (ModelID.StartsWith(TEXT("gpt-")) || ModelID.StartsWith(TEXT("o1")) || ModelID.StartsWith(TEXT("o3"))) + return TEXT("OpenAI"); + if (ModelID.StartsWith(TEXT("claude-"))) + return TEXT("Anthropic"); + if (ModelID.StartsWith(TEXT("gemini-"))) + return TEXT("Google"); + if (ModelID.StartsWith(TEXT("grok"))) + return TEXT("xAI"); + if (ModelID == TEXT("glm-4.5-air") || ModelID == TEXT("qwen3-30b-a3b") || ModelID == TEXT("gpt-oss-120b")) + return TEXT("ElevenLabs"); + return FString(); +} + // Language code → display name. Shared by BuildAgentPayload (to resolve // {Language} placeholder) and the fetch handler (to strip the resolved fragment). static FString GetLanguageDisplayName(const FString& LangCode) @@ -332,9 +358,11 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::CustomizeDetails( .Font(IDetailLayoutBuilder::GetDetailFont()) ] .ValueContent() + .MaxDesiredWidth(600.f) [ SNew(SBox) .MinDesiredHeight(200.f) + .MinDesiredWidth(400.f) [ SNew(SMultiLineEditableTextBox) .Font(IDetailLayoutBuilder::GetDetailFont()) @@ -679,6 +707,10 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked() Pinned->LLMDisplayNames.Reset(); Pinned->LLMModelIDs.Reset(); + // Collect models grouped by provider for sorted display. + struct FLLMEntry { FString ModelID; FString Provider; FString Display; bool bCheckpoint; }; + TArray AllEntries; + for (const auto& LLMVal : *LLMs) { const TSharedPtr* LLMObj = nullptr; @@ -703,28 +735,62 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked() } } - // Check if it's a checkpoint model (sub-version). bool bIsCheckpoint = false; (*LLMObj)->TryGetBoolField(TEXT("is_checkpoint"), bIsCheckpoint); - // Build display string: "model-id (~350ms)" or " model-id (checkpoint, ~350ms)" const FString Latency = GetLLMLatencyHint(ModelID); + const FString Provider = GetLLMProvider(ModelID); + + // Build display: " model-id (checkpoint, ~350ms)" for checkpoints, + // "model-id (~350ms)" for main models. FString Display; if (bIsCheckpoint) { Display = Latency.IsEmpty() - ? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID) - : FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency); + ? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID) + : FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency); } else { Display = Latency.IsEmpty() - ? ModelID - : FString::Printf(TEXT("%s (%s)"), *ModelID, *Latency); + ? FString::Printf(TEXT(" %s"), *ModelID) + : FString::Printf(TEXT(" %s (%s)"), *ModelID, *Latency); } - Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Display))); - Pinned->LLMModelIDs.Add(ModelID); + AllEntries.Add({ ModelID, Provider, Display, bIsCheckpoint }); + } + + // Sort by provider order (ElevenLabs, Google, OpenAI, Anthropic, Other), + // then main models before checkpoints, then alphabetically. + static const TArray ProviderOrder = { + TEXT("ElevenLabs"), TEXT("Google"), TEXT("OpenAI"), TEXT("Anthropic"), TEXT("xAI") + }; + AllEntries.Sort([](const FLLMEntry& A, const FLLMEntry& B) + { + int32 IdxA = ProviderOrder.IndexOfByKey(A.Provider); + int32 IdxB = ProviderOrder.IndexOfByKey(B.Provider); + if (IdxA == INDEX_NONE) IdxA = ProviderOrder.Num(); + if (IdxB == INDEX_NONE) IdxB = ProviderOrder.Num(); + if (IdxA != IdxB) return IdxA < IdxB; + if (A.bCheckpoint != B.bCheckpoint) return !A.bCheckpoint; // main first + return A.ModelID < B.ModelID; + }); + + // Insert provider headers as non-selectable separator entries. + FString LastProvider; + for (const auto& Entry : AllEntries) + { + const FString& Prov = Entry.Provider.IsEmpty() ? TEXT("Other") : Entry.Provider; + if (Prov != LastProvider) + { + // Header line: "── OpenAI ──" (not selectable — mapped to empty ModelID) + FString Header = FString::Printf(TEXT("── %s ──"), *Prov); + Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Header))); + Pinned->LLMModelIDs.Add(FString()); // empty = separator + LastProvider = Prov; + } + Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Entry.Display))); + Pinned->LLMModelIDs.Add(Entry.ModelID); } // Pre-select the currently set LLMModel if it exists in the list. @@ -767,6 +833,9 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnLLMSelected( int32 Idx = LLMDisplayNames.IndexOfByKey(NewSelection); if (Idx == INDEX_NONE) return; + // Separator headers have empty ModelID — ignore selection. + if (LLMModelIDs[Idx].IsEmpty()) return; + if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = GetEditedAsset()) { Asset->Modify();