Add turn eagerness, speculative turn, adaptive pre-buffer, and latency HUD improvements

- Add TurnEagerness (Eager/Normal/Patient) and bSpeculativeTurn to agent config
  data asset, sent as conversation_config_override at WebSocket connection time
- Add adaptive pre-buffer system: measures inter-chunk TTS timing and decreases
  pre-buffer when chunks arrive fast enough (decrease-only, resets each conversation)
- New UPROPERTY: bAdaptivePreBuffer toggle, AudioPreBufferMs as starting/worst-case value
- Rework latency HUD: TTS+Net, PreBuf actual/target with trend indicator, Gen>Ear,
  WS Ping, server region display
- Fetch ElevenLabs server region from REST API x-region header
- Add editor Detail Customization: TurnEagerness dropdown + SpeculativeTurn checkbox
  in AgentConfig with LLM picker and Language picker

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-06 16:43:20 +01:00
parent 2169c58cd7
commit 4456dfa9dc
9 changed files with 540 additions and 105 deletions

View File

@ -1,8 +1,8 @@
[/Script/EngineSettings.GameMapsSettings] [/Script/EngineSettings.GameMapsSettings]
GameDefaultMap=/Game/voidMap.voidMap GameDefaultMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
EditorStartupMap=/Game/voidMap.voidMap EditorStartupMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
[/Script/Engine.RendererSettings] [/Script/Engine.RendererSettings]
r.AllowStaticLighting=False r.AllowStaticLighting=False
@ -182,4 +182,5 @@ ManualIPAddress=
[/Script/PS_AI_ConvAgent.PS_AI_ConvAgent_Settings_ElevenLabs] [/Script/PS_AI_ConvAgent.PS_AI_ConvAgent_Settings_ElevenLabs]
API_Key=7b73c4244ccbec394cc010aaab01b0ec59ce0b11fc636ce4828354f675ca14a5 API_Key=7b73c4244ccbec394cc010aaab01b0ec59ce0b11fc636ce4828354f675ca14a5
ServerRegion=Global

View File

@ -17,6 +17,9 @@
#include "GameFramework/PlayerController.h" #include "GameFramework/PlayerController.h"
#include "Net/UnrealNetwork.h" #include "Net/UnrealNetwork.h"
#include "VoiceModule.h" #include "VoiceModule.h"
#include "HttpModule.h"
#include "Interfaces/IHttpRequest.h"
#include "Interfaces/IHttpResponse.h"
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_ConvAgent_ElevenLabs, Log, All); DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_ConvAgent_ElevenLabs, Log, All);
@ -147,15 +150,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
if (bPreBuffering) if (bPreBuffering)
{ {
const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0; const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
if (Elapsed >= static_cast<double>(AudioPreBufferMs)) const int32 EffPreBuf = (AudioPreBufferMs > 0)
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
if (Elapsed >= static_cast<double>(EffPreBuf))
{ {
bPreBuffering = false; bPreBuffering = false;
if (bDebug) if (bDebug)
{ {
const double Tpb = FPlatformTime::Seconds() - SessionStartTime; const double Tpb = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms). Starting playback."), TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms adaptive). Starting playback."),
Tpb, LastClosedTurnIndex, AudioPreBufferMs); Tpb, LastClosedTurnIndex, EffPreBuf);
} }
// Only start playback if the agent is still speaking. // Only start playback if the agent is still speaking.
// If silence detection already set bAgentSpeaking=false, this is stale. // If silence detection already set bAgentSpeaking=false, this is stale.
@ -292,6 +297,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time. // Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
if (bShouldBroadcastStopped) if (bShouldBroadcastStopped)
{ {
// Adapt pre-buffer for next turn based on this turn's signals.
ApplyPreBufferAdaptation();
if (bHardTimeoutFired && bDebug) if (bHardTimeoutFired && bDebug)
{ {
const double Tht = FPlatformTime::Seconds() - SessionStartTime; const double Tht = FPlatformTime::Seconds() - SessionStartTime;
@ -321,7 +329,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
{ {
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread(); const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency; const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
if (bShowLatency) // Only draw on the active (connected) Authority component.
// Multiple agents in the scene would overwrite each other's HUD at the same
// BaseKey, causing visible blinking between their values.
if (bShowLatency && IsConnected() && GetOwnerRole() == ROLE_Authority)
{ {
DrawLatencyHUD(); DrawLatencyHUD();
} }
@ -388,6 +399,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StartConversation_Internal()
// Pass configuration to the proxy before connecting. // Pass configuration to the proxy before connecting.
WebSocketProxy->TurnMode = TurnMode; WebSocketProxy->TurnMode = TurnMode;
if (AgentConfig)
{
WebSocketProxy->TurnEagerness = AgentConfig->TurnEagerness;
WebSocketProxy->bSpeculativeTurn = AgentConfig->bSpeculativeTurn;
}
// Resolve AgentID by priority: AgentConfig > component string > project default. // Resolve AgentID by priority: AgentConfig > component string > project default.
FString ResolvedAgentID = AgentID; FString ResolvedAgentID = AgentID;
@ -834,6 +850,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
SessionStartTime = FPlatformTime::Seconds(); SessionStartTime = FPlatformTime::Seconds();
TurnIndex = 0; TurnIndex = 0;
LastClosedTurnIndex = 0; LastClosedTurnIndex = 0;
// Initialize adaptive pre-buffer from designer settings.
AdaptivePreBufferMs = AudioPreBufferMs; // Start at the designer's value.
PreBufferTrend = 0;
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID); UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID);
OnAgentConnected.Broadcast(Info); OnAgentConnected.Broadcast(Info);
@ -852,6 +875,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
} }
} }
// Probe server region once per session (only when latency HUD is enabled).
if (ServerRegion.IsEmpty() && GetOwnerRole() == ROLE_Authority)
{
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
if (bShowLatency)
{
FetchServerRegion();
}
}
// In Client turn mode (push-to-talk), the user controls listening manually via // In Client turn mode (push-to-talk), the user controls listening manually via
// StartListening()/StopListening(). Auto-starting would leave the mic open // StartListening()/StopListening(). Auto-starting would leave the mic open
// permanently and interfere with push-to-talk — the T-release StopListening() // permanently and interfere with push-to-talk — the T-release StopListening()
@ -1081,21 +1115,28 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
// In Server VAD mode, StopListening() is not called — the server detects // In Server VAD mode, StopListening() is not called — the server detects
// end of user speech and immediately starts generating. If TurnEndTime was // end of user speech and immediately starts generating. If TurnEndTime was
// not set by StopListening since the last generation (i.e. it's stale or 0), // not set by StopListening since the last generation (i.e. it's stale or 0),
// use Now as the best client-side approximation. // use the proxy's LastUserTranscriptTime as the best approximation:
// user_transcript arrives after server VAD + ASR, just before LLM starts.
const bool bFreshTurnEnd = (TurnEndTime > GenerationStartTime) && (GenerationStartTime > 0.0); const bool bFreshTurnEnd = (TurnEndTime > GenerationStartTime) && (GenerationStartTime > 0.0);
if (!bFreshTurnEnd) if (!bFreshTurnEnd)
{ {
TurnEndTime = Now; const double TranscriptTime = WebSocketProxy ? WebSocketProxy->GetLastUserTranscriptTime() : 0.0;
TurnEndTime = (TranscriptTime > 0.0) ? TranscriptTime : Now;
} }
// Reset all latency measurements — new response cycle starts here. // New response cycle starts here. All client-side metrics are anchored to
// All metrics are anchored to GenerationStartTime (= now), which is the closest // GenerationStartTime (= now). Do NOT zero CurrentLatencies — the per-field
// client-side proxy for "user stopped speaking" in Server VAD mode. // assignments in EnqueueAgentAudio() overwrite naturally, so the HUD shows the
CurrentLatencies = FDebugLatencies(); // previous turn's values until the new turn's measurements arrive (no "---" blink).
GenerationStartTime = Now; GenerationStartTime = Now;
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = Now - TurnEndTime; const double LatencyFromTurnEnd = Now - TurnEndTime;
// LLM latency: time from user_transcript received to first text token arriving.
// In Server VAD mode, this approximates LLM TTFT + network (post-ASR).
// In Client turn mode, this is the full ASR + LLM latency.
CurrentLatencies.TurnEndToTextMs = static_cast<float>(LatencyFromTurnEnd * 1000.0);
if (bIsListening) if (bIsListening)
{ {
// In Server VAD + interruption mode, keep the mic open so the server can // In Server VAD + interruption mode, keep the mic open so the server can
@ -1321,7 +1362,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
AudioQueueReadOffset = 0; AudioQueueReadOffset = 0;
} }
// Log when queue recovers (new data arrived after being dry) // Queue recovered: was dry, now has data again.
// Only flag as underrun if the gap was long enough to be audible.
// Short gaps (<200ms) are handled seamlessly by USoundWaveProcedural's
// internal silence — no need to increase the pre-buffer for those.
if (bQueueWasDry) if (bQueueWasDry)
{ {
bQueueWasDry = false; bQueueWasDry = false;
@ -1329,7 +1373,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
{ {
const double T = FPlatformTime::Seconds() - SessionStartTime; const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."), TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered (%d bytes remaining)."),
T, LastClosedTurnIndex, AudioQueue.Num() - AudioQueueReadOffset); T, LastClosedTurnIndex, AudioQueue.Num() - AudioQueueReadOffset);
} }
} }
@ -1371,6 +1415,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking. bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
bQueueWasDry = false; bQueueWasDry = false;
SilentTickCount = 0; SilentTickCount = 0;
// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
TurnFirstChunkTime = FPlatformTime::Seconds();
TurnFirstChunkBytes = PCMData.Num();
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
// Latency capture (always, for HUD display). // Latency capture (always, for HUD display).
if (GenerationStartTime > 0.0) if (GenerationStartTime > 0.0)
@ -1393,7 +1442,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
MulticastAgentStartedSpeaking(); MulticastAgentStartedSpeaking();
} }
if (AudioPreBufferMs > 0) const int32 EffectivePreBufferMs = (AudioPreBufferMs > 0)
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
if (EffectivePreBufferMs > 0)
{ {
// Pre-buffer: accumulate audio before starting playback. // Pre-buffer: accumulate audio before starting playback.
// This absorbs TTS inter-chunk gaps so chunk 2 arrives before // This absorbs TTS inter-chunk gaps so chunk 2 arrives before
@ -1404,8 +1455,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{ {
const double Tpb2 = FPlatformTime::Seconds() - SessionStartTime; const double Tpb2 = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms before starting playback."), TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms (adaptive) before starting playback."),
Tpb2, LastClosedTurnIndex, AudioPreBufferMs); Tpb2, LastClosedTurnIndex, EffectivePreBufferMs);
} }
} }
else else
@ -1433,14 +1484,25 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
if (GetOwnerRole() == ROLE_Authority) if (GetOwnerRole() == ROLE_Authority)
{ {
bPreBuffering = false; bPreBuffering = false;
// Measure inter-chunk gap for adaptive pre-buffer.
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0)
{
const double NowGap = FPlatformTime::Seconds();
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
// Chunk 1 audio duration: 16kHz 16-bit mono = 32000 bytes/sec.
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
bTurnGapMeasured = true;
}
if (bDebug) if (bDebug)
{ {
const double NowPb = FPlatformTime::Seconds(); const double NowPb = FPlatformTime::Seconds();
const double BufferedMs = (NowPb - PreBufferStartTime) * 1000.0; const double BufferedMs = (NowPb - PreBufferStartTime) * 1000.0;
const double Tpb3 = NowPb - SessionStartTime; const double Tpb3 = NowPb - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."), TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered, ideal=%dms). Starting playback."),
Tpb3, LastClosedTurnIndex, BufferedMs); Tpb3, LastClosedTurnIndex, BufferedMs, TurnIdealPreBufferMs);
} }
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying()) if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{ {
@ -1467,6 +1529,23 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{ {
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
// Measure inter-chunk gap for adaptive pre-buffer (first gap only).
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0 && GetOwnerRole() == ROLE_Authority)
{
const double NowGap = FPlatformTime::Seconds();
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
bTurnGapMeasured = true;
if (bDebug)
{
const double T = NowGap - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Inter-chunk gap: %.0fms, chunk1 audio: %.0fms → ideal pre-buffer: %dms"),
T, LastClosedTurnIndex, InterChunkGapMs, Chunk1AudioMs, TurnIdealPreBufferMs);
}
}
// Reset silence counter — new audio arrived, we're not in a gap anymore // Reset silence counter — new audio arrived, we're not in a gap anymore
SilentTickCount = 0; SilentTickCount = 0;
} }
@ -1516,6 +1595,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
// Broadcast outside the lock. // Broadcast outside the lock.
if (bWasSpeaking) if (bWasSpeaking)
{ {
// Adapt pre-buffer for next turn based on this turn's signals.
ApplyPreBufferAdaptation();
if (bDebug) if (bDebug)
{ {
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
@ -1536,6 +1618,52 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
} }
} }
void UPS_AI_ConvAgent_ElevenLabsComponent::ApplyPreBufferAdaptation()
{
// Only adapt on Authority (where the WebSocket lives and measurements are taken).
if (GetOwnerRole() != ROLE_Authority) return;
// Adaptive mode must be enabled, and pre-buffering must be active.
if (!bAdaptivePreBuffer || AudioPreBufferMs == 0) return;
// No measurement this turn (single-chunk response or no second chunk arrived).
if (TurnIdealPreBufferMs < 0) { PreBufferTrend = 0; return; }
const int32 Prev = AdaptivePreBufferMs;
// DECREASE-ONLY: the measured ideal tells us the minimum pre-buffer needed.
// If the ideal is lower than our current value, the connection is fast enough
// that we can reduce the pre-buffer and save latency.
// If the ideal is higher (e.g. natural speech pause, slow network), we do NOT
// increase — USoundWaveProcedural handles gaps seamlessly in most cases.
// The user sets AudioPreBufferMs as the "worst case" starting value;
// the system only optimizes downward from there. Resets each conversation.
if (TurnIdealPreBufferMs < AdaptivePreBufferMs)
{
// Ideal is lower — decrease toward it (EMA 30% per turn, with 50ms margin).
const int32 TargetMs = FMath::Max(AdaptivePreBufferMinMs, TurnIdealPreBufferMs + 50);
AdaptivePreBufferMs = FMath::Max(AdaptivePreBufferMinMs,
FMath::RoundToInt32(AdaptivePreBufferMs * 0.7f + TargetMs * 0.3f));
PreBufferTrend = (AdaptivePreBufferMs < Prev) ? -1 : 0;
}
else
{
// Ideal >= current — connection is same or worse, keep current value.
PreBufferTrend = 0;
}
// Reset measurement for next turn.
const int32 IdealForLog = TurnIdealPreBufferMs;
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
if (bDebug && Prev != AdaptivePreBufferMs)
{
const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Adaptive pre-buffer: %d ms -> %d ms (ideal=%dms)"),
T, LastClosedTurnIndex, Prev, AdaptivePreBufferMs, IdealForLog);
}
}
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
// Microphone → WebSocket // Microphone → WebSocket
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
@ -2404,6 +2532,42 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT(""))); bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT("")));
} }
// ─────────────────────────────────────────────────────────────────────────────
// Server region detection (one-shot HTTP probe)
// ─────────────────────────────────────────────────────────────────────────────
void UPS_AI_ConvAgent_ElevenLabsComponent::FetchServerRegion()
{
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
if (!Settings || Settings->API_Key.IsEmpty()) return;
auto Request = FHttpModule::Get().CreateRequest();
Request->SetURL(Settings->GetAPIBaseURL() + TEXT("/v1/models"));
Request->SetVerb(TEXT("GET"));
Request->SetHeader(TEXT("xi-api-key"), Settings->API_Key);
TWeakObjectPtr<UPS_AI_ConvAgent_ElevenLabsComponent> WeakThis(this);
Request->OnProcessRequestComplete().BindLambda(
[WeakThis](FHttpRequestPtr /*Req*/, FHttpResponsePtr Resp, bool bSuccess)
{
if (!bSuccess || !Resp.IsValid()) return;
const FString Region = Resp->GetHeader(TEXT("x-region"));
if (Region.IsEmpty()) return;
AsyncTask(ENamedThreads::GameThread, [WeakThis, Region]()
{
if (WeakThis.IsValid())
{
WeakThis->ServerRegion = Region;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("ElevenLabs server region: %s"), *Region);
}
});
});
Request->ProcessRequest();
}
// ─────────────────────────────────────────────────────────────────────────────
// Latency debug HUD
// ─────────────────────────────────────────────────────────────────────────────
void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
{ {
if (!GEngine) return; if (!GEngine) return;
@ -2412,29 +2576,62 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
const int32 BaseKey = 93700; const int32 BaseKey = 93700;
const float DisplayTime = 1.0f; // long enough to avoid flicker between ticks const float DisplayTime = 1.0f; // long enough to avoid flicker between ticks
const FColor TitleColor = FColor::Cyan; const FColor TitleColor = FColor::Cyan;
const FColor ValueColor = FColor::White; const FColor ValueColor = FColor::White;
const FColor HighlightColor = FColor::Yellow; const FColor HighlightColor = FColor::Yellow;
// Helper: format a single metric — shows "---" when not yet captured this turn
auto Fmt = [](float Ms) -> FString auto Fmt = [](float Ms) -> FString
{ {
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
}; };
// Title — all times measured from agent_response_started int32 Row = 0;
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
TEXT("=== Latency (from gen start) ==="));
// 1. Gen → Audio: generation start → first audio chunk (LLM + TTS) GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, TEXT("=== Voice-to-Voice Latency ==="));
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 2. Pre-buffer wait before playback // Client-side breakdown: TTS+Net + Pre-buffer = Gen>Ear
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, // Note: LLM latency is only visible on ElevenLabs dashboard (server-side).
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); // In Server VAD mode, no reliable client-side "end of user speech" marker exists.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" TTS+Net: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 3. Gen → Ear: generation start → playback starts (user-perceived total) // Pre-buffer display depends on adaptive mode.
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, if (bAdaptivePreBuffer && AudioPreBufferMs > 0)
{
// Adaptive ON: show actual wait + adaptive target with trend arrow.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" PreBuf actual: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
const TCHAR* TrendArrow = (PreBufferTrend > 0) ? TEXT(" ^")
: (PreBufferTrend < 0) ? TEXT(" v")
: TEXT("");
const FColor AdaptiveColor = (PreBufferTrend > 0) ? FColor::Red
: (PreBufferTrend < 0) ? FColor::Green
: ValueColor;
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, AdaptiveColor,
FString::Printf(TEXT(" PreBuf target: %d ms%s"), AdaptivePreBufferMs, TrendArrow));
}
else
{
// Adaptive OFF (or pre-buffer disabled): show fixed pre-buffer value.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
}
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, HighlightColor,
FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs))); FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs)));
// Connection section
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
TEXT("--- Connection ---"));
const int32 PingMs = WebSocketProxy ? WebSocketProxy->GetLastPingMs() : -1;
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" WS Ping: %s"),
(PingMs >= 0) ? *FString::Printf(TEXT("%d ms"), PingMs) : TEXT("---")));
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" Region: %s"),
ServerRegion.IsEmpty() ? TEXT("---") : *ServerRegion));
} }

View File

@ -207,41 +207,58 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
// This produces smooth continuous audio chunks without the fragmentation caused by // This produces smooth continuous audio chunks without the fragmentation caused by
// explicit optimize_streaming_latency or enable_intermediate_response overrides. // explicit optimize_streaming_latency or enable_intermediate_response overrides.
// //
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency. // Build turn configuration based on mode + latency settings.
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client)
{ {
// turn_timeout: how long the server waits after VAD detects silence before
// processing the user's turn. Default is ~3s. In push-to-talk mode this
// directly adds latency — the server waits after the user releases T.
// 1s is safe without speculative_turn (which was removed — see history below).
//
// History:
// turn_timeout=1 was problematic when combined with speculative_turn=true
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
// and halves the per-turn latency.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
TurnObj->SetNumberField(TEXT("turn_timeout"), 1); bool bHasTurnOverrides = false;
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject()); // In Client (push-to-talk) mode, reduce turn_timeout to minimize latency.
AgentObj->SetObjectField(TEXT("turn"), TurnObj); if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client)
{
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
bHasTurnOverrides = true;
}
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); // turn_eagerness: controls how quickly the server interprets pauses as end-of-speech.
// "eager" = fastest (may cut user off), "normal" = balanced, "patient" = waits longer.
if (TurnEagerness != EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal)
{
FString EagernessStr;
switch (TurnEagerness)
{
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Eager: EagernessStr = TEXT("eager"); break;
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Patient: EagernessStr = TEXT("patient"); break;
default: EagernessStr = TEXT("normal"); break;
}
TurnObj->SetStringField(TEXT("turn_eagerness"), EagernessStr);
bHasTurnOverrides = true;
}
// speculative_turn: start generating a response before confirming end-of-speech.
// Reduces latency but may cause occasional false starts (discarded if user continues).
if (bSpeculativeTurn)
{
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
bHasTurnOverrides = true;
}
if (bHasTurnOverrides)
{
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
}
} }
// NOTE: We intentionally do NOT send these overrides (matching C++ sample): // NOTE: We intentionally do NOT send these overrides (matching C++ sample):
// //
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes // - tts.optimize_streaming_latency: deprecated by ElevenLabs. Sending any value
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely. // changes TTS chunking behaviour. Server default (omitted) is optimal.
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
// //
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks // - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
// before finishing generation → fragmented audio. When omitted (C++ sample), the // before finishing generation → fragmented audio. Omitted = server default.
// LLM completes its response first → continuous TTS chunks.
// //
// - custom_llm_extra_body (empty object): Even an empty object might override the // - custom_llm_extra_body (empty object): Even an empty object might override the
// agent's configured custom_llm_extra_body with nothing. Omit entirely. // agent's configured custom_llm_extra_body with nothing. Omit entirely.
@ -259,12 +276,15 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter); FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
{ {
const UPS_AI_ConvAgent_Settings_ElevenLabs* S = FPS_AI_ConvAgentModule::Get().GetSettings(); const UPS_AI_ConvAgent_Settings_ElevenLabs* S = FPS_AI_ConvAgentModule::Get().GetSettings();
if (S->bVerboseLogging) if (S && S->bVerboseLogging)
{ {
UE_LOG(LogPS_AI_ConvAgent_WS_ElevenLabs, Verbose, TEXT("Sending initiation: %s"), *InitJson); UE_LOG(LogPS_AI_ConvAgent_WS_ElevenLabs, Verbose, TEXT("Sending initiation: %s"), *InitJson);
} }
} }
WebSocket->Send(InitJson); if (WebSocket.IsValid())
{
WebSocket->Send(InitJson);
}
} }
void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnectionError(const FString& Error) void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnectionError(const FString& Error)
@ -507,6 +527,10 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleTranscript(const TSharedP
return; return;
} }
// Record arrival time for latency measurement (ASR+LLM breakdown).
// user_transcript arrives after server VAD + ASR, just before LLM starts.
LastUserTranscriptTime = FPlatformTime::Seconds();
FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs Segment; FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs Segment;
Segment.Speaker = TEXT("user"); Segment.Speaker = TEXT("user");
(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text); (*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
@ -679,6 +703,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandlePing(const TSharedPtr<FJs
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent) if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
{ {
(*PingEvent)->TryGetNumberField(TEXT("event_id"), EventID); (*PingEvent)->TryGetNumberField(TEXT("event_id"), EventID);
// Extract server-reported WS round-trip latency.
int32 PingValue = 0;
if ((*PingEvent)->TryGetNumberField(TEXT("ping_ms"), PingValue))
{
LastPingMs.store(PingValue, std::memory_order_relaxed);
}
} }
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject()); TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
@ -718,7 +749,7 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
{ {
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings(); const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
// Custom URL override takes full precedence // Custom URL override takes full precedence (advanced / proxy use case)
if (!Settings->CustomWebSocketURL.IsEmpty()) if (!Settings->CustomWebSocketURL.IsEmpty())
{ {
return Settings->CustomWebSocketURL; return Settings->CustomWebSocketURL;
@ -730,9 +761,9 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
return FString(); return FString();
} }
// Official ElevenLabs Conversational AI WebSocket endpoint // Build URL from the region-aware base: wss://<regional-host>/v1/convai/conversation?agent_id=<ID>
// wss://api.elevenlabs.io/v1/convai/conversation?agent_id=<ID> const FString BaseURL = Settings->GetWSBaseURL();
return FString::Printf( return FString::Printf(
TEXT("wss://api.elevenlabs.io/v1/convai/conversation?agent_id=%s"), TEXT("%s/v1/convai/conversation?agent_id=%s"),
*AgentIDOverride); *BaseURL, *AgentIDOverride);
} }

View File

@ -6,6 +6,22 @@
#include "Modules/ModuleManager.h" #include "Modules/ModuleManager.h"
#include "PS_AI_ConvAgent.generated.h" #include "PS_AI_ConvAgent.generated.h"
// ─────────────────────────────────────────────────────────────────────────────
// ElevenLabs server region
// ─────────────────────────────────────────────────────────────────────────────
UENUM()
enum class EPS_AI_ConvAgent_ElevenLabsRegion : uint8
{
/** Automatic global routing (default). Server chosen by ElevenLabs based on client location. */
Global UMETA(DisplayName = "Global (auto)"),
/** Force US servers: api.us.elevenlabs.io */
US UMETA(DisplayName = "US"),
/** Force EU servers (Enterprise only): api.eu.residency.elevenlabs.io */
EU UMETA(DisplayName = "EU (Enterprise)"),
/** Force India servers (Enterprise only): api.in.residency.elevenlabs.io */
India UMETA(DisplayName = "India (Enterprise)")
};
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
// Settings object exposed in Project Settings → Plugins → PS AI ConvAgent - ElevenLabs // Settings object exposed in Project Settings → Plugins → PS AI ConvAgent - ElevenLabs
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
@ -24,8 +40,17 @@ public:
FString API_Key; FString API_Key;
/** /**
* Override the ElevenLabs WebSocket base URL. Leave empty to use the default: * Server region for ElevenLabs API.
* wss://api.elevenlabs.io/v1/convai/conversation * - Global (default): automatic routing based on client location.
* - US: force US servers (api.us.elevenlabs.io).
* - EU / India: Enterprise-only data residency endpoints.
*/
UPROPERTY(Config, EditAnywhere, Category = "PS AI ConvAgent|ElevenLabs API")
EPS_AI_ConvAgent_ElevenLabsRegion ServerRegion = EPS_AI_ConvAgent_ElevenLabsRegion::Global;
/**
* Override the ElevenLabs WebSocket URL entirely. Leave empty to use ServerRegion setting.
* Example: wss://custom-proxy.example.com/v1/convai/conversation?agent_id=YOUR_ID
*/ */
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API") UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
FString CustomWebSocketURL; FString CustomWebSocketURL;
@ -33,6 +58,30 @@ public:
/** Log verbose WebSocket messages to the Output Log (useful during development). */ /** Log verbose WebSocket messages to the Output Log (useful during development). */
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API") UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
bool bVerboseLogging = false; bool bVerboseLogging = false;
/** Return the API base URL (https) for the selected region. */
FString GetAPIBaseURL() const
{
switch (ServerRegion)
{
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("https://api.us.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("https://api.eu.residency.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("https://api.in.residency.elevenlabs.io");
default: return TEXT("https://api.elevenlabs.io");
}
}
/** Return the WebSocket base URL (wss) for the selected region. */
FString GetWSBaseURL() const
{
switch (ServerRegion)
{
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("wss://api.us.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("wss://api.eu.residency.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("wss://api.in.residency.elevenlabs.io");
default: return TEXT("wss://api.elevenlabs.io");
}
}
}; };

View File

@ -4,6 +4,7 @@
#include "CoreMinimal.h" #include "CoreMinimal.h"
#include "Engine/DataAsset.h" #include "Engine/DataAsset.h"
#include "PS_AI_ConvAgent_Definitions.h"
#include "PS_AI_ConvAgent_AgentConfig_ElevenLabs.generated.h" #include "PS_AI_ConvAgent_AgentConfig_ElevenLabs.generated.h"
/** /**
@ -186,6 +187,24 @@ public:
ToolTip = "Max conversation turns.\n0 = unlimited.")) ToolTip = "Max conversation turns.\n0 = unlimited."))
int32 MaxTurns = 0; int32 MaxTurns = 0;
// ── Latency / Turn-taking ───────────────────────────────────────────────
/** How quickly the server detects end-of-speech and starts responding.
* Eager = fastest response, may cut the user off during pauses.
* Normal = balanced (default). Patient = waits longer for user to finish.
* Sent as conversation_config_override at WebSocket connection time. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
meta = (ToolTip = "Controls how quickly the server detects end-of-speech.\n- Eager: fastest response, may interrupt mid-pause.\n- Normal: balanced (default).\n- Patient: waits longer for user to finish."))
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
/** Enable speculative turn processing: the server starts generating a response
* before it's certain the user has finished speaking. If the user continues,
* the speculative response is discarded. Reduces perceived latency.
* May cause occasional false starts disable if the agent interrupts too often. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
meta = (ToolTip = "Start generating a response before confirming end-of-speech.\nReduces latency but may cause occasional false starts.\nDisable if the agent interrupts the user too often."))
bool bSpeculativeTurn = false;
// ── Emotion Tool ───────────────────────────────────────────────────────── // ── Emotion Tool ─────────────────────────────────────────────────────────
/** Include the built-in "set_emotion" client tool in the agent configuration. /** Include the built-in "set_emotion" client tool in the agent configuration.

View File

@ -29,6 +29,20 @@ enum class EPS_AI_ConvAgent_TurnMode_ElevenLabs : uint8
Client UMETA(DisplayName = "Client Controlled"), Client UMETA(DisplayName = "Client Controlled"),
}; };
// ─────────────────────────────────────────────────────────────────────────────
// Agent turn eagerness — controls how quickly the server detects end of speech
// ─────────────────────────────────────────────────────────────────────────────
UENUM(BlueprintType)
enum class EPS_AI_ConvAgent_TurnEagerness_ElevenLabs : uint8
{
/** Quick response at the earliest opportunity. Best for customer service. */
Eager UMETA(DisplayName = "Eager"),
/** Balanced turn-taking for general scenarios (default). */
Normal UMETA(DisplayName = "Normal"),
/** Longer wait for user to finish. Best for information collection. */
Patient UMETA(DisplayName = "Patient"),
};
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────
// WebSocket message type helpers (internal, not exposed to Blueprint) // WebSocket message type helpers (internal, not exposed to Blueprint)
// ───────────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────────

View File

@ -185,14 +185,24 @@ public:
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text.")) meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
bool bEnableAgentPartialResponse = false; bool bEnableAgentPartialResponse = false;
/** Pre-buffer delay (ms) before starting audio playback on the first chunk. /** Pre-buffer delay (ms) before starting audio playback on the first TTS chunk.
* Delays playback start so early TTS chunks can accumulate, preventing * Set this to your "worst case" value (e.g. 300-1000ms depending on connection).
* mid-sentence pauses when the second chunk hasn't arrived yet. * When adaptive mode is on, the system starts here and can only decrease
* Set to 0 for immediate playback. */ * (never increase) as it measures that chunks arrive fast enough.
* Set to 0 to disable pre-buffering entirely. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency", UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
meta = (ClampMin = "0", ClampMax = "4000", meta = (ClampMin = "0", ClampMax = "4000",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback.")) ToolTip = "Pre-buffer delay (ms) — your safe 'worst case' value.\nAdaptive mode can only decrease from here, never increase.\nSet 0 to disable pre-buffering entirely."))
int32 AudioPreBufferMs = 2000; int32 AudioPreBufferMs = 300;
/** Enable adaptive pre-buffer: measures inter-chunk timing and automatically
* lowers the pre-buffer when TTS chunks arrive fast enough.
* The system can only decrease from AudioPreBufferMs never increase.
* Resets to AudioPreBufferMs at the start of each conversation. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
meta = (EditCondition = "AudioPreBufferMs > 0",
ToolTip = "Automatically lower pre-buffer when connection is good.\nCan only decrease, never increase beyond AudioPreBufferMs.\nResets each conversation."))
bool bAdaptivePreBuffer = true;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs", UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs",
@ -640,18 +650,23 @@ private:
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating. double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted. // Current-turn latency measurements (ms). Overwritten per-field as each
// All anchored to GenerationStartTime (agent_response_started event), which is // measurement is captured — NOT reset to zero between turns, so the HUD
// the closest client-side proxy for "user stopped speaking" in Server VAD mode. // always shows the most recent value instead of blinking "---".
// Zero means "not yet measured this turn". // All anchored to GenerationStartTime (agent_response_started event).
struct FDebugLatencies struct FDebugLatencies
{ {
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS) float TurnEndToTextMs = 0.0f; // user turn end → first text from LLM (≈ ASR + LLM TTFT)
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (≈ TTS + network)
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived) float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived)
}; };
FDebugLatencies CurrentLatencies; FDebugLatencies CurrentLatencies;
// ElevenLabs server region (from x-region header on REST API). Fetched once per session.
FString ServerRegion;
void FetchServerRegion();
// Accumulates incoming PCM bytes until the audio component needs data. // Accumulates incoming PCM bytes until the audio component needs data.
// Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every // Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every
// underflow callback (~60Hz). Compacted periodically when read offset // underflow callback (~60Hz). Compacted periodically when read offset
@ -664,6 +679,22 @@ private:
bool bPreBuffering = false; bool bPreBuffering = false;
double PreBufferStartTime = 0.0; double PreBufferStartTime = 0.0;
// ── Adaptive pre-buffer ─────────────────────────────────────────────────
// Runtime pre-buffer duration (ms). Equals AudioPreBufferMs when adaptive is off.
// When adaptive is on: initialized from AudioPreBufferMs, adjusted based on
// measured inter-chunk timing (not queue-dry detection).
int32 AdaptivePreBufferMs = 300;
static constexpr int32 AdaptivePreBufferMinMs = 50;
// Direction of last adaptation: +1=raised, -1=lowered, 0=stable. Used by HUD.
int32 PreBufferTrend = 0;
void ApplyPreBufferAdaptation();
// Per-turn inter-chunk timing measurement (game thread only).
// Set when the second TTS chunk arrives, consumed at turn end.
double TurnFirstChunkTime = 0.0; // When chunk 1 arrived.
int32 TurnFirstChunkBytes = 0; // Bytes in chunk 1 (to estimate audio duration).
int32 TurnIdealPreBufferMs = -1; // Computed ideal pre-buffer. -1 = not measured.
bool bTurnGapMeasured = false; // True after first inter-chunk gap is measured.
// Debug: track when the AudioQueue runs dry during speech (one-shot log). // Debug: track when the AudioQueue runs dry during speech (one-shot log).
bool bQueueWasDry = false; bool bQueueWasDry = false;

View File

@ -197,6 +197,18 @@ public:
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs") UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const { return ConversationInfo; } const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const { return ConversationInfo; }
/** Latest WebSocket round-trip latency reported by the server (ms).
* Returns -1 if no ping has been received yet. Thread-safe. */
int32 GetLastPingMs() const { return LastPingMs.load(std::memory_order_relaxed); }
/** Timestamp of the last user audio chunk sent to the server.
* Used as a proxy for "user stopped speaking" in Server VAD mode. */
double GetLastAudioChunkSentTime() const { return LastAudioChunkSentTime; }
/** Timestamp of the last user_transcript received from the server.
* Marks when server finished ASR best anchor for LLM latency measurement. */
double GetLastUserTranscriptTime() const { return LastUserTranscriptTime; }
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
// Internal // Internal
// ───────────────────────────────────────────────────────────────────────── // ─────────────────────────────────────────────────────────────────────────
@ -235,10 +247,16 @@ private:
TArray<uint8> BinaryFrameBuffer; TArray<uint8> BinaryFrameBuffer;
// ── Latency tracking ───────────────────────────────────────────────────── // ── Latency tracking ─────────────────────────────────────────────────────
// Server-reported WebSocket round-trip latency from ping events (~every 2s).
// Atomic: written from WS callback thread, read from game thread (HUD).
std::atomic<int32> LastPingMs{-1};
// Timestamp of the last audio chunk sent (user speech). // Timestamp of the last audio chunk sent (user speech).
double LastAudioChunkSentTime = 0.0; double LastAudioChunkSentTime = 0.0;
// Timestamp when user turn ended (StopListening). // Timestamp when user turn ended (StopListening).
double UserTurnEndTime = 0.0; double UserTurnEndTime = 0.0;
// Timestamp of the last user_transcript received (server finished ASR).
double LastUserTranscriptTime = 0.0;
// Whether we are waiting for the first response after user stopped speaking. // Whether we are waiting for the first response after user stopped speaking.
// Atomic: defensive — documents thread-safety contract. // Atomic: defensive — documents thread-safety contract.
std::atomic<bool> bWaitingForResponse{false}; std::atomic<bool> bWaitingForResponse{false};
@ -264,4 +282,10 @@ public:
// Set by UPS_AI_ConvAgent_ElevenLabsComponent before calling Connect(). // Set by UPS_AI_ConvAgent_ElevenLabsComponent before calling Connect().
// Controls turn_timeout in conversation_initiation_client_data. // Controls turn_timeout in conversation_initiation_client_data.
EPS_AI_ConvAgent_TurnMode_ElevenLabs TurnMode = EPS_AI_ConvAgent_TurnMode_ElevenLabs::Server; EPS_AI_ConvAgent_TurnMode_ElevenLabs TurnMode = EPS_AI_ConvAgent_TurnMode_ElevenLabs::Server;
// Controls how eagerly the server interprets pauses as end-of-speech.
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
// Start generating before confirming end-of-speech (reduces latency, may cause false starts).
bool bSpeculativeTurn = false;
}; };

View File

@ -22,33 +22,43 @@
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_AgentConfigEditor, Log, All); DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_AgentConfigEditor, Log, All);
// Approximate LLM latencies as shown on the ElevenLabs dashboard. // Approximate LLM latencies as shown on the ElevenLabs dashboard (March 2026).
// The API does not expose this data — values are indicative and may change. // The /v1/convai/llm/list API does NOT expose latency — values are indicative.
// Update this table periodically to stay current. // Update this table periodically to stay current.
static FString GetLLMLatencyHint(const FString& ModelID) static FString GetLLMLatencyHint(const FString& ModelID)
{ {
struct FLatencyEntry { const TCHAR* ID; const TCHAR* Latency; }; struct FLatencyEntry { const TCHAR* ID; const TCHAR* Latency; };
static const FLatencyEntry Entries[] = static const FLatencyEntry Entries[] =
{ {
// OpenAI // ── ElevenLabs-hosted ─────────────────────────────────────────────
{ TEXT("gpt-4o-mini"), TEXT("~350ms") }, { TEXT("glm-4.5-air"), TEXT("~949ms") },
{ TEXT("gpt-4o"), TEXT("~700ms") }, { TEXT("qwen3-30b-a3b"), TEXT("~189ms") },
{ TEXT("gpt-4"), TEXT("~900ms") }, { TEXT("gpt-oss-120b"), TEXT("~321ms") },
{ TEXT("gpt-4-turbo"), TEXT("~650ms") }, // ── Google ────────────────────────────────────────────────────────
// Anthropic { TEXT("gemini-3-pro"), TEXT("~3.5s") },
{ TEXT("claude-sonnet-4-5"), TEXT("~750ms") }, { TEXT("gemini-3-flash"), TEXT("~1.4s") },
{ TEXT("claude-haiku-4-5"), TEXT("~350ms") }, { TEXT("gemini-2.5-flash"), TEXT("~967ms") },
{ TEXT("claude-3-5-sonnet"), TEXT("~700ms") }, { TEXT("gemini-2.5-flash-lite"), TEXT("~605ms") },
// Google // ── OpenAI ────────────────────────────────────────────────────────
{ TEXT("gemini-1.5-pro"), TEXT("~500ms") }, { TEXT("gpt-5"), TEXT("~1.1s") },
{ TEXT("gemini-2.0-flash"), TEXT("~300ms") }, { TEXT("gpt-5.1"), TEXT("~980ms") },
{ TEXT("gemini-2.5-flash"), TEXT("~250ms") }, { TEXT("gpt-5.2"), TEXT("~795ms") },
// xAI { TEXT("gpt-5-mini"), TEXT("~884ms") },
{ TEXT("grok-beta"), TEXT("~500ms") }, { TEXT("gpt-5-nano"), TEXT("~734ms") },
// ElevenLabs-hosted { TEXT("gpt-4.1"), TEXT("~870ms") },
{ TEXT("qwen3-30b-a3b"), TEXT("~207ms") }, { TEXT("gpt-4.1-mini"), TEXT("~916ms") },
{ TEXT("glm-4.5-air"), TEXT("~980ms") }, { TEXT("gpt-4.1-nano"), TEXT("~574ms") },
{ TEXT("gpt-oss-120b"), TEXT("~331ms") }, { TEXT("gpt-4o"), TEXT("~728ms") },
{ TEXT("gpt-4o-mini"), TEXT("~767ms") },
{ TEXT("gpt-4-turbo"), TEXT("~1.5s") },
{ TEXT("gpt-3.5-turbo"), TEXT("~458ms") },
// ── Anthropic ─────────────────────────────────────────────────────
{ TEXT("claude-sonnet-4-5"), TEXT("~1.4s") },
{ TEXT("claude-sonnet-4"), TEXT("~1.1s") },
{ TEXT("claude-haiku-4-5"), TEXT("~644ms") },
{ TEXT("claude-3.7-sonnet"), TEXT("~1.2s") },
{ TEXT("claude-3-haiku"), TEXT("~484ms") },
{ TEXT("claude-3-5-sonnet"), TEXT("~1.2s") },
}; };
for (const auto& E : Entries) for (const auto& E : Entries)
@ -58,6 +68,22 @@ static FString GetLLMLatencyHint(const FString& ModelID)
return FString(); return FString();
} }
// Infer provider from model ID prefix for display grouping.
static FString GetLLMProvider(const FString& ModelID)
{
if (ModelID.StartsWith(TEXT("gpt-")) || ModelID.StartsWith(TEXT("o1")) || ModelID.StartsWith(TEXT("o3")))
return TEXT("OpenAI");
if (ModelID.StartsWith(TEXT("claude-")))
return TEXT("Anthropic");
if (ModelID.StartsWith(TEXT("gemini-")))
return TEXT("Google");
if (ModelID.StartsWith(TEXT("grok")))
return TEXT("xAI");
if (ModelID == TEXT("glm-4.5-air") || ModelID == TEXT("qwen3-30b-a3b") || ModelID == TEXT("gpt-oss-120b"))
return TEXT("ElevenLabs");
return FString();
}
// Language code → display name. Shared by BuildAgentPayload (to resolve // Language code → display name. Shared by BuildAgentPayload (to resolve
// {Language} placeholder) and the fetch handler (to strip the resolved fragment). // {Language} placeholder) and the fetch handler (to strip the resolved fragment).
static FString GetLanguageDisplayName(const FString& LangCode) static FString GetLanguageDisplayName(const FString& LangCode)
@ -332,9 +358,11 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::CustomizeDetails(
.Font(IDetailLayoutBuilder::GetDetailFont()) .Font(IDetailLayoutBuilder::GetDetailFont())
] ]
.ValueContent() .ValueContent()
.MaxDesiredWidth(600.f)
[ [
SNew(SBox) SNew(SBox)
.MinDesiredHeight(200.f) .MinDesiredHeight(200.f)
.MinDesiredWidth(400.f)
[ [
SNew(SMultiLineEditableTextBox) SNew(SMultiLineEditableTextBox)
.Font(IDetailLayoutBuilder::GetDetailFont()) .Font(IDetailLayoutBuilder::GetDetailFont())
@ -679,6 +707,10 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
Pinned->LLMDisplayNames.Reset(); Pinned->LLMDisplayNames.Reset();
Pinned->LLMModelIDs.Reset(); Pinned->LLMModelIDs.Reset();
// Collect models grouped by provider for sorted display.
struct FLLMEntry { FString ModelID; FString Provider; FString Display; bool bCheckpoint; };
TArray<FLLMEntry> AllEntries;
for (const auto& LLMVal : *LLMs) for (const auto& LLMVal : *LLMs)
{ {
const TSharedPtr<FJsonObject>* LLMObj = nullptr; const TSharedPtr<FJsonObject>* LLMObj = nullptr;
@ -703,28 +735,62 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
} }
} }
// Check if it's a checkpoint model (sub-version).
bool bIsCheckpoint = false; bool bIsCheckpoint = false;
(*LLMObj)->TryGetBoolField(TEXT("is_checkpoint"), bIsCheckpoint); (*LLMObj)->TryGetBoolField(TEXT("is_checkpoint"), bIsCheckpoint);
// Build display string: "model-id (~350ms)" or " model-id (checkpoint, ~350ms)"
const FString Latency = GetLLMLatencyHint(ModelID); const FString Latency = GetLLMLatencyHint(ModelID);
const FString Provider = GetLLMProvider(ModelID);
// Build display: " model-id (checkpoint, ~350ms)" for checkpoints,
// "model-id (~350ms)" for main models.
FString Display; FString Display;
if (bIsCheckpoint) if (bIsCheckpoint)
{ {
Display = Latency.IsEmpty() Display = Latency.IsEmpty()
? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID) ? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID)
: FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency); : FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency);
} }
else else
{ {
Display = Latency.IsEmpty() Display = Latency.IsEmpty()
? ModelID ? FString::Printf(TEXT(" %s"), *ModelID)
: FString::Printf(TEXT("%s (%s)"), *ModelID, *Latency); : FString::Printf(TEXT(" %s (%s)"), *ModelID, *Latency);
} }
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Display))); AllEntries.Add({ ModelID, Provider, Display, bIsCheckpoint });
Pinned->LLMModelIDs.Add(ModelID); }
// Sort by provider order (ElevenLabs, Google, OpenAI, Anthropic, Other),
// then main models before checkpoints, then alphabetically.
static const TArray<FString> ProviderOrder = {
TEXT("ElevenLabs"), TEXT("Google"), TEXT("OpenAI"), TEXT("Anthropic"), TEXT("xAI")
};
AllEntries.Sort([](const FLLMEntry& A, const FLLMEntry& B)
{
int32 IdxA = ProviderOrder.IndexOfByKey(A.Provider);
int32 IdxB = ProviderOrder.IndexOfByKey(B.Provider);
if (IdxA == INDEX_NONE) IdxA = ProviderOrder.Num();
if (IdxB == INDEX_NONE) IdxB = ProviderOrder.Num();
if (IdxA != IdxB) return IdxA < IdxB;
if (A.bCheckpoint != B.bCheckpoint) return !A.bCheckpoint; // main first
return A.ModelID < B.ModelID;
});
// Insert provider headers as non-selectable separator entries.
FString LastProvider;
for (const auto& Entry : AllEntries)
{
const FString& Prov = Entry.Provider.IsEmpty() ? TEXT("Other") : Entry.Provider;
if (Prov != LastProvider)
{
// Header line: "── OpenAI ──" (not selectable — mapped to empty ModelID)
FString Header = FString::Printf(TEXT("── %s ──"), *Prov);
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Header)));
Pinned->LLMModelIDs.Add(FString()); // empty = separator
LastProvider = Prov;
}
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Entry.Display)));
Pinned->LLMModelIDs.Add(Entry.ModelID);
} }
// Pre-select the currently set LLMModel if it exists in the list. // Pre-select the currently set LLMModel if it exists in the list.
@ -767,6 +833,9 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnLLMSelected(
int32 Idx = LLMDisplayNames.IndexOfByKey(NewSelection); int32 Idx = LLMDisplayNames.IndexOfByKey(NewSelection);
if (Idx == INDEX_NONE) return; if (Idx == INDEX_NONE) return;
// Separator headers have empty ModelID — ignore selection.
if (LLMModelIDs[Idx].IsEmpty()) return;
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = GetEditedAsset()) if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = GetEditedAsset())
{ {
Asset->Modify(); Asset->Modify();