Compare commits

..

3 Commits

Author SHA1 Message Date
eaa52a5c5f Unreal Datas updated 2026-03-06 17:07:30 +01:00
28aed55cd3 Allow switching agents mid-conversation by looking at another agent
Conversation lock no longer prevents switching to a different agent.
When in an active conversation, the player can look at another nearby
agent for ConversationSwitchDelay seconds (default 1s) to switch.
Looking at empty space keeps the current agent selected (no deselect).
Works in multiplayer — each player has independent switch tracking.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 17:07:03 +01:00
4456dfa9dc Add turn eagerness, speculative turn, adaptive pre-buffer, and latency HUD improvements
- Add TurnEagerness (Eager/Normal/Patient) and bSpeculativeTurn to agent config
  data asset, sent as conversation_config_override at WebSocket connection time
- Add adaptive pre-buffer system: measures inter-chunk TTS timing and decreases
  pre-buffer when chunks arrive fast enough (decrease-only, resets each conversation)
- New UPROPERTY: bAdaptivePreBuffer toggle, AudioPreBufferMs as starting/worst-case value
- Rework latency HUD: TTS+Net, PreBuf actual/target with trend indicator, Gen>Ear,
  WS Ping, server region display
- Fetch ElevenLabs server region from REST API x-region header
- Add editor Detail Customization: TurnEagerness dropdown + SpeculativeTurn checkbox
  in AgentConfig with LLM picker and Language picker

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-06 16:43:20 +01:00
15 changed files with 613 additions and 117 deletions

View File

@ -1,8 +1,8 @@
[/Script/EngineSettings.GameMapsSettings]
GameDefaultMap=/Game/voidMap.voidMap
EditorStartupMap=/Game/voidMap.voidMap
GameDefaultMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
EditorStartupMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
[/Script/Engine.RendererSettings]
r.AllowStaticLighting=False
@ -182,4 +182,5 @@ ManualIPAddress=
[/Script/PS_AI_ConvAgent.PS_AI_ConvAgent_Settings_ElevenLabs]
API_Key=7b73c4244ccbec394cc010aaab01b0ec59ce0b11fc636ce4828354f675ca14a5
ServerRegion=Global

Binary file not shown.

Binary file not shown.

View File

@ -17,6 +17,9 @@
#include "GameFramework/PlayerController.h"
#include "Net/UnrealNetwork.h"
#include "VoiceModule.h"
#include "HttpModule.h"
#include "Interfaces/IHttpRequest.h"
#include "Interfaces/IHttpResponse.h"
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_ConvAgent_ElevenLabs, Log, All);
@ -147,15 +150,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
if (bPreBuffering)
{
const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
if (Elapsed >= static_cast<double>(AudioPreBufferMs))
const int32 EffPreBuf = (AudioPreBufferMs > 0)
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
if (Elapsed >= static_cast<double>(EffPreBuf))
{
bPreBuffering = false;
if (bDebug)
{
const double Tpb = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms). Starting playback."),
Tpb, LastClosedTurnIndex, AudioPreBufferMs);
TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms adaptive). Starting playback."),
Tpb, LastClosedTurnIndex, EffPreBuf);
}
// Only start playback if the agent is still speaking.
// If silence detection already set bAgentSpeaking=false, this is stale.
@ -292,6 +297,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
if (bShouldBroadcastStopped)
{
// Adapt pre-buffer for next turn based on this turn's signals.
ApplyPreBufferAdaptation();
if (bHardTimeoutFired && bDebug)
{
const double Tht = FPlatformTime::Seconds() - SessionStartTime;
@ -321,7 +329,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
{
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
if (bShowLatency)
// Only draw on the active (connected) Authority component.
// Multiple agents in the scene would overwrite each other's HUD at the same
// BaseKey, causing visible blinking between their values.
if (bShowLatency && IsConnected() && GetOwnerRole() == ROLE_Authority)
{
DrawLatencyHUD();
}
@ -388,6 +399,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StartConversation_Internal()
// Pass configuration to the proxy before connecting.
WebSocketProxy->TurnMode = TurnMode;
if (AgentConfig)
{
WebSocketProxy->TurnEagerness = AgentConfig->TurnEagerness;
WebSocketProxy->bSpeculativeTurn = AgentConfig->bSpeculativeTurn;
}
// Resolve AgentID by priority: AgentConfig > component string > project default.
FString ResolvedAgentID = AgentID;
@ -834,6 +850,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
SessionStartTime = FPlatformTime::Seconds();
TurnIndex = 0;
LastClosedTurnIndex = 0;
// Initialize adaptive pre-buffer from designer settings.
AdaptivePreBufferMs = AudioPreBufferMs; // Start at the designer's value.
PreBufferTrend = 0;
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID);
OnAgentConnected.Broadcast(Info);
@ -852,6 +875,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
}
}
// Probe server region once per session (only when latency HUD is enabled).
if (ServerRegion.IsEmpty() && GetOwnerRole() == ROLE_Authority)
{
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
if (bShowLatency)
{
FetchServerRegion();
}
}
// In Client turn mode (push-to-talk), the user controls listening manually via
// StartListening()/StopListening(). Auto-starting would leave the mic open
// permanently and interfere with push-to-talk — the T-release StopListening()
@ -1081,21 +1115,28 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
// In Server VAD mode, StopListening() is not called — the server detects
// end of user speech and immediately starts generating. If TurnEndTime was
// not set by StopListening since the last generation (i.e. it's stale or 0),
// use Now as the best client-side approximation.
// use the proxy's LastUserTranscriptTime as the best approximation:
// user_transcript arrives after server VAD + ASR, just before LLM starts.
const bool bFreshTurnEnd = (TurnEndTime > GenerationStartTime) && (GenerationStartTime > 0.0);
if (!bFreshTurnEnd)
{
TurnEndTime = Now;
const double TranscriptTime = WebSocketProxy ? WebSocketProxy->GetLastUserTranscriptTime() : 0.0;
TurnEndTime = (TranscriptTime > 0.0) ? TranscriptTime : Now;
}
// Reset all latency measurements — new response cycle starts here.
// All metrics are anchored to GenerationStartTime (= now), which is the closest
// client-side proxy for "user stopped speaking" in Server VAD mode.
CurrentLatencies = FDebugLatencies();
// New response cycle starts here. All client-side metrics are anchored to
// GenerationStartTime (= now). Do NOT zero CurrentLatencies — the per-field
// assignments in EnqueueAgentAudio() overwrite naturally, so the HUD shows the
// previous turn's values until the new turn's measurements arrive (no "---" blink).
GenerationStartTime = Now;
const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = Now - TurnEndTime;
// LLM latency: time from user_transcript received to first text token arriving.
// In Server VAD mode, this approximates LLM TTFT + network (post-ASR).
// In Client turn mode, this is the full ASR + LLM latency.
CurrentLatencies.TurnEndToTextMs = static_cast<float>(LatencyFromTurnEnd * 1000.0);
if (bIsListening)
{
// In Server VAD + interruption mode, keep the mic open so the server can
@ -1321,7 +1362,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
AudioQueueReadOffset = 0;
}
// Log when queue recovers (new data arrived after being dry)
// Queue recovered: was dry, now has data again.
// Only flag as underrun if the gap was long enough to be audible.
// Short gaps (<200ms) are handled seamlessly by USoundWaveProcedural's
// internal silence — no need to increase the pre-buffer for those.
if (bQueueWasDry)
{
bQueueWasDry = false;
@ -1329,7 +1373,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
{
const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered (%d bytes remaining)."),
T, LastClosedTurnIndex, AudioQueue.Num() - AudioQueueReadOffset);
}
}
@ -1371,6 +1415,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
bQueueWasDry = false;
SilentTickCount = 0;
// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
TurnFirstChunkTime = FPlatformTime::Seconds();
TurnFirstChunkBytes = PCMData.Num();
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
// Latency capture (always, for HUD display).
if (GenerationStartTime > 0.0)
@ -1393,7 +1442,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
MulticastAgentStartedSpeaking();
}
if (AudioPreBufferMs > 0)
const int32 EffectivePreBufferMs = (AudioPreBufferMs > 0)
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
if (EffectivePreBufferMs > 0)
{
// Pre-buffer: accumulate audio before starting playback.
// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
@ -1404,8 +1455,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{
const double Tpb2 = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms before starting playback."),
Tpb2, LastClosedTurnIndex, AudioPreBufferMs);
TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms (adaptive) before starting playback."),
Tpb2, LastClosedTurnIndex, EffectivePreBufferMs);
}
}
else
@ -1433,14 +1484,25 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
if (GetOwnerRole() == ROLE_Authority)
{
bPreBuffering = false;
// Measure inter-chunk gap for adaptive pre-buffer.
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0)
{
const double NowGap = FPlatformTime::Seconds();
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
// Chunk 1 audio duration: 16kHz 16-bit mono = 32000 bytes/sec.
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
bTurnGapMeasured = true;
}
if (bDebug)
{
const double NowPb = FPlatformTime::Seconds();
const double BufferedMs = (NowPb - PreBufferStartTime) * 1000.0;
const double Tpb3 = NowPb - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
Tpb3, LastClosedTurnIndex, BufferedMs);
TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered, ideal=%dms). Starting playback."),
Tpb3, LastClosedTurnIndex, BufferedMs, TurnIdealPreBufferMs);
}
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
{
@ -1467,6 +1529,23 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{
AudioPlaybackComponent->Play();
}
// Measure inter-chunk gap for adaptive pre-buffer (first gap only).
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0 && GetOwnerRole() == ROLE_Authority)
{
const double NowGap = FPlatformTime::Seconds();
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
bTurnGapMeasured = true;
if (bDebug)
{
const double T = NowGap - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Inter-chunk gap: %.0fms, chunk1 audio: %.0fms → ideal pre-buffer: %dms"),
T, LastClosedTurnIndex, InterChunkGapMs, Chunk1AudioMs, TurnIdealPreBufferMs);
}
}
// Reset silence counter — new audio arrived, we're not in a gap anymore
SilentTickCount = 0;
}
@ -1516,6 +1595,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
// Broadcast outside the lock.
if (bWasSpeaking)
{
// Adapt pre-buffer for next turn based on this turn's signals.
ApplyPreBufferAdaptation();
if (bDebug)
{
const double T = Now - SessionStartTime;
@ -1536,6 +1618,52 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
}
}
void UPS_AI_ConvAgent_ElevenLabsComponent::ApplyPreBufferAdaptation()
{
// Only adapt on Authority (where the WebSocket lives and measurements are taken).
if (GetOwnerRole() != ROLE_Authority) return;
// Adaptive mode must be enabled, and pre-buffering must be active.
if (!bAdaptivePreBuffer || AudioPreBufferMs == 0) return;
// No measurement this turn (single-chunk response or no second chunk arrived).
if (TurnIdealPreBufferMs < 0) { PreBufferTrend = 0; return; }
const int32 Prev = AdaptivePreBufferMs;
// DECREASE-ONLY: the measured ideal tells us the minimum pre-buffer needed.
// If the ideal is lower than our current value, the connection is fast enough
// that we can reduce the pre-buffer and save latency.
// If the ideal is higher (e.g. natural speech pause, slow network), we do NOT
// increase — USoundWaveProcedural handles gaps seamlessly in most cases.
// The user sets AudioPreBufferMs as the "worst case" starting value;
// the system only optimizes downward from there. Resets each conversation.
if (TurnIdealPreBufferMs < AdaptivePreBufferMs)
{
// Ideal is lower — decrease toward it (EMA 30% per turn, with 50ms margin).
const int32 TargetMs = FMath::Max(AdaptivePreBufferMinMs, TurnIdealPreBufferMs + 50);
AdaptivePreBufferMs = FMath::Max(AdaptivePreBufferMinMs,
FMath::RoundToInt32(AdaptivePreBufferMs * 0.7f + TargetMs * 0.3f));
PreBufferTrend = (AdaptivePreBufferMs < Prev) ? -1 : 0;
}
else
{
// Ideal >= current — connection is same or worse, keep current value.
PreBufferTrend = 0;
}
// Reset measurement for next turn.
const int32 IdealForLog = TurnIdealPreBufferMs;
TurnIdealPreBufferMs = -1;
bTurnGapMeasured = false;
if (bDebug && Prev != AdaptivePreBufferMs)
{
const double T = FPlatformTime::Seconds() - SessionStartTime;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Adaptive pre-buffer: %d ms -> %d ms (ideal=%dms)"),
T, LastClosedTurnIndex, Prev, AdaptivePreBufferMs, IdealForLog);
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Microphone → WebSocket
// ─────────────────────────────────────────────────────────────────────────────
@ -2404,6 +2532,42 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT("")));
}
// ─────────────────────────────────────────────────────────────────────────────
// Server region detection (one-shot HTTP probe)
// ─────────────────────────────────────────────────────────────────────────────
void UPS_AI_ConvAgent_ElevenLabsComponent::FetchServerRegion()
{
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
if (!Settings || Settings->API_Key.IsEmpty()) return;
auto Request = FHttpModule::Get().CreateRequest();
Request->SetURL(Settings->GetAPIBaseURL() + TEXT("/v1/models"));
Request->SetVerb(TEXT("GET"));
Request->SetHeader(TEXT("xi-api-key"), Settings->API_Key);
TWeakObjectPtr<UPS_AI_ConvAgent_ElevenLabsComponent> WeakThis(this);
Request->OnProcessRequestComplete().BindLambda(
[WeakThis](FHttpRequestPtr /*Req*/, FHttpResponsePtr Resp, bool bSuccess)
{
if (!bSuccess || !Resp.IsValid()) return;
const FString Region = Resp->GetHeader(TEXT("x-region"));
if (Region.IsEmpty()) return;
AsyncTask(ENamedThreads::GameThread, [WeakThis, Region]()
{
if (WeakThis.IsValid())
{
WeakThis->ServerRegion = Region;
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("ElevenLabs server region: %s"), *Region);
}
});
});
Request->ProcessRequest();
}
// ─────────────────────────────────────────────────────────────────────────────
// Latency debug HUD
// ─────────────────────────────────────────────────────────────────────────────
void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
{
if (!GEngine) return;
@ -2412,29 +2576,62 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
const int32 BaseKey = 93700;
const float DisplayTime = 1.0f; // long enough to avoid flicker between ticks
const FColor TitleColor = FColor::Cyan;
const FColor ValueColor = FColor::White;
const FColor TitleColor = FColor::Cyan;
const FColor ValueColor = FColor::White;
const FColor HighlightColor = FColor::Yellow;
// Helper: format a single metric — shows "---" when not yet captured this turn
auto Fmt = [](float Ms) -> FString
{
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
};
// Title — all times measured from agent_response_started
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
TEXT("=== Latency (from gen start) ==="));
int32 Row = 0;
// 1. Gen → Audio: generation start → first audio chunk (LLM + TTS)
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
TEXT("=== Voice-to-Voice Latency ==="));
// 2. Pre-buffer wait before playback
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
// Client-side breakdown: TTS+Net + Pre-buffer = Gen>Ear
// Note: LLM latency is only visible on ElevenLabs dashboard (server-side).
// In Server VAD mode, no reliable client-side "end of user speech" marker exists.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" TTS+Net: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 3. Gen → Ear: generation start → playback starts (user-perceived total)
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
// Pre-buffer display depends on adaptive mode.
if (bAdaptivePreBuffer && AudioPreBufferMs > 0)
{
// Adaptive ON: show actual wait + adaptive target with trend arrow.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" PreBuf actual: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
const TCHAR* TrendArrow = (PreBufferTrend > 0) ? TEXT(" ^")
: (PreBufferTrend < 0) ? TEXT(" v")
: TEXT("");
const FColor AdaptiveColor = (PreBufferTrend > 0) ? FColor::Red
: (PreBufferTrend < 0) ? FColor::Green
: ValueColor;
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, AdaptiveColor,
FString::Printf(TEXT(" PreBuf target: %d ms%s"), AdaptivePreBufferMs, TrendArrow));
}
else
{
// Adaptive OFF (or pre-buffer disabled): show fixed pre-buffer value.
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
}
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, HighlightColor,
FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs)));
// Connection section
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
TEXT("--- Connection ---"));
const int32 PingMs = WebSocketProxy ? WebSocketProxy->GetLastPingMs() : -1;
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" WS Ping: %s"),
(PingMs >= 0) ? *FString::Printf(TEXT("%d ms"), PingMs) : TEXT("---")));
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
FString::Printf(TEXT(" Region: %s"),
ServerRegion.IsEmpty() ? TEXT("---") : *ServerRegion));
}

View File

@ -165,7 +165,7 @@ void UPS_AI_ConvAgent_InteractionComponent::TickComponent(float DeltaTime, ELeve
// ─────────────────────────────────────────────────────────────────────────────
// Selection evaluation
// ─────────────────────────────────────────────────────────────────────────────
UPS_AI_ConvAgent_ElevenLabsComponent* UPS_AI_ConvAgent_InteractionComponent::EvaluateBestAgent() const
UPS_AI_ConvAgent_ElevenLabsComponent* UPS_AI_ConvAgent_InteractionComponent::EvaluateBestAgent()
{
UWorld* World = GetWorld();
if (!World) return nullptr;
@ -190,23 +190,25 @@ UPS_AI_ConvAgent_ElevenLabsComponent* UPS_AI_ConvAgent_InteractionComponent::Eva
UPS_AI_ConvAgent_ElevenLabsComponent* CurrentAgent = SelectedAgent.Get();
// ── Conversation lock ──────────────────────────────────────────────
// While we're actively conversing with an agent, keep it selected as
// long as it's within interaction distance — ignore the view cone.
// This prevents deselect/reselect flicker when the player turns quickly
// (which would cause spurious OnAgentConnected re-broadcasts in
// persistent session mode).
if (CurrentAgent && CurrentAgent->bNetIsConversing)
// While we're actively conversing with an agent, keep it selected UNLESS
// the player is looking directly at a DIFFERENT agent within range.
// This allows switching between nearby agents by looking at them, while
// preventing deselect when looking at empty space (no agent in view cone).
// If no other agent is in the view cone, the current agent stays selected
// regardless of look direction — only distance can break the lock.
const bool bConversationLocked = CurrentAgent && CurrentAgent->bNetIsConversing;
bool bCurrentAgentInRange = false;
if (bConversationLocked)
{
if (AActor* AgentActor = CurrentAgent->GetOwner())
{
const FVector AgentLoc = AgentActor->GetActorLocation()
+ FVector(0.0f, 0.0f, AgentEyeLevelOffset);
const float DistSq = (AgentLoc - ViewLocation).SizeSquared();
if (DistSq <= MaxDistSq)
{
return CurrentAgent; // Keep conversing agent selected.
}
bCurrentAgentInRange = (DistSq <= MaxDistSq);
}
// If current agent is out of range, fall through to normal evaluation
// (which will select a new agent or nullptr).
}
for (UPS_AI_ConvAgent_ElevenLabsComponent* Agent : Agents)
@ -259,6 +261,51 @@ UPS_AI_ConvAgent_ElevenLabsComponent* UPS_AI_ConvAgent_InteractionComponent::Eva
}
}
// ── Conversation lock fallback ────────────────────────────────────
// If we're in conversation and the current agent is still in range:
// - No other agent in view cone → keep current agent (don't deselect).
// - Different agent in view cone → switch after ConversationSwitchDelay.
if (bConversationLocked && bCurrentAgentInRange)
{
if (!BestCandidate || BestCandidate == CurrentAgent)
{
// Looking at current agent or empty space → keep current, reset switch timer.
PendingSwitchAgent.Reset();
PendingSwitchStartTime = 0.0;
return CurrentAgent;
}
// Player is looking at a different agent. Apply switch delay.
if (ConversationSwitchDelay <= 0.0f)
{
// Instant switch (delay = 0).
PendingSwitchAgent.Reset();
PendingSwitchStartTime = 0.0;
return BestCandidate;
}
// Start or continue the switch timer.
if (PendingSwitchAgent.Get() != BestCandidate)
{
// Player started looking at a new candidate — reset timer.
PendingSwitchAgent = BestCandidate;
PendingSwitchStartTime = FPlatformTime::Seconds();
return CurrentAgent; // Not yet — keep current.
}
// Same candidate as before — check if delay has elapsed.
const double Elapsed = FPlatformTime::Seconds() - PendingSwitchStartTime;
if (Elapsed < static_cast<double>(ConversationSwitchDelay))
{
return CurrentAgent; // Still waiting.
}
// Delay elapsed → allow the switch.
PendingSwitchAgent.Reset();
PendingSwitchStartTime = 0.0;
return BestCandidate;
}
return BestCandidate;
}

View File

@ -207,41 +207,58 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
// This produces smooth continuous audio chunks without the fragmentation caused by
// explicit optimize_streaming_latency or enable_intermediate_response overrides.
//
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
// Build turn configuration based on mode + latency settings.
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client)
{
// turn_timeout: how long the server waits after VAD detects silence before
// processing the user's turn. Default is ~3s. In push-to-talk mode this
// directly adds latency — the server waits after the user releases T.
// 1s is safe without speculative_turn (which was removed — see history below).
//
// History:
// turn_timeout=1 was problematic when combined with speculative_turn=true
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
// and halves the per-turn latency.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
bool bHasTurnOverrides = false;
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
// In Client (push-to-talk) mode, reduce turn_timeout to minimize latency.
if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client)
{
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
bHasTurnOverrides = true;
}
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
// turn_eagerness: controls how quickly the server interprets pauses as end-of-speech.
// "eager" = fastest (may cut user off), "normal" = balanced, "patient" = waits longer.
if (TurnEagerness != EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal)
{
FString EagernessStr;
switch (TurnEagerness)
{
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Eager: EagernessStr = TEXT("eager"); break;
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Patient: EagernessStr = TEXT("patient"); break;
default: EagernessStr = TEXT("normal"); break;
}
TurnObj->SetStringField(TEXT("turn_eagerness"), EagernessStr);
bHasTurnOverrides = true;
}
// speculative_turn: start generating a response before confirming end-of-speech.
// Reduces latency but may cause occasional false starts (discarded if user continues).
if (bSpeculativeTurn)
{
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
bHasTurnOverrides = true;
}
if (bHasTurnOverrides)
{
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
}
}
// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
//
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
// - tts.optimize_streaming_latency: deprecated by ElevenLabs. Sending any value
// changes TTS chunking behaviour. Server default (omitted) is optimal.
//
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
// before finishing generation → fragmented audio. When omitted (C++ sample), the
// LLM completes its response first → continuous TTS chunks.
// before finishing generation → fragmented audio. Omitted = server default.
//
// - custom_llm_extra_body (empty object): Even an empty object might override the
// agent's configured custom_llm_extra_body with nothing. Omit entirely.
@ -259,12 +276,15 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
{
const UPS_AI_ConvAgent_Settings_ElevenLabs* S = FPS_AI_ConvAgentModule::Get().GetSettings();
if (S->bVerboseLogging)
if (S && S->bVerboseLogging)
{
UE_LOG(LogPS_AI_ConvAgent_WS_ElevenLabs, Verbose, TEXT("Sending initiation: %s"), *InitJson);
}
}
WebSocket->Send(InitJson);
if (WebSocket.IsValid())
{
WebSocket->Send(InitJson);
}
}
void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnectionError(const FString& Error)
@ -507,6 +527,10 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleTranscript(const TSharedP
return;
}
// Record arrival time for latency measurement (ASR+LLM breakdown).
// user_transcript arrives after server VAD + ASR, just before LLM starts.
LastUserTranscriptTime = FPlatformTime::Seconds();
FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs Segment;
Segment.Speaker = TEXT("user");
(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
@ -679,6 +703,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandlePing(const TSharedPtr<FJs
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
{
(*PingEvent)->TryGetNumberField(TEXT("event_id"), EventID);
// Extract server-reported WS round-trip latency.
int32 PingValue = 0;
if ((*PingEvent)->TryGetNumberField(TEXT("ping_ms"), PingValue))
{
LastPingMs.store(PingValue, std::memory_order_relaxed);
}
}
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
@ -718,7 +749,7 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
{
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
// Custom URL override takes full precedence
// Custom URL override takes full precedence (advanced / proxy use case)
if (!Settings->CustomWebSocketURL.IsEmpty())
{
return Settings->CustomWebSocketURL;
@ -730,9 +761,9 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
return FString();
}
// Official ElevenLabs Conversational AI WebSocket endpoint
// wss://api.elevenlabs.io/v1/convai/conversation?agent_id=<ID>
// Build URL from the region-aware base: wss://<regional-host>/v1/convai/conversation?agent_id=<ID>
const FString BaseURL = Settings->GetWSBaseURL();
return FString::Printf(
TEXT("wss://api.elevenlabs.io/v1/convai/conversation?agent_id=%s"),
*AgentIDOverride);
TEXT("%s/v1/convai/conversation?agent_id=%s"),
*BaseURL, *AgentIDOverride);
}

View File

@ -6,6 +6,22 @@
#include "Modules/ModuleManager.h"
#include "PS_AI_ConvAgent.generated.h"
// ─────────────────────────────────────────────────────────────────────────────
// ElevenLabs server region
// ─────────────────────────────────────────────────────────────────────────────
UENUM()
enum class EPS_AI_ConvAgent_ElevenLabsRegion : uint8
{
/** Automatic global routing (default). Server chosen by ElevenLabs based on client location. */
Global UMETA(DisplayName = "Global (auto)"),
/** Force US servers: api.us.elevenlabs.io */
US UMETA(DisplayName = "US"),
/** Force EU servers (Enterprise only): api.eu.residency.elevenlabs.io */
EU UMETA(DisplayName = "EU (Enterprise)"),
/** Force India servers (Enterprise only): api.in.residency.elevenlabs.io */
India UMETA(DisplayName = "India (Enterprise)")
};
// ─────────────────────────────────────────────────────────────────────────────
// Settings object exposed in Project Settings → Plugins → PS AI ConvAgent - ElevenLabs
// ─────────────────────────────────────────────────────────────────────────────
@ -24,8 +40,17 @@ public:
FString API_Key;
/**
* Override the ElevenLabs WebSocket base URL. Leave empty to use the default:
* wss://api.elevenlabs.io/v1/convai/conversation
* Server region for ElevenLabs API.
* - Global (default): automatic routing based on client location.
* - US: force US servers (api.us.elevenlabs.io).
* - EU / India: Enterprise-only data residency endpoints.
*/
UPROPERTY(Config, EditAnywhere, Category = "PS AI ConvAgent|ElevenLabs API")
EPS_AI_ConvAgent_ElevenLabsRegion ServerRegion = EPS_AI_ConvAgent_ElevenLabsRegion::Global;
/**
* Override the ElevenLabs WebSocket URL entirely. Leave empty to use ServerRegion setting.
* Example: wss://custom-proxy.example.com/v1/convai/conversation?agent_id=YOUR_ID
*/
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
FString CustomWebSocketURL;
@ -33,6 +58,30 @@ public:
/** Log verbose WebSocket messages to the Output Log (useful during development). */
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
bool bVerboseLogging = false;
/** Return the API base URL (https) for the selected region. */
FString GetAPIBaseURL() const
{
switch (ServerRegion)
{
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("https://api.us.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("https://api.eu.residency.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("https://api.in.residency.elevenlabs.io");
default: return TEXT("https://api.elevenlabs.io");
}
}
/** Return the WebSocket base URL (wss) for the selected region. */
FString GetWSBaseURL() const
{
switch (ServerRegion)
{
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("wss://api.us.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("wss://api.eu.residency.elevenlabs.io");
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("wss://api.in.residency.elevenlabs.io");
default: return TEXT("wss://api.elevenlabs.io");
}
}
};

View File

@ -4,6 +4,7 @@
#include "CoreMinimal.h"
#include "Engine/DataAsset.h"
#include "PS_AI_ConvAgent_Definitions.h"
#include "PS_AI_ConvAgent_AgentConfig_ElevenLabs.generated.h"
/**
@ -186,6 +187,24 @@ public:
ToolTip = "Max conversation turns.\n0 = unlimited."))
int32 MaxTurns = 0;
// ── Latency / Turn-taking ───────────────────────────────────────────────
/** How quickly the server detects end-of-speech and starts responding.
* Eager = fastest response, may cut the user off during pauses.
* Normal = balanced (default). Patient = waits longer for user to finish.
* Sent as conversation_config_override at WebSocket connection time. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
meta = (ToolTip = "Controls how quickly the server detects end-of-speech.\n- Eager: fastest response, may interrupt mid-pause.\n- Normal: balanced (default).\n- Patient: waits longer for user to finish."))
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
/** Enable speculative turn processing: the server starts generating a response
* before it's certain the user has finished speaking. If the user continues,
* the speculative response is discarded. Reduces perceived latency.
* May cause occasional false starts disable if the agent interrupts too often. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
meta = (ToolTip = "Start generating a response before confirming end-of-speech.\nReduces latency but may cause occasional false starts.\nDisable if the agent interrupts the user too often."))
bool bSpeculativeTurn = false;
// ── Emotion Tool ─────────────────────────────────────────────────────────
/** Include the built-in "set_emotion" client tool in the agent configuration.

View File

@ -29,6 +29,20 @@ enum class EPS_AI_ConvAgent_TurnMode_ElevenLabs : uint8
Client UMETA(DisplayName = "Client Controlled"),
};
// ─────────────────────────────────────────────────────────────────────────────
// Agent turn eagerness — controls how quickly the server detects end of speech
// ─────────────────────────────────────────────────────────────────────────────
UENUM(BlueprintType)
enum class EPS_AI_ConvAgent_TurnEagerness_ElevenLabs : uint8
{
/** Quick response at the earliest opportunity. Best for customer service. */
Eager UMETA(DisplayName = "Eager"),
/** Balanced turn-taking for general scenarios (default). */
Normal UMETA(DisplayName = "Normal"),
/** Longer wait for user to finish. Best for information collection. */
Patient UMETA(DisplayName = "Patient"),
};
// ─────────────────────────────────────────────────────────────────────────────
// WebSocket message type helpers (internal, not exposed to Blueprint)
// ─────────────────────────────────────────────────────────────────────────────

View File

@ -185,14 +185,24 @@ public:
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
bool bEnableAgentPartialResponse = false;
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
* Delays playback start so early TTS chunks can accumulate, preventing
* mid-sentence pauses when the second chunk hasn't arrived yet.
* Set to 0 for immediate playback. */
/** Pre-buffer delay (ms) before starting audio playback on the first TTS chunk.
* Set this to your "worst case" value (e.g. 300-1000ms depending on connection).
* When adaptive mode is on, the system starts here and can only decrease
* (never increase) as it measures that chunks arrive fast enough.
* Set to 0 to disable pre-buffering entirely. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
meta = (ClampMin = "0", ClampMax = "4000",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
int32 AudioPreBufferMs = 2000;
ToolTip = "Pre-buffer delay (ms) — your safe 'worst case' value.\nAdaptive mode can only decrease from here, never increase.\nSet 0 to disable pre-buffering entirely."))
int32 AudioPreBufferMs = 300;
/** Enable adaptive pre-buffer: measures inter-chunk timing and automatically
* lowers the pre-buffer when TTS chunks arrive fast enough.
* The system can only decrease from AudioPreBufferMs never increase.
* Resets to AudioPreBufferMs at the start of each conversation. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
meta = (EditCondition = "AudioPreBufferMs > 0",
ToolTip = "Automatically lower pre-buffer when connection is good.\nCan only decrease, never increase beyond AudioPreBufferMs.\nResets each conversation."))
bool bAdaptivePreBuffer = true;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs",
@ -640,18 +650,23 @@ private:
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
// All anchored to GenerationStartTime (agent_response_started event), which is
// the closest client-side proxy for "user stopped speaking" in Server VAD mode.
// Zero means "not yet measured this turn".
// Current-turn latency measurements (ms). Overwritten per-field as each
// measurement is captured — NOT reset to zero between turns, so the HUD
// always shows the most recent value instead of blinking "---".
// All anchored to GenerationStartTime (agent_response_started event).
struct FDebugLatencies
{
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
float TurnEndToTextMs = 0.0f; // user turn end → first text from LLM (≈ ASR + LLM TTFT)
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (≈ TTS + network)
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived)
};
FDebugLatencies CurrentLatencies;
// ElevenLabs server region (from x-region header on REST API). Fetched once per session.
FString ServerRegion;
void FetchServerRegion();
// Accumulates incoming PCM bytes until the audio component needs data.
// Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every
// underflow callback (~60Hz). Compacted periodically when read offset
@ -664,6 +679,22 @@ private:
bool bPreBuffering = false;
double PreBufferStartTime = 0.0;
// ── Adaptive pre-buffer ─────────────────────────────────────────────────
// Runtime pre-buffer duration (ms). Equals AudioPreBufferMs when adaptive is off.
// When adaptive is on: initialized from AudioPreBufferMs, adjusted based on
// measured inter-chunk timing (not queue-dry detection).
int32 AdaptivePreBufferMs = 300;
static constexpr int32 AdaptivePreBufferMinMs = 50;
// Direction of last adaptation: +1=raised, -1=lowered, 0=stable. Used by HUD.
int32 PreBufferTrend = 0;
void ApplyPreBufferAdaptation();
// Per-turn inter-chunk timing measurement (game thread only).
// Set when the second TTS chunk arrives, consumed at turn end.
double TurnFirstChunkTime = 0.0; // When chunk 1 arrived.
int32 TurnFirstChunkBytes = 0; // Bytes in chunk 1 (to estimate audio duration).
int32 TurnIdealPreBufferMs = -1; // Computed ideal pre-buffer. -1 = not measured.
bool bTurnGapMeasured = false; // True after first inter-chunk gap is measured.
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
bool bQueueWasDry = false;

View File

@ -117,6 +117,14 @@ public:
// ── Conversation management ──────────────────────────────────────────────
/** How long (seconds) the player must look at a different agent before switching
* during an active conversation. Prevents accidental switches when glancing around.
* Only applies when a conversation is active (bNetIsConversing). */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|Interaction",
meta = (ClampMin = "0.0", ClampMax = "5.0",
ToolTip = "Seconds the player must look at another agent before switching mid-conversation.\nPrevents accidental switches when glancing around.\n0 = instant switch."))
float ConversationSwitchDelay = 1.0f;
/** Automatically start the WebSocket conversation when an agent is selected
* (enters range + view cone). When false, selecting an agent only manages
* gaze and visual awareness the conversation must be started explicitly
@ -251,7 +259,7 @@ private:
// ── Selection logic ──────────────────────────────────────────────────────
/** Evaluate all registered agents, return the best candidate (or null). */
UPS_AI_ConvAgent_ElevenLabsComponent* EvaluateBestAgent() const;
UPS_AI_ConvAgent_ElevenLabsComponent* EvaluateBestAgent();
/** Apply a new selection — fire events, reroute mic. */
void SetSelectedAgent(UPS_AI_ConvAgent_ElevenLabsComponent* NewAgent);
@ -296,4 +304,10 @@ private:
FTimerHandle GazeAttachTimerHandle;
FTimerHandle GazeDetachTimerHandle;
// ── Conversation switch delay ────────────────────────────────────────
// Tracks how long the player has been looking at a different agent
// while in an active conversation. Switch only happens after the delay.
TWeakObjectPtr<UPS_AI_ConvAgent_ElevenLabsComponent> PendingSwitchAgent;
double PendingSwitchStartTime = 0.0;
};

View File

@ -197,6 +197,18 @@ public:
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const { return ConversationInfo; }
/** Latest WebSocket round-trip latency reported by the server (ms).
* Returns -1 if no ping has been received yet. Thread-safe. */
int32 GetLastPingMs() const { return LastPingMs.load(std::memory_order_relaxed); }
/** Timestamp of the last user audio chunk sent to the server.
* Used as a proxy for "user stopped speaking" in Server VAD mode. */
double GetLastAudioChunkSentTime() const { return LastAudioChunkSentTime; }
/** Timestamp of the last user_transcript received from the server.
* Marks when server finished ASR best anchor for LLM latency measurement. */
double GetLastUserTranscriptTime() const { return LastUserTranscriptTime; }
// ─────────────────────────────────────────────────────────────────────────
// Internal
// ─────────────────────────────────────────────────────────────────────────
@ -235,10 +247,16 @@ private:
TArray<uint8> BinaryFrameBuffer;
// ── Latency tracking ─────────────────────────────────────────────────────
// Server-reported WebSocket round-trip latency from ping events (~every 2s).
// Atomic: written from WS callback thread, read from game thread (HUD).
std::atomic<int32> LastPingMs{-1};
// Timestamp of the last audio chunk sent (user speech).
double LastAudioChunkSentTime = 0.0;
// Timestamp when user turn ended (StopListening).
double UserTurnEndTime = 0.0;
// Timestamp of the last user_transcript received (server finished ASR).
double LastUserTranscriptTime = 0.0;
// Whether we are waiting for the first response after user stopped speaking.
// Atomic: defensive — documents thread-safety contract.
std::atomic<bool> bWaitingForResponse{false};
@ -264,4 +282,10 @@ public:
// Set by UPS_AI_ConvAgent_ElevenLabsComponent before calling Connect().
// Controls turn_timeout in conversation_initiation_client_data.
EPS_AI_ConvAgent_TurnMode_ElevenLabs TurnMode = EPS_AI_ConvAgent_TurnMode_ElevenLabs::Server;
// Controls how eagerly the server interprets pauses as end-of-speech.
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
// Start generating before confirming end-of-speech (reduces latency, may cause false starts).
bool bSpeculativeTurn = false;
};

View File

@ -22,33 +22,43 @@
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_AgentConfigEditor, Log, All);
// Approximate LLM latencies as shown on the ElevenLabs dashboard.
// The API does not expose this data — values are indicative and may change.
// Approximate LLM latencies as shown on the ElevenLabs dashboard (March 2026).
// The /v1/convai/llm/list API does NOT expose latency — values are indicative.
// Update this table periodically to stay current.
static FString GetLLMLatencyHint(const FString& ModelID)
{
struct FLatencyEntry { const TCHAR* ID; const TCHAR* Latency; };
static const FLatencyEntry Entries[] =
{
// OpenAI
{ TEXT("gpt-4o-mini"), TEXT("~350ms") },
{ TEXT("gpt-4o"), TEXT("~700ms") },
{ TEXT("gpt-4"), TEXT("~900ms") },
{ TEXT("gpt-4-turbo"), TEXT("~650ms") },
// Anthropic
{ TEXT("claude-sonnet-4-5"), TEXT("~750ms") },
{ TEXT("claude-haiku-4-5"), TEXT("~350ms") },
{ TEXT("claude-3-5-sonnet"), TEXT("~700ms") },
// Google
{ TEXT("gemini-1.5-pro"), TEXT("~500ms") },
{ TEXT("gemini-2.0-flash"), TEXT("~300ms") },
{ TEXT("gemini-2.5-flash"), TEXT("~250ms") },
// xAI
{ TEXT("grok-beta"), TEXT("~500ms") },
// ElevenLabs-hosted
{ TEXT("qwen3-30b-a3b"), TEXT("~207ms") },
{ TEXT("glm-4.5-air"), TEXT("~980ms") },
{ TEXT("gpt-oss-120b"), TEXT("~331ms") },
// ── ElevenLabs-hosted ─────────────────────────────────────────────
{ TEXT("glm-4.5-air"), TEXT("~949ms") },
{ TEXT("qwen3-30b-a3b"), TEXT("~189ms") },
{ TEXT("gpt-oss-120b"), TEXT("~321ms") },
// ── Google ────────────────────────────────────────────────────────
{ TEXT("gemini-3-pro"), TEXT("~3.5s") },
{ TEXT("gemini-3-flash"), TEXT("~1.4s") },
{ TEXT("gemini-2.5-flash"), TEXT("~967ms") },
{ TEXT("gemini-2.5-flash-lite"), TEXT("~605ms") },
// ── OpenAI ────────────────────────────────────────────────────────
{ TEXT("gpt-5"), TEXT("~1.1s") },
{ TEXT("gpt-5.1"), TEXT("~980ms") },
{ TEXT("gpt-5.2"), TEXT("~795ms") },
{ TEXT("gpt-5-mini"), TEXT("~884ms") },
{ TEXT("gpt-5-nano"), TEXT("~734ms") },
{ TEXT("gpt-4.1"), TEXT("~870ms") },
{ TEXT("gpt-4.1-mini"), TEXT("~916ms") },
{ TEXT("gpt-4.1-nano"), TEXT("~574ms") },
{ TEXT("gpt-4o"), TEXT("~728ms") },
{ TEXT("gpt-4o-mini"), TEXT("~767ms") },
{ TEXT("gpt-4-turbo"), TEXT("~1.5s") },
{ TEXT("gpt-3.5-turbo"), TEXT("~458ms") },
// ── Anthropic ─────────────────────────────────────────────────────
{ TEXT("claude-sonnet-4-5"), TEXT("~1.4s") },
{ TEXT("claude-sonnet-4"), TEXT("~1.1s") },
{ TEXT("claude-haiku-4-5"), TEXT("~644ms") },
{ TEXT("claude-3.7-sonnet"), TEXT("~1.2s") },
{ TEXT("claude-3-haiku"), TEXT("~484ms") },
{ TEXT("claude-3-5-sonnet"), TEXT("~1.2s") },
};
for (const auto& E : Entries)
@ -58,6 +68,22 @@ static FString GetLLMLatencyHint(const FString& ModelID)
return FString();
}
// Infer provider from model ID prefix for display grouping.
static FString GetLLMProvider(const FString& ModelID)
{
if (ModelID.StartsWith(TEXT("gpt-")) || ModelID.StartsWith(TEXT("o1")) || ModelID.StartsWith(TEXT("o3")))
return TEXT("OpenAI");
if (ModelID.StartsWith(TEXT("claude-")))
return TEXT("Anthropic");
if (ModelID.StartsWith(TEXT("gemini-")))
return TEXT("Google");
if (ModelID.StartsWith(TEXT("grok")))
return TEXT("xAI");
if (ModelID == TEXT("glm-4.5-air") || ModelID == TEXT("qwen3-30b-a3b") || ModelID == TEXT("gpt-oss-120b"))
return TEXT("ElevenLabs");
return FString();
}
// Language code → display name. Shared by BuildAgentPayload (to resolve
// {Language} placeholder) and the fetch handler (to strip the resolved fragment).
static FString GetLanguageDisplayName(const FString& LangCode)
@ -332,9 +358,11 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::CustomizeDetails(
.Font(IDetailLayoutBuilder::GetDetailFont())
]
.ValueContent()
.MaxDesiredWidth(600.f)
[
SNew(SBox)
.MinDesiredHeight(200.f)
.MinDesiredWidth(400.f)
[
SNew(SMultiLineEditableTextBox)
.Font(IDetailLayoutBuilder::GetDetailFont())
@ -679,6 +707,10 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
Pinned->LLMDisplayNames.Reset();
Pinned->LLMModelIDs.Reset();
// Collect models grouped by provider for sorted display.
struct FLLMEntry { FString ModelID; FString Provider; FString Display; bool bCheckpoint; };
TArray<FLLMEntry> AllEntries;
for (const auto& LLMVal : *LLMs)
{
const TSharedPtr<FJsonObject>* LLMObj = nullptr;
@ -703,28 +735,62 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
}
}
// Check if it's a checkpoint model (sub-version).
bool bIsCheckpoint = false;
(*LLMObj)->TryGetBoolField(TEXT("is_checkpoint"), bIsCheckpoint);
// Build display string: "model-id (~350ms)" or " model-id (checkpoint, ~350ms)"
const FString Latency = GetLLMLatencyHint(ModelID);
const FString Provider = GetLLMProvider(ModelID);
// Build display: " model-id (checkpoint, ~350ms)" for checkpoints,
// "model-id (~350ms)" for main models.
FString Display;
if (bIsCheckpoint)
{
Display = Latency.IsEmpty()
? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID)
: FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency);
? FString::Printf(TEXT(" %s (checkpoint)"), *ModelID)
: FString::Printf(TEXT(" %s (checkpoint, %s)"), *ModelID, *Latency);
}
else
{
Display = Latency.IsEmpty()
? ModelID
: FString::Printf(TEXT("%s (%s)"), *ModelID, *Latency);
? FString::Printf(TEXT(" %s"), *ModelID)
: FString::Printf(TEXT(" %s (%s)"), *ModelID, *Latency);
}
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Display)));
Pinned->LLMModelIDs.Add(ModelID);
AllEntries.Add({ ModelID, Provider, Display, bIsCheckpoint });
}
// Sort by provider order (ElevenLabs, Google, OpenAI, Anthropic, Other),
// then main models before checkpoints, then alphabetically.
static const TArray<FString> ProviderOrder = {
TEXT("ElevenLabs"), TEXT("Google"), TEXT("OpenAI"), TEXT("Anthropic"), TEXT("xAI")
};
AllEntries.Sort([](const FLLMEntry& A, const FLLMEntry& B)
{
int32 IdxA = ProviderOrder.IndexOfByKey(A.Provider);
int32 IdxB = ProviderOrder.IndexOfByKey(B.Provider);
if (IdxA == INDEX_NONE) IdxA = ProviderOrder.Num();
if (IdxB == INDEX_NONE) IdxB = ProviderOrder.Num();
if (IdxA != IdxB) return IdxA < IdxB;
if (A.bCheckpoint != B.bCheckpoint) return !A.bCheckpoint; // main first
return A.ModelID < B.ModelID;
});
// Insert provider headers as non-selectable separator entries.
FString LastProvider;
for (const auto& Entry : AllEntries)
{
const FString& Prov = Entry.Provider.IsEmpty() ? TEXT("Other") : Entry.Provider;
if (Prov != LastProvider)
{
// Header line: "── OpenAI ──" (not selectable — mapped to empty ModelID)
FString Header = FString::Printf(TEXT("── %s ──"), *Prov);
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Header)));
Pinned->LLMModelIDs.Add(FString()); // empty = separator
LastProvider = Prov;
}
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Entry.Display)));
Pinned->LLMModelIDs.Add(Entry.ModelID);
}
// Pre-select the currently set LLMModel if it exists in the list.
@ -767,6 +833,9 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnLLMSelected(
int32 Idx = LLMDisplayNames.IndexOfByKey(NewSelection);
if (Idx == INDEX_NONE) return;
// Separator headers have empty ModelID — ignore selection.
if (LLMModelIDs[Idx].IsEmpty()) return;
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = GetEditedAsset())
{
Asset->Modify();