Add turn eagerness, speculative turn, adaptive pre-buffer, and latency HUD improvements
- Add TurnEagerness (Eager/Normal/Patient) and bSpeculativeTurn to agent config data asset, sent as conversation_config_override at WebSocket connection time - Add adaptive pre-buffer system: measures inter-chunk TTS timing and decreases pre-buffer when chunks arrive fast enough (decrease-only, resets each conversation) - New UPROPERTY: bAdaptivePreBuffer toggle, AudioPreBufferMs as starting/worst-case value - Rework latency HUD: TTS+Net, PreBuf actual/target with trend indicator, Gen>Ear, WS Ping, server region display - Fetch ElevenLabs server region from REST API x-region header - Add editor Detail Customization: TurnEagerness dropdown + SpeculativeTurn checkbox in AgentConfig with LLM picker and Language picker Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2169c58cd7
commit
4456dfa9dc
@ -1,8 +1,8 @@
|
||||
|
||||
|
||||
[/Script/EngineSettings.GameMapsSettings]
|
||||
GameDefaultMap=/Game/voidMap.voidMap
|
||||
EditorStartupMap=/Game/voidMap.voidMap
|
||||
GameDefaultMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
|
||||
EditorStartupMap=/PS_AI_ConvAgent/Demo_Metahuman.Demo_Metahuman
|
||||
|
||||
[/Script/Engine.RendererSettings]
|
||||
r.AllowStaticLighting=False
|
||||
@ -182,4 +182,5 @@ ManualIPAddress=
|
||||
|
||||
[/Script/PS_AI_ConvAgent.PS_AI_ConvAgent_Settings_ElevenLabs]
|
||||
API_Key=7b73c4244ccbec394cc010aaab01b0ec59ce0b11fc636ce4828354f675ca14a5
|
||||
ServerRegion=Global
|
||||
|
||||
|
||||
@ -17,6 +17,9 @@
|
||||
#include "GameFramework/PlayerController.h"
|
||||
#include "Net/UnrealNetwork.h"
|
||||
#include "VoiceModule.h"
|
||||
#include "HttpModule.h"
|
||||
#include "Interfaces/IHttpRequest.h"
|
||||
#include "Interfaces/IHttpResponse.h"
|
||||
|
||||
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_ConvAgent_ElevenLabs, Log, All);
|
||||
|
||||
@ -147,15 +150,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
if (bPreBuffering)
|
||||
{
|
||||
const double Elapsed = (FPlatformTime::Seconds() - PreBufferStartTime) * 1000.0;
|
||||
if (Elapsed >= static_cast<double>(AudioPreBufferMs))
|
||||
const int32 EffPreBuf = (AudioPreBufferMs > 0)
|
||||
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
|
||||
if (Elapsed >= static_cast<double>(EffPreBuf))
|
||||
{
|
||||
bPreBuffering = false;
|
||||
if (bDebug)
|
||||
{
|
||||
const double Tpb = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms). Starting playback."),
|
||||
Tpb, LastClosedTurnIndex, AudioPreBufferMs);
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffer timeout (%dms adaptive). Starting playback."),
|
||||
Tpb, LastClosedTurnIndex, EffPreBuf);
|
||||
}
|
||||
// Only start playback if the agent is still speaking.
|
||||
// If silence detection already set bAgentSpeaking=false, this is stale.
|
||||
@ -292,6 +297,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
// Broadcast OUTSIDE the lock — Blueprint handlers can execute for arbitrary time.
|
||||
if (bShouldBroadcastStopped)
|
||||
{
|
||||
// Adapt pre-buffer for next turn based on this turn's signals.
|
||||
ApplyPreBufferAdaptation();
|
||||
|
||||
if (bHardTimeoutFired && bDebug)
|
||||
{
|
||||
const double Tht = FPlatformTime::Seconds() - SessionStartTime;
|
||||
@ -321,7 +329,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
||||
{
|
||||
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
|
||||
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
|
||||
if (bShowLatency)
|
||||
// Only draw on the active (connected) Authority component.
|
||||
// Multiple agents in the scene would overwrite each other's HUD at the same
|
||||
// BaseKey, causing visible blinking between their values.
|
||||
if (bShowLatency && IsConnected() && GetOwnerRole() == ROLE_Authority)
|
||||
{
|
||||
DrawLatencyHUD();
|
||||
}
|
||||
@ -388,6 +399,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StartConversation_Internal()
|
||||
|
||||
// Pass configuration to the proxy before connecting.
|
||||
WebSocketProxy->TurnMode = TurnMode;
|
||||
if (AgentConfig)
|
||||
{
|
||||
WebSocketProxy->TurnEagerness = AgentConfig->TurnEagerness;
|
||||
WebSocketProxy->bSpeculativeTurn = AgentConfig->bSpeculativeTurn;
|
||||
}
|
||||
|
||||
// Resolve AgentID by priority: AgentConfig > component string > project default.
|
||||
FString ResolvedAgentID = AgentID;
|
||||
@ -834,6 +850,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
|
||||
SessionStartTime = FPlatformTime::Seconds();
|
||||
TurnIndex = 0;
|
||||
LastClosedTurnIndex = 0;
|
||||
|
||||
// Initialize adaptive pre-buffer from designer settings.
|
||||
AdaptivePreBufferMs = AudioPreBufferMs; // Start at the designer's value.
|
||||
PreBufferTrend = 0;
|
||||
TurnIdealPreBufferMs = -1;
|
||||
bTurnGapMeasured = false;
|
||||
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+0.00s] Agent connected. ConversationID=%s"), *Info.ConversationID);
|
||||
OnAgentConnected.Broadcast(Info);
|
||||
|
||||
@ -852,6 +875,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleConnected(const FPS_AI_ConvAgen
|
||||
}
|
||||
}
|
||||
|
||||
// Probe server region once per session (only when latency HUD is enabled).
|
||||
if (ServerRegion.IsEmpty() && GetOwnerRole() == ROLE_Authority)
|
||||
{
|
||||
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
|
||||
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
|
||||
if (bShowLatency)
|
||||
{
|
||||
FetchServerRegion();
|
||||
}
|
||||
}
|
||||
|
||||
// In Client turn mode (push-to-talk), the user controls listening manually via
|
||||
// StartListening()/StopListening(). Auto-starting would leave the mic open
|
||||
// permanently and interfere with push-to-talk — the T-release StopListening()
|
||||
@ -1081,21 +1115,28 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
|
||||
// In Server VAD mode, StopListening() is not called — the server detects
|
||||
// end of user speech and immediately starts generating. If TurnEndTime was
|
||||
// not set by StopListening since the last generation (i.e. it's stale or 0),
|
||||
// use Now as the best client-side approximation.
|
||||
// use the proxy's LastUserTranscriptTime as the best approximation:
|
||||
// user_transcript arrives after server VAD + ASR, just before LLM starts.
|
||||
const bool bFreshTurnEnd = (TurnEndTime > GenerationStartTime) && (GenerationStartTime > 0.0);
|
||||
if (!bFreshTurnEnd)
|
||||
{
|
||||
TurnEndTime = Now;
|
||||
const double TranscriptTime = WebSocketProxy ? WebSocketProxy->GetLastUserTranscriptTime() : 0.0;
|
||||
TurnEndTime = (TranscriptTime > 0.0) ? TranscriptTime : Now;
|
||||
}
|
||||
|
||||
// Reset all latency measurements — new response cycle starts here.
|
||||
// All metrics are anchored to GenerationStartTime (= now), which is the closest
|
||||
// client-side proxy for "user stopped speaking" in Server VAD mode.
|
||||
CurrentLatencies = FDebugLatencies();
|
||||
// New response cycle starts here. All client-side metrics are anchored to
|
||||
// GenerationStartTime (= now). Do NOT zero CurrentLatencies — the per-field
|
||||
// assignments in EnqueueAgentAudio() overwrite naturally, so the HUD shows the
|
||||
// previous turn's values until the new turn's measurements arrive (no "---" blink).
|
||||
GenerationStartTime = Now;
|
||||
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = Now - TurnEndTime;
|
||||
|
||||
// LLM latency: time from user_transcript received to first text token arriving.
|
||||
// In Server VAD mode, this approximates LLM TTFT + network (post-ASR).
|
||||
// In Client turn mode, this is the full ASR + LLM latency.
|
||||
CurrentLatencies.TurnEndToTextMs = static_cast<float>(LatencyFromTurnEnd * 1000.0);
|
||||
if (bIsListening)
|
||||
{
|
||||
// In Server VAD + interruption mode, keep the mic open so the server can
|
||||
@ -1321,7 +1362,10 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
|
||||
AudioQueueReadOffset = 0;
|
||||
}
|
||||
|
||||
// Log when queue recovers (new data arrived after being dry)
|
||||
// Queue recovered: was dry, now has data again.
|
||||
// Only flag as underrun if the gap was long enough to be audible.
|
||||
// Short gaps (<200ms) are handled seamlessly by USoundWaveProcedural's
|
||||
// internal silence — no need to increase the pre-buffer for those.
|
||||
if (bQueueWasDry)
|
||||
{
|
||||
bQueueWasDry = false;
|
||||
@ -1329,7 +1373,7 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::OnProceduralUnderflow(
|
||||
{
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered — feeding real data again (%d bytes remaining)."),
|
||||
TEXT("[T+%.2fs] [Turn %d] AudioQueue recovered (%d bytes remaining)."),
|
||||
T, LastClosedTurnIndex, AudioQueue.Num() - AudioQueueReadOffset);
|
||||
}
|
||||
}
|
||||
@ -1371,6 +1415,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
||||
bQueueWasDry = false;
|
||||
SilentTickCount = 0;
|
||||
// Adaptive pre-buffer: record first chunk timing for inter-chunk gap measurement.
|
||||
TurnFirstChunkTime = FPlatformTime::Seconds();
|
||||
TurnFirstChunkBytes = PCMData.Num();
|
||||
TurnIdealPreBufferMs = -1;
|
||||
bTurnGapMeasured = false;
|
||||
|
||||
// Latency capture (always, for HUD display).
|
||||
if (GenerationStartTime > 0.0)
|
||||
@ -1393,7 +1442,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
MulticastAgentStartedSpeaking();
|
||||
}
|
||||
|
||||
if (AudioPreBufferMs > 0)
|
||||
const int32 EffectivePreBufferMs = (AudioPreBufferMs > 0)
|
||||
? (bAdaptivePreBuffer ? AdaptivePreBufferMs : AudioPreBufferMs) : 0;
|
||||
if (EffectivePreBufferMs > 0)
|
||||
{
|
||||
// Pre-buffer: accumulate audio before starting playback.
|
||||
// This absorbs TTS inter-chunk gaps so chunk 2 arrives before
|
||||
@ -1404,8 +1455,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
{
|
||||
const double Tpb2 = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms before starting playback."),
|
||||
Tpb2, LastClosedTurnIndex, AudioPreBufferMs);
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffering %dms (adaptive) before starting playback."),
|
||||
Tpb2, LastClosedTurnIndex, EffectivePreBufferMs);
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -1433,14 +1484,25 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
if (GetOwnerRole() == ROLE_Authority)
|
||||
{
|
||||
bPreBuffering = false;
|
||||
// Measure inter-chunk gap for adaptive pre-buffer.
|
||||
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0)
|
||||
{
|
||||
const double NowGap = FPlatformTime::Seconds();
|
||||
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
|
||||
// Chunk 1 audio duration: 16kHz 16-bit mono = 32000 bytes/sec.
|
||||
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
|
||||
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
|
||||
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
|
||||
bTurnGapMeasured = true;
|
||||
}
|
||||
if (bDebug)
|
||||
{
|
||||
const double NowPb = FPlatformTime::Seconds();
|
||||
const double BufferedMs = (NowPb - PreBufferStartTime) * 1000.0;
|
||||
const double Tpb3 = NowPb - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered). Starting playback."),
|
||||
Tpb3, LastClosedTurnIndex, BufferedMs);
|
||||
TEXT("[T+%.2fs] [Turn %d] Pre-buffer: second chunk arrived (%.0fms buffered, ideal=%dms). Starting playback."),
|
||||
Tpb3, LastClosedTurnIndex, BufferedMs, TurnIdealPreBufferMs);
|
||||
}
|
||||
if (AudioPlaybackComponent && !AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
@ -1467,6 +1529,23 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
||||
{
|
||||
AudioPlaybackComponent->Play();
|
||||
}
|
||||
// Measure inter-chunk gap for adaptive pre-buffer (first gap only).
|
||||
if (!bTurnGapMeasured && TurnFirstChunkTime > 0.0 && GetOwnerRole() == ROLE_Authority)
|
||||
{
|
||||
const double NowGap = FPlatformTime::Seconds();
|
||||
const double InterChunkGapMs = (NowGap - TurnFirstChunkTime) * 1000.0;
|
||||
const double Chunk1AudioMs = (TurnFirstChunkBytes > 0)
|
||||
? (static_cast<double>(TurnFirstChunkBytes) / 32.0) : 0.0;
|
||||
TurnIdealPreBufferMs = FMath::Max(0, FMath::RoundToInt32(InterChunkGapMs - Chunk1AudioMs));
|
||||
bTurnGapMeasured = true;
|
||||
if (bDebug)
|
||||
{
|
||||
const double T = NowGap - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Inter-chunk gap: %.0fms, chunk1 audio: %.0fms → ideal pre-buffer: %dms"),
|
||||
T, LastClosedTurnIndex, InterChunkGapMs, Chunk1AudioMs, TurnIdealPreBufferMs);
|
||||
}
|
||||
}
|
||||
// Reset silence counter — new audio arrived, we're not in a gap anymore
|
||||
SilentTickCount = 0;
|
||||
}
|
||||
@ -1516,6 +1595,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
|
||||
// Broadcast outside the lock.
|
||||
if (bWasSpeaking)
|
||||
{
|
||||
// Adapt pre-buffer for next turn based on this turn's signals.
|
||||
ApplyPreBufferAdaptation();
|
||||
|
||||
if (bDebug)
|
||||
{
|
||||
const double T = Now - SessionStartTime;
|
||||
@ -1536,6 +1618,52 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
|
||||
}
|
||||
}
|
||||
|
||||
void UPS_AI_ConvAgent_ElevenLabsComponent::ApplyPreBufferAdaptation()
|
||||
{
|
||||
// Only adapt on Authority (where the WebSocket lives and measurements are taken).
|
||||
if (GetOwnerRole() != ROLE_Authority) return;
|
||||
// Adaptive mode must be enabled, and pre-buffering must be active.
|
||||
if (!bAdaptivePreBuffer || AudioPreBufferMs == 0) return;
|
||||
// No measurement this turn (single-chunk response or no second chunk arrived).
|
||||
if (TurnIdealPreBufferMs < 0) { PreBufferTrend = 0; return; }
|
||||
|
||||
const int32 Prev = AdaptivePreBufferMs;
|
||||
|
||||
// DECREASE-ONLY: the measured ideal tells us the minimum pre-buffer needed.
|
||||
// If the ideal is lower than our current value, the connection is fast enough
|
||||
// that we can reduce the pre-buffer and save latency.
|
||||
// If the ideal is higher (e.g. natural speech pause, slow network), we do NOT
|
||||
// increase — USoundWaveProcedural handles gaps seamlessly in most cases.
|
||||
// The user sets AudioPreBufferMs as the "worst case" starting value;
|
||||
// the system only optimizes downward from there. Resets each conversation.
|
||||
if (TurnIdealPreBufferMs < AdaptivePreBufferMs)
|
||||
{
|
||||
// Ideal is lower — decrease toward it (EMA 30% per turn, with 50ms margin).
|
||||
const int32 TargetMs = FMath::Max(AdaptivePreBufferMinMs, TurnIdealPreBufferMs + 50);
|
||||
AdaptivePreBufferMs = FMath::Max(AdaptivePreBufferMinMs,
|
||||
FMath::RoundToInt32(AdaptivePreBufferMs * 0.7f + TargetMs * 0.3f));
|
||||
PreBufferTrend = (AdaptivePreBufferMs < Prev) ? -1 : 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
// Ideal >= current — connection is same or worse, keep current value.
|
||||
PreBufferTrend = 0;
|
||||
}
|
||||
|
||||
// Reset measurement for next turn.
|
||||
const int32 IdealForLog = TurnIdealPreBufferMs;
|
||||
TurnIdealPreBufferMs = -1;
|
||||
bTurnGapMeasured = false;
|
||||
|
||||
if (bDebug && Prev != AdaptivePreBufferMs)
|
||||
{
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Adaptive pre-buffer: %d ms -> %d ms (ideal=%dms)"),
|
||||
T, LastClosedTurnIndex, Prev, AdaptivePreBufferMs, IdealForLog);
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Microphone → WebSocket
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
@ -2404,6 +2532,42 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
|
||||
bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT("")));
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Server region detection (one-shot HTTP probe)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
void UPS_AI_ConvAgent_ElevenLabsComponent::FetchServerRegion()
|
||||
{
|
||||
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
|
||||
if (!Settings || Settings->API_Key.IsEmpty()) return;
|
||||
|
||||
auto Request = FHttpModule::Get().CreateRequest();
|
||||
Request->SetURL(Settings->GetAPIBaseURL() + TEXT("/v1/models"));
|
||||
Request->SetVerb(TEXT("GET"));
|
||||
Request->SetHeader(TEXT("xi-api-key"), Settings->API_Key);
|
||||
|
||||
TWeakObjectPtr<UPS_AI_ConvAgent_ElevenLabsComponent> WeakThis(this);
|
||||
Request->OnProcessRequestComplete().BindLambda(
|
||||
[WeakThis](FHttpRequestPtr /*Req*/, FHttpResponsePtr Resp, bool bSuccess)
|
||||
{
|
||||
if (!bSuccess || !Resp.IsValid()) return;
|
||||
const FString Region = Resp->GetHeader(TEXT("x-region"));
|
||||
if (Region.IsEmpty()) return;
|
||||
|
||||
AsyncTask(ENamedThreads::GameThread, [WeakThis, Region]()
|
||||
{
|
||||
if (WeakThis.IsValid())
|
||||
{
|
||||
WeakThis->ServerRegion = Region;
|
||||
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("ElevenLabs server region: %s"), *Region);
|
||||
}
|
||||
});
|
||||
});
|
||||
Request->ProcessRequest();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Latency debug HUD
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
|
||||
{
|
||||
if (!GEngine) return;
|
||||
@ -2416,25 +2580,58 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
|
||||
const FColor ValueColor = FColor::White;
|
||||
const FColor HighlightColor = FColor::Yellow;
|
||||
|
||||
// Helper: format a single metric — shows "---" when not yet captured this turn
|
||||
auto Fmt = [](float Ms) -> FString
|
||||
{
|
||||
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
|
||||
};
|
||||
|
||||
// Title — all times measured from agent_response_started
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
|
||||
TEXT("=== Latency (from gen start) ==="));
|
||||
int32 Row = 0;
|
||||
|
||||
// 1. Gen → Audio: generation start → first audio chunk (LLM + TTS)
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
|
||||
TEXT("=== Voice-to-Voice Latency ==="));
|
||||
|
||||
// 2. Pre-buffer wait before playback
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
|
||||
// Client-side breakdown: TTS+Net + Pre-buffer = Gen>Ear
|
||||
// Note: LLM latency is only visible on ElevenLabs dashboard (server-side).
|
||||
// In Server VAD mode, no reliable client-side "end of user speech" marker exists.
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" TTS+Net: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
|
||||
|
||||
// Pre-buffer display depends on adaptive mode.
|
||||
if (bAdaptivePreBuffer && AudioPreBufferMs > 0)
|
||||
{
|
||||
// Adaptive ON: show actual wait + adaptive target with trend arrow.
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" PreBuf actual: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
|
||||
|
||||
const TCHAR* TrendArrow = (PreBufferTrend > 0) ? TEXT(" ^")
|
||||
: (PreBufferTrend < 0) ? TEXT(" v")
|
||||
: TEXT("");
|
||||
const FColor AdaptiveColor = (PreBufferTrend > 0) ? FColor::Red
|
||||
: (PreBufferTrend < 0) ? FColor::Green
|
||||
: ValueColor;
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, AdaptiveColor,
|
||||
FString::Printf(TEXT(" PreBuf target: %d ms%s"), AdaptivePreBufferMs, TrendArrow));
|
||||
}
|
||||
else
|
||||
{
|
||||
// Adaptive OFF (or pre-buffer disabled): show fixed pre-buffer value.
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
|
||||
}
|
||||
|
||||
// 3. Gen → Ear: generation start → playback starts (user-perceived total)
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, HighlightColor,
|
||||
FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs)));
|
||||
|
||||
// Connection section
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, TitleColor,
|
||||
TEXT("--- Connection ---"));
|
||||
|
||||
const int32 PingMs = WebSocketProxy ? WebSocketProxy->GetLastPingMs() : -1;
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" WS Ping: %s"),
|
||||
(PingMs >= 0) ? *FString::Printf(TEXT("%d ms"), PingMs) : TEXT("---")));
|
||||
|
||||
GEngine->AddOnScreenDebugMessage(BaseKey + Row++, DisplayTime, ValueColor,
|
||||
FString::Printf(TEXT(" Region: %s"),
|
||||
ServerRegion.IsEmpty() ? TEXT("---") : *ServerRegion));
|
||||
}
|
||||
|
||||
@ -207,41 +207,58 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
|
||||
// This produces smooth continuous audio chunks without the fragmentation caused by
|
||||
// explicit optimize_streaming_latency or enable_intermediate_response overrides.
|
||||
//
|
||||
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
|
||||
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
|
||||
// Build turn configuration based on mode + latency settings.
|
||||
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
||||
|
||||
{
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
bool bHasTurnOverrides = false;
|
||||
|
||||
// In Client (push-to-talk) mode, reduce turn_timeout to minimize latency.
|
||||
if (TurnMode == EPS_AI_ConvAgent_TurnMode_ElevenLabs::Client)
|
||||
{
|
||||
// turn_timeout: how long the server waits after VAD detects silence before
|
||||
// processing the user's turn. Default is ~3s. In push-to-talk mode this
|
||||
// directly adds latency — the server waits after the user releases T.
|
||||
// 1s is safe without speculative_turn (which was removed — see history below).
|
||||
//
|
||||
// History:
|
||||
// turn_timeout=1 was problematic when combined with speculative_turn=true
|
||||
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
|
||||
// and halves the per-turn latency.
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||
bHasTurnOverrides = true;
|
||||
}
|
||||
|
||||
// turn_eagerness: controls how quickly the server interprets pauses as end-of-speech.
|
||||
// "eager" = fastest (may cut user off), "normal" = balanced, "patient" = waits longer.
|
||||
if (TurnEagerness != EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal)
|
||||
{
|
||||
FString EagernessStr;
|
||||
switch (TurnEagerness)
|
||||
{
|
||||
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Eager: EagernessStr = TEXT("eager"); break;
|
||||
case EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Patient: EagernessStr = TEXT("patient"); break;
|
||||
default: EagernessStr = TEXT("normal"); break;
|
||||
}
|
||||
TurnObj->SetStringField(TEXT("turn_eagerness"), EagernessStr);
|
||||
bHasTurnOverrides = true;
|
||||
}
|
||||
|
||||
// speculative_turn: start generating a response before confirming end-of-speech.
|
||||
// Reduces latency but may cause occasional false starts (discarded if user continues).
|
||||
if (bSpeculativeTurn)
|
||||
{
|
||||
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
|
||||
bHasTurnOverrides = true;
|
||||
}
|
||||
|
||||
if (bHasTurnOverrides)
|
||||
{
|
||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||
|
||||
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
||||
}
|
||||
}
|
||||
|
||||
// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
|
||||
//
|
||||
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
|
||||
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
|
||||
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
|
||||
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
|
||||
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
|
||||
// - tts.optimize_streaming_latency: deprecated by ElevenLabs. Sending any value
|
||||
// changes TTS chunking behaviour. Server default (omitted) is optimal.
|
||||
//
|
||||
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
|
||||
// before finishing generation → fragmented audio. When omitted (C++ sample), the
|
||||
// LLM completes its response first → continuous TTS chunks.
|
||||
// before finishing generation → fragmented audio. Omitted = server default.
|
||||
//
|
||||
// - custom_llm_extra_body (empty object): Even an empty object might override the
|
||||
// agent's configured custom_llm_extra_body with nothing. Omit entirely.
|
||||
@ -259,12 +276,15 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnected()
|
||||
FJsonSerializer::Serialize(InitMsg.ToSharedRef(), InitWriter);
|
||||
{
|
||||
const UPS_AI_ConvAgent_Settings_ElevenLabs* S = FPS_AI_ConvAgentModule::Get().GetSettings();
|
||||
if (S->bVerboseLogging)
|
||||
if (S && S->bVerboseLogging)
|
||||
{
|
||||
UE_LOG(LogPS_AI_ConvAgent_WS_ElevenLabs, Verbose, TEXT("Sending initiation: %s"), *InitJson);
|
||||
}
|
||||
}
|
||||
if (WebSocket.IsValid())
|
||||
{
|
||||
WebSocket->Send(InitJson);
|
||||
}
|
||||
}
|
||||
|
||||
void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::OnWsConnectionError(const FString& Error)
|
||||
@ -507,6 +527,10 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandleTranscript(const TSharedP
|
||||
return;
|
||||
}
|
||||
|
||||
// Record arrival time for latency measurement (ASR+LLM breakdown).
|
||||
// user_transcript arrives after server VAD + ASR, just before LLM starts.
|
||||
LastUserTranscriptTime = FPlatformTime::Seconds();
|
||||
|
||||
FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs Segment;
|
||||
Segment.Speaker = TEXT("user");
|
||||
(*TranscriptEvent)->TryGetStringField(TEXT("user_transcript"), Segment.Text);
|
||||
@ -679,6 +703,13 @@ void UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::HandlePing(const TSharedPtr<FJs
|
||||
if (Root->TryGetObjectField(TEXT("ping_event"), PingEvent) && PingEvent)
|
||||
{
|
||||
(*PingEvent)->TryGetNumberField(TEXT("event_id"), EventID);
|
||||
|
||||
// Extract server-reported WS round-trip latency.
|
||||
int32 PingValue = 0;
|
||||
if ((*PingEvent)->TryGetNumberField(TEXT("ping_ms"), PingValue))
|
||||
{
|
||||
LastPingMs.store(PingValue, std::memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
TSharedPtr<FJsonObject> Pong = MakeShareable(new FJsonObject());
|
||||
@ -718,7 +749,7 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
|
||||
{
|
||||
const UPS_AI_ConvAgent_Settings_ElevenLabs* Settings = FPS_AI_ConvAgentModule::Get().GetSettings();
|
||||
|
||||
// Custom URL override takes full precedence
|
||||
// Custom URL override takes full precedence (advanced / proxy use case)
|
||||
if (!Settings->CustomWebSocketURL.IsEmpty())
|
||||
{
|
||||
return Settings->CustomWebSocketURL;
|
||||
@ -730,9 +761,9 @@ FString UPS_AI_ConvAgent_WebSocket_ElevenLabsProxy::BuildWebSocketURL(const FStr
|
||||
return FString();
|
||||
}
|
||||
|
||||
// Official ElevenLabs Conversational AI WebSocket endpoint
|
||||
// wss://api.elevenlabs.io/v1/convai/conversation?agent_id=<ID>
|
||||
// Build URL from the region-aware base: wss://<regional-host>/v1/convai/conversation?agent_id=<ID>
|
||||
const FString BaseURL = Settings->GetWSBaseURL();
|
||||
return FString::Printf(
|
||||
TEXT("wss://api.elevenlabs.io/v1/convai/conversation?agent_id=%s"),
|
||||
*AgentIDOverride);
|
||||
TEXT("%s/v1/convai/conversation?agent_id=%s"),
|
||||
*BaseURL, *AgentIDOverride);
|
||||
}
|
||||
|
||||
@ -6,6 +6,22 @@
|
||||
#include "Modules/ModuleManager.h"
|
||||
#include "PS_AI_ConvAgent.generated.h"
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// ElevenLabs server region
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
UENUM()
|
||||
enum class EPS_AI_ConvAgent_ElevenLabsRegion : uint8
|
||||
{
|
||||
/** Automatic global routing (default). Server chosen by ElevenLabs based on client location. */
|
||||
Global UMETA(DisplayName = "Global (auto)"),
|
||||
/** Force US servers: api.us.elevenlabs.io */
|
||||
US UMETA(DisplayName = "US"),
|
||||
/** Force EU servers (Enterprise only): api.eu.residency.elevenlabs.io */
|
||||
EU UMETA(DisplayName = "EU (Enterprise)"),
|
||||
/** Force India servers (Enterprise only): api.in.residency.elevenlabs.io */
|
||||
India UMETA(DisplayName = "India (Enterprise)")
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Settings object – exposed in Project Settings → Plugins → PS AI ConvAgent - ElevenLabs
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
@ -24,8 +40,17 @@ public:
|
||||
FString API_Key;
|
||||
|
||||
/**
|
||||
* Override the ElevenLabs WebSocket base URL. Leave empty to use the default:
|
||||
* wss://api.elevenlabs.io/v1/convai/conversation
|
||||
* Server region for ElevenLabs API.
|
||||
* - Global (default): automatic routing based on client location.
|
||||
* - US: force US servers (api.us.elevenlabs.io).
|
||||
* - EU / India: Enterprise-only data residency endpoints.
|
||||
*/
|
||||
UPROPERTY(Config, EditAnywhere, Category = "PS AI ConvAgent|ElevenLabs API")
|
||||
EPS_AI_ConvAgent_ElevenLabsRegion ServerRegion = EPS_AI_ConvAgent_ElevenLabsRegion::Global;
|
||||
|
||||
/**
|
||||
* Override the ElevenLabs WebSocket URL entirely. Leave empty to use ServerRegion setting.
|
||||
* Example: wss://custom-proxy.example.com/v1/convai/conversation?agent_id=YOUR_ID
|
||||
*/
|
||||
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
|
||||
FString CustomWebSocketURL;
|
||||
@ -33,6 +58,30 @@ public:
|
||||
/** Log verbose WebSocket messages to the Output Log (useful during development). */
|
||||
UPROPERTY(Config, EditAnywhere, AdvancedDisplay, Category = "PS AI ConvAgent|ElevenLabs API")
|
||||
bool bVerboseLogging = false;
|
||||
|
||||
/** Return the API base URL (https) for the selected region. */
|
||||
FString GetAPIBaseURL() const
|
||||
{
|
||||
switch (ServerRegion)
|
||||
{
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("https://api.us.elevenlabs.io");
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("https://api.eu.residency.elevenlabs.io");
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("https://api.in.residency.elevenlabs.io");
|
||||
default: return TEXT("https://api.elevenlabs.io");
|
||||
}
|
||||
}
|
||||
|
||||
/** Return the WebSocket base URL (wss) for the selected region. */
|
||||
FString GetWSBaseURL() const
|
||||
{
|
||||
switch (ServerRegion)
|
||||
{
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::US: return TEXT("wss://api.us.elevenlabs.io");
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::EU: return TEXT("wss://api.eu.residency.elevenlabs.io");
|
||||
case EPS_AI_ConvAgent_ElevenLabsRegion::India: return TEXT("wss://api.in.residency.elevenlabs.io");
|
||||
default: return TEXT("wss://api.elevenlabs.io");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
@ -4,6 +4,7 @@
|
||||
|
||||
#include "CoreMinimal.h"
|
||||
#include "Engine/DataAsset.h"
|
||||
#include "PS_AI_ConvAgent_Definitions.h"
|
||||
#include "PS_AI_ConvAgent_AgentConfig_ElevenLabs.generated.h"
|
||||
|
||||
/**
|
||||
@ -186,6 +187,24 @@ public:
|
||||
ToolTip = "Max conversation turns.\n0 = unlimited."))
|
||||
int32 MaxTurns = 0;
|
||||
|
||||
// ── Latency / Turn-taking ───────────────────────────────────────────────
|
||||
|
||||
/** How quickly the server detects end-of-speech and starts responding.
|
||||
* Eager = fastest response, may cut the user off during pauses.
|
||||
* Normal = balanced (default). Patient = waits longer for user to finish.
|
||||
* Sent as conversation_config_override at WebSocket connection time. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
|
||||
meta = (ToolTip = "Controls how quickly the server detects end-of-speech.\n- Eager: fastest response, may interrupt mid-pause.\n- Normal: balanced (default).\n- Patient: waits longer for user to finish."))
|
||||
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
|
||||
|
||||
/** Enable speculative turn processing: the server starts generating a response
|
||||
* before it's certain the user has finished speaking. If the user continues,
|
||||
* the speculative response is discarded. Reduces perceived latency.
|
||||
* May cause occasional false starts — disable if the agent interrupts too often. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "Latency",
|
||||
meta = (ToolTip = "Start generating a response before confirming end-of-speech.\nReduces latency but may cause occasional false starts.\nDisable if the agent interrupts the user too often."))
|
||||
bool bSpeculativeTurn = false;
|
||||
|
||||
// ── Emotion Tool ─────────────────────────────────────────────────────────
|
||||
|
||||
/** Include the built-in "set_emotion" client tool in the agent configuration.
|
||||
|
||||
@ -29,6 +29,20 @@ enum class EPS_AI_ConvAgent_TurnMode_ElevenLabs : uint8
|
||||
Client UMETA(DisplayName = "Client Controlled"),
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Agent turn eagerness — controls how quickly the server detects end of speech
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
UENUM(BlueprintType)
|
||||
enum class EPS_AI_ConvAgent_TurnEagerness_ElevenLabs : uint8
|
||||
{
|
||||
/** Quick response at the earliest opportunity. Best for customer service. */
|
||||
Eager UMETA(DisplayName = "Eager"),
|
||||
/** Balanced turn-taking for general scenarios (default). */
|
||||
Normal UMETA(DisplayName = "Normal"),
|
||||
/** Longer wait for user to finish. Best for information collection. */
|
||||
Patient UMETA(DisplayName = "Patient"),
|
||||
};
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// WebSocket message type helpers (internal, not exposed to Blueprint)
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -185,14 +185,24 @@ public:
|
||||
meta = (ToolTip = "Fire OnAgentPartialResponse with streaming text fragments as the LLM generates them.\nIdeal for real-time subtitles. Each event gives one text chunk, not the accumulated text."))
|
||||
bool bEnableAgentPartialResponse = false;
|
||||
|
||||
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
|
||||
* Delays playback start so early TTS chunks can accumulate, preventing
|
||||
* mid-sentence pauses when the second chunk hasn't arrived yet.
|
||||
* Set to 0 for immediate playback. */
|
||||
/** Pre-buffer delay (ms) before starting audio playback on the first TTS chunk.
|
||||
* Set this to your "worst case" value (e.g. 300-1000ms depending on connection).
|
||||
* When adaptive mode is on, the system starts here and can only decrease
|
||||
* (never increase) as it measures that chunks arrive fast enough.
|
||||
* Set to 0 to disable pre-buffering entirely. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
|
||||
meta = (ClampMin = "0", ClampMax = "4000",
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
|
||||
int32 AudioPreBufferMs = 2000;
|
||||
ToolTip = "Pre-buffer delay (ms) — your safe 'worst case' value.\nAdaptive mode can only decrease from here, never increase.\nSet 0 to disable pre-buffering entirely."))
|
||||
int32 AudioPreBufferMs = 300;
|
||||
|
||||
/** Enable adaptive pre-buffer: measures inter-chunk timing and automatically
|
||||
* lowers the pre-buffer when TTS chunks arrive fast enough.
|
||||
* The system can only decrease from AudioPreBufferMs — never increase.
|
||||
* Resets to AudioPreBufferMs at the start of each conversation. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs|Latency",
|
||||
meta = (EditCondition = "AudioPreBufferMs > 0",
|
||||
ToolTip = "Automatically lower pre-buffer when connection is good.\nCan only decrease, never increase beyond AudioPreBufferMs.\nResets each conversation."))
|
||||
bool bAdaptivePreBuffer = true;
|
||||
|
||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|ElevenLabs",
|
||||
@ -640,18 +650,23 @@ private:
|
||||
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
|
||||
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
|
||||
|
||||
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
|
||||
// All anchored to GenerationStartTime (agent_response_started event), which is
|
||||
// the closest client-side proxy for "user stopped speaking" in Server VAD mode.
|
||||
// Zero means "not yet measured this turn".
|
||||
// Current-turn latency measurements (ms). Overwritten per-field as each
|
||||
// measurement is captured — NOT reset to zero between turns, so the HUD
|
||||
// always shows the most recent value instead of blinking "---".
|
||||
// All anchored to GenerationStartTime (agent_response_started event).
|
||||
struct FDebugLatencies
|
||||
{
|
||||
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
|
||||
float TurnEndToTextMs = 0.0f; // user turn end → first text from LLM (≈ ASR + LLM TTFT)
|
||||
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (≈ TTS + network)
|
||||
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
|
||||
float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived)
|
||||
};
|
||||
FDebugLatencies CurrentLatencies;
|
||||
|
||||
// ElevenLabs server region (from x-region header on REST API). Fetched once per session.
|
||||
FString ServerRegion;
|
||||
void FetchServerRegion();
|
||||
|
||||
// Accumulates incoming PCM bytes until the audio component needs data.
|
||||
// Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every
|
||||
// underflow callback (~60Hz). Compacted periodically when read offset
|
||||
@ -664,6 +679,22 @@ private:
|
||||
bool bPreBuffering = false;
|
||||
double PreBufferStartTime = 0.0;
|
||||
|
||||
// ── Adaptive pre-buffer ─────────────────────────────────────────────────
|
||||
// Runtime pre-buffer duration (ms). Equals AudioPreBufferMs when adaptive is off.
|
||||
// When adaptive is on: initialized from AudioPreBufferMs, adjusted based on
|
||||
// measured inter-chunk timing (not queue-dry detection).
|
||||
int32 AdaptivePreBufferMs = 300;
|
||||
static constexpr int32 AdaptivePreBufferMinMs = 50;
|
||||
// Direction of last adaptation: +1=raised, -1=lowered, 0=stable. Used by HUD.
|
||||
int32 PreBufferTrend = 0;
|
||||
void ApplyPreBufferAdaptation();
|
||||
// Per-turn inter-chunk timing measurement (game thread only).
|
||||
// Set when the second TTS chunk arrives, consumed at turn end.
|
||||
double TurnFirstChunkTime = 0.0; // When chunk 1 arrived.
|
||||
int32 TurnFirstChunkBytes = 0; // Bytes in chunk 1 (to estimate audio duration).
|
||||
int32 TurnIdealPreBufferMs = -1; // Computed ideal pre-buffer. -1 = not measured.
|
||||
bool bTurnGapMeasured = false; // True after first inter-chunk gap is measured.
|
||||
|
||||
// Debug: track when the AudioQueue runs dry during speech (one-shot log).
|
||||
bool bQueueWasDry = false;
|
||||
|
||||
|
||||
@ -197,6 +197,18 @@ public:
|
||||
UFUNCTION(BlueprintPure, Category = "PS AI ConvAgent|ElevenLabs")
|
||||
const FPS_AI_ConvAgent_ConversationInfo_ElevenLabs& GetConversationInfo() const { return ConversationInfo; }
|
||||
|
||||
/** Latest WebSocket round-trip latency reported by the server (ms).
|
||||
* Returns -1 if no ping has been received yet. Thread-safe. */
|
||||
int32 GetLastPingMs() const { return LastPingMs.load(std::memory_order_relaxed); }
|
||||
|
||||
/** Timestamp of the last user audio chunk sent to the server.
|
||||
* Used as a proxy for "user stopped speaking" in Server VAD mode. */
|
||||
double GetLastAudioChunkSentTime() const { return LastAudioChunkSentTime; }
|
||||
|
||||
/** Timestamp of the last user_transcript received from the server.
|
||||
* Marks when server finished ASR — best anchor for LLM latency measurement. */
|
||||
double GetLastUserTranscriptTime() const { return LastUserTranscriptTime; }
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
// Internal
|
||||
// ─────────────────────────────────────────────────────────────────────────
|
||||
@ -235,10 +247,16 @@ private:
|
||||
TArray<uint8> BinaryFrameBuffer;
|
||||
|
||||
// ── Latency tracking ─────────────────────────────────────────────────────
|
||||
// Server-reported WebSocket round-trip latency from ping events (~every 2s).
|
||||
// Atomic: written from WS callback thread, read from game thread (HUD).
|
||||
std::atomic<int32> LastPingMs{-1};
|
||||
|
||||
// Timestamp of the last audio chunk sent (user speech).
|
||||
double LastAudioChunkSentTime = 0.0;
|
||||
// Timestamp when user turn ended (StopListening).
|
||||
double UserTurnEndTime = 0.0;
|
||||
// Timestamp of the last user_transcript received (server finished ASR).
|
||||
double LastUserTranscriptTime = 0.0;
|
||||
// Whether we are waiting for the first response after user stopped speaking.
|
||||
// Atomic: defensive — documents thread-safety contract.
|
||||
std::atomic<bool> bWaitingForResponse{false};
|
||||
@ -264,4 +282,10 @@ public:
|
||||
// Set by UPS_AI_ConvAgent_ElevenLabsComponent before calling Connect().
|
||||
// Controls turn_timeout in conversation_initiation_client_data.
|
||||
EPS_AI_ConvAgent_TurnMode_ElevenLabs TurnMode = EPS_AI_ConvAgent_TurnMode_ElevenLabs::Server;
|
||||
|
||||
// Controls how eagerly the server interprets pauses as end-of-speech.
|
||||
EPS_AI_ConvAgent_TurnEagerness_ElevenLabs TurnEagerness = EPS_AI_ConvAgent_TurnEagerness_ElevenLabs::Normal;
|
||||
|
||||
// Start generating before confirming end-of-speech (reduces latency, may cause false starts).
|
||||
bool bSpeculativeTurn = false;
|
||||
};
|
||||
|
||||
@ -22,33 +22,43 @@
|
||||
|
||||
DEFINE_LOG_CATEGORY_STATIC(LogPS_AI_AgentConfigEditor, Log, All);
|
||||
|
||||
// Approximate LLM latencies as shown on the ElevenLabs dashboard.
|
||||
// The API does not expose this data — values are indicative and may change.
|
||||
// Approximate LLM latencies as shown on the ElevenLabs dashboard (March 2026).
|
||||
// The /v1/convai/llm/list API does NOT expose latency — values are indicative.
|
||||
// Update this table periodically to stay current.
|
||||
static FString GetLLMLatencyHint(const FString& ModelID)
|
||||
{
|
||||
struct FLatencyEntry { const TCHAR* ID; const TCHAR* Latency; };
|
||||
static const FLatencyEntry Entries[] =
|
||||
{
|
||||
// OpenAI
|
||||
{ TEXT("gpt-4o-mini"), TEXT("~350ms") },
|
||||
{ TEXT("gpt-4o"), TEXT("~700ms") },
|
||||
{ TEXT("gpt-4"), TEXT("~900ms") },
|
||||
{ TEXT("gpt-4-turbo"), TEXT("~650ms") },
|
||||
// Anthropic
|
||||
{ TEXT("claude-sonnet-4-5"), TEXT("~750ms") },
|
||||
{ TEXT("claude-haiku-4-5"), TEXT("~350ms") },
|
||||
{ TEXT("claude-3-5-sonnet"), TEXT("~700ms") },
|
||||
// Google
|
||||
{ TEXT("gemini-1.5-pro"), TEXT("~500ms") },
|
||||
{ TEXT("gemini-2.0-flash"), TEXT("~300ms") },
|
||||
{ TEXT("gemini-2.5-flash"), TEXT("~250ms") },
|
||||
// xAI
|
||||
{ TEXT("grok-beta"), TEXT("~500ms") },
|
||||
// ElevenLabs-hosted
|
||||
{ TEXT("qwen3-30b-a3b"), TEXT("~207ms") },
|
||||
{ TEXT("glm-4.5-air"), TEXT("~980ms") },
|
||||
{ TEXT("gpt-oss-120b"), TEXT("~331ms") },
|
||||
// ── ElevenLabs-hosted ─────────────────────────────────────────────
|
||||
{ TEXT("glm-4.5-air"), TEXT("~949ms") },
|
||||
{ TEXT("qwen3-30b-a3b"), TEXT("~189ms") },
|
||||
{ TEXT("gpt-oss-120b"), TEXT("~321ms") },
|
||||
// ── Google ────────────────────────────────────────────────────────
|
||||
{ TEXT("gemini-3-pro"), TEXT("~3.5s") },
|
||||
{ TEXT("gemini-3-flash"), TEXT("~1.4s") },
|
||||
{ TEXT("gemini-2.5-flash"), TEXT("~967ms") },
|
||||
{ TEXT("gemini-2.5-flash-lite"), TEXT("~605ms") },
|
||||
// ── OpenAI ────────────────────────────────────────────────────────
|
||||
{ TEXT("gpt-5"), TEXT("~1.1s") },
|
||||
{ TEXT("gpt-5.1"), TEXT("~980ms") },
|
||||
{ TEXT("gpt-5.2"), TEXT("~795ms") },
|
||||
{ TEXT("gpt-5-mini"), TEXT("~884ms") },
|
||||
{ TEXT("gpt-5-nano"), TEXT("~734ms") },
|
||||
{ TEXT("gpt-4.1"), TEXT("~870ms") },
|
||||
{ TEXT("gpt-4.1-mini"), TEXT("~916ms") },
|
||||
{ TEXT("gpt-4.1-nano"), TEXT("~574ms") },
|
||||
{ TEXT("gpt-4o"), TEXT("~728ms") },
|
||||
{ TEXT("gpt-4o-mini"), TEXT("~767ms") },
|
||||
{ TEXT("gpt-4-turbo"), TEXT("~1.5s") },
|
||||
{ TEXT("gpt-3.5-turbo"), TEXT("~458ms") },
|
||||
// ── Anthropic ─────────────────────────────────────────────────────
|
||||
{ TEXT("claude-sonnet-4-5"), TEXT("~1.4s") },
|
||||
{ TEXT("claude-sonnet-4"), TEXT("~1.1s") },
|
||||
{ TEXT("claude-haiku-4-5"), TEXT("~644ms") },
|
||||
{ TEXT("claude-3.7-sonnet"), TEXT("~1.2s") },
|
||||
{ TEXT("claude-3-haiku"), TEXT("~484ms") },
|
||||
{ TEXT("claude-3-5-sonnet"), TEXT("~1.2s") },
|
||||
};
|
||||
|
||||
for (const auto& E : Entries)
|
||||
@ -58,6 +68,22 @@ static FString GetLLMLatencyHint(const FString& ModelID)
|
||||
return FString();
|
||||
}
|
||||
|
||||
// Infer provider from model ID prefix for display grouping.
|
||||
static FString GetLLMProvider(const FString& ModelID)
|
||||
{
|
||||
if (ModelID.StartsWith(TEXT("gpt-")) || ModelID.StartsWith(TEXT("o1")) || ModelID.StartsWith(TEXT("o3")))
|
||||
return TEXT("OpenAI");
|
||||
if (ModelID.StartsWith(TEXT("claude-")))
|
||||
return TEXT("Anthropic");
|
||||
if (ModelID.StartsWith(TEXT("gemini-")))
|
||||
return TEXT("Google");
|
||||
if (ModelID.StartsWith(TEXT("grok")))
|
||||
return TEXT("xAI");
|
||||
if (ModelID == TEXT("glm-4.5-air") || ModelID == TEXT("qwen3-30b-a3b") || ModelID == TEXT("gpt-oss-120b"))
|
||||
return TEXT("ElevenLabs");
|
||||
return FString();
|
||||
}
|
||||
|
||||
// Language code → display name. Shared by BuildAgentPayload (to resolve
|
||||
// {Language} placeholder) and the fetch handler (to strip the resolved fragment).
|
||||
static FString GetLanguageDisplayName(const FString& LangCode)
|
||||
@ -332,9 +358,11 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::CustomizeDetails(
|
||||
.Font(IDetailLayoutBuilder::GetDetailFont())
|
||||
]
|
||||
.ValueContent()
|
||||
.MaxDesiredWidth(600.f)
|
||||
[
|
||||
SNew(SBox)
|
||||
.MinDesiredHeight(200.f)
|
||||
.MinDesiredWidth(400.f)
|
||||
[
|
||||
SNew(SMultiLineEditableTextBox)
|
||||
.Font(IDetailLayoutBuilder::GetDetailFont())
|
||||
@ -679,6 +707,10 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
|
||||
Pinned->LLMDisplayNames.Reset();
|
||||
Pinned->LLMModelIDs.Reset();
|
||||
|
||||
// Collect models grouped by provider for sorted display.
|
||||
struct FLLMEntry { FString ModelID; FString Provider; FString Display; bool bCheckpoint; };
|
||||
TArray<FLLMEntry> AllEntries;
|
||||
|
||||
for (const auto& LLMVal : *LLMs)
|
||||
{
|
||||
const TSharedPtr<FJsonObject>* LLMObj = nullptr;
|
||||
@ -703,12 +735,14 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
|
||||
}
|
||||
}
|
||||
|
||||
// Check if it's a checkpoint model (sub-version).
|
||||
bool bIsCheckpoint = false;
|
||||
(*LLMObj)->TryGetBoolField(TEXT("is_checkpoint"), bIsCheckpoint);
|
||||
|
||||
// Build display string: "model-id (~350ms)" or " model-id (checkpoint, ~350ms)"
|
||||
const FString Latency = GetLLMLatencyHint(ModelID);
|
||||
const FString Provider = GetLLMProvider(ModelID);
|
||||
|
||||
// Build display: " model-id (checkpoint, ~350ms)" for checkpoints,
|
||||
// "model-id (~350ms)" for main models.
|
||||
FString Display;
|
||||
if (bIsCheckpoint)
|
||||
{
|
||||
@ -719,12 +753,44 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnFetchLLMsClicked()
|
||||
else
|
||||
{
|
||||
Display = Latency.IsEmpty()
|
||||
? ModelID
|
||||
: FString::Printf(TEXT("%s (%s)"), *ModelID, *Latency);
|
||||
? FString::Printf(TEXT(" %s"), *ModelID)
|
||||
: FString::Printf(TEXT(" %s (%s)"), *ModelID, *Latency);
|
||||
}
|
||||
|
||||
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Display)));
|
||||
Pinned->LLMModelIDs.Add(ModelID);
|
||||
AllEntries.Add({ ModelID, Provider, Display, bIsCheckpoint });
|
||||
}
|
||||
|
||||
// Sort by provider order (ElevenLabs, Google, OpenAI, Anthropic, Other),
|
||||
// then main models before checkpoints, then alphabetically.
|
||||
static const TArray<FString> ProviderOrder = {
|
||||
TEXT("ElevenLabs"), TEXT("Google"), TEXT("OpenAI"), TEXT("Anthropic"), TEXT("xAI")
|
||||
};
|
||||
AllEntries.Sort([](const FLLMEntry& A, const FLLMEntry& B)
|
||||
{
|
||||
int32 IdxA = ProviderOrder.IndexOfByKey(A.Provider);
|
||||
int32 IdxB = ProviderOrder.IndexOfByKey(B.Provider);
|
||||
if (IdxA == INDEX_NONE) IdxA = ProviderOrder.Num();
|
||||
if (IdxB == INDEX_NONE) IdxB = ProviderOrder.Num();
|
||||
if (IdxA != IdxB) return IdxA < IdxB;
|
||||
if (A.bCheckpoint != B.bCheckpoint) return !A.bCheckpoint; // main first
|
||||
return A.ModelID < B.ModelID;
|
||||
});
|
||||
|
||||
// Insert provider headers as non-selectable separator entries.
|
||||
FString LastProvider;
|
||||
for (const auto& Entry : AllEntries)
|
||||
{
|
||||
const FString& Prov = Entry.Provider.IsEmpty() ? TEXT("Other") : Entry.Provider;
|
||||
if (Prov != LastProvider)
|
||||
{
|
||||
// Header line: "── OpenAI ──" (not selectable — mapped to empty ModelID)
|
||||
FString Header = FString::Printf(TEXT("── %s ──"), *Prov);
|
||||
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Header)));
|
||||
Pinned->LLMModelIDs.Add(FString()); // empty = separator
|
||||
LastProvider = Prov;
|
||||
}
|
||||
Pinned->LLMDisplayNames.Add(MakeShareable(new FString(Entry.Display)));
|
||||
Pinned->LLMModelIDs.Add(Entry.ModelID);
|
||||
}
|
||||
|
||||
// Pre-select the currently set LLMModel if it exists in the list.
|
||||
@ -767,6 +833,9 @@ void FPS_AI_ConvAgent_AgentConfigCustomization_ElevenLabs::OnLLMSelected(
|
||||
int32 Idx = LLMDisplayNames.IndexOfByKey(NewSelection);
|
||||
if (Idx == INDEX_NONE) return;
|
||||
|
||||
// Separator headers have empty ModelID — ignore selection.
|
||||
if (LLMModelIDs[Idx].IsEmpty()) return;
|
||||
|
||||
if (UPS_AI_ConvAgent_AgentConfig_ElevenLabs* Asset = GetEditedAsset())
|
||||
{
|
||||
Asset->Modify();
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user