Improve latency debug HUD: separate toggle, per-turn reset, multi-line display

- Add bDebugLatency property + CVar (ps.ai.ConvAgent.Debug.Latency)
  independent from bDebug to save HUD space
- Reset latencies to zero each turn (StopListening) instead of persisting
- Add UserSpeechMs and PreBufferMs to the latency struct
- Move latency captures outside bDebug guard (always measured)
- Replace single-line latency in DrawDebugHUD with dedicated DrawLatencyHUD()
  showing 6 metrics on separate lines with color coding

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-05 18:06:39 +01:00
parent d60f8d8484
commit 955c97e0dd
2 changed files with 109 additions and 31 deletions

View File

@ -26,6 +26,12 @@ static TAutoConsoleVariable<int32> CVarDebugElevenLabs(
TEXT("Debug HUD for ElevenLabs. -1=use property, 0=off, 1-3=verbosity."),
ECVF_Default);
static TAutoConsoleVariable<int32> CVarDebugLatency(
TEXT("ps.ai.ConvAgent.Debug.Latency"),
-1,
TEXT("Latency debug HUD. -1=use property, 0=off, 1=on."),
ECVF_Default);
// ─────────────────────────────────────────────────────────────────────────────
// Constructor
// ─────────────────────────────────────────────────────────────────────────────
@ -160,9 +166,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
if (TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
if (PreBufferStartTime > 0.0)
{
CurrentLatencies.PreBufferMs = static_cast<float>((PlaybackStartTime - PreBufferStartTime) * 1000.0);
}
OnAudioPlaybackStarted.Broadcast();
}
@ -308,6 +318,14 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
DrawDebugHUD();
}
}
{
const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread();
const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency;
if (bShowLatency)
{
DrawLatencyHUD();
}
}
}
// ─────────────────────────────────────────────────────────────────────────────
@ -576,6 +594,14 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopListening()
}
TurnEndTime = FPlatformTime::Seconds();
// Reset latency measurements for this new response cycle.
CurrentLatencies = FDebugLatencies();
if (TurnStartTime > 0.0)
{
CurrentLatencies.UserSpeechMs = static_cast<float>((TurnEndTime - TurnStartTime) * 1000.0);
}
// Start the response timeout clock — but only when the server hasn't already started
// generating. When StopListening() is called from HandleAgentResponseStarted() as part
// of collision avoidance, bAgentGenerating is already true, meaning the server IS already
@ -1058,9 +1084,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
const double Now = FPlatformTime::Seconds();
GenerationStartTime = Now;
if (bDebug && TurnEndTime > 0.0)
if (TurnEndTime > 0.0)
{
LastLatencies.STTToGenMs = static_cast<float>((Now - TurnEndTime) * 1000.0);
CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnEndTime) * 1000.0);
}
const double T = Now - SessionStartTime;
@ -1341,6 +1367,12 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
bQueueWasDry = false;
SilentTickCount = 0;
// Latency capture (always, for HUD display).
if (TurnEndTime > 0.0)
CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnEndTime) * 1000.0);
if (GenerationStartTime > 0.0)
CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
if (bDebug)
{
const double T = AgentSpeakStart - SessionStartTime;
@ -1348,12 +1380,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Agent speaking — first audio chunk. (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
// Update latency snapshot for HUD display.
if (TurnEndTime > 0.0)
LastLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnEndTime) * 1000.0);
if (GenerationStartTime > 0.0)
LastLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
}
OnAgentStartedSpeaking.Broadcast();
@ -1386,10 +1412,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
if (TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
// No pre-buffer in this path (immediate playback).
OnAudioPlaybackStarted.Broadcast();
}
}
@ -1417,9 +1444,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
if (TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
if (PreBufferStartTime > 0.0)
{
CurrentLatencies.PreBufferMs = static_cast<float>((PlaybackStartTime - PreBufferStartTime) * 1000.0);
}
OnAudioPlaybackStarted.Broadcast();
}
@ -2362,19 +2393,57 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
NetConnectedPawns.Num(), *SpeakerName));
}
// Latencies (from last completed turn)
if (LastLatencies.TotalMs > 0.0f)
{
GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime, MainColor,
FString::Printf(TEXT(" Latency: total=%.0fms (stt>gen=%.0fms gen>audio=%.0fms) ear=%.0fms"),
LastLatencies.TotalMs, LastLatencies.STTToGenMs,
LastLatencies.GenToAudioMs, LastLatencies.EndToEarMs));
}
// Reconnection
GEngine->AddOnScreenDebugMessage(BaseKey + 9, DisplayTime,
GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime,
bWantsReconnect ? FColor::Red : MainColor,
FString::Printf(TEXT(" Reconnect: %d/%d attempts%s"),
ReconnectAttemptCount, MaxReconnectAttempts,
bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT("")));
}
void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
{
if (!GEngine) return;
// Separate BaseKey range so it never collides with DrawDebugHUD
const int32 BaseKey = 93700;
const float DisplayTime = 0.0f; // refreshed every tick
const FColor TitleColor = FColor::Cyan;
const FColor ValueColor = FColor::White;
const FColor HighlightColor = FColor::Yellow;
// Helper: format a single metric — shows "---" when not yet captured this turn
auto Fmt = [](float Ms) -> FString
{
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
};
// Title
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
TEXT("=== Latency HUD ==="));
// 1. User speech duration
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
FString::Printf(TEXT(" User speech: %s"), *Fmt(CurrentLatencies.UserSpeechMs)));
// 2. STT + Network: end of speech → server starts generating
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
FString::Printf(TEXT(" STT + Network: %s"), *Fmt(CurrentLatencies.STTToGenMs)));
// 3. LLM + TTS: server generating → first audio chunk received
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, ValueColor,
FString::Printf(TEXT(" LLM + TTS: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 4. Total round-trip: end of speech → first audio chunk
GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, HighlightColor,
FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs)));
// 5. Pre-buffer wait before playback
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
// 6. End-to-ear: end of speech → audio playback starts (user-perceived latency)
GEngine->AddOnScreenDebugMessage(BaseKey + 6, DisplayTime, HighlightColor,
FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs)));
}

View File

@ -231,6 +231,11 @@ public:
meta = (ClampMin = "0", ClampMax = "3", EditCondition = "bDebug"))
int32 DebugVerbosity = 1;
/** Show a separate latency debug HUD with detailed per-turn timing breakdown.
* Independent from bDebug can be enabled alone via CVar ps.ai.ConvAgent.Debug.Latency. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|Debug")
bool bDebugLatency = false;
// ── Events ────────────────────────────────────────────────────────────────
/** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */
@ -635,16 +640,19 @@ private:
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Last-turn latency snapshot (ms) — updated per turn, displayed on debug HUD.
// Persists between turns so the HUD always shows the most recent measurement.
// Current-turn latency measurements (ms). Reset when the user starts a new turn
// (StopListening). Each field is populated as the corresponding event fires.
// Zero means "not yet measured this turn".
struct FDebugLatencies
{
float STTToGenMs = 0.0f; // TurnEnd → server starts generating
float GenToAudioMs = 0.0f; // Server generating → first audio chunk
float TotalMs = 0.0f; // TurnEnd → first audio chunk
float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived)
float UserSpeechMs = 0.0f; // How long the user spoke (TurnStart → TurnEnd)
float STTToGenMs = 0.0f; // TurnEnd → server starts generating (network + STT)
float GenToAudioMs = 0.0f; // Server generating → first audio chunk (LLM + TTS)
float TotalMs = 0.0f; // TurnEnd → first audio chunk (full round-trip)
float PreBufferMs = 0.0f; // Pre-buffer wait before playback
float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived)
};
FDebugLatencies LastLatencies;
FDebugLatencies CurrentLatencies;
// Accumulates incoming PCM bytes until the audio component needs data.
// Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every
@ -747,4 +755,5 @@ private:
/** Draw on-screen debug info (called from TickComponent when bDebug). */
void DrawDebugHUD() const;
void DrawLatencyHUD() const;
};