From 5fad6376bc8c89142cf7d24f9b4b93c175f1a81c Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 5 Mar 2026 18:39:49 +0100 Subject: [PATCH] Fix latency HUD: anchor all metrics to agent_response_started MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit user_transcript arrives AFTER agent_response_started in Server VAD mode (the server detects end of speech via VAD, starts generating immediately, and STT completes later). This caused Transcript>Gen to show stale values (19s) and Total < Gen>Audio (impossible). Now all metrics are anchored to GenerationStartTime (agent_response_started), which is the closest client-side proxy for "user stopped speaking": - Gen>Audio: generation start → first audio chunk (LLM + TTS) - Pre-buffer: wait before playback - Gen>Ear: generation start → playback starts (user-perceived) Removed STTToGenMs, TotalMs, EndToEarMs, UserSpeechMs (all depended on unreliable timestamps). Simpler, always correct, 3 clear metrics. Co-Authored-By: Claude Opus 4.6 --- .../PS_AI_ConvAgent_ElevenLabsComponent.cpp | 63 +++++-------------- .../PS_AI_ConvAgent_ElevenLabsComponent.h | 7 +-- 2 files changed, 19 insertions(+), 51 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp index 258dee5..f047027 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp @@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnStartTime > 0.0) + if (GenerationStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); + CurrentLatencies.GenToEarMs = static_cast((PlaybackStartTime - GenerationStartTime) * 1000.0); } if (PreBufferStartTime > 0.0) { @@ -1017,20 +1017,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray 0.0 && Now > TurnStartTime) - { - CurrentLatencies.STTToGenMs = static_cast((Now - TurnStartTime) * 1000.0); - } - GenerationStartTime = Now; const double T = Now - SessionStartTime; @@ -1394,8 +1373,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray SilentTickCount = 0; // Latency capture (always, for HUD display). - if (TurnStartTime > 0.0) - CurrentLatencies.TotalMs = static_cast((AgentSpeakStart - TurnStartTime) * 1000.0); if (GenerationStartTime > 0.0) CurrentLatencies.GenToAudioMs = static_cast((AgentSpeakStart - GenerationStartTime) * 1000.0); @@ -1438,9 +1415,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnStartTime > 0.0) + if (GenerationStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); + CurrentLatencies.GenToEarMs = static_cast((PlaybackStartTime - GenerationStartTime) * 1000.0); } // No pre-buffer in this path (immediate playback). OnAudioPlaybackStarted.Broadcast(); @@ -1470,9 +1447,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnStartTime > 0.0) + if (GenerationStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); + CurrentLatencies.GenToEarMs = static_cast((PlaybackStartTime - GenerationStartTime) * 1000.0); } if (PreBufferStartTime > 0.0) { @@ -2445,27 +2422,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); }; - // Title — all times measured from first user_transcript received + // Title — all times measured from agent_response_started GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, - TEXT("=== Latency (from transcript) ===")); + TEXT("=== Latency (from gen start) ===")); - // 1. Transcript → Generation: LLM think time after STT completed + // 1. Gen → Audio: generation start → first audio chunk (LLM + TTS) GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, - FString::Printf(TEXT(" Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs))); - - // 2. Generation → Audio: LLM + TTS processing - GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); - // 3. Total: transcript → first audio chunk (full pipeline) - GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, - FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs))); - - // 4. Pre-buffer wait before playback - GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor, + // 2. Pre-buffer wait before playback + GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); - // 5. End-to-ear: transcript → audio playback starts (user-perceived) - GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor, - FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs))); + // 3. Gen → Ear: generation start → playback starts (user-perceived total) + GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, + FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs))); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h index c3ccfee..055ea00 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h @@ -641,15 +641,14 @@ private: double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). // Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted. - // Each field is populated as the corresponding event fires. + // All anchored to GenerationStartTime (agent_response_started event), which is + // the closest client-side proxy for "user stopped speaking" in Server VAD mode. // Zero means "not yet measured this turn". struct FDebugLatencies { - float STTToGenMs = 0.0f; // First user_transcript → agent_response_started (LLM think time) float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS) - float TotalMs = 0.0f; // First user_transcript → first audio chunk (full pipeline) float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts - float EndToEarMs = 0.0f; // First user_transcript → audio playback starts (user-perceived) + float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived) }; FDebugLatencies CurrentLatencies;