From 0b190c3149f5b89c59de3fcfc55eccf72dbb9045 Mon Sep 17 00:00:00 2001 From: "j.foucher" Date: Thu, 5 Mar 2026 18:32:59 +0100 Subject: [PATCH] Rework latency HUD: base all measurements on first user_transcript MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous approach used TurnEndTime (from StopListening) which was never set in Server VAD mode. Now all latency measurements are anchored to TurnStartTime, captured when the first user_transcript arrives from the ElevenLabs server — the earliest client-side confirmation of user speech. Timeline: [user speaks] → STT → user_transcript(=T0) → agent_response_started → audio → playback Metrics shown: - Transcript>Gen: T0 → generation start (LLM think time) - Gen>Audio: generation start → first audio chunk (LLM + TTS) - Total: T0 → first audio chunk (full pipeline) - Pre-buffer: wait before playback - End-to-Ear: T0 → playback starts (user-perceived) Removed UserSpeechMs (unmeasurable without client-side VAD). Co-Authored-By: Claude Opus 4.6 --- .../PS_AI_ConvAgent_ElevenLabsComponent.cpp | 69 +++++++++++-------- .../PS_AI_ConvAgent_ElevenLabsComponent.h | 15 ++-- 2 files changed, 47 insertions(+), 37 deletions(-) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp index ed75d81..258dee5 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp @@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnEndTime > 0.0) + if (TurnStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); } if (PreBufferStartTime > 0.0) { @@ -279,7 +279,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel bHardTimeoutFired = bHardTimeout && !bAgentResponseReceived; bAgentSpeaking = false; bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted. - TurnStartTime = FPlatformTime::Seconds(); // Mark start of next potential user turn. bAgentResponseReceived = false; SilentTickCount = 0; bShouldBroadcastStopped = true; @@ -1018,6 +1017,20 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray 0.0 && TurnEndTime > TurnStartTime) + + // Transcript→Gen: time between first user_transcript received and generation start. + // In Server VAD, this captures the LLM "think time" after STT completed. + // TurnStartTime is set in HandleTranscript when the first user_transcript arrives. + if (TurnStartTime > 0.0 && Now > TurnStartTime) { - CurrentLatencies.UserSpeechMs = static_cast((TurnEndTime - TurnStartTime) * 1000.0); + CurrentLatencies.STTToGenMs = static_cast((Now - TurnStartTime) * 1000.0); } GenerationStartTime = Now; - CurrentLatencies.STTToGenMs = static_cast((Now - TurnEndTime) * 1000.0); const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = Now - TurnEndTime; @@ -1378,8 +1394,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray SilentTickCount = 0; // Latency capture (always, for HUD display). - if (TurnEndTime > 0.0) - CurrentLatencies.TotalMs = static_cast((AgentSpeakStart - TurnEndTime) * 1000.0); + if (TurnStartTime > 0.0) + CurrentLatencies.TotalMs = static_cast((AgentSpeakStart - TurnStartTime) * 1000.0); if (GenerationStartTime > 0.0) CurrentLatencies.GenToAudioMs = static_cast((AgentSpeakStart - GenerationStartTime) * 1000.0); @@ -1422,9 +1438,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnEndTime > 0.0) + if (TurnStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); } // No pre-buffer in this path (immediate playback). OnAudioPlaybackStarted.Broadcast(); @@ -1454,9 +1470,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (TurnEndTime > 0.0) + if (TurnStartTime > 0.0) { - CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnStartTime) * 1000.0); } if (PreBufferStartTime > 0.0) { @@ -1517,7 +1533,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio() SilentTickCount = 0; bWasSpeaking = true; Now = FPlatformTime::Seconds(); - TurnStartTime = Now; // Mark start of next potential user turn. } } @@ -2430,31 +2445,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); }; - // Title + // Title — all times measured from first user_transcript received GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, - TEXT("=== Latency HUD ===")); + TEXT("=== Latency (from transcript) ===")); - // 1. User speech duration + // 1. Transcript → Generation: LLM think time after STT completed GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, - FString::Printf(TEXT(" User speech: %s"), *Fmt(CurrentLatencies.UserSpeechMs))); + FString::Printf(TEXT(" Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs))); - // 2. STT + Network: end of speech → server starts generating + // 2. Generation → Audio: LLM + TTS processing GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, - FString::Printf(TEXT(" STT + Network: %s"), *Fmt(CurrentLatencies.STTToGenMs))); + FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); - // 3. LLM + TTS: server generating → first audio chunk received - GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, ValueColor, - FString::Printf(TEXT(" LLM + TTS: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); - - // 4. Total round-trip: end of speech → first audio chunk - GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, HighlightColor, + // 3. Total: transcript → first audio chunk (full pipeline) + GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs))); - // 5. Pre-buffer wait before playback - GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, ValueColor, + // 4. Pre-buffer wait before playback + GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor, FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); - // 6. End-to-ear: end of speech → audio playback starts (user-perceived latency) - GEngine->AddOnScreenDebugMessage(BaseKey + 6, DisplayTime, HighlightColor, + // 5. End-to-ear: transcript → audio playback starts (user-perceived) + GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor, FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs))); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h index 4c42d1d..c3ccfee 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h @@ -640,17 +640,16 @@ private: double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating. double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). - // Current-turn latency measurements (ms). Reset when the user starts a new turn - // (StopListening). Each field is populated as the corresponding event fires. + // Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted. + // Each field is populated as the corresponding event fires. // Zero means "not yet measured this turn". struct FDebugLatencies { - float UserSpeechMs = 0.0f; // How long the user spoke (TurnStart → TurnEnd) - float STTToGenMs = 0.0f; // TurnEnd → server starts generating (network + STT) - float GenToAudioMs = 0.0f; // Server generating → first audio chunk (LLM + TTS) - float TotalMs = 0.0f; // TurnEnd → first audio chunk (full round-trip) - float PreBufferMs = 0.0f; // Pre-buffer wait before playback - float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived) + float STTToGenMs = 0.0f; // First user_transcript → agent_response_started (LLM think time) + float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS) + float TotalMs = 0.0f; // First user_transcript → first audio chunk (full pipeline) + float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts + float EndToEarMs = 0.0f; // First user_transcript → audio playback starts (user-perceived) }; FDebugLatencies CurrentLatencies;