From 5fad6376bc8c89142cf7d24f9b4b93c175f1a81c Mon Sep 17 00:00:00 2001
From: "j.foucher" <j.foucher@polymorph.fr>
Date: Thu, 5 Mar 2026 18:39:49 +0100
Subject: [PATCH] Fix latency HUD: anchor all metrics to agent_response_started
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

user_transcript arrives AFTER agent_response_started in Server VAD mode
(the server detects end of speech via VAD, starts generating immediately,
and STT completes later). This caused Transcript>Gen to show stale values
(19s) and Total < Gen>Audio (impossible).

Now all metrics are anchored to GenerationStartTime (agent_response_started),
which is the closest client-side proxy for "user stopped speaking":

- Gen>Audio: generation start → first audio chunk (LLM + TTS)
- Pre-buffer: wait before playback
- Gen>Ear: generation start → playback starts (user-perceived)

Removed STTToGenMs, TotalMs, EndToEarMs, UserSpeechMs (all depended on
unreliable timestamps). Simpler, always correct, 3 clear metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../PS_AI_ConvAgent_ElevenLabsComponent.cpp   | 63 +++++--------------
 .../PS_AI_ConvAgent_ElevenLabsComponent.h     |  7 +--
 2 files changed, 19 insertions(+), 51 deletions(-)
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp
index 258dee5..f047027 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp
@@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
 					AudioPlaybackComponent->Play();
 				}
 				PlaybackStartTime = FPlatformTime::Seconds();
-				if (TurnStartTime > 0.0)
+				if (GenerationStartTime > 0.0)
 				{
-					CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
+					CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
 				}
 				if (PreBufferStartTime > 0.0)
 				{
@@ -1017,20 +1017,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray<uint
 
 void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment)
 {
-	// In Server VAD mode, user_transcript is the first server confirmation that the
-	// user spoke. Capture the time of the FIRST transcript this turn as TurnStartTime.
-	// This is after STT completes (not real speech start), but it's the best client-side
-	// approximation. The gap between TurnStartTime and GenerationStartTime then measures
-	// the LLM think time (transcript received → agent starts generating).
-	if (Segment.Speaker == TEXT("user"))
-	{
-		// "First transcript this turn" = TurnStartTime hasn't been updated since last generation.
-		if (GenerationStartTime <= 0.0 || TurnStartTime <= GenerationStartTime)
-		{
-			TurnStartTime = FPlatformTime::Seconds();
-		}
-	}
-
 	if (bEnableUserTranscript)
 	{
 		OnAgentTranscript.Broadcast(Segment);
@@ -1103,16 +1089,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
 	}
 
 	// Reset all latency measurements — new response cycle starts here.
+	// All metrics are anchored to GenerationStartTime (= now), which is the closest
+	// client-side proxy for "user stopped speaking" in Server VAD mode.
 	CurrentLatencies = FDebugLatencies();
-
-	// Transcript→Gen: time between first user_transcript received and generation start.
-	// In Server VAD, this captures the LLM "think time" after STT completed.
-	// TurnStartTime is set in HandleTranscript when the first user_transcript arrives.
-	if (TurnStartTime > 0.0 && Now > TurnStartTime)
-	{
-		CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnStartTime) * 1000.0);
-	}
-
 	GenerationStartTime = Now;
 
 	const double T = Now - SessionStartTime;
@@ -1394,8 +1373,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 		SilentTickCount = 0;
 
 		// Latency capture (always, for HUD display).
-		if (TurnStartTime > 0.0)
-			CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnStartTime) * 1000.0);
 		if (GenerationStartTime > 0.0)
 			CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
 
@@ -1438,9 +1415,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 				AudioPlaybackComponent->Play();
 			}
 			PlaybackStartTime = FPlatformTime::Seconds();
-			if (TurnStartTime > 0.0)
+			if (GenerationStartTime > 0.0)
 			{
-				CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
+				CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
 			}
 			// No pre-buffer in this path (immediate playback).
 			OnAudioPlaybackStarted.Broadcast();
@@ -1470,9 +1447,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
 				AudioPlaybackComponent->Play();
 			}
 			PlaybackStartTime = FPlatformTime::Seconds();
-			if (TurnStartTime > 0.0)
+			if (GenerationStartTime > 0.0)
 			{
-				CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
+				CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
 			}
 			if (PreBufferStartTime > 0.0)
 			{
@@ -2445,27 +2422,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
 		return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
 	};
 
-	// Title — all times measured from first user_transcript received
+	// Title — all times measured from agent_response_started
 	GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
-		TEXT("=== Latency (from transcript) ==="));
+		TEXT("=== Latency (from gen start) ==="));
 
-	// 1. Transcript → Generation: LLM think time after STT completed
+	// 1. Gen → Audio: generation start → first audio chunk (LLM + TTS)
 	GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
-		FString::Printf(TEXT("  Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs)));
-
-	// 2. Generation → Audio: LLM + TTS processing
-	GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
 		FString::Printf(TEXT("  Gen>Audio:      %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
 
-	// 3. Total: transcript → first audio chunk (full pipeline)
-	GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
-		FString::Printf(TEXT("  Total:          %s"), *Fmt(CurrentLatencies.TotalMs)));
-
-	// 4. Pre-buffer wait before playback
-	GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor,
+	// 2. Pre-buffer wait before playback
+	GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
 		FString::Printf(TEXT("  Pre-buffer:     %s"), *Fmt(CurrentLatencies.PreBufferMs)));
 
-	// 5. End-to-ear: transcript → audio playback starts (user-perceived)
-	GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor,
-		FString::Printf(TEXT("  End-to-Ear:     %s"), *Fmt(CurrentLatencies.EndToEarMs)));
+	// 3. Gen → Ear: generation start → playback starts (user-perceived total)
+	GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
+		FString::Printf(TEXT("  Gen>Ear:        %s"), *Fmt(CurrentLatencies.GenToEarMs)));
 }
diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h
index c3ccfee..055ea00 100644
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h
@@ -641,15 +641,14 @@ private:
 	double PlaybackStartTime   = 0.0; // Set when audio playback actually starts (post pre-buffer).
 
 	// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
-	// Each field is populated as the corresponding event fires.
+	// All anchored to GenerationStartTime (agent_response_started event), which is
+	// the closest client-side proxy for "user stopped speaking" in Server VAD mode.
 	// Zero means "not yet measured this turn".
 	struct FDebugLatencies
 	{
-		float STTToGenMs    = 0.0f; // First user_transcript → agent_response_started (LLM think time)
 		float GenToAudioMs  = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
-		float TotalMs       = 0.0f; // First user_transcript → first audio chunk (full pipeline)
 		float PreBufferMs   = 0.0f; // Pre-buffer wait before playback starts
-		float EndToEarMs    = 0.0f; // First user_transcript → audio playback starts (user-perceived)
+		float GenToEarMs    = 0.0f; // agent_response_started → playback starts (user-perceived)
 	};
 	FDebugLatencies CurrentLatencies;