Fix latency HUD: anchor all metrics to agent_response_started

user_transcript arrives AFTER agent_response_started in Server VAD mode
(the server detects end of speech via VAD, starts generating immediately,
and STT completes later). This caused Transcript>Gen to show stale values
(19s) and Total < Gen>Audio (impossible).

Now all metrics are anchored to GenerationStartTime (agent_response_started),
which is the closest client-side proxy for "user stopped speaking":

- Gen>Audio: generation start → first audio chunk (LLM + TTS)
- Pre-buffer: wait before playback
- Gen>Ear: generation start → playback starts (user-perceived)

Removed STTToGenMs, TotalMs, EndToEarMs, UserSpeechMs (all depended on
unreliable timestamps). Simpler, always correct, 3 clear metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-05 18:39:49 +01:00
parent 0b190c3149
commit 5fad6376bc
2 changed files with 19 additions and 51 deletions

View File

@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnStartTime > 0.0) if (GenerationStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0); CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
} }
if (PreBufferStartTime > 0.0) if (PreBufferStartTime > 0.0)
{ {
@ -1017,20 +1017,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray<uint
void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment) void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment)
{ {
// In Server VAD mode, user_transcript is the first server confirmation that the
// user spoke. Capture the time of the FIRST transcript this turn as TurnStartTime.
// This is after STT completes (not real speech start), but it's the best client-side
// approximation. The gap between TurnStartTime and GenerationStartTime then measures
// the LLM think time (transcript received → agent starts generating).
if (Segment.Speaker == TEXT("user"))
{
// "First transcript this turn" = TurnStartTime hasn't been updated since last generation.
if (GenerationStartTime <= 0.0 || TurnStartTime <= GenerationStartTime)
{
TurnStartTime = FPlatformTime::Seconds();
}
}
if (bEnableUserTranscript) if (bEnableUserTranscript)
{ {
OnAgentTranscript.Broadcast(Segment); OnAgentTranscript.Broadcast(Segment);
@ -1103,16 +1089,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
} }
// Reset all latency measurements — new response cycle starts here. // Reset all latency measurements — new response cycle starts here.
// All metrics are anchored to GenerationStartTime (= now), which is the closest
// client-side proxy for "user stopped speaking" in Server VAD mode.
CurrentLatencies = FDebugLatencies(); CurrentLatencies = FDebugLatencies();
// Transcript→Gen: time between first user_transcript received and generation start.
// In Server VAD, this captures the LLM "think time" after STT completed.
// TurnStartTime is set in HandleTranscript when the first user_transcript arrives.
if (TurnStartTime > 0.0 && Now > TurnStartTime)
{
CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnStartTime) * 1000.0);
}
GenerationStartTime = Now; GenerationStartTime = Now;
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
@ -1394,8 +1373,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
SilentTickCount = 0; SilentTickCount = 0;
// Latency capture (always, for HUD display). // Latency capture (always, for HUD display).
if (TurnStartTime > 0.0)
CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnStartTime) * 1000.0);
if (GenerationStartTime > 0.0) if (GenerationStartTime > 0.0)
CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0); CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
@ -1438,9 +1415,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnStartTime > 0.0) if (GenerationStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0); CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
} }
// No pre-buffer in this path (immediate playback). // No pre-buffer in this path (immediate playback).
OnAudioPlaybackStarted.Broadcast(); OnAudioPlaybackStarted.Broadcast();
@ -1470,9 +1447,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnStartTime > 0.0) if (GenerationStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0); CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
} }
if (PreBufferStartTime > 0.0) if (PreBufferStartTime > 0.0)
{ {
@ -2445,27 +2422,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
}; };
// Title — all times measured from first user_transcript received // Title — all times measured from agent_response_started
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
TEXT("=== Latency (from transcript) ===")); TEXT("=== Latency (from gen start) ==="));
// 1. Transcript → Generation: LLM think time after STT completed // 1. Gen → Audio: generation start → first audio chunk (LLM + TTS)
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
FString::Printf(TEXT(" Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs)));
// 2. Generation → Audio: LLM + TTS processing
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 3. Total: transcript → first audio chunk (full pipeline) // 2. Pre-buffer wait before playback
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor, GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs)));
// 4. Pre-buffer wait before playback
GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
// 5. End-to-ear: transcript → audio playback starts (user-perceived) // 3. Gen → Ear: generation start → playback starts (user-perceived total)
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor, GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs))); FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs)));
} }

View File

@ -641,15 +641,14 @@ private:
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted. // Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
// Each field is populated as the corresponding event fires. // All anchored to GenerationStartTime (agent_response_started event), which is
// the closest client-side proxy for "user stopped speaking" in Server VAD mode.
// Zero means "not yet measured this turn". // Zero means "not yet measured this turn".
struct FDebugLatencies struct FDebugLatencies
{ {
float STTToGenMs = 0.0f; // First user_transcript → agent_response_started (LLM think time)
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS) float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
float TotalMs = 0.0f; // First user_transcript → first audio chunk (full pipeline)
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
float EndToEarMs = 0.0f; // First user_transcript → audio playback starts (user-perceived) float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived)
}; };
FDebugLatencies CurrentLatencies; FDebugLatencies CurrentLatencies;