Rework latency HUD: base all measurements on first user_transcript

Previous approach used TurnEndTime (from StopListening) which was never
set in Server VAD mode. Now all latency measurements are anchored to
TurnStartTime, captured when the first user_transcript arrives from the
ElevenLabs server — the earliest client-side confirmation of user speech.

Timeline: [user speaks] → STT → user_transcript(=T0) → agent_response_started → audio → playback

Metrics shown:
- Transcript>Gen: T0 → generation start (LLM think time)
- Gen>Audio: generation start → first audio chunk (LLM + TTS)
- Total: T0 → first audio chunk (full pipeline)
- Pre-buffer: wait before playback
- End-to-Ear: T0 → playback starts (user-perceived)

Removed UserSpeechMs (unmeasurable without client-side VAD).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-05 18:32:59 +01:00
parent 56b072c45e
commit 0b190c3149
2 changed files with 47 additions and 37 deletions

View File

@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnEndTime > 0.0) if (TurnStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0); CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
} }
if (PreBufferStartTime > 0.0) if (PreBufferStartTime > 0.0)
{ {
@ -279,7 +279,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
bHardTimeoutFired = bHardTimeout && !bAgentResponseReceived; bHardTimeoutFired = bHardTimeout && !bAgentResponseReceived;
bAgentSpeaking = false; bAgentSpeaking = false;
bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted. bPreBuffering = false; // Cancel pending pre-buffer to prevent stale OnAudioPlaybackStarted.
TurnStartTime = FPlatformTime::Seconds(); // Mark start of next potential user turn.
bAgentResponseReceived = false; bAgentResponseReceived = false;
SilentTickCount = 0; SilentTickCount = 0;
bShouldBroadcastStopped = true; bShouldBroadcastStopped = true;
@ -1018,6 +1017,20 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray<uint
void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment) void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment)
{ {
// In Server VAD mode, user_transcript is the first server confirmation that the
// user spoke. Capture the time of the FIRST transcript this turn as TurnStartTime.
// This is after STT completes (not real speech start), but it's the best client-side
// approximation. The gap between TurnStartTime and GenerationStartTime then measures
// the LLM think time (transcript received → agent starts generating).
if (Segment.Speaker == TEXT("user"))
{
// "First transcript this turn" = TurnStartTime hasn't been updated since last generation.
if (GenerationStartTime <= 0.0 || TurnStartTime <= GenerationStartTime)
{
TurnStartTime = FPlatformTime::Seconds();
}
}
if (bEnableUserTranscript) if (bEnableUserTranscript)
{ {
OnAgentTranscript.Broadcast(Segment); OnAgentTranscript.Broadcast(Segment);
@ -1091,13 +1104,16 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
// Reset all latency measurements — new response cycle starts here. // Reset all latency measurements — new response cycle starts here.
CurrentLatencies = FDebugLatencies(); CurrentLatencies = FDebugLatencies();
if (TurnStartTime > 0.0 && TurnEndTime > TurnStartTime)
// Transcript→Gen: time between first user_transcript received and generation start.
// In Server VAD, this captures the LLM "think time" after STT completed.
// TurnStartTime is set in HandleTranscript when the first user_transcript arrives.
if (TurnStartTime > 0.0 && Now > TurnStartTime)
{ {
CurrentLatencies.UserSpeechMs = static_cast<float>((TurnEndTime - TurnStartTime) * 1000.0); CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnStartTime) * 1000.0);
} }
GenerationStartTime = Now; GenerationStartTime = Now;
CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnEndTime) * 1000.0);
const double T = Now - SessionStartTime; const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = Now - TurnEndTime; const double LatencyFromTurnEnd = Now - TurnEndTime;
@ -1378,8 +1394,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
SilentTickCount = 0; SilentTickCount = 0;
// Latency capture (always, for HUD display). // Latency capture (always, for HUD display).
if (TurnEndTime > 0.0) if (TurnStartTime > 0.0)
CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnEndTime) * 1000.0); CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnStartTime) * 1000.0);
if (GenerationStartTime > 0.0) if (GenerationStartTime > 0.0)
CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0); CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
@ -1422,9 +1438,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnEndTime > 0.0) if (TurnStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0); CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
} }
// No pre-buffer in this path (immediate playback). // No pre-buffer in this path (immediate playback).
OnAudioPlaybackStarted.Broadcast(); OnAudioPlaybackStarted.Broadcast();
@ -1454,9 +1470,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
AudioPlaybackComponent->Play(); AudioPlaybackComponent->Play();
} }
PlaybackStartTime = FPlatformTime::Seconds(); PlaybackStartTime = FPlatformTime::Seconds();
if (TurnEndTime > 0.0) if (TurnStartTime > 0.0)
{ {
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0); CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
} }
if (PreBufferStartTime > 0.0) if (PreBufferStartTime > 0.0)
{ {
@ -1517,7 +1533,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopAgentAudio()
SilentTickCount = 0; SilentTickCount = 0;
bWasSpeaking = true; bWasSpeaking = true;
Now = FPlatformTime::Seconds(); Now = FPlatformTime::Seconds();
TurnStartTime = Now; // Mark start of next potential user turn.
} }
} }
@ -2430,31 +2445,27 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
}; };
// Title // Title — all times measured from first user_transcript received
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
TEXT("=== Latency HUD ===")); TEXT("=== Latency (from transcript) ==="));
// 1. User speech duration // 1. Transcript → Generation: LLM think time after STT completed
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
FString::Printf(TEXT(" User speech: %s"), *Fmt(CurrentLatencies.UserSpeechMs))); FString::Printf(TEXT(" Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs)));
// 2. STT + Network: end of speech → server starts generating // 2. Generation → Audio: LLM + TTS processing
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
FString::Printf(TEXT(" STT + Network: %s"), *Fmt(CurrentLatencies.STTToGenMs))); FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 3. LLM + TTS: server generating → first audio chunk received // 3. Total: transcript → first audio chunk (full pipeline)
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, ValueColor, GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
FString::Printf(TEXT(" LLM + TTS: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
// 4. Total round-trip: end of speech → first audio chunk
GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, HighlightColor,
FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs))); FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs)));
// 5. Pre-buffer wait before playback // 4. Pre-buffer wait before playback
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, ValueColor, GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor,
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
// 6. End-to-ear: end of speech → audio playback starts (user-perceived latency) // 5. End-to-ear: transcript → audio playback starts (user-perceived)
GEngine->AddOnScreenDebugMessage(BaseKey + 6, DisplayTime, HighlightColor, GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor,
FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs))); FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs)));
} }

View File

@ -640,17 +640,16 @@ private:
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating. double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Current-turn latency measurements (ms). Reset when the user starts a new turn // Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
// (StopListening). Each field is populated as the corresponding event fires. // Each field is populated as the corresponding event fires.
// Zero means "not yet measured this turn". // Zero means "not yet measured this turn".
struct FDebugLatencies struct FDebugLatencies
{ {
float UserSpeechMs = 0.0f; // How long the user spoke (TurnStart → TurnEnd) float STTToGenMs = 0.0f; // First user_transcript → agent_response_started (LLM think time)
float STTToGenMs = 0.0f; // TurnEnd → server starts generating (network + STT) float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
float GenToAudioMs = 0.0f; // Server generating → first audio chunk (LLM + TTS) float TotalMs = 0.0f; // First user_transcript → first audio chunk (full pipeline)
float TotalMs = 0.0f; // TurnEnd → first audio chunk (full round-trip) float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
float PreBufferMs = 0.0f; // Pre-buffer wait before playback float EndToEarMs = 0.0f; // First user_transcript → audio playback starts (user-perceived)
float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived)
}; };
FDebugLatencies CurrentLatencies; FDebugLatencies CurrentLatencies;