Add response latency metrics to ElevenLabs debug HUD

Track 4 latencies per conversation turn (computed only when bDebug is active):
- STT→Gen: user stops talking → server starts generating
- Gen→Audio: server generating → first audio chunk received
- Total: user stops talking → first audio chunk (end-to-end)
- End-to-Ear: user stops talking → audio playback starts (includes pre-buffer)

New timestamps: GenerationStartTime (HandleAgentResponseStarted),
PlaybackStartTime (3 OnAudioPlaybackStarted sites). Values persist on
HUD between turns, reset when new turn starts.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-03-05 17:51:25 +01:00
parent 6d4ef21269
commit d60f8d8484
2 changed files with 52 additions and 1 deletions

View File

@ -159,6 +159,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
{
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
OnAudioPlaybackStarted.Broadcast();
}
}
@ -449,6 +454,8 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StartListening()
}
}
bWaitingForAgentResponse = false; // New user turn — cancel any pending response timeout.
GenerationStartTime = 0.0;
PlaybackStartTime = 0.0;
++TurnIndex;
bIsListening = true;
TurnStartTime = FPlatformTime::Seconds();
@ -1050,6 +1057,12 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
}
const double Now = FPlatformTime::Seconds();
GenerationStartTime = Now;
if (bDebug && TurnEndTime > 0.0)
{
LastLatencies.STTToGenMs = static_cast<float>((Now - TurnEndTime) * 1000.0);
}
const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
if (bIsListening)
@ -1335,6 +1348,12 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log,
TEXT("[T+%.2fs] [Turn %d] Agent speaking — first audio chunk. (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
// Update latency snapshot for HUD display.
if (TurnEndTime > 0.0)
LastLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnEndTime) * 1000.0);
if (GenerationStartTime > 0.0)
LastLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
}
OnAgentStartedSpeaking.Broadcast();
@ -1366,6 +1385,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
OnAudioPlaybackStarted.Broadcast();
}
}
@ -1392,6 +1416,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
{
AudioPlaybackComponent->Play();
}
PlaybackStartTime = FPlatformTime::Seconds();
if (bDebug && TurnEndTime > 0.0)
{
LastLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnEndTime) * 1000.0);
}
OnAudioPlaybackStarted.Broadcast();
}
SilentTickCount = 0;
@ -2333,8 +2362,17 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const
NetConnectedPawns.Num(), *SpeakerName));
}
// Latencies (from last completed turn)
if (LastLatencies.TotalMs > 0.0f)
{
GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime, MainColor,
FString::Printf(TEXT(" Latency: total=%.0fms (stt>gen=%.0fms gen>audio=%.0fms) ear=%.0fms"),
LastLatencies.TotalMs, LastLatencies.STTToGenMs,
LastLatencies.GenToAudioMs, LastLatencies.EndToEarMs));
}
// Reconnection
GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime,
GEngine->AddOnScreenDebugMessage(BaseKey + 9, DisplayTime,
bWantsReconnect ? FColor::Red : MainColor,
FString::Printf(TEXT(" Reconnect: %d/%d attempts%s"),
ReconnectAttemptCount, MaxReconnectAttempts,

View File

@ -632,6 +632,19 @@ private:
double TurnStartTime = 0.0; // Set in StartListening — when mic opens.
double TurnEndTime = 0.0; // Set in StopListening — when mic closes.
double AgentSpeakStart = 0.0; // Set in EnqueueAgentAudio (first chunk) — when audio begins.
double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating.
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
// Last-turn latency snapshot (ms) — updated per turn, displayed on debug HUD.
// Persists between turns so the HUD always shows the most recent measurement.
struct FDebugLatencies
{
float STTToGenMs = 0.0f; // TurnEnd → server starts generating
float GenToAudioMs = 0.0f; // Server generating → first audio chunk
float TotalMs = 0.0f; // TurnEnd → first audio chunk
float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived)
};
FDebugLatencies LastLatencies;
// Accumulates incoming PCM bytes until the audio component needs data.
// Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every