Fix latency HUD: anchor all metrics to agent_response_started
user_transcript arrives AFTER agent_response_started in Server VAD mode (the server detects end of speech via VAD, starts generating immediately, and STT completes later). This caused Transcript>Gen to show stale values (19s) and Total < Gen>Audio (impossible). Now all metrics are anchored to GenerationStartTime (agent_response_started), which is the closest client-side proxy for "user stopped speaking": - Gen>Audio: generation start → first audio chunk (LLM + TTS) - Pre-buffer: wait before playback - Gen>Ear: generation start → playback starts (user-perceived) Removed STTToGenMs, TotalMs, EndToEarMs, UserSpeechMs (all depended on unreliable timestamps). Simpler, always correct, 3 clear metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0b190c3149
commit
5fad6376bc
@ -166,9 +166,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel
|
|||||||
AudioPlaybackComponent->Play();
|
AudioPlaybackComponent->Play();
|
||||||
}
|
}
|
||||||
PlaybackStartTime = FPlatformTime::Seconds();
|
PlaybackStartTime = FPlatformTime::Seconds();
|
||||||
if (TurnStartTime > 0.0)
|
if (GenerationStartTime > 0.0)
|
||||||
{
|
{
|
||||||
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
|
CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
|
||||||
}
|
}
|
||||||
if (PreBufferStartTime > 0.0)
|
if (PreBufferStartTime > 0.0)
|
||||||
{
|
{
|
||||||
@ -1017,20 +1017,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAudioReceived(const TArray<uint
|
|||||||
|
|
||||||
void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment)
|
void UPS_AI_ConvAgent_ElevenLabsComponent::HandleTranscript(const FPS_AI_ConvAgent_TranscriptSegment_ElevenLabs& Segment)
|
||||||
{
|
{
|
||||||
// In Server VAD mode, user_transcript is the first server confirmation that the
|
|
||||||
// user spoke. Capture the time of the FIRST transcript this turn as TurnStartTime.
|
|
||||||
// This is after STT completes (not real speech start), but it's the best client-side
|
|
||||||
// approximation. The gap between TurnStartTime and GenerationStartTime then measures
|
|
||||||
// the LLM think time (transcript received → agent starts generating).
|
|
||||||
if (Segment.Speaker == TEXT("user"))
|
|
||||||
{
|
|
||||||
// "First transcript this turn" = TurnStartTime hasn't been updated since last generation.
|
|
||||||
if (GenerationStartTime <= 0.0 || TurnStartTime <= GenerationStartTime)
|
|
||||||
{
|
|
||||||
TurnStartTime = FPlatformTime::Seconds();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (bEnableUserTranscript)
|
if (bEnableUserTranscript)
|
||||||
{
|
{
|
||||||
OnAgentTranscript.Broadcast(Segment);
|
OnAgentTranscript.Broadcast(Segment);
|
||||||
@ -1103,16 +1089,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted()
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Reset all latency measurements — new response cycle starts here.
|
// Reset all latency measurements — new response cycle starts here.
|
||||||
|
// All metrics are anchored to GenerationStartTime (= now), which is the closest
|
||||||
|
// client-side proxy for "user stopped speaking" in Server VAD mode.
|
||||||
CurrentLatencies = FDebugLatencies();
|
CurrentLatencies = FDebugLatencies();
|
||||||
|
|
||||||
// Transcript→Gen: time between first user_transcript received and generation start.
|
|
||||||
// In Server VAD, this captures the LLM "think time" after STT completed.
|
|
||||||
// TurnStartTime is set in HandleTranscript when the first user_transcript arrives.
|
|
||||||
if (TurnStartTime > 0.0 && Now > TurnStartTime)
|
|
||||||
{
|
|
||||||
CurrentLatencies.STTToGenMs = static_cast<float>((Now - TurnStartTime) * 1000.0);
|
|
||||||
}
|
|
||||||
|
|
||||||
GenerationStartTime = Now;
|
GenerationStartTime = Now;
|
||||||
|
|
||||||
const double T = Now - SessionStartTime;
|
const double T = Now - SessionStartTime;
|
||||||
@ -1394,8 +1373,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
|||||||
SilentTickCount = 0;
|
SilentTickCount = 0;
|
||||||
|
|
||||||
// Latency capture (always, for HUD display).
|
// Latency capture (always, for HUD display).
|
||||||
if (TurnStartTime > 0.0)
|
|
||||||
CurrentLatencies.TotalMs = static_cast<float>((AgentSpeakStart - TurnStartTime) * 1000.0);
|
|
||||||
if (GenerationStartTime > 0.0)
|
if (GenerationStartTime > 0.0)
|
||||||
CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
|
CurrentLatencies.GenToAudioMs = static_cast<float>((AgentSpeakStart - GenerationStartTime) * 1000.0);
|
||||||
|
|
||||||
@ -1438,9 +1415,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
|||||||
AudioPlaybackComponent->Play();
|
AudioPlaybackComponent->Play();
|
||||||
}
|
}
|
||||||
PlaybackStartTime = FPlatformTime::Seconds();
|
PlaybackStartTime = FPlatformTime::Seconds();
|
||||||
if (TurnStartTime > 0.0)
|
if (GenerationStartTime > 0.0)
|
||||||
{
|
{
|
||||||
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
|
CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
|
||||||
}
|
}
|
||||||
// No pre-buffer in this path (immediate playback).
|
// No pre-buffer in this path (immediate playback).
|
||||||
OnAudioPlaybackStarted.Broadcast();
|
OnAudioPlaybackStarted.Broadcast();
|
||||||
@ -1470,9 +1447,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray<uint8>
|
|||||||
AudioPlaybackComponent->Play();
|
AudioPlaybackComponent->Play();
|
||||||
}
|
}
|
||||||
PlaybackStartTime = FPlatformTime::Seconds();
|
PlaybackStartTime = FPlatformTime::Seconds();
|
||||||
if (TurnStartTime > 0.0)
|
if (GenerationStartTime > 0.0)
|
||||||
{
|
{
|
||||||
CurrentLatencies.EndToEarMs = static_cast<float>((PlaybackStartTime - TurnStartTime) * 1000.0);
|
CurrentLatencies.GenToEarMs = static_cast<float>((PlaybackStartTime - GenerationStartTime) * 1000.0);
|
||||||
}
|
}
|
||||||
if (PreBufferStartTime > 0.0)
|
if (PreBufferStartTime > 0.0)
|
||||||
{
|
{
|
||||||
@ -2445,27 +2422,19 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const
|
|||||||
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
|
return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---"));
|
||||||
};
|
};
|
||||||
|
|
||||||
// Title — all times measured from first user_transcript received
|
// Title — all times measured from agent_response_started
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
|
GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor,
|
||||||
TEXT("=== Latency (from transcript) ==="));
|
TEXT("=== Latency (from gen start) ==="));
|
||||||
|
|
||||||
// 1. Transcript → Generation: LLM think time after STT completed
|
// 1. Gen → Audio: generation start → first audio chunk (LLM + TTS)
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
|
GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor,
|
||||||
FString::Printf(TEXT(" Transcript>Gen: %s"), *Fmt(CurrentLatencies.STTToGenMs)));
|
|
||||||
|
|
||||||
// 2. Generation → Audio: LLM + TTS processing
|
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
|
|
||||||
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
|
FString::Printf(TEXT(" Gen>Audio: %s"), *Fmt(CurrentLatencies.GenToAudioMs)));
|
||||||
|
|
||||||
// 3. Total: transcript → first audio chunk (full pipeline)
|
// 2. Pre-buffer wait before playback
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
|
GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor,
|
||||||
FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs)));
|
|
||||||
|
|
||||||
// 4. Pre-buffer wait before playback
|
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, ValueColor,
|
|
||||||
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
|
FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs)));
|
||||||
|
|
||||||
// 5. End-to-ear: transcript → audio playback starts (user-perceived)
|
// 3. Gen → Ear: generation start → playback starts (user-perceived total)
|
||||||
GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, HighlightColor,
|
GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, HighlightColor,
|
||||||
FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs)));
|
FString::Printf(TEXT(" Gen>Ear: %s"), *Fmt(CurrentLatencies.GenToEarMs)));
|
||||||
}
|
}
|
||||||
|
|||||||
@ -641,15 +641,14 @@ private:
|
|||||||
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
|
double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer).
|
||||||
|
|
||||||
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
|
// Current-turn latency measurements (ms). Reset in HandleAgentResponseStarted.
|
||||||
// Each field is populated as the corresponding event fires.
|
// All anchored to GenerationStartTime (agent_response_started event), which is
|
||||||
|
// the closest client-side proxy for "user stopped speaking" in Server VAD mode.
|
||||||
// Zero means "not yet measured this turn".
|
// Zero means "not yet measured this turn".
|
||||||
struct FDebugLatencies
|
struct FDebugLatencies
|
||||||
{
|
{
|
||||||
float STTToGenMs = 0.0f; // First user_transcript → agent_response_started (LLM think time)
|
|
||||||
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
|
float GenToAudioMs = 0.0f; // agent_response_started → first audio chunk (LLM + TTS)
|
||||||
float TotalMs = 0.0f; // First user_transcript → first audio chunk (full pipeline)
|
|
||||||
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
|
float PreBufferMs = 0.0f; // Pre-buffer wait before playback starts
|
||||||
float EndToEarMs = 0.0f; // First user_transcript → audio playback starts (user-perceived)
|
float GenToEarMs = 0.0f; // agent_response_started → playback starts (user-perceived)
|
||||||
};
|
};
|
||||||
FDebugLatencies CurrentLatencies;
|
FDebugLatencies CurrentLatencies;
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user