diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp index 895bf07..9bcb28f 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Private/PS_AI_ConvAgent_ElevenLabsComponent.cpp @@ -26,6 +26,12 @@ static TAutoConsoleVariable CVarDebugElevenLabs( TEXT("Debug HUD for ElevenLabs. -1=use property, 0=off, 1-3=verbosity."), ECVF_Default); +static TAutoConsoleVariable CVarDebugLatency( + TEXT("ps.ai.ConvAgent.Debug.Latency"), + -1, + TEXT("Latency debug HUD. -1=use property, 0=off, 1=on."), + ECVF_Default); + // ───────────────────────────────────────────────────────────────────────────── // Constructor // ───────────────────────────────────────────────────────────────────────────── @@ -160,9 +166,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (bDebug && TurnEndTime > 0.0) + if (TurnEndTime > 0.0) { - LastLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + } + if (PreBufferStartTime > 0.0) + { + CurrentLatencies.PreBufferMs = static_cast((PlaybackStartTime - PreBufferStartTime) * 1000.0); } OnAudioPlaybackStarted.Broadcast(); } @@ -308,6 +318,14 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::TickComponent(float DeltaTime, ELevel DrawDebugHUD(); } } + { + const int32 CVarVal = CVarDebugLatency.GetValueOnGameThread(); + const bool bShowLatency = (CVarVal >= 0) ? (CVarVal > 0) : bDebugLatency; + if (bShowLatency) + { + DrawLatencyHUD(); + } + } } // ───────────────────────────────────────────────────────────────────────────── @@ -576,6 +594,14 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::StopListening() } TurnEndTime = FPlatformTime::Seconds(); + + // Reset latency measurements for this new response cycle. + CurrentLatencies = FDebugLatencies(); + if (TurnStartTime > 0.0) + { + CurrentLatencies.UserSpeechMs = static_cast((TurnEndTime - TurnStartTime) * 1000.0); + } + // Start the response timeout clock — but only when the server hasn't already started // generating. When StopListening() is called from HandleAgentResponseStarted() as part // of collision avoidance, bAgentGenerating is already true, meaning the server IS already @@ -1058,9 +1084,9 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::HandleAgentResponseStarted() const double Now = FPlatformTime::Seconds(); GenerationStartTime = Now; - if (bDebug && TurnEndTime > 0.0) + if (TurnEndTime > 0.0) { - LastLatencies.STTToGenMs = static_cast((Now - TurnEndTime) * 1000.0); + CurrentLatencies.STTToGenMs = static_cast((Now - TurnEndTime) * 1000.0); } const double T = Now - SessionStartTime; @@ -1341,6 +1367,12 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray bQueueWasDry = false; SilentTickCount = 0; + // Latency capture (always, for HUD display). + if (TurnEndTime > 0.0) + CurrentLatencies.TotalMs = static_cast((AgentSpeakStart - TurnEndTime) * 1000.0); + if (GenerationStartTime > 0.0) + CurrentLatencies.GenToAudioMs = static_cast((AgentSpeakStart - GenerationStartTime) * 1000.0); + if (bDebug) { const double T = AgentSpeakStart - SessionStartTime; @@ -1348,12 +1380,6 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray UE_LOG(LogPS_AI_ConvAgent_ElevenLabs, Log, TEXT("[T+%.2fs] [Turn %d] Agent speaking — first audio chunk. (%.2fs after turn end)"), T, LastClosedTurnIndex, LatencyFromTurnEnd); - - // Update latency snapshot for HUD display. - if (TurnEndTime > 0.0) - LastLatencies.TotalMs = static_cast((AgentSpeakStart - TurnEndTime) * 1000.0); - if (GenerationStartTime > 0.0) - LastLatencies.GenToAudioMs = static_cast((AgentSpeakStart - GenerationStartTime) * 1000.0); } OnAgentStartedSpeaking.Broadcast(); @@ -1386,10 +1412,11 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (bDebug && TurnEndTime > 0.0) + if (TurnEndTime > 0.0) { - LastLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); } + // No pre-buffer in this path (immediate playback). OnAudioPlaybackStarted.Broadcast(); } } @@ -1417,9 +1444,13 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::EnqueueAgentAudio(const TArray AudioPlaybackComponent->Play(); } PlaybackStartTime = FPlatformTime::Seconds(); - if (bDebug && TurnEndTime > 0.0) + if (TurnEndTime > 0.0) { - LastLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + CurrentLatencies.EndToEarMs = static_cast((PlaybackStartTime - TurnEndTime) * 1000.0); + } + if (PreBufferStartTime > 0.0) + { + CurrentLatencies.PreBufferMs = static_cast((PlaybackStartTime - PreBufferStartTime) * 1000.0); } OnAudioPlaybackStarted.Broadcast(); } @@ -2362,19 +2393,57 @@ void UPS_AI_ConvAgent_ElevenLabsComponent::DrawDebugHUD() const NetConnectedPawns.Num(), *SpeakerName)); } - // Latencies (from last completed turn) - if (LastLatencies.TotalMs > 0.0f) - { - GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime, MainColor, - FString::Printf(TEXT(" Latency: total=%.0fms (stt>gen=%.0fms gen>audio=%.0fms) ear=%.0fms"), - LastLatencies.TotalMs, LastLatencies.STTToGenMs, - LastLatencies.GenToAudioMs, LastLatencies.EndToEarMs)); - } - // Reconnection - GEngine->AddOnScreenDebugMessage(BaseKey + 9, DisplayTime, + GEngine->AddOnScreenDebugMessage(BaseKey + 8, DisplayTime, bWantsReconnect ? FColor::Red : MainColor, FString::Printf(TEXT(" Reconnect: %d/%d attempts%s"), ReconnectAttemptCount, MaxReconnectAttempts, bWantsReconnect ? TEXT(" (ACTIVE)") : TEXT(""))); } + +void UPS_AI_ConvAgent_ElevenLabsComponent::DrawLatencyHUD() const +{ + if (!GEngine) return; + + // Separate BaseKey range so it never collides with DrawDebugHUD + const int32 BaseKey = 93700; + const float DisplayTime = 0.0f; // refreshed every tick + + const FColor TitleColor = FColor::Cyan; + const FColor ValueColor = FColor::White; + const FColor HighlightColor = FColor::Yellow; + + // Helper: format a single metric — shows "---" when not yet captured this turn + auto Fmt = [](float Ms) -> FString + { + return (Ms > 0.0f) ? FString::Printf(TEXT("%.0f ms"), Ms) : FString(TEXT("---")); + }; + + // Title + GEngine->AddOnScreenDebugMessage(BaseKey, DisplayTime, TitleColor, + TEXT("=== Latency HUD ===")); + + // 1. User speech duration + GEngine->AddOnScreenDebugMessage(BaseKey + 1, DisplayTime, ValueColor, + FString::Printf(TEXT(" User speech: %s"), *Fmt(CurrentLatencies.UserSpeechMs))); + + // 2. STT + Network: end of speech → server starts generating + GEngine->AddOnScreenDebugMessage(BaseKey + 2, DisplayTime, ValueColor, + FString::Printf(TEXT(" STT + Network: %s"), *Fmt(CurrentLatencies.STTToGenMs))); + + // 3. LLM + TTS: server generating → first audio chunk received + GEngine->AddOnScreenDebugMessage(BaseKey + 3, DisplayTime, ValueColor, + FString::Printf(TEXT(" LLM + TTS: %s"), *Fmt(CurrentLatencies.GenToAudioMs))); + + // 4. Total round-trip: end of speech → first audio chunk + GEngine->AddOnScreenDebugMessage(BaseKey + 4, DisplayTime, HighlightColor, + FString::Printf(TEXT(" Total: %s"), *Fmt(CurrentLatencies.TotalMs))); + + // 5. Pre-buffer wait before playback + GEngine->AddOnScreenDebugMessage(BaseKey + 5, DisplayTime, ValueColor, + FString::Printf(TEXT(" Pre-buffer: %s"), *Fmt(CurrentLatencies.PreBufferMs))); + + // 6. End-to-ear: end of speech → audio playback starts (user-perceived latency) + GEngine->AddOnScreenDebugMessage(BaseKey + 6, DisplayTime, HighlightColor, + FString::Printf(TEXT(" End-to-Ear: %s"), *Fmt(CurrentLatencies.EndToEarMs))); +} diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h index 9bff26b..4c42d1d 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_ConvAgent/Source/PS_AI_ConvAgent/Public/PS_AI_ConvAgent_ElevenLabsComponent.h @@ -231,6 +231,11 @@ public: meta = (ClampMin = "0", ClampMax = "3", EditCondition = "bDebug")) int32 DebugVerbosity = 1; + /** Show a separate latency debug HUD with detailed per-turn timing breakdown. + * Independent from bDebug — can be enabled alone via CVar ps.ai.ConvAgent.Debug.Latency. */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "PS AI ConvAgent|Debug") + bool bDebugLatency = false; + // ── Events ──────────────────────────────────────────────────────────────── /** Fired when the WebSocket connection is established and the conversation session is ready. Provides the ConversationID and AgentID. */ @@ -635,16 +640,19 @@ private: double GenerationStartTime = 0.0; // Set in HandleAgentResponseStarted — server starts generating. double PlaybackStartTime = 0.0; // Set when audio playback actually starts (post pre-buffer). - // Last-turn latency snapshot (ms) — updated per turn, displayed on debug HUD. - // Persists between turns so the HUD always shows the most recent measurement. + // Current-turn latency measurements (ms). Reset when the user starts a new turn + // (StopListening). Each field is populated as the corresponding event fires. + // Zero means "not yet measured this turn". struct FDebugLatencies { - float STTToGenMs = 0.0f; // TurnEnd → server starts generating - float GenToAudioMs = 0.0f; // Server generating → first audio chunk - float TotalMs = 0.0f; // TurnEnd → first audio chunk - float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived) + float UserSpeechMs = 0.0f; // How long the user spoke (TurnStart → TurnEnd) + float STTToGenMs = 0.0f; // TurnEnd → server starts generating (network + STT) + float GenToAudioMs = 0.0f; // Server generating → first audio chunk (LLM + TTS) + float TotalMs = 0.0f; // TurnEnd → first audio chunk (full round-trip) + float PreBufferMs = 0.0f; // Pre-buffer wait before playback + float EndToEarMs = 0.0f; // TurnEnd → audio playback starts (user-perceived) }; - FDebugLatencies LastLatencies; + FDebugLatencies CurrentLatencies; // Accumulates incoming PCM bytes until the audio component needs data. // Uses a read offset instead of RemoveAt(0,N) to avoid O(n) memmove every @@ -747,4 +755,5 @@ private: /** Draw on-screen debug info (called from TickComponent when bDebug). */ void DrawDebugHUD() const; + void DrawLatencyHUD() const; };