diff --git a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset index 801ae24..d0e446f 100644 Binary files a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset and b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 545d40a..a948b06 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -48,9 +48,27 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe if (AudioQueue.Num() == 0) { SilentTickCount++; - if (SilentTickCount >= SilenceThresholdTicks) + + // Wait for agent_response (confirms the full response is done) before + // declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking + // events when ElevenLabs TTS streams audio in multiple batches with gaps + // (e.g. for long responses) — without this guard, the Blueprint's + // OnAgentStoppedSpeaking handler reopens the mic mid-response. + const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks; + + // Hard-timeout fallback: if agent_response never arrives (or is very late), + // stop after 2s of silence to avoid leaving the state machine stuck. + const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks; + + if (bResponseConfirmed || bHardTimeout) { + if (bHardTimeout && !bAgentResponseReceived) + { + UE_LOG(LogElevenLabsAgent, Warning, + TEXT("Agent silence hard-timeout (2s) without agent_response — declaring agent stopped.")); + } bAgentSpeaking = false; + bAgentResponseReceived = false; SilentTickCount = 0; OnAgentStoppedSpeaking.Broadcast(); } @@ -84,6 +102,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation() &UElevenLabsConversationalAgentComponent::HandleAgentResponse); WebSocketProxy->OnInterrupted.AddDynamic(this, &UElevenLabsConversationalAgentComponent::HandleInterrupted); + WebSocketProxy->OnAgentResponseStarted.AddDynamic(this, + &UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted); } // Pass configuration to the proxy before connecting. @@ -114,6 +134,33 @@ void UElevenLabsConversationalAgentComponent::StartListening() } if (bIsListening) return; + + // If the agent is currently generating or speaking, decide how to handle the request. + // + // Interruption (bAllowInterruption) applies ONLY when the agent is already playing audio + // (bAgentSpeaking). Pressing T while the agent speaks immediately stops it and opens the mic. + // + // During the generation phase (bAgentGenerating, no audio yet) we always block silently. + // This prevents the Blueprint's OnAgentStartedGenerating handler — which typically calls + // StartListening() for bookkeeping — from accidentally sending an interrupt to the server + // the moment it starts generating, which would cancel every response before any audio plays. + if (bAgentGenerating || bAgentSpeaking) + { + if (bAgentSpeaking && bAllowInterruption) + { + UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening: interrupting agent (speaking) to allow user to speak.")); + InterruptAgent(); + // InterruptAgent → StopAgentAudio clears bAgentSpeaking / bAgentGenerating, + // so we fall through and open the microphone immediately. + } + else + { + UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening ignored: agent is %s%s — will listen after agent finishes."), + bAgentGenerating ? TEXT("generating") : TEXT("speaking"), + (bAgentSpeaking && !bAllowInterruption) ? TEXT(" (interruption disabled)") : TEXT("")); + return; + } + } bIsListening = true; if (TurnMode == EElevenLabsTurnMode::Client) @@ -225,6 +272,8 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason); bIsListening = false; bAgentSpeaking = false; + bAgentGenerating = false; + bAgentResponseReceived = false; MicAccumulationBuffer.Reset(); OnAgentDisconnected.Broadcast(StatusCode, Reason); } @@ -250,6 +299,11 @@ void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabs void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText) { + // The server sends agent_response when the full text response is complete. + // This is our reliable signal that no more TTS audio chunks will follow. + // Set the flag so the silence-detection Tick can safely fire OnAgentStoppedSpeaking. + bAgentResponseReceived = true; + if (bEnableAgentTextResponse) { OnAgentTextResponse.Broadcast(ResponseText); @@ -262,6 +316,22 @@ void UElevenLabsConversationalAgentComponent::HandleInterrupted() OnAgentInterrupted.Broadcast(); } +void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted() +{ + // The server has started generating a response (first agent_chat_response_part). + // Set bAgentGenerating BEFORE StopListening so that any StartListening call + // triggered by the Blueprint's OnAgentStartedGenerating handler is blocked. + bAgentGenerating = true; + + if (bIsListening) + { + UE_LOG(LogElevenLabsAgent, Log, + TEXT("Agent started generating while mic was open — stopping listening to avoid turn collision.")); + StopListening(); + } + OnAgentStartedGenerating.Broadcast(); +} + // ───────────────────────────────────────────────────────────────────────────── // Audio playback // ───────────────────────────────────────────────────────────────────────────── @@ -314,6 +384,8 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray& PCMData) } if (PCMData.Num() == 0) return; - UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num()); - // Track when the last audio chunk was sent for latency measurement. LastAudioChunkSentTime = FPlatformTime::Seconds(); @@ -119,13 +117,8 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray& PCMData) // to avoid the pretty-printed writer and to keep the payload minimal. const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio); - // Log first chunk fully for debugging - static int32 AudioChunksSent = 0; - AudioChunksSent++; - if (AudioChunksSent <= 2) - { - UE_LOG(LogElevenLabsWS, Log, TEXT(" Audio JSON (first 200 chars): %.200s"), *AudioJson); - } + // Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second). + UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num()); if (WebSocket.IsValid() && WebSocket->IsConnected()) { @@ -139,7 +132,17 @@ void UElevenLabsWebSocketProxy::SendUserTurnStart() // The server's VAD detects speech from the audio chunks we send. // user_activity is a keep-alive/timeout-reset message and should NOT be // sent here — it would delay the agent's turn after the user stops. - UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow).")); + + // Reset latency tracking so a new turn starts with a clean state. + // If the previous turn got no server response (bWaitingForResponse stayed true), + // this prevents stale UserTurnEndTime from corrupting latency measurements + // and ensures the state machine is consistent for the new turn. + bWaitingForResponse = false; + bFirstAudioResponseLogged = false; + bAgentResponseStartedFired = false; + + const double T = FPlatformTime::Seconds() - SessionStartTime; + UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn started — mic open, audio chunks will follow."), T); } void UElevenLabsWebSocketProxy::SendUserTurnEnd() @@ -149,7 +152,13 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd() UserTurnEndTime = FPlatformTime::Seconds(); bWaitingForResponse = true; bFirstAudioResponseLogged = false; - UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence.")); + // NOTE: Do NOT reset bAgentResponseStartedFired here. + // StopListening() calls SendUserTurnEnd(), and HandleAgentResponseStarted() calls StopListening(). + // If we reset the flag here, the next agent_chat_response_part would re-fire OnAgentResponseStarted + // in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop. + // The flag is only reset in SendUserTurnStart() at the beginning of a new user turn. + const double T = UserTurnEndTime - SessionStartTime; + UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T); } void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text) @@ -171,6 +180,14 @@ void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text) void UElevenLabsWebSocketProxy::SendInterrupt() { if (!IsConnected()) return; + + // Immediately start discarding in-flight audio and chat response parts from + // the generation we are about to interrupt. The server may still send several + // frames before it processes our interrupt. We stop ignoring once the server + // sends its "interruption" acknowledgement (HandleInterruption). + bIgnoreIncomingContent = true; + UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks.")); + TSharedPtr Msg = MakeShareable(new FJsonObject()); Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt); SendJsonMessage(Msg); @@ -194,7 +211,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected() // "type": "conversation_initiation_client_data", // "conversation_config_override": { // "agent": { - // "turn": { "turn_timeout": 3, "speculative_turn": true } + // "turn": { "turn_timeout": 3 } // speculative_turn removed (caused silent failures after 2 turns) // }, // "tts": { // "optimize_streaming_latency": 3 @@ -211,19 +228,28 @@ void UElevenLabsWebSocketProxy::OnWsConnected() // In push-to-talk (Client mode) the user controls the mic; the server still // uses its VAD to detect the end of speech from the audio chunks it receives. TSharedPtr TurnObj = MakeShareable(new FJsonObject()); - // Lower turn_timeout so the agent responds faster after the user stops speaking. - // Default is 7s. In push-to-talk (Client mode), the user explicitly signals - // end-of-turn by releasing the key, so we can use a very short timeout (1s). + // turn_timeout: how long the server waits after VAD detects silence before + // processing the user's turn. In push-to-talk (Client) mode this directly adds + // latency to every response — the server waits this many seconds of silence + // after the user releases T before it begins LLM processing. + // + // History: + // turn_timeout=1 was originally problematic, but ONLY when combined with + // speculative_turn=true (which has since been removed). Without speculative_turn, + // 1s is safe and halves the per-turn latency vs the 3s we had previously. + // Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1. if (TurnMode == EElevenLabsTurnMode::Client) { TurnObj->SetNumberField(TEXT("turn_timeout"), 1); } - // Speculative turn: start LLM generation during silence before the VAD is - // fully confident the user finished speaking. Reduces latency by 200-500ms. - if (bSpeculativeTurn) - { - TurnObj->SetBoolField(TEXT("speculative_turn"), true); - } + // NOTE: speculative_turn is intentionally NOT sent here. + // With speculative_turn=true the server starts LLM generation speculatively + // before the VAD is fully confident the user finished speaking. Combined with + // the short turn_timeout this put the server's state machine into a state where + // it stopped processing user audio after 2 turns — subsequent turns received + // only pings and no agent_chat_response_part / audio / user_transcript at all. + // Removing it costs ~200-500ms of latency but restores reliable multi-turn + // conversation. Re-enable only if ElevenLabs confirms it is stable. TSharedPtr AgentObj = MakeShareable(new FJsonObject()); AgentObj->SetObjectField(TEXT("turn"), TurnObj); @@ -297,7 +323,15 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) return; } - // Log every message type received from the server for debugging. + // Suppress ping from the visible log — they arrive every ~2s and flood the output. + // Handle ping early before the generic type log. + if (MsgType == ElevenLabsMessageType::PingEvent) + { + HandlePing(Root); + return; + } + + // Log every non-ping message type received from the server. UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType); if (MsgType == ElevenLabsMessageType::ConversationInitiation) @@ -310,11 +344,12 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) if (bWaitingForResponse && !bFirstAudioResponseLogged) { const double Now = FPlatformTime::Seconds(); + const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0; UE_LOG(LogElevenLabsWS, Warning, - TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"), - LatencyFromTurnEnd, LatencyFromLastChunk); + TEXT("[T+%.2fs] [LATENCY] First audio: %.0f ms after turn end (%.0f ms after last chunk)"), + T, LatencyFromTurnEnd, LatencyFromLastChunk); bFirstAudioResponseLogged = true; } HandleAudioResponse(Root); @@ -325,10 +360,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) if (bWaitingForResponse) { const double Now = FPlatformTime::Seconds(); + const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; UE_LOG(LogElevenLabsWS, Warning, - TEXT("[LATENCY] User transcript received: %.0f ms after turn end"), - LatencyFromTurnEnd); + TEXT("[T+%.2fs] [LATENCY] User transcript: %.0f ms after turn end"), + T, LatencyFromTurnEnd); bWaitingForResponse = false; } HandleTranscript(Root); @@ -339,26 +375,27 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) if (UserTurnEndTime > 0.0) { const double Now = FPlatformTime::Seconds(); + const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; UE_LOG(LogElevenLabsWS, Warning, - TEXT("[LATENCY] Agent text response: %.0f ms after turn end"), - LatencyFromTurnEnd); + TEXT("[T+%.2fs] [LATENCY] Agent text response: %.0f ms after turn end"), + T, LatencyFromTurnEnd); } HandleAgentResponse(Root); } + else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart) + { + HandleAgentChatResponsePart(); + } else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection) { - // Silently ignore for now — corrected text after interruption. + // Silently ignore — corrected text after interruption. UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored).")); } else if (MsgType == ElevenLabsMessageType::InterruptionEvent) { HandleInterruption(Root); } - else if (MsgType == ElevenLabsMessageType::PingEvent) - { - HandlePing(Root); - } else { UE_LOG(LogElevenLabsWS, Verbose, TEXT("Unhandled message type: %s"), *MsgType); @@ -415,9 +452,17 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size, } // Broadcast raw PCM bytes directly to the audio queue. + // Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse). TArray PCMData = MoveTemp(BinaryFrameBuffer); BinaryFrameBuffer.Reset(); - OnAudioReceived.Broadcast(PCMData); + if (!bIgnoreIncomingContent) + { + OnAudioReceived.Broadcast(PCMData); + } + else + { + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack).")); + } } } @@ -439,13 +484,23 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtrTryGetStringField(TEXT("conversation_id"), ConversationInfo.ConversationID); } - UE_LOG(LogElevenLabsWS, Log, TEXT("Conversation initiated. ID=%s"), *ConversationInfo.ConversationID); + SessionStartTime = FPlatformTime::Seconds(); + UE_LOG(LogElevenLabsWS, Log, TEXT("[T+0.00s] Conversation initiated. ID=%s"), *ConversationInfo.ConversationID); ConnectionState = EElevenLabsConnectionState::Connected; OnConnected.Broadcast(ConversationInfo); } void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr& Root) { + // Discard audio that belongs to an interrupted generation. + // The server may send several more audio frames after we sent "interrupt" — + // they must not restart the speaking state on the client side. + if (bIgnoreIncomingContent) + { + UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack).")); + return; + } + // Expected structure: // { "type": "audio", // "audio_event": { "audio_base_64": "", "event_id": 1 } @@ -513,9 +568,41 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0; + UE_LOG(LogElevenLabsWS, Log, + TEXT("[T+%.2fs] Agent started generating (%.0f ms after turn end — includes VAD silence timeout + LLM start)."), + T, LatencyFromTurnEnd); + OnAgentResponseStarted.Broadcast(); + } + // Subsequent parts logged at Verbose only (can be dozens per response). +} + void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr& Root) { - UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted.")); + // Server has acknowledged the interruption — the old generation is fully stopped. + // Resume accepting incoming audio and chat response parts (for the next turn). + bIgnoreIncomingContent = false; + UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing).")); OnInterrupted.Broadcast(); } diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index f972f94..bee6ec3 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -34,6 +34,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedSpeaking); DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStoppedSpeaking); DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted); +/** + * Fired when the server sends its first agent_chat_response_part — i.e. the moment + * the LLM starts generating, well before audio arrives. + * The component automatically calls StopListening() when this fires while the + * microphone is open, preventing the user's new audio from being sent to the + * server while it is still processing the previous turn. + */ +DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating); + // ───────────────────────────────────────────────────────────────────────────── // UElevenLabsConversationalAgentComponent // @@ -83,10 +92,27 @@ public: /** * Enable speculative turn: the LLM starts generating a response during * silence before the VAD is fully confident the user has finished speaking. - * Reduces latency by 200-500ms but may occasionally produce premature responses. + * Reduces latency by 200-500ms but caused the server to silently stop + * processing user audio after 2 turns when combined with a short turn_timeout. + * Disabled by default until ElevenLabs confirms stability in multi-turn sessions. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency") - bool bSpeculativeTurn = true; + bool bSpeculativeTurn = false; + + /** + * Allow the user to interrupt the agent while it is playing audio (speaking). + * When true, calling StartListening() while the agent is audibly speaking automatically + * sends an interruption signal to the server and opens the mic — no Blueprint nodes needed. + * When false, StartListening() is silently ignored until the agent finishes speaking. + * + * NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking). + * While the agent is generating but has not yet started speaking, StartListening() is + * always blocked regardless of this flag — this prevents Blueprint's OnAgentStartedGenerating + * handler (which often calls StartListening for bookkeeping) from accidentally cancelling + * the response before any audio plays. + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs") + bool bAllowInterruption = true; /** * Forward user speech transcripts (user_transcript events) to the @@ -131,6 +157,15 @@ public: UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnAgentInterrupted OnAgentInterrupted; + /** + * Fired when the server starts generating a response (before audio). + * The component automatically stops the microphone when this fires while listening, + * so the Blueprint doesn't need to handle this manually for push-to-talk. + * Bind here if you need UI feedback ("agent is thinking..."). + */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") + FOnAgentStartedGenerating OnAgentStartedGenerating; + // ── Control ─────────────────────────────────────────────────────────────── /** @@ -219,6 +254,9 @@ private: UFUNCTION() void HandleInterrupted(); + UFUNCTION() + void HandleAgentResponseStarted(); + // ── Audio playback ──────────────────────────────────────────────────────── void InitAudioPlayback(); void EnqueueAgentAudio(const TArray& PCMData); @@ -244,15 +282,32 @@ private: // ── State ───────────────────────────────────────────────────────────────── bool bIsListening = false; bool bAgentSpeaking = false; + // True from the first agent_chat_response_part until the first audio chunk arrives. + // Used to block StartListening() while the server is processing the previous turn. + bool bAgentGenerating = false; // Accumulates incoming PCM bytes until the audio component needs data. TArray AudioQueue; FCriticalSection AudioQueueLock; - // Simple heuristic: if we haven't received audio data for this many ticks, - // consider the agent done speaking. + // Silence detection: how many consecutive ticks with an empty audio queue. int32 SilentTickCount = 0; - static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps + + // Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks + // once the server has confirmed the full response (bAgentResponseReceived=true). + // 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream. + static constexpr int32 SilenceThresholdTicks = 30; + + // Hard-timeout fallback: fire even without agent_response confirmation after 2s + // of silence (handles edge cases where agent_response is very late or missing). + static constexpr int32 HardSilenceTimeoutTicks = 120; // 2s at 60fps + + // True once the server sends agent_response for the current turn. + // The server sends the full text when generation is complete — this is the + // reliable signal that no more audio chunks will follow for this utterance. + // We wait for this before declaring the agent "stopped speaking" to avoid + // premature OnAgentStoppedSpeaking events during multi-chunk TTS streaming. + bool bAgentResponseReceived = false; // ── Microphone accumulation ─────────────────────────────────────────────── // WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono). diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h index b2af349..09d7ba1 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h @@ -49,8 +49,9 @@ namespace ElevenLabsMessageType static const FString AudioResponse = TEXT("audio"); // User speech-to-text transcript (speaker is always the user) static const FString UserTranscript = TEXT("user_transcript"); - static const FString AgentResponse = TEXT("agent_response"); - static const FString AgentResponseCorrection= TEXT("agent_response_correction"); + static const FString AgentResponse = TEXT("agent_response"); + static const FString AgentChatResponsePart = TEXT("agent_chat_response_part"); // intermediate LLM token stream + static const FString AgentResponseCorrection = TEXT("agent_response_correction"); static const FString InterruptionEvent = TEXT("interruption"); static const FString PingEvent = TEXT("ping"); static const FString ClientToolCall = TEXT("client_tool_call"); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index 13fb2e8..09a4d28 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -36,6 +36,13 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponse, /** Fired when the agent interrupts the user. */ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted); +/** + * Fired when the server starts generating a response (first agent_chat_response_part received). + * This fires BEFORE audio arrives — useful to detect that the server is processing + * the previous turn while the client may have restarted listening (auto-restart scenario). + */ +DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted); + // ───────────────────────────────────────────────────────────────────────────── // WebSocket Proxy @@ -79,6 +86,14 @@ public: UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") FOnElevenLabsInterrupted OnInterrupted; + /** + * Fired on the first agent_chat_response_part per turn — i.e. the moment the server + * starts generating. Fires well before audio. The component uses this to stop the + * microphone if it was restarted before the server finished processing the previous turn. + */ + UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") + FOnElevenLabsAgentResponseStarted OnAgentResponseStarted; + // ── Lifecycle ───────────────────────────────────────────────────────────── /** @@ -167,6 +182,7 @@ private: void HandleAudioResponse(const TSharedPtr& Payload); void HandleTranscript(const TSharedPtr& Payload); void HandleAgentResponse(const TSharedPtr& Payload); + void HandleAgentChatResponsePart(); void HandleInterruption(const TSharedPtr& Payload); void HandlePing(const TSharedPtr& Payload); @@ -193,6 +209,19 @@ private: bool bWaitingForResponse = false; // Whether we already logged the first audio response latency for this turn. bool bFirstAudioResponseLogged = false; + // Whether OnAgentResponseStarted has already been fired for the current turn. + // Reset at turn start so only the first agent_chat_response_part fires the event. + bool bAgentResponseStartedFired = false; + + // Timestamp when the conversation was initiated (conversation_initiation_metadata received). + // Used to compute [T+Xs] session-relative timestamps in all log messages. + double SessionStartTime = 0.0; + + // Set to true in SendInterrupt() so that in-flight audio frames and + // agent_chat_response_part messages from the interrupted generation are silently + // discarded instead of re-triggering the speaking/generating state. + // Cleared when the server sends its "interruption" acknowledgement. + bool bIgnoreIncomingContent = false; public: // Set by UElevenLabsConversationalAgentComponent before calling Connect(). diff --git a/build.bat b/build.bat new file mode 100644 index 0000000..ee648b3 --- /dev/null +++ b/build.bat @@ -0,0 +1,35 @@ +@echo off +chcp 65001 >nul +title Build PS_AI_Agent + +echo ============================================================ +echo PS_AI_Agent - Compilation plugin ElevenLabs (UE 5.5) +echo ============================================================ +echo. +echo ATTENTION : Ferme l'Unreal Editor avant de continuer ! +echo (Les DLL seraient verrouillees et la compilation echouerait) +echo. +pause + +echo. +echo Compilation en cours... +echo (Seuls les .cpp modifies sont recompiles, ~16s) +echo. + +powershell.exe -Command "& 'C:\Program Files\Epic Games\UE_5.5\Engine\Build\BatchFiles\RunUAT.bat' BuildEditor -project='C:\ASTERION\GIT\PS_AI_Agent\Unreal\PS_AI_Agent\PS_AI_Agent.uproject' -notools -noP4 2>&1" + +echo. +if %ERRORLEVEL% == 0 ( + echo ============================================================ + echo SUCCES - Compilation terminee sans erreur. + echo Tu peux relancer l'Unreal Editor. + echo ============================================================ +) else ( + echo ============================================================ + echo ECHEC - Erreur de compilation (code %ERRORLEVEL%) + echo Consulte le log ci-dessus pour le detail. + echo ============================================================ +) + +echo. +pause