Debug in progress

2026-02-20 15:18:03 +01:00
parent 9f28ed7457
commit 0dc9d67308
7 changed files with 326 additions and 44 deletions
--- a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
+++ b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -48,9 +48,27 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
 		if (AudioQueue.Num() == 0)
 		{
 			SilentTickCount++;
-			if (SilentTickCount >= SilenceThresholdTicks)
+
+			// Wait for agent_response (confirms the full response is done) before
+			// declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking
+			// events when ElevenLabs TTS streams audio in multiple batches with gaps
+			// (e.g. for long responses) — without this guard, the Blueprint's
+			// OnAgentStoppedSpeaking handler reopens the mic mid-response.
+			const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks;
+
+			// Hard-timeout fallback: if agent_response never arrives (or is very late),
+			// stop after 2s of silence to avoid leaving the state machine stuck.
+			const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks;
+
+			if (bResponseConfirmed || bHardTimeout)
 			{
+				if (bHardTimeout && !bAgentResponseReceived)
+				{
+					UE_LOG(LogElevenLabsAgent, Warning,
+						TEXT("Agent silence hard-timeout (2s) without agent_response — declaring agent stopped."));
+				}
 				bAgentSpeaking = false;
+				bAgentResponseReceived = false;
 				SilentTickCount = 0;
 				OnAgentStoppedSpeaking.Broadcast();
 			}
@@ -84,6 +102,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
 			&UElevenLabsConversationalAgentComponent::HandleAgentResponse);
 		WebSocketProxy->OnInterrupted.AddDynamic(this,
 			&UElevenLabsConversationalAgentComponent::HandleInterrupted);
+		WebSocketProxy->OnAgentResponseStarted.AddDynamic(this,
+			&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
 	}

 	// Pass configuration to the proxy before connecting.
@@ -114,6 +134,33 @@ void UElevenLabsConversationalAgentComponent::StartListening()
 	}

 	if (bIsListening) return;
+
+	// If the agent is currently generating or speaking, decide how to handle the request.
+	//
+	// Interruption (bAllowInterruption) applies ONLY when the agent is already playing audio
+	// (bAgentSpeaking). Pressing T while the agent speaks immediately stops it and opens the mic.
+	//
+	// During the generation phase (bAgentGenerating, no audio yet) we always block silently.
+	// This prevents the Blueprint's OnAgentStartedGenerating handler — which typically calls
+	// StartListening() for bookkeeping — from accidentally sending an interrupt to the server
+	// the moment it starts generating, which would cancel every response before any audio plays.
+	if (bAgentGenerating || bAgentSpeaking)
+	{
+		if (bAgentSpeaking && bAllowInterruption)
+		{
+			UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening: interrupting agent (speaking) to allow user to speak."));
+			InterruptAgent();
+			// InterruptAgent → StopAgentAudio clears bAgentSpeaking / bAgentGenerating,
+			// so we fall through and open the microphone immediately.
+		}
+		else
+		{
+			UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening ignored: agent is %s%s — will listen after agent finishes."),
+				bAgentGenerating ? TEXT("generating") : TEXT("speaking"),
+				(bAgentSpeaking && !bAllowInterruption) ? TEXT(" (interruption disabled)") : TEXT(""));
+			return;
+		}
+	}
 	bIsListening = true;

 	if (TurnMode == EElevenLabsTurnMode::Client)
@@ -225,6 +272,8 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
 	UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason);
 	bIsListening = false;
 	bAgentSpeaking = false;
+	bAgentGenerating = false;
+	bAgentResponseReceived = false;
 	MicAccumulationBuffer.Reset();
 	OnAgentDisconnected.Broadcast(StatusCode, Reason);
 }
@@ -250,6 +299,11 @@ void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabs

 void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
 {
+	// The server sends agent_response when the full text response is complete.
+	// This is our reliable signal that no more TTS audio chunks will follow.
+	// Set the flag so the silence-detection Tick can safely fire OnAgentStoppedSpeaking.
+	bAgentResponseReceived = true;
+
 	if (bEnableAgentTextResponse)
 	{
 		OnAgentTextResponse.Broadcast(ResponseText);
@@ -262,6 +316,22 @@ void UElevenLabsConversationalAgentComponent::HandleInterrupted()
 	OnAgentInterrupted.Broadcast();
 }

+void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
+{
+	// The server has started generating a response (first agent_chat_response_part).
+	// Set bAgentGenerating BEFORE StopListening so that any StartListening call
+	// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked.
+	bAgentGenerating = true;
+
+	if (bIsListening)
+	{
+		UE_LOG(LogElevenLabsAgent, Log,
+			TEXT("Agent started generating while mic was open — stopping listening to avoid turn collision."));
+		StopListening();
+	}
+	OnAgentStartedGenerating.Broadcast();
+}
+
 // ─────────────────────────────────────────────────────────────────────────────
 // Audio playback
 // ─────────────────────────────────────────────────────────────────────────────
@@ -314,6 +384,8 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
 	if (!bAgentSpeaking)
 	{
 		bAgentSpeaking = true;
+		bAgentGenerating = false;    // Agent is now speaking — generation phase is over.
+		bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
 		SilentTickCount = 0;
 		OnAgentStartedSpeaking.Broadcast();

@@ -334,6 +406,9 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 	FScopeLock Lock(&AudioQueueLock);
 	AudioQueue.Empty();

+	bAgentGenerating = false;       // Always clear — covers interruptions during generation phase.
+	bAgentResponseReceived = false; // Reset — next response will re-confirm when done.
+
 	if (bAgentSpeaking)
 	{
 		bAgentSpeaking = false;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -104,8 +104,6 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 	}
 	if (PCMData.Num() == 0) return;

-	UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
-
 	// Track when the last audio chunk was sent for latency measurement.
 	LastAudioChunkSentTime = FPlatformTime::Seconds();

@@ -119,13 +117,8 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 	// to avoid the pretty-printed writer and to keep the payload minimal.
 	const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);

-	// Log first chunk fully for debugging
-	static int32 AudioChunksSent = 0;
-	AudioChunksSent++;
-	if (AudioChunksSent <= 2)
-	{
-		UE_LOG(LogElevenLabsWS, Log, TEXT("  Audio JSON (first 200 chars): %.200s"), *AudioJson);
-	}
+	// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
+	UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());

 	if (WebSocket.IsValid() && WebSocket->IsConnected())
 	{
@@ -139,7 +132,17 @@ void UElevenLabsWebSocketProxy::SendUserTurnStart()
 	// The server's VAD detects speech from the audio chunks we send.
 	// user_activity is a keep-alive/timeout-reset message and should NOT be
 	// sent here — it would delay the agent's turn after the user stops.
-	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
+
+	// Reset latency tracking so a new turn starts with a clean state.
+	// If the previous turn got no server response (bWaitingForResponse stayed true),
+	// this prevents stale UserTurnEndTime from corrupting latency measurements
+	// and ensures the state machine is consistent for the new turn.
+	bWaitingForResponse = false;
+	bFirstAudioResponseLogged = false;
+	bAgentResponseStartedFired = false;
+
+	const double T = FPlatformTime::Seconds() - SessionStartTime;
+	UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn started — mic open, audio chunks will follow."), T);
 }

 void UElevenLabsWebSocketProxy::SendUserTurnEnd()
@@ -149,7 +152,13 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 	UserTurnEndTime = FPlatformTime::Seconds();
 	bWaitingForResponse = true;
 	bFirstAudioResponseLogged = false;
-	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
+	// NOTE: Do NOT reset bAgentResponseStartedFired here.
+	// StopListening() calls SendUserTurnEnd(), and HandleAgentResponseStarted() calls StopListening().
+	// If we reset the flag here, the next agent_chat_response_part would re-fire OnAgentResponseStarted
+	// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
+	// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
+	const double T = UserTurnEndTime - SessionStartTime;
+	UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
 }

 void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
@@ -171,6 +180,14 @@ void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
 void UElevenLabsWebSocketProxy::SendInterrupt()
 {
 	if (!IsConnected()) return;
+
+	// Immediately start discarding in-flight audio and chat response parts from
+	// the generation we are about to interrupt. The server may still send several
+	// frames before it processes our interrupt. We stop ignoring once the server
+	// sends its "interruption" acknowledgement (HandleInterruption).
+	bIgnoreIncomingContent = true;
+	UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks."));
+
 	TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
 	Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
 	SendJsonMessage(Msg);
@@ -194,7 +211,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	//   "type": "conversation_initiation_client_data",
 	//   "conversation_config_override": {
 	//     "agent": {
-	//       "turn": { "turn_timeout": 3, "speculative_turn": true }
+	//       "turn": { "turn_timeout": 3 }   // speculative_turn removed (caused silent failures after 2 turns)
 	//     },
 	//     "tts": {
 	//       "optimize_streaming_latency": 3
@@ -211,19 +228,28 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	// In push-to-talk (Client mode) the user controls the mic; the server still
 	// uses its VAD to detect the end of speech from the audio chunks it receives.
 	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
-	// Lower turn_timeout so the agent responds faster after the user stops speaking.
-	// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
-	// end-of-turn by releasing the key, so we can use a very short timeout (1s).
+	// turn_timeout: how long the server waits after VAD detects silence before
+	// processing the user's turn. In push-to-talk (Client) mode this directly adds
+	// latency to every response — the server waits this many seconds of silence
+	// after the user releases T before it begins LLM processing.
+	//
+	// History:
+	//   turn_timeout=1 was originally problematic, but ONLY when combined with
+	//   speculative_turn=true (which has since been removed). Without speculative_turn,
+	//   1s is safe and halves the per-turn latency vs the 3s we had previously.
+	//   Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
 		TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
 	}
-	// Speculative turn: start LLM generation during silence before the VAD is
-	// fully confident the user finished speaking. Reduces latency by 200-500ms.
-	if (bSpeculativeTurn)
-	{
-		TurnObj->SetBoolField(TEXT("speculative_turn"), true);
-	}
+	// NOTE: speculative_turn is intentionally NOT sent here.
+	// With speculative_turn=true the server starts LLM generation speculatively
+	// before the VAD is fully confident the user finished speaking.  Combined with
+	// the short turn_timeout this put the server's state machine into a state where
+	// it stopped processing user audio after 2 turns — subsequent turns received
+	// only pings and no agent_chat_response_part / audio / user_transcript at all.
+	// Removing it costs ~200-500ms of latency but restores reliable multi-turn
+	// conversation.  Re-enable only if ElevenLabs confirms it is stable.

 	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
 	AgentObj->SetObjectField(TEXT("turn"), TurnObj);
@@ -297,7 +323,15 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		return;
 	}

-	// Log every message type received from the server for debugging.
+	// Suppress ping from the visible log — they arrive every ~2s and flood the output.
+	// Handle ping early before the generic type log.
+	if (MsgType == ElevenLabsMessageType::PingEvent)
+	{
+		HandlePing(Root);
+		return;
+	}
+
+	// Log every non-ping message type received from the server.
 	UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);

 	if (MsgType == ElevenLabsMessageType::ConversationInitiation)
@@ -310,11 +344,12 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		if (bWaitingForResponse && !bFirstAudioResponseLogged)
 		{
 			const double Now = FPlatformTime::Seconds();
+			const double T = Now - SessionStartTime;
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
-				TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
-				LatencyFromTurnEnd, LatencyFromLastChunk);
+				TEXT("[T+%.2fs] [LATENCY] First audio: %.0f ms after turn end (%.0f ms after last chunk)"),
+				T, LatencyFromTurnEnd, LatencyFromLastChunk);
 			bFirstAudioResponseLogged = true;
 		}
 		HandleAudioResponse(Root);
@@ -325,10 +360,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		if (bWaitingForResponse)
 		{
 			const double Now = FPlatformTime::Seconds();
+			const double T = Now - SessionStartTime;
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
-				TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
-				LatencyFromTurnEnd);
+				TEXT("[T+%.2fs] [LATENCY] User transcript: %.0f ms after turn end"),
+				T, LatencyFromTurnEnd);
 			bWaitingForResponse = false;
 		}
 		HandleTranscript(Root);
@@ -339,26 +375,27 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 		if (UserTurnEndTime > 0.0)
 		{
 			const double Now = FPlatformTime::Seconds();
+			const double T = Now - SessionStartTime;
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
-				TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
-				LatencyFromTurnEnd);
+				TEXT("[T+%.2fs] [LATENCY] Agent text response: %.0f ms after turn end"),
+				T, LatencyFromTurnEnd);
 		}
 		HandleAgentResponse(Root);
 	}
+	else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart)
+	{
+		HandleAgentChatResponsePart();
+	}
 	else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
 	{
-		// Silently ignore for now — corrected text after interruption.
+		// Silently ignore — corrected text after interruption.
 		UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
 	}
 	else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
 	{
 		HandleInterruption(Root);
 	}
-	else if (MsgType == ElevenLabsMessageType::PingEvent)
-	{
-		HandlePing(Root);
-	}
 	else
 	{
 		UE_LOG(LogElevenLabsWS, Verbose, TEXT("Unhandled message type: %s"), *MsgType);
@@ -415,9 +452,17 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
 		}

 		// Broadcast raw PCM bytes directly to the audio queue.
+		// Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse).
 		TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
 		BinaryFrameBuffer.Reset();
-		OnAudioReceived.Broadcast(PCMData);
+		if (!bIgnoreIncomingContent)
+		{
+			OnAudioReceived.Broadcast(PCMData);
+		}
+		else
+		{
+			UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack)."));
+		}
 	}
 }

@@ -439,13 +484,23 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr<FJ
 		(*MetaObj)->TryGetStringField(TEXT("conversation_id"), ConversationInfo.ConversationID);
 	}

-	UE_LOG(LogElevenLabsWS, Log, TEXT("Conversation initiated. ID=%s"), *ConversationInfo.ConversationID);
+	SessionStartTime = FPlatformTime::Seconds();
+	UE_LOG(LogElevenLabsWS, Log, TEXT("[T+0.00s] Conversation initiated. ID=%s"), *ConversationInfo.ConversationID);
 	ConnectionState = EElevenLabsConnectionState::Connected;
 	OnConnected.Broadcast(ConversationInfo);
 }

 void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
 {
+	// Discard audio that belongs to an interrupted generation.
+	// The server may send several more audio frames after we sent "interrupt" —
+	// they must not restart the speaking state on the client side.
+	if (bIgnoreIncomingContent)
+	{
+		UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack)."));
+		return;
+	}
+
 	// Expected structure:
 	// { "type": "audio",
 	//   "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
@@ -513,9 +568,41 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject
 	OnAgentResponse.Broadcast(ResponseText);
 }

+void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart()
+{
+	// Ignore response parts that belong to a generation we have already interrupted.
+	// Without this guard, old parts arriving after SendInterrupt() would re-trigger
+	// OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart),
+	// causing the component to stop the newly-opened microphone — creating an infinite loop.
+	if (bIgnoreIncomingContent)
+	{
+		UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack)."));
+		return;
+	}
+
+	// agent_chat_response_part = the server is actively generating a response (LLM token stream).
+	// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
+	// if the Blueprint restarted listening before the server finished processing the previous turn.
+	if (!bAgentResponseStartedFired)
+	{
+		bAgentResponseStartedFired = true;
+		const double Now = FPlatformTime::Seconds();
+		const double T = Now - SessionStartTime;
+		const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
+		UE_LOG(LogElevenLabsWS, Log,
+			TEXT("[T+%.2fs] Agent started generating (%.0f ms after turn end — includes VAD silence timeout + LLM start)."),
+			T, LatencyFromTurnEnd);
+		OnAgentResponseStarted.Broadcast();
+	}
+	// Subsequent parts logged at Verbose only (can be dozens per response).
+}
+
 void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
 {
-	UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted."));
+	// Server has acknowledged the interruption — the old generation is fully stopped.
+	// Resume accepting incoming audio and chat response parts (for the next turn).
+	bIgnoreIncomingContent = false;
+	UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing)."));
 	OnInterrupted.Broadcast();
 }

--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -34,6 +34,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedSpeaking);
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStoppedSpeaking);
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted);

+/**
+ * Fired when the server sends its first agent_chat_response_part — i.e. the moment
+ * the LLM starts generating, well before audio arrives.
+ * The component automatically calls StopListening() when this fires while the
+ * microphone is open, preventing the user's new audio from being sent to the
+ * server while it is still processing the previous turn.
+ */
+DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating);
+
 // ─────────────────────────────────────────────────────────────────────────────
 // UElevenLabsConversationalAgentComponent
 //
@@ -83,10 +92,27 @@ public:
 	/**
 	 * Enable speculative turn: the LLM starts generating a response during
 	 * silence before the VAD is fully confident the user has finished speaking.
-	 * Reduces latency by 200-500ms but may occasionally produce premature responses.
+	 * Reduces latency by 200-500ms but caused the server to silently stop
+	 * processing user audio after 2 turns when combined with a short turn_timeout.
+	 * Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
 	 */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
-	bool bSpeculativeTurn = true;
+	bool bSpeculativeTurn = false;
+
+	/**
+	 * Allow the user to interrupt the agent while it is playing audio (speaking).
+	 * When true, calling StartListening() while the agent is audibly speaking automatically
+	 * sends an interruption signal to the server and opens the mic — no Blueprint nodes needed.
+	 * When false, StartListening() is silently ignored until the agent finishes speaking.
+	 *
+	 * NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking).
+	 * While the agent is generating but has not yet started speaking, StartListening() is
+	 * always blocked regardless of this flag — this prevents Blueprint's OnAgentStartedGenerating
+	 * handler (which often calls StartListening for bookkeeping) from accidentally cancelling
+	 * the response before any audio plays.
+	 */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
+	bool bAllowInterruption = true;

 	/**
 	 * Forward user speech transcripts (user_transcript events) to the
@@ -131,6 +157,15 @@ public:
 	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
 	FOnAgentInterrupted OnAgentInterrupted;

+	/**
+	 * Fired when the server starts generating a response (before audio).
+	 * The component automatically stops the microphone when this fires while listening,
+	 * so the Blueprint doesn't need to handle this manually for push-to-talk.
+	 * Bind here if you need UI feedback ("agent is thinking...").
+	 */
+	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
+	FOnAgentStartedGenerating OnAgentStartedGenerating;
+
 	// ── Control ───────────────────────────────────────────────────────────────

 	/**
@@ -219,6 +254,9 @@ private:
 	UFUNCTION()
 	void HandleInterrupted();

+	UFUNCTION()
+	void HandleAgentResponseStarted();
+
 	// ── Audio playback ────────────────────────────────────────────────────────
 	void InitAudioPlayback();
 	void EnqueueAgentAudio(const TArray<uint8>& PCMData);
@@ -244,15 +282,32 @@ private:
 	// ── State ─────────────────────────────────────────────────────────────────
 	bool bIsListening = false;
 	bool bAgentSpeaking = false;
+	// True from the first agent_chat_response_part until the first audio chunk arrives.
+	// Used to block StartListening() while the server is processing the previous turn.
+	bool bAgentGenerating = false;

 	// Accumulates incoming PCM bytes until the audio component needs data.
 	TArray<uint8> AudioQueue;
 	FCriticalSection AudioQueueLock;

-	// Simple heuristic: if we haven't received audio data for this many ticks,
-	// consider the agent done speaking.
+	// Silence detection: how many consecutive ticks with an empty audio queue.
 	int32 SilentTickCount = 0;
-	static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps
+
+	// Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks
+	// once the server has confirmed the full response (bAgentResponseReceived=true).
+	// 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream.
+	static constexpr int32 SilenceThresholdTicks = 30;
+
+	// Hard-timeout fallback: fire even without agent_response confirmation after 2s
+	// of silence (handles edge cases where agent_response is very late or missing).
+	static constexpr int32 HardSilenceTimeoutTicks = 120; // 2s at 60fps
+
+	// True once the server sends agent_response for the current turn.
+	// The server sends the full text when generation is complete — this is the
+	// reliable signal that no more audio chunks will follow for this utterance.
+	// We wait for this before declaring the agent "stopped speaking" to avoid
+	// premature OnAgentStoppedSpeaking events during multi-chunk TTS streaming.
+	bool bAgentResponseReceived = false;

 	// ── Microphone accumulation ───────────────────────────────────────────────
 	// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsDefinitions.h
@@ -49,8 +49,9 @@ namespace ElevenLabsMessageType
 	static const FString AudioResponse			= TEXT("audio");
 	// User speech-to-text transcript (speaker is always the user)
 	static const FString UserTranscript			= TEXT("user_transcript");
-	static const FString AgentResponse			= TEXT("agent_response");
-	static const FString AgentResponseCorrection= TEXT("agent_response_correction");
+	static const FString AgentResponse				= TEXT("agent_response");
+	static const FString AgentChatResponsePart		= TEXT("agent_chat_response_part"); // intermediate LLM token stream
+	static const FString AgentResponseCorrection	= TEXT("agent_response_correction");
 	static const FString InterruptionEvent		= TEXT("interruption");
 	static const FString PingEvent				= TEXT("ping");
 	static const FString ClientToolCall			= TEXT("client_tool_call");
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -36,6 +36,13 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponse,
 /** Fired when the agent interrupts the user. */
 DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted);

+/**
+ * Fired when the server starts generating a response (first agent_chat_response_part received).
+ * This fires BEFORE audio arrives — useful to detect that the server is processing
+ * the previous turn while the client may have restarted listening (auto-restart scenario).
+ */
+DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
+

 // ─────────────────────────────────────────────────────────────────────────────
 // WebSocket Proxy
@@ -79,6 +86,14 @@ public:
 	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
 	FOnElevenLabsInterrupted OnInterrupted;

+	/**
+	 * Fired on the first agent_chat_response_part per turn — i.e. the moment the server
+	 * starts generating. Fires well before audio.  The component uses this to stop the
+	 * microphone if it was restarted before the server finished processing the previous turn.
+	 */
+	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
+	FOnElevenLabsAgentResponseStarted OnAgentResponseStarted;
+
 	// ── Lifecycle ─────────────────────────────────────────────────────────────

 	/**
@@ -167,6 +182,7 @@ private:
 	void HandleAudioResponse(const TSharedPtr<FJsonObject>& Payload);
 	void HandleTranscript(const TSharedPtr<FJsonObject>& Payload);
 	void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
+	void HandleAgentChatResponsePart();
 	void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
 	void HandlePing(const TSharedPtr<FJsonObject>& Payload);

@@ -193,6 +209,19 @@ private:
 	bool bWaitingForResponse = false;
 	// Whether we already logged the first audio response latency for this turn.
 	bool bFirstAudioResponseLogged = false;
+	// Whether OnAgentResponseStarted has already been fired for the current turn.
+	// Reset at turn start so only the first agent_chat_response_part fires the event.
+	bool bAgentResponseStartedFired = false;
+
+	// Timestamp when the conversation was initiated (conversation_initiation_metadata received).
+	// Used to compute [T+Xs] session-relative timestamps in all log messages.
+	double SessionStartTime = 0.0;
+
+	// Set to true in SendInterrupt() so that in-flight audio frames and
+	// agent_chat_response_part messages from the interrupted generation are silently
+	// discarded instead of re-triggering the speaking/generating state.
+	// Cleared when the server sends its "interruption" acknowledgement.
+	bool bIgnoreIncomingContent = false;

 public:
 	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
--- a/build.bat
+++ b/build.bat
@@ -0,0 +1,35 @@
+@echo off
+chcp 65001 >nul
+title Build PS_AI_Agent
+
+echo ============================================================
+echo  PS_AI_Agent - Compilation plugin ElevenLabs (UE 5.5)
+echo ============================================================
+echo.
+echo  ATTENTION : Ferme l'Unreal Editor avant de continuer !
+echo  (Les DLL seraient verrouillees et la compilation echouerait)
+echo.
+pause
+
+echo.
+echo  Compilation en cours...
+echo  (Seuls les .cpp modifies sont recompiles, ~16s)
+echo.
+
+powershell.exe -Command "& 'C:\Program Files\Epic Games\UE_5.5\Engine\Build\BatchFiles\RunUAT.bat' BuildEditor -project='C:\ASTERION\GIT\PS_AI_Agent\Unreal\PS_AI_Agent\PS_AI_Agent.uproject' -notools -noP4 2>&1"
+
+echo.
+if %ERRORLEVEL% == 0 (
+    echo ============================================================
+    echo  SUCCES - Compilation terminee sans erreur.
+    echo  Tu peux relancer l'Unreal Editor.
+    echo ============================================================
+) else (
+    echo ============================================================
+    echo  ECHEC - Erreur de compilation (code %ERRORLEVEL%)
+    echo  Consulte le log ci-dessus pour le detail.
+    echo ============================================================
+)
+
+echo.
+pause