v1.8.0: Server VAD interruption, partial response fix, configurable chunk size

- Server VAD + interruption: mic stays open while agent speaks, server detects user voice and triggers interruption automatically. Echo suppression disabled in this mode so audio reaches the server. - Fix agent_chat_response_part parsing: ElevenLabs API now uses text_response_part.text instead of agent_chat_response_part_event. Added fallback for legacy format. - Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms) instead of compile-time constant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:13:02 +01:00
parent 152fc6196d
commit 20a6e30377
4 changed files with 82 additions and 28 deletions
--- a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
+++ b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening()
 		&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
 	// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
 	// capture entirely (before resampling) while the agent is speaking.
-	Mic->EchoSuppressFlag = &bAgentSpeaking;
+	// In Server VAD + interruption mode, disable echo suppression so the server
 	// receives the user's voice even during agent playback — the server's own VAD
 	// handles echo filtering and interruption detection.
 	if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
 	{
 		Mic->EchoSuppressFlag = nullptr;
 	}
 	else
 	{
 		Mic->EchoSuppressFlag = &bAgentSpeaking;
 	}
 	Mic->StartCapture();
 	const double T = TurnStartTime - SessionStartTime;
@@ -460,22 +470,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
 	const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
 	if (bIsListening)
 	{
-		// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
+		// In Server VAD + interruption mode, keep the mic open so the server can
-		// The server's VAD detected a pause in the user's speech and started generating
+		// detect if the user speaks over the agent and send an interruption event.
-		// prematurely — the user hasn't finished speaking yet.
+		// The server handles echo filtering and VAD — we just keep streaming audio.
-		//
+		if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
-		// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
+		{
-		// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
+			UE_LOG(LogElevenLabsAgent, Log,
-		// causing it to re-enter "user speaking" state and stall — both sides stuck.
+				TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
-		//
+				T, LastClosedTurnIndex, LatencyFromTurnEnd);
-		// Do NOT send an interrupt here — just let the server's response play out:
+		}
-		//   - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
+		else
-		//   - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
+		{
-		// Either way the state machine recovers and Blueprint can reopen the mic.
+			// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
-		UE_LOG(LogElevenLabsAgent, Log,
+			// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
-			TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
+			// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
-			T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
+			// causing it to re-enter "user speaking" state and stall — both sides stuck.
-		StopListening();
+			UE_LOG(LogElevenLabsAgent, Log,
 				TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
 				T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
 			StopListening();
 		}
 	}
 	UE_LOG(LogElevenLabsAgent, Log,
@@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
 	// Echo suppression: skip sending mic audio while the agent is speaking.
 	// This prevents the agent from hearing its own voice through the speakers,
-	// which would confuse the server's VAD and STT. Matches the approach used
+	// which would confuse the server's VAD and STT.
-	// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
+	// In Server VAD + interruption mode, keep sending audio so the server can
-	if (bAgentSpeaking) return;
+	// detect the user speaking over the agent and trigger an interruption.
 	if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
 	{
 		return;
 	}
 	// Convert this callback's samples to int16 bytes and accumulate.
 	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
@@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
 	FScopeLock Lock(&MicSendLock);
 	MicAccumulationBuffer.Append(PCMBytes);
-	if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
+	if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
 	{
 		WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
 		MicAccumulationBuffer.Reset();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
 	}
 	// Extract the streaming text fragment and broadcast it.
-	// API structure:
+	// Current API structure (2026):
 	// { "type": "agent_chat_response_part",
 	//   "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
 	// }
 	// Legacy structure (pre-2026):
 	// { "type": "agent_chat_response_part",
 	//   "agent_chat_response_part_event": { "agent_response_part": "partial text" }
 	// }
-	const TSharedPtr<FJsonObject>* PartEvent = nullptr;
+	FString PartText;
-	if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
+	bool bFound = false;
 	// Try current format: text_response_part.text
 	const TSharedPtr<FJsonObject>* TextPart = nullptr;
 	if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
 	{
-		FString PartText;
+		(*TextPart)->TryGetStringField(TEXT("text"), PartText);
-		if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
+		bFound = true;
 	}
 	// Fallback: legacy format agent_chat_response_part_event.agent_response_part
 	if (!bFound)
 	{
 		const TSharedPtr<FJsonObject>* PartEvent = nullptr;
 		if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
 		{
-			OnAgentResponsePart.Broadcast(PartText);
+			(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
 			bFound = true;
 		}
 	}
 	if (bFound && !PartText.IsEmpty())
 	{
 		OnAgentResponsePart.Broadcast(PartText);
 	}
 }
 void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -118,6 +118,18 @@ public:
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
 	bool bSpeculativeTurn = false;
 	/**
 	 * Duration in milliseconds of each microphone audio chunk sent to ElevenLabs.
 	 * WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT
 	 * accuracy. We accumulate audio and send once this duration is reached.
 	 * - Lower values (50-80ms): less latency, but VAD may be less reliable.
 	 * - Higher values (150-250ms): more reliable VAD, but adds latency.
 	 * Default: 100ms (3200 bytes at 16kHz 16-bit mono).
 	 */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
 		meta = (ClampMin = "20", ClampMax = "500", Units = "ms"))
 	int32 MicChunkDurationMs = 100;
 	/**
 	 * Allow the user to interrupt the agent while it is playing audio (speaking).
 	 * When true, calling StartListening() while the agent is audibly speaking automatically
@@ -405,5 +417,8 @@ private:
 	// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
 	TArray<uint8> MicAccumulationBuffer;
 	FCriticalSection MicSendLock;
-	static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
+
 	/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
 	 *  Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
 	int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
 };