v1.8.0: Server VAD interruption, partial response fix, configurable chunk size

- Server VAD + interruption: mic stays open while agent speaks, server detects user voice and triggers interruption automatically. Echo suppression disabled in this mode so audio reaches the server. - Fix agent_chat_response_part parsing: ElevenLabs API now uses text_response_part.text instead of agent_chat_response_part_event. Added fallback for legacy format. - Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms) instead of compile-time constant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 09:13:02 +01:00
parent 152fc6196d
commit 20a6e30377
4 changed files with 82 additions and 28 deletions
--- a/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
+++ b/Unreal/PS_AI_Agent/Content/test_AI_Actor.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening()
 		&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
 	// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
 	// capture entirely (before resampling) while the agent is speaking.
+	// In Server VAD + interruption mode, disable echo suppression so the server
+	// receives the user's voice even during agent playback — the server's own VAD
+	// handles echo filtering and interruption detection.
+	if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
+	{
+		Mic->EchoSuppressFlag = nullptr;
+	}
+	else
+	{
 		Mic->EchoSuppressFlag = &bAgentSpeaking;
+	}
 	Mic->StartCapture();

 	const double T = TurnStartTime - SessionStartTime;
@@ -459,24 +469,28 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
 	const double T = Now - SessionStartTime;
 	const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
 	if (bIsListening)
+	{
+		// In Server VAD + interruption mode, keep the mic open so the server can
+		// detect if the user speaks over the agent and send an interruption event.
+		// The server handles echo filtering and VAD — we just keep streaming audio.
+		if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
+		{
+			UE_LOG(LogElevenLabsAgent, Log,
+				TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
+				T, LastClosedTurnIndex, LatencyFromTurnEnd);
+		}
+		else
 		{
 			// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
-		// The server's VAD detected a pause in the user's speech and started generating
-		// prematurely — the user hasn't finished speaking yet.
-		//
 			// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
 			// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
 			// causing it to re-enter "user speaking" state and stall — both sides stuck.
-		//
-		// Do NOT send an interrupt here — just let the server's response play out:
-		//   - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
-		//   - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
-		// Either way the state machine recovers and Blueprint can reopen the mic.
 			UE_LOG(LogElevenLabsAgent, Log,
 				TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
 				T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
 			StopListening();
 		}
+	}

 	UE_LOG(LogElevenLabsAgent, Log,
 		TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
@@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr

 	// Echo suppression: skip sending mic audio while the agent is speaking.
 	// This prevents the agent from hearing its own voice through the speakers,
-	// which would confuse the server's VAD and STT. Matches the approach used
-	// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
-	if (bAgentSpeaking) return;
+	// which would confuse the server's VAD and STT.
+	// In Server VAD + interruption mode, keep sending audio so the server can
+	// detect the user speaking over the agent and trigger an interruption.
+	if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
+	{
+		return;
+	}

 	// Convert this callback's samples to int16 bytes and accumulate.
 	// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
@@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
 	FScopeLock Lock(&MicSendLock);
 	MicAccumulationBuffer.Append(PCMBytes);

-	if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
+	if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
 	{
 		WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
 		MicAccumulationBuffer.Reset();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
 	}

 	// Extract the streaming text fragment and broadcast it.
-	// API structure:
+	// Current API structure (2026):
+	// { "type": "agent_chat_response_part",
+	//   "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
+	// }
+	// Legacy structure (pre-2026):
 	// { "type": "agent_chat_response_part",
 	//   "agent_chat_response_part_event": { "agent_response_part": "partial text" }
 	// }
+	FString PartText;
+	bool bFound = false;
+
+	// Try current format: text_response_part.text
+	const TSharedPtr<FJsonObject>* TextPart = nullptr;
+	if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
+	{
+		(*TextPart)->TryGetStringField(TEXT("text"), PartText);
+		bFound = true;
+	}
+
+	// Fallback: legacy format agent_chat_response_part_event.agent_response_part
+	if (!bFound)
+	{
 		const TSharedPtr<FJsonObject>* PartEvent = nullptr;
 		if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
 		{
-		FString PartText;
-		if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
+			(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
+			bFound = true;
+		}
+	}
+
+	if (bFound && !PartText.IsEmpty())
 	{
 		OnAgentResponsePart.Broadcast(PartText);
 	}
-	}
 }

 void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -118,6 +118,18 @@ public:
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
 	bool bSpeculativeTurn = false;

+	/**
+	 * Duration in milliseconds of each microphone audio chunk sent to ElevenLabs.
+	 * WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT
+	 * accuracy. We accumulate audio and send once this duration is reached.
+	 * - Lower values (50-80ms): less latency, but VAD may be less reliable.
+	 * - Higher values (150-250ms): more reliable VAD, but adds latency.
+	 * Default: 100ms (3200 bytes at 16kHz 16-bit mono).
+	 */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
+		meta = (ClampMin = "20", ClampMax = "500", Units = "ms"))
+	int32 MicChunkDurationMs = 100;
+
 	/**
 	 * Allow the user to interrupt the agent while it is playing audio (speaking).
 	 * When true, calling StartListening() while the agent is audibly speaking automatically
@@ -405,5 +417,8 @@ private:
 	// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
 	TArray<uint8> MicAccumulationBuffer;
 	FCriticalSection MicSendLock;
-	static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
+
+	/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
+	 *  Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
+	int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
 };