v1.8.0: Server VAD interruption, partial response fix, configurable chunk size

- Server VAD + interruption: mic stays open while agent speaks, server
  detects user voice and triggers interruption automatically. Echo
  suppression disabled in this mode so audio reaches the server.
- Fix agent_chat_response_part parsing: ElevenLabs API now uses
  text_response_part.text instead of agent_chat_response_part_event.
  Added fallback for legacy format.
- Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms)
  instead of compile-time constant.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-22 09:13:02 +01:00
parent 152fc6196d
commit 20a6e30377
4 changed files with 82 additions and 28 deletions

View File

@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening()
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
// capture entirely (before resampling) while the agent is speaking.
// In Server VAD + interruption mode, disable echo suppression so the server
// receives the user's voice even during agent playback — the server's own VAD
// handles echo filtering and interruption detection.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
Mic->EchoSuppressFlag = nullptr;
}
else
{
Mic->EchoSuppressFlag = &bAgentSpeaking;
}
Mic->StartCapture();
const double T = TurnStartTime - SessionStartTime;
@ -459,24 +469,28 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
if (bIsListening)
{
// In Server VAD + interruption mode, keep the mic open so the server can
// detect if the user speaks over the agent and send an interruption event.
// The server handles echo filtering and VAD — we just keep streaming audio.
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
{
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
T, LastClosedTurnIndex, LatencyFromTurnEnd);
}
else
{
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
// The server's VAD detected a pause in the user's speech and started generating
// prematurely — the user hasn't finished speaking yet.
//
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
StopListening();
}
}
UE_LOG(LogElevenLabsAgent, Log,
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
// Echo suppression: skip sending mic audio while the agent is speaking.
// This prevents the agent from hearing its own voice through the speakers,
// which would confuse the server's VAD and STT. Matches the approach used
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
if (bAgentSpeaking) return;
// which would confuse the server's VAD and STT.
// In Server VAD + interruption mode, keep sending audio so the server can
// detect the user speaking over the agent and trigger an interruption.
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
{
return;
}
// Convert this callback's samples to int16 bytes and accumulate.
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
FScopeLock Lock(&MicSendLock);
MicAccumulationBuffer.Append(PCMBytes);
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
{
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
MicAccumulationBuffer.Reset();

View File

@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
}
// Extract the streaming text fragment and broadcast it.
// API structure:
// Current API structure (2026):
// { "type": "agent_chat_response_part",
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
// }
// Legacy structure (pre-2026):
// { "type": "agent_chat_response_part",
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
// }
FString PartText;
bool bFound = false;
// Try current format: text_response_part.text
const TSharedPtr<FJsonObject>* TextPart = nullptr;
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
{
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
bFound = true;
}
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
if (!bFound)
{
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
{
FString PartText;
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
bFound = true;
}
}
if (bFound && !PartText.IsEmpty())
{
OnAgentResponsePart.Broadcast(PartText);
}
}
}
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)

View File

@ -118,6 +118,18 @@ public:
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
bool bSpeculativeTurn = false;
/**
* Duration in milliseconds of each microphone audio chunk sent to ElevenLabs.
* WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT
* accuracy. We accumulate audio and send once this duration is reached.
* - Lower values (50-80ms): less latency, but VAD may be less reliable.
* - Higher values (150-250ms): more reliable VAD, but adds latency.
* Default: 100ms (3200 bytes at 16kHz 16-bit mono).
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ClampMin = "20", ClampMax = "500", Units = "ms"))
int32 MicChunkDurationMs = 100;
/**
* Allow the user to interrupt the agent while it is playing audio (speaking).
* When true, calling StartListening() while the agent is audibly speaking automatically
@ -405,5 +417,8 @@ private:
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
TArray<uint8> MicAccumulationBuffer;
FCriticalSection MicSendLock;
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
};