v1.8.0: Server VAD interruption, partial response fix, configurable chunk size
- Server VAD + interruption: mic stays open while agent speaks, server detects user voice and triggers interruption automatically. Echo suppression disabled in this mode so audio reaches the server. - Fix agent_chat_response_part parsing: ElevenLabs API now uses text_response_part.text instead of agent_chat_response_part_event. Added fallback for legacy format. - Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms) instead of compile-time constant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
152fc6196d
commit
20a6e30377
Binary file not shown.
@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening()
|
||||
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
|
||||
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
|
||||
// capture entirely (before resampling) while the agent is speaking.
|
||||
// In Server VAD + interruption mode, disable echo suppression so the server
|
||||
// receives the user's voice even during agent playback — the server's own VAD
|
||||
// handles echo filtering and interruption detection.
|
||||
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||
{
|
||||
Mic->EchoSuppressFlag = nullptr;
|
||||
}
|
||||
else
|
||||
{
|
||||
Mic->EchoSuppressFlag = &bAgentSpeaking;
|
||||
}
|
||||
Mic->StartCapture();
|
||||
|
||||
const double T = TurnStartTime - SessionStartTime;
|
||||
@ -459,24 +469,28 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
|
||||
if (bIsListening)
|
||||
{
|
||||
// In Server VAD + interruption mode, keep the mic open so the server can
|
||||
// detect if the user speaks over the agent and send an interruption event.
|
||||
// The server handles echo filtering and VAD — we just keep streaming audio.
|
||||
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
|
||||
T, LastClosedTurnIndex, LatencyFromTurnEnd);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
||||
// The server's VAD detected a pause in the user's speech and started generating
|
||||
// prematurely — the user hasn't finished speaking yet.
|
||||
//
|
||||
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
||||
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
||||
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
||||
//
|
||||
// Do NOT send an interrupt here — just let the server's response play out:
|
||||
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
|
||||
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
|
||||
// Either way the state machine recovers and Blueprint can reopen the mic.
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
||||
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
||||
StopListening();
|
||||
}
|
||||
}
|
||||
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("[T+%.2fs] [Turn %d] Agent generating. (%.2fs after turn end)"),
|
||||
@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
||||
|
||||
// Echo suppression: skip sending mic audio while the agent is speaking.
|
||||
// This prevents the agent from hearing its own voice through the speakers,
|
||||
// which would confuse the server's VAD and STT. Matches the approach used
|
||||
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
|
||||
if (bAgentSpeaking) return;
|
||||
// which would confuse the server's VAD and STT.
|
||||
// In Server VAD + interruption mode, keep sending audio so the server can
|
||||
// detect the user speaking over the agent and trigger an interruption.
|
||||
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Convert this callback's samples to int16 bytes and accumulate.
|
||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
||||
@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
||||
FScopeLock Lock(&MicSendLock);
|
||||
MicAccumulationBuffer.Append(PCMBytes);
|
||||
|
||||
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
|
||||
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
|
||||
{
|
||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||
MicAccumulationBuffer.Reset();
|
||||
|
||||
@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
||||
}
|
||||
|
||||
// Extract the streaming text fragment and broadcast it.
|
||||
// API structure:
|
||||
// Current API structure (2026):
|
||||
// { "type": "agent_chat_response_part",
|
||||
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
|
||||
// }
|
||||
// Legacy structure (pre-2026):
|
||||
// { "type": "agent_chat_response_part",
|
||||
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
|
||||
// }
|
||||
FString PartText;
|
||||
bool bFound = false;
|
||||
|
||||
// Try current format: text_response_part.text
|
||||
const TSharedPtr<FJsonObject>* TextPart = nullptr;
|
||||
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
|
||||
{
|
||||
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
|
||||
bFound = true;
|
||||
}
|
||||
|
||||
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
|
||||
if (!bFound)
|
||||
{
|
||||
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
||||
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
||||
{
|
||||
FString PartText;
|
||||
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
|
||||
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
|
||||
bFound = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (bFound && !PartText.IsEmpty())
|
||||
{
|
||||
OnAgentResponsePart.Broadcast(PartText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||
|
||||
@ -118,6 +118,18 @@ public:
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||
bool bSpeculativeTurn = false;
|
||||
|
||||
/**
|
||||
* Duration in milliseconds of each microphone audio chunk sent to ElevenLabs.
|
||||
* WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT
|
||||
* accuracy. We accumulate audio and send once this duration is reached.
|
||||
* - Lower values (50-80ms): less latency, but VAD may be less reliable.
|
||||
* - Higher values (150-250ms): more reliable VAD, but adds latency.
|
||||
* Default: 100ms (3200 bytes at 16kHz 16-bit mono).
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ClampMin = "20", ClampMax = "500", Units = "ms"))
|
||||
int32 MicChunkDurationMs = 100;
|
||||
|
||||
/**
|
||||
* Allow the user to interrupt the agent while it is playing audio (speaking).
|
||||
* When true, calling StartListening() while the agent is audibly speaking automatically
|
||||
@ -405,5 +417,8 @@ private:
|
||||
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
|
||||
TArray<uint8> MicAccumulationBuffer;
|
||||
FCriticalSection MicSendLock;
|
||||
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
|
||||
|
||||
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
|
||||
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
|
||||
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
|
||||
};
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user