v1.8.0: Server VAD interruption, partial response fix, configurable chunk size
- Server VAD + interruption: mic stays open while agent speaks, server detects user voice and triggers interruption automatically. Echo suppression disabled in this mode so audio reaches the server. - Fix agent_chat_response_part parsing: ElevenLabs API now uses text_response_part.text instead of agent_chat_response_part_event. Added fallback for legacy format. - Expose MicChunkDurationMs as UPROPERTY (20-500ms, default 100ms) instead of compile-time constant. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
152fc6196d
commit
20a6e30377
Binary file not shown.
@ -249,7 +249,17 @@ void UElevenLabsConversationalAgentComponent::StartListening()
|
|||||||
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
|
&UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured);
|
||||||
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
|
// Echo suppression: point the mic at our atomic bAgentSpeaking flag so it skips
|
||||||
// capture entirely (before resampling) while the agent is speaking.
|
// capture entirely (before resampling) while the agent is speaking.
|
||||||
Mic->EchoSuppressFlag = &bAgentSpeaking;
|
// In Server VAD + interruption mode, disable echo suppression so the server
|
||||||
|
// receives the user's voice even during agent playback — the server's own VAD
|
||||||
|
// handles echo filtering and interruption detection.
|
||||||
|
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||||
|
{
|
||||||
|
Mic->EchoSuppressFlag = nullptr;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
Mic->EchoSuppressFlag = &bAgentSpeaking;
|
||||||
|
}
|
||||||
Mic->StartCapture();
|
Mic->StartCapture();
|
||||||
|
|
||||||
const double T = TurnStartTime - SessionStartTime;
|
const double T = TurnStartTime - SessionStartTime;
|
||||||
@ -460,22 +470,26 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
|||||||
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
|
const double LatencyFromTurnEnd = TurnEndTime > 0.0 ? Now - TurnEndTime : 0.0;
|
||||||
if (bIsListening)
|
if (bIsListening)
|
||||||
{
|
{
|
||||||
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
// In Server VAD + interruption mode, keep the mic open so the server can
|
||||||
// The server's VAD detected a pause in the user's speech and started generating
|
// detect if the user speaks over the agent and send an interruption event.
|
||||||
// prematurely — the user hasn't finished speaking yet.
|
// The server handles echo filtering and VAD — we just keep streaming audio.
|
||||||
//
|
if (TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption)
|
||||||
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
{
|
||||||
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
TEXT("[T+%.2fs] [Turn %d] Agent generating — mic stays open (Server VAD + interruption). (%.2fs after turn end)"),
|
||||||
//
|
T, LastClosedTurnIndex, LatencyFromTurnEnd);
|
||||||
// Do NOT send an interrupt here — just let the server's response play out:
|
}
|
||||||
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
|
else
|
||||||
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
|
{
|
||||||
// Either way the state machine recovers and Blueprint can reopen the mic.
|
// Collision: server started generating Turn N's response while Turn M (M>N) mic was open.
|
||||||
UE_LOG(LogElevenLabsAgent, Log,
|
// Stop the mic WITHOUT flushing the accumulated audio buffer (see StopListening's
|
||||||
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
|
||||||
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
// causing it to re-enter "user speaking" state and stall — both sides stuck.
|
||||||
StopListening();
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
|
TEXT("[T+%.2fs] [Turn %d → Turn %d collision] Agent generating Turn %d response — mic (Turn %d) was open, stopping. (%.2fs after turn end)"),
|
||||||
|
T, LastClosedTurnIndex, TurnIndex, LastClosedTurnIndex, TurnIndex, LatencyFromTurnEnd);
|
||||||
|
StopListening();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
UE_LOG(LogElevenLabsAgent, Log,
|
UE_LOG(LogElevenLabsAgent, Log,
|
||||||
@ -615,9 +629,13 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
|||||||
|
|
||||||
// Echo suppression: skip sending mic audio while the agent is speaking.
|
// Echo suppression: skip sending mic audio while the agent is speaking.
|
||||||
// This prevents the agent from hearing its own voice through the speakers,
|
// This prevents the agent from hearing its own voice through the speakers,
|
||||||
// which would confuse the server's VAD and STT. Matches the approach used
|
// which would confuse the server's VAD and STT.
|
||||||
// in the official ElevenLabs C++ SDK (outputPlaying_ flag).
|
// In Server VAD + interruption mode, keep sending audio so the server can
|
||||||
if (bAgentSpeaking) return;
|
// detect the user speaking over the agent and trigger an interruption.
|
||||||
|
if (bAgentSpeaking && !(TurnMode == EElevenLabsTurnMode::Server && bAllowInterruption))
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// Convert this callback's samples to int16 bytes and accumulate.
|
// Convert this callback's samples to int16 bytes and accumulate.
|
||||||
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
// WASAPI fires every ~5ms (158 bytes at 16kHz). ElevenLabs needs ≥100ms
|
||||||
@ -630,7 +648,7 @@ void UElevenLabsConversationalAgentComponent::OnMicrophoneDataCaptured(const TAr
|
|||||||
FScopeLock Lock(&MicSendLock);
|
FScopeLock Lock(&MicSendLock);
|
||||||
MicAccumulationBuffer.Append(PCMBytes);
|
MicAccumulationBuffer.Append(PCMBytes);
|
||||||
|
|
||||||
if (MicAccumulationBuffer.Num() >= MicChunkMinBytes)
|
if (MicAccumulationBuffer.Num() >= GetMicChunkMinBytes())
|
||||||
{
|
{
|
||||||
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
WebSocketProxy->SendAudioChunk(MicAccumulationBuffer);
|
||||||
MicAccumulationBuffer.Reset();
|
MicAccumulationBuffer.Reset();
|
||||||
|
|||||||
@ -588,19 +588,40 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Extract the streaming text fragment and broadcast it.
|
// Extract the streaming text fragment and broadcast it.
|
||||||
// API structure:
|
// Current API structure (2026):
|
||||||
|
// { "type": "agent_chat_response_part",
|
||||||
|
// "text_response_part": { "text": "partial text", "type": "part"|"stop", "event_id": N }
|
||||||
|
// }
|
||||||
|
// Legacy structure (pre-2026):
|
||||||
// { "type": "agent_chat_response_part",
|
// { "type": "agent_chat_response_part",
|
||||||
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
|
// "agent_chat_response_part_event": { "agent_response_part": "partial text" }
|
||||||
// }
|
// }
|
||||||
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
FString PartText;
|
||||||
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
bool bFound = false;
|
||||||
|
|
||||||
|
// Try current format: text_response_part.text
|
||||||
|
const TSharedPtr<FJsonObject>* TextPart = nullptr;
|
||||||
|
if (Root->TryGetObjectField(TEXT("text_response_part"), TextPart) && TextPart)
|
||||||
{
|
{
|
||||||
FString PartText;
|
(*TextPart)->TryGetStringField(TEXT("text"), PartText);
|
||||||
if ((*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText) && !PartText.IsEmpty())
|
bFound = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: legacy format agent_chat_response_part_event.agent_response_part
|
||||||
|
if (!bFound)
|
||||||
|
{
|
||||||
|
const TSharedPtr<FJsonObject>* PartEvent = nullptr;
|
||||||
|
if (Root->TryGetObjectField(TEXT("agent_chat_response_part_event"), PartEvent) && PartEvent)
|
||||||
{
|
{
|
||||||
OnAgentResponsePart.Broadcast(PartText);
|
(*PartEvent)->TryGetStringField(TEXT("agent_response_part"), PartText);
|
||||||
|
bFound = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (bFound && !PartText.IsEmpty())
|
||||||
|
{
|
||||||
|
OnAgentResponsePart.Broadcast(PartText);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||||
|
|||||||
@ -118,6 +118,18 @@ public:
|
|||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||||
bool bSpeculativeTurn = false;
|
bool bSpeculativeTurn = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Duration in milliseconds of each microphone audio chunk sent to ElevenLabs.
|
||||||
|
* WASAPI captures audio every ~5ms, but sending tiny chunks degrades VAD/STT
|
||||||
|
* accuracy. We accumulate audio and send once this duration is reached.
|
||||||
|
* - Lower values (50-80ms): less latency, but VAD may be less reliable.
|
||||||
|
* - Higher values (150-250ms): more reliable VAD, but adds latency.
|
||||||
|
* Default: 100ms (3200 bytes at 16kHz 16-bit mono).
|
||||||
|
*/
|
||||||
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||||
|
meta = (ClampMin = "20", ClampMax = "500", Units = "ms"))
|
||||||
|
int32 MicChunkDurationMs = 100;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allow the user to interrupt the agent while it is playing audio (speaking).
|
* Allow the user to interrupt the agent while it is playing audio (speaking).
|
||||||
* When true, calling StartListening() while the agent is audibly speaking automatically
|
* When true, calling StartListening() while the agent is audibly speaking automatically
|
||||||
@ -405,5 +417,8 @@ private:
|
|||||||
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
|
// in OnMicrophoneDataCaptured and from game thread in StopListening flush).
|
||||||
TArray<uint8> MicAccumulationBuffer;
|
TArray<uint8> MicAccumulationBuffer;
|
||||||
FCriticalSection MicSendLock;
|
FCriticalSection MicSendLock;
|
||||||
static constexpr int32 MicChunkMinBytes = 3200; // 100ms @ 16kHz 16-bit mono (1600 samples)
|
|
||||||
|
/** Compute the minimum bytes from the user-facing MicChunkDurationMs.
|
||||||
|
* Formula: bytes = SampleRate * (ms / 1000) * BytesPerSample = 16000 * ms / 1000 * 2 = 32 * ms */
|
||||||
|
int32 GetMicChunkMinBytes() const { return MicChunkDurationMs * 32; }
|
||||||
};
|
};
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user