Debug in progress
This commit is contained in:
parent
9f28ed7457
commit
0dc9d67308
Binary file not shown.
@ -48,9 +48,27 @@ void UElevenLabsConversationalAgentComponent::TickComponent(float DeltaTime, ELe
|
||||
if (AudioQueue.Num() == 0)
|
||||
{
|
||||
SilentTickCount++;
|
||||
if (SilentTickCount >= SilenceThresholdTicks)
|
||||
|
||||
// Wait for agent_response (confirms the full response is done) before
|
||||
// declaring the agent stopped. This prevents premature OnAgentStoppedSpeaking
|
||||
// events when ElevenLabs TTS streams audio in multiple batches with gaps
|
||||
// (e.g. for long responses) — without this guard, the Blueprint's
|
||||
// OnAgentStoppedSpeaking handler reopens the mic mid-response.
|
||||
const bool bResponseConfirmed = bAgentResponseReceived && SilentTickCount >= SilenceThresholdTicks;
|
||||
|
||||
// Hard-timeout fallback: if agent_response never arrives (or is very late),
|
||||
// stop after 2s of silence to avoid leaving the state machine stuck.
|
||||
const bool bHardTimeout = SilentTickCount >= HardSilenceTimeoutTicks;
|
||||
|
||||
if (bResponseConfirmed || bHardTimeout)
|
||||
{
|
||||
if (bHardTimeout && !bAgentResponseReceived)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Warning,
|
||||
TEXT("Agent silence hard-timeout (2s) without agent_response — declaring agent stopped."));
|
||||
}
|
||||
bAgentSpeaking = false;
|
||||
bAgentResponseReceived = false;
|
||||
SilentTickCount = 0;
|
||||
OnAgentStoppedSpeaking.Broadcast();
|
||||
}
|
||||
@ -84,6 +102,8 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
|
||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponse);
|
||||
WebSocketProxy->OnInterrupted.AddDynamic(this,
|
||||
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
|
||||
WebSocketProxy->OnAgentResponseStarted.AddDynamic(this,
|
||||
&UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted);
|
||||
}
|
||||
|
||||
// Pass configuration to the proxy before connecting.
|
||||
@ -114,6 +134,33 @@ void UElevenLabsConversationalAgentComponent::StartListening()
|
||||
}
|
||||
|
||||
if (bIsListening) return;
|
||||
|
||||
// If the agent is currently generating or speaking, decide how to handle the request.
|
||||
//
|
||||
// Interruption (bAllowInterruption) applies ONLY when the agent is already playing audio
|
||||
// (bAgentSpeaking). Pressing T while the agent speaks immediately stops it and opens the mic.
|
||||
//
|
||||
// During the generation phase (bAgentGenerating, no audio yet) we always block silently.
|
||||
// This prevents the Blueprint's OnAgentStartedGenerating handler — which typically calls
|
||||
// StartListening() for bookkeeping — from accidentally sending an interrupt to the server
|
||||
// the moment it starts generating, which would cancel every response before any audio plays.
|
||||
if (bAgentGenerating || bAgentSpeaking)
|
||||
{
|
||||
if (bAgentSpeaking && bAllowInterruption)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening: interrupting agent (speaking) to allow user to speak."));
|
||||
InterruptAgent();
|
||||
// InterruptAgent → StopAgentAudio clears bAgentSpeaking / bAgentGenerating,
|
||||
// so we fall through and open the microphone immediately.
|
||||
}
|
||||
else
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log, TEXT("StartListening ignored: agent is %s%s — will listen after agent finishes."),
|
||||
bAgentGenerating ? TEXT("generating") : TEXT("speaking"),
|
||||
(bAgentSpeaking && !bAllowInterruption) ? TEXT(" (interruption disabled)") : TEXT(""));
|
||||
return;
|
||||
}
|
||||
}
|
||||
bIsListening = true;
|
||||
|
||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||
@ -225,6 +272,8 @@ void UElevenLabsConversationalAgentComponent::HandleDisconnected(int32 StatusCod
|
||||
UE_LOG(LogElevenLabsAgent, Log, TEXT("Agent disconnected. Code=%d Reason=%s"), StatusCode, *Reason);
|
||||
bIsListening = false;
|
||||
bAgentSpeaking = false;
|
||||
bAgentGenerating = false;
|
||||
bAgentResponseReceived = false;
|
||||
MicAccumulationBuffer.Reset();
|
||||
OnAgentDisconnected.Broadcast(StatusCode, Reason);
|
||||
}
|
||||
@ -250,6 +299,11 @@ void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabs
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
|
||||
{
|
||||
// The server sends agent_response when the full text response is complete.
|
||||
// This is our reliable signal that no more TTS audio chunks will follow.
|
||||
// Set the flag so the silence-detection Tick can safely fire OnAgentStoppedSpeaking.
|
||||
bAgentResponseReceived = true;
|
||||
|
||||
if (bEnableAgentTextResponse)
|
||||
{
|
||||
OnAgentTextResponse.Broadcast(ResponseText);
|
||||
@ -262,6 +316,22 @@ void UElevenLabsConversationalAgentComponent::HandleInterrupted()
|
||||
OnAgentInterrupted.Broadcast();
|
||||
}
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
|
||||
{
|
||||
// The server has started generating a response (first agent_chat_response_part).
|
||||
// Set bAgentGenerating BEFORE StopListening so that any StartListening call
|
||||
// triggered by the Blueprint's OnAgentStartedGenerating handler is blocked.
|
||||
bAgentGenerating = true;
|
||||
|
||||
if (bIsListening)
|
||||
{
|
||||
UE_LOG(LogElevenLabsAgent, Log,
|
||||
TEXT("Agent started generating while mic was open — stopping listening to avoid turn collision."));
|
||||
StopListening();
|
||||
}
|
||||
OnAgentStartedGenerating.Broadcast();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Audio playback
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
@ -314,6 +384,8 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
||||
if (!bAgentSpeaking)
|
||||
{
|
||||
bAgentSpeaking = true;
|
||||
bAgentGenerating = false; // Agent is now speaking — generation phase is over.
|
||||
bAgentResponseReceived = false; // Reset: wait for agent_response before allowing StopSpeaking.
|
||||
SilentTickCount = 0;
|
||||
OnAgentStartedSpeaking.Broadcast();
|
||||
|
||||
@ -334,6 +406,9 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
AudioQueue.Empty();
|
||||
|
||||
bAgentGenerating = false; // Always clear — covers interruptions during generation phase.
|
||||
bAgentResponseReceived = false; // Reset — next response will re-confirm when done.
|
||||
|
||||
if (bAgentSpeaking)
|
||||
{
|
||||
bAgentSpeaking = false;
|
||||
|
||||
@ -104,8 +104,6 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
||||
}
|
||||
if (PCMData.Num() == 0) return;
|
||||
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
|
||||
|
||||
// Track when the last audio chunk was sent for latency measurement.
|
||||
LastAudioChunkSentTime = FPlatformTime::Seconds();
|
||||
|
||||
@ -119,13 +117,8 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
|
||||
// to avoid the pretty-printed writer and to keep the payload minimal.
|
||||
const FString AudioJson = FString::Printf(TEXT("{\"user_audio_chunk\":\"%s\"}"), *Base64Audio);
|
||||
|
||||
// Log first chunk fully for debugging
|
||||
static int32 AudioChunksSent = 0;
|
||||
AudioChunksSent++;
|
||||
if (AudioChunksSent <= 2)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT(" Audio JSON (first 200 chars): %.200s"), *AudioJson);
|
||||
}
|
||||
// Per-chunk log at Verbose only — Log level is too spammy (10+ lines per second).
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("SendAudioChunk: %d bytes"), PCMData.Num());
|
||||
|
||||
if (WebSocket.IsValid() && WebSocket->IsConnected())
|
||||
{
|
||||
@ -139,7 +132,17 @@ void UElevenLabsWebSocketProxy::SendUserTurnStart()
|
||||
// The server's VAD detects speech from the audio chunks we send.
|
||||
// user_activity is a keep-alive/timeout-reset message and should NOT be
|
||||
// sent here — it would delay the agent's turn after the user stops.
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn started (audio chunks will follow)."));
|
||||
|
||||
// Reset latency tracking so a new turn starts with a clean state.
|
||||
// If the previous turn got no server response (bWaitingForResponse stayed true),
|
||||
// this prevents stale UserTurnEndTime from corrupting latency measurements
|
||||
// and ensures the state machine is consistent for the new turn.
|
||||
bWaitingForResponse = false;
|
||||
bFirstAudioResponseLogged = false;
|
||||
bAgentResponseStartedFired = false;
|
||||
|
||||
const double T = FPlatformTime::Seconds() - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn started — mic open, audio chunks will follow."), T);
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
||||
@ -149,7 +152,13 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
|
||||
UserTurnEndTime = FPlatformTime::Seconds();
|
||||
bWaitingForResponse = true;
|
||||
bFirstAudioResponseLogged = false;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
|
||||
// NOTE: Do NOT reset bAgentResponseStartedFired here.
|
||||
// StopListening() calls SendUserTurnEnd(), and HandleAgentResponseStarted() calls StopListening().
|
||||
// If we reset the flag here, the next agent_chat_response_part would re-fire OnAgentResponseStarted
|
||||
// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
|
||||
// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
|
||||
const double T = UserTurnEndTime - SessionStartTime;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
|
||||
@ -171,6 +180,14 @@ void UElevenLabsWebSocketProxy::SendTextMessage(const FString& Text)
|
||||
void UElevenLabsWebSocketProxy::SendInterrupt()
|
||||
{
|
||||
if (!IsConnected()) return;
|
||||
|
||||
// Immediately start discarding in-flight audio and chat response parts from
|
||||
// the generation we are about to interrupt. The server may still send several
|
||||
// frames before it processes our interrupt. We stop ignoring once the server
|
||||
// sends its "interruption" acknowledgement (HandleInterruption).
|
||||
bIgnoreIncomingContent = true;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks."));
|
||||
|
||||
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
|
||||
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
|
||||
SendJsonMessage(Msg);
|
||||
@ -194,7 +211,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||
// "type": "conversation_initiation_client_data",
|
||||
// "conversation_config_override": {
|
||||
// "agent": {
|
||||
// "turn": { "turn_timeout": 3, "speculative_turn": true }
|
||||
// "turn": { "turn_timeout": 3 } // speculative_turn removed (caused silent failures after 2 turns)
|
||||
// },
|
||||
// "tts": {
|
||||
// "optimize_streaming_latency": 3
|
||||
@ -211,19 +228,28 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||
// In push-to-talk (Client mode) the user controls the mic; the server still
|
||||
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
// Lower turn_timeout so the agent responds faster after the user stops speaking.
|
||||
// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
|
||||
// end-of-turn by releasing the key, so we can use a very short timeout (1s).
|
||||
// turn_timeout: how long the server waits after VAD detects silence before
|
||||
// processing the user's turn. In push-to-talk (Client) mode this directly adds
|
||||
// latency to every response — the server waits this many seconds of silence
|
||||
// after the user releases T before it begins LLM processing.
|
||||
//
|
||||
// History:
|
||||
// turn_timeout=1 was originally problematic, but ONLY when combined with
|
||||
// speculative_turn=true (which has since been removed). Without speculative_turn,
|
||||
// 1s is safe and halves the per-turn latency vs the 3s we had previously.
|
||||
// Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
|
||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||
{
|
||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||
}
|
||||
// Speculative turn: start LLM generation during silence before the VAD is
|
||||
// fully confident the user finished speaking. Reduces latency by 200-500ms.
|
||||
if (bSpeculativeTurn)
|
||||
{
|
||||
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
|
||||
}
|
||||
// NOTE: speculative_turn is intentionally NOT sent here.
|
||||
// With speculative_turn=true the server starts LLM generation speculatively
|
||||
// before the VAD is fully confident the user finished speaking. Combined with
|
||||
// the short turn_timeout this put the server's state machine into a state where
|
||||
// it stopped processing user audio after 2 turns — subsequent turns received
|
||||
// only pings and no agent_chat_response_part / audio / user_transcript at all.
|
||||
// Removing it costs ~200-500ms of latency but restores reliable multi-turn
|
||||
// conversation. Re-enable only if ElevenLabs confirms it is stable.
|
||||
|
||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||
@ -297,7 +323,15 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
return;
|
||||
}
|
||||
|
||||
// Log every message type received from the server for debugging.
|
||||
// Suppress ping from the visible log — they arrive every ~2s and flood the output.
|
||||
// Handle ping early before the generic type log.
|
||||
if (MsgType == ElevenLabsMessageType::PingEvent)
|
||||
{
|
||||
HandlePing(Root);
|
||||
return;
|
||||
}
|
||||
|
||||
// Log every non-ping message type received from the server.
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Received message type: %s"), *MsgType);
|
||||
|
||||
if (MsgType == ElevenLabsMessageType::ConversationInitiation)
|
||||
@ -310,11 +344,12 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
if (bWaitingForResponse && !bFirstAudioResponseLogged)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
|
||||
LatencyFromTurnEnd, LatencyFromLastChunk);
|
||||
TEXT("[T+%.2fs] [LATENCY] First audio: %.0f ms after turn end (%.0f ms after last chunk)"),
|
||||
T, LatencyFromTurnEnd, LatencyFromLastChunk);
|
||||
bFirstAudioResponseLogged = true;
|
||||
}
|
||||
HandleAudioResponse(Root);
|
||||
@ -325,10 +360,11 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
if (bWaitingForResponse)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
|
||||
LatencyFromTurnEnd);
|
||||
TEXT("[T+%.2fs] [LATENCY] User transcript: %.0f ms after turn end"),
|
||||
T, LatencyFromTurnEnd);
|
||||
bWaitingForResponse = false;
|
||||
}
|
||||
HandleTranscript(Root);
|
||||
@ -339,26 +375,27 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
|
||||
if (UserTurnEndTime > 0.0)
|
||||
{
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
|
||||
UE_LOG(LogElevenLabsWS, Warning,
|
||||
TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
|
||||
LatencyFromTurnEnd);
|
||||
TEXT("[T+%.2fs] [LATENCY] Agent text response: %.0f ms after turn end"),
|
||||
T, LatencyFromTurnEnd);
|
||||
}
|
||||
HandleAgentResponse(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::AgentChatResponsePart)
|
||||
{
|
||||
HandleAgentChatResponsePart();
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
|
||||
{
|
||||
// Silently ignore for now — corrected text after interruption.
|
||||
// Silently ignore — corrected text after interruption.
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("agent_response_correction received (ignored)."));
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::InterruptionEvent)
|
||||
{
|
||||
HandleInterruption(Root);
|
||||
}
|
||||
else if (MsgType == ElevenLabsMessageType::PingEvent)
|
||||
{
|
||||
HandlePing(Root);
|
||||
}
|
||||
else
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Unhandled message type: %s"), *MsgType);
|
||||
@ -415,9 +452,17 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
|
||||
}
|
||||
|
||||
// Broadcast raw PCM bytes directly to the audio queue.
|
||||
// Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse).
|
||||
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
|
||||
BinaryFrameBuffer.Reset();
|
||||
OnAudioReceived.Broadcast(PCMData);
|
||||
if (!bIgnoreIncomingContent)
|
||||
{
|
||||
OnAudioReceived.Broadcast(PCMData);
|
||||
}
|
||||
else
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack)."));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -439,13 +484,23 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr<FJ
|
||||
(*MetaObj)->TryGetStringField(TEXT("conversation_id"), ConversationInfo.ConversationID);
|
||||
}
|
||||
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Conversation initiated. ID=%s"), *ConversationInfo.ConversationID);
|
||||
SessionStartTime = FPlatformTime::Seconds();
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+0.00s] Conversation initiated. ID=%s"), *ConversationInfo.ConversationID);
|
||||
ConnectionState = EElevenLabsConnectionState::Connected;
|
||||
OnConnected.Broadcast(ConversationInfo);
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
|
||||
{
|
||||
// Discard audio that belongs to an interrupted generation.
|
||||
// The server may send several more audio frames after we sent "interrupt" —
|
||||
// they must not restart the speaking state on the client side.
|
||||
if (bIgnoreIncomingContent)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack)."));
|
||||
return;
|
||||
}
|
||||
|
||||
// Expected structure:
|
||||
// { "type": "audio",
|
||||
// "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
|
||||
@ -513,9 +568,41 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject
|
||||
OnAgentResponse.Broadcast(ResponseText);
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart()
|
||||
{
|
||||
// Ignore response parts that belong to a generation we have already interrupted.
|
||||
// Without this guard, old parts arriving after SendInterrupt() would re-trigger
|
||||
// OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart),
|
||||
// causing the component to stop the newly-opened microphone — creating an infinite loop.
|
||||
if (bIgnoreIncomingContent)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack)."));
|
||||
return;
|
||||
}
|
||||
|
||||
// agent_chat_response_part = the server is actively generating a response (LLM token stream).
|
||||
// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
|
||||
// if the Blueprint restarted listening before the server finished processing the previous turn.
|
||||
if (!bAgentResponseStartedFired)
|
||||
{
|
||||
bAgentResponseStartedFired = true;
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
|
||||
UE_LOG(LogElevenLabsWS, Log,
|
||||
TEXT("[T+%.2fs] Agent started generating (%.0f ms after turn end — includes VAD silence timeout + LLM start)."),
|
||||
T, LatencyFromTurnEnd);
|
||||
OnAgentResponseStarted.Broadcast();
|
||||
}
|
||||
// Subsequent parts logged at Verbose only (can be dozens per response).
|
||||
}
|
||||
|
||||
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted."));
|
||||
// Server has acknowledged the interruption — the old generation is fully stopped.
|
||||
// Resume accepting incoming audio and chat response parts (for the next turn).
|
||||
bIgnoreIncomingContent = false;
|
||||
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing)."));
|
||||
OnInterrupted.Broadcast();
|
||||
}
|
||||
|
||||
|
||||
@ -34,6 +34,15 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedSpeaking);
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStoppedSpeaking);
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentInterrupted);
|
||||
|
||||
/**
|
||||
* Fired when the server sends its first agent_chat_response_part — i.e. the moment
|
||||
* the LLM starts generating, well before audio arrives.
|
||||
* The component automatically calls StopListening() when this fires while the
|
||||
* microphone is open, preventing the user's new audio from being sent to the
|
||||
* server while it is still processing the previous turn.
|
||||
*/
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnAgentStartedGenerating);
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// UElevenLabsConversationalAgentComponent
|
||||
//
|
||||
@ -83,10 +92,27 @@ public:
|
||||
/**
|
||||
* Enable speculative turn: the LLM starts generating a response during
|
||||
* silence before the VAD is fully confident the user has finished speaking.
|
||||
* Reduces latency by 200-500ms but may occasionally produce premature responses.
|
||||
* Reduces latency by 200-500ms but caused the server to silently stop
|
||||
* processing user audio after 2 turns when combined with a short turn_timeout.
|
||||
* Disabled by default until ElevenLabs confirms stability in multi-turn sessions.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
|
||||
bool bSpeculativeTurn = true;
|
||||
bool bSpeculativeTurn = false;
|
||||
|
||||
/**
|
||||
* Allow the user to interrupt the agent while it is playing audio (speaking).
|
||||
* When true, calling StartListening() while the agent is audibly speaking automatically
|
||||
* sends an interruption signal to the server and opens the mic — no Blueprint nodes needed.
|
||||
* When false, StartListening() is silently ignored until the agent finishes speaking.
|
||||
*
|
||||
* NOTE: interruption only applies during the audio-playback phase (bAgentSpeaking).
|
||||
* While the agent is generating but has not yet started speaking, StartListening() is
|
||||
* always blocked regardless of this flag — this prevents Blueprint's OnAgentStartedGenerating
|
||||
* handler (which often calls StartListening for bookkeeping) from accidentally cancelling
|
||||
* the response before any audio plays.
|
||||
*/
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
|
||||
bool bAllowInterruption = true;
|
||||
|
||||
/**
|
||||
* Forward user speech transcripts (user_transcript events) to the
|
||||
@ -131,6 +157,15 @@ public:
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnAgentInterrupted OnAgentInterrupted;
|
||||
|
||||
/**
|
||||
* Fired when the server starts generating a response (before audio).
|
||||
* The component automatically stops the microphone when this fires while listening,
|
||||
* so the Blueprint doesn't need to handle this manually for push-to-talk.
|
||||
* Bind here if you need UI feedback ("agent is thinking...").
|
||||
*/
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnAgentStartedGenerating OnAgentStartedGenerating;
|
||||
|
||||
// ── Control ───────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
@ -219,6 +254,9 @@ private:
|
||||
UFUNCTION()
|
||||
void HandleInterrupted();
|
||||
|
||||
UFUNCTION()
|
||||
void HandleAgentResponseStarted();
|
||||
|
||||
// ── Audio playback ────────────────────────────────────────────────────────
|
||||
void InitAudioPlayback();
|
||||
void EnqueueAgentAudio(const TArray<uint8>& PCMData);
|
||||
@ -244,15 +282,32 @@ private:
|
||||
// ── State ─────────────────────────────────────────────────────────────────
|
||||
bool bIsListening = false;
|
||||
bool bAgentSpeaking = false;
|
||||
// True from the first agent_chat_response_part until the first audio chunk arrives.
|
||||
// Used to block StartListening() while the server is processing the previous turn.
|
||||
bool bAgentGenerating = false;
|
||||
|
||||
// Accumulates incoming PCM bytes until the audio component needs data.
|
||||
TArray<uint8> AudioQueue;
|
||||
FCriticalSection AudioQueueLock;
|
||||
|
||||
// Simple heuristic: if we haven't received audio data for this many ticks,
|
||||
// consider the agent done speaking.
|
||||
// Silence detection: how many consecutive ticks with an empty audio queue.
|
||||
int32 SilentTickCount = 0;
|
||||
static constexpr int32 SilenceThresholdTicks = 30; // ~0.5s at 60fps
|
||||
|
||||
// Primary threshold: fire OnAgentStoppedSpeaking after this many silent ticks
|
||||
// once the server has confirmed the full response (bAgentResponseReceived=true).
|
||||
// 30 ticks ≈ 0.5s at 60fps — enough to bridge brief inter-chunk gaps in the TTS stream.
|
||||
static constexpr int32 SilenceThresholdTicks = 30;
|
||||
|
||||
// Hard-timeout fallback: fire even without agent_response confirmation after 2s
|
||||
// of silence (handles edge cases where agent_response is very late or missing).
|
||||
static constexpr int32 HardSilenceTimeoutTicks = 120; // 2s at 60fps
|
||||
|
||||
// True once the server sends agent_response for the current turn.
|
||||
// The server sends the full text when generation is complete — this is the
|
||||
// reliable signal that no more audio chunks will follow for this utterance.
|
||||
// We wait for this before declaring the agent "stopped speaking" to avoid
|
||||
// premature OnAgentStoppedSpeaking events during multi-chunk TTS streaming.
|
||||
bool bAgentResponseReceived = false;
|
||||
|
||||
// ── Microphone accumulation ───────────────────────────────────────────────
|
||||
// WASAPI fires callbacks every ~5ms (158 bytes at 16kHz 16-bit mono).
|
||||
|
||||
@ -49,8 +49,9 @@ namespace ElevenLabsMessageType
|
||||
static const FString AudioResponse = TEXT("audio");
|
||||
// User speech-to-text transcript (speaker is always the user)
|
||||
static const FString UserTranscript = TEXT("user_transcript");
|
||||
static const FString AgentResponse = TEXT("agent_response");
|
||||
static const FString AgentResponseCorrection= TEXT("agent_response_correction");
|
||||
static const FString AgentResponse = TEXT("agent_response");
|
||||
static const FString AgentChatResponsePart = TEXT("agent_chat_response_part"); // intermediate LLM token stream
|
||||
static const FString AgentResponseCorrection = TEXT("agent_response_correction");
|
||||
static const FString InterruptionEvent = TEXT("interruption");
|
||||
static const FString PingEvent = TEXT("ping");
|
||||
static const FString ClientToolCall = TEXT("client_tool_call");
|
||||
|
||||
@ -36,6 +36,13 @@ DECLARE_DYNAMIC_MULTICAST_DELEGATE_OneParam(FOnElevenLabsAgentResponse,
|
||||
/** Fired when the agent interrupts the user. */
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsInterrupted);
|
||||
|
||||
/**
|
||||
* Fired when the server starts generating a response (first agent_chat_response_part received).
|
||||
* This fires BEFORE audio arrives — useful to detect that the server is processing
|
||||
* the previous turn while the client may have restarted listening (auto-restart scenario).
|
||||
*/
|
||||
DECLARE_DYNAMIC_MULTICAST_DELEGATE(FOnElevenLabsAgentResponseStarted);
|
||||
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// WebSocket Proxy
|
||||
@ -79,6 +86,14 @@ public:
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnElevenLabsInterrupted OnInterrupted;
|
||||
|
||||
/**
|
||||
* Fired on the first agent_chat_response_part per turn — i.e. the moment the server
|
||||
* starts generating. Fires well before audio. The component uses this to stop the
|
||||
* microphone if it was restarted before the server finished processing the previous turn.
|
||||
*/
|
||||
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
|
||||
FOnElevenLabsAgentResponseStarted OnAgentResponseStarted;
|
||||
|
||||
// ── Lifecycle ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
@ -167,6 +182,7 @@ private:
|
||||
void HandleAudioResponse(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleTranscript(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleAgentResponse(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandleAgentChatResponsePart();
|
||||
void HandleInterruption(const TSharedPtr<FJsonObject>& Payload);
|
||||
void HandlePing(const TSharedPtr<FJsonObject>& Payload);
|
||||
|
||||
@ -193,6 +209,19 @@ private:
|
||||
bool bWaitingForResponse = false;
|
||||
// Whether we already logged the first audio response latency for this turn.
|
||||
bool bFirstAudioResponseLogged = false;
|
||||
// Whether OnAgentResponseStarted has already been fired for the current turn.
|
||||
// Reset at turn start so only the first agent_chat_response_part fires the event.
|
||||
bool bAgentResponseStartedFired = false;
|
||||
|
||||
// Timestamp when the conversation was initiated (conversation_initiation_metadata received).
|
||||
// Used to compute [T+Xs] session-relative timestamps in all log messages.
|
||||
double SessionStartTime = 0.0;
|
||||
|
||||
// Set to true in SendInterrupt() so that in-flight audio frames and
|
||||
// agent_chat_response_part messages from the interrupted generation are silently
|
||||
// discarded instead of re-triggering the speaking/generating state.
|
||||
// Cleared when the server sends its "interruption" acknowledgement.
|
||||
bool bIgnoreIncomingContent = false;
|
||||
|
||||
public:
|
||||
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
|
||||
|
||||
35
build.bat
Normal file
35
build.bat
Normal file
@ -0,0 +1,35 @@
|
||||
@echo off
|
||||
chcp 65001 >nul
|
||||
title Build PS_AI_Agent
|
||||
|
||||
echo ============================================================
|
||||
echo PS_AI_Agent - Compilation plugin ElevenLabs (UE 5.5)
|
||||
echo ============================================================
|
||||
echo.
|
||||
echo ATTENTION : Ferme l'Unreal Editor avant de continuer !
|
||||
echo (Les DLL seraient verrouillees et la compilation echouerait)
|
||||
echo.
|
||||
pause
|
||||
|
||||
echo.
|
||||
echo Compilation en cours...
|
||||
echo (Seuls les .cpp modifies sont recompiles, ~16s)
|
||||
echo.
|
||||
|
||||
powershell.exe -Command "& 'C:\Program Files\Epic Games\UE_5.5\Engine\Build\BatchFiles\RunUAT.bat' BuildEditor -project='C:\ASTERION\GIT\PS_AI_Agent\Unreal\PS_AI_Agent\PS_AI_Agent.uproject' -notools -noP4 2>&1"
|
||||
|
||||
echo.
|
||||
if %ERRORLEVEL% == 0 (
|
||||
echo ============================================================
|
||||
echo SUCCES - Compilation terminee sans erreur.
|
||||
echo Tu peux relancer l'Unreal Editor.
|
||||
echo ============================================================
|
||||
) else (
|
||||
echo ============================================================
|
||||
echo ECHEC - Erreur de compilation (code %ERRORLEVEL%)
|
||||
echo Consulte le log ci-dessus pour le detail.
|
||||
echo ============================================================
|
||||
)
|
||||
|
||||
echo.
|
||||
pause
|
||||
Loading…
x
Reference in New Issue
Block a user