rollback to a more functional version but not perfect

This commit is contained in:
j.foucher 2026-02-21 19:49:26 +01:00
parent 1b883f532f
commit d8957625f8
4 changed files with 4 additions and 71 deletions

View File

@ -459,9 +459,7 @@ void UElevenLabsConversationalAgentComponent::HandleAgentResponseStarted()
// bAgentGenerating guard). Flushing would send audio to a server that is mid-generation,
// causing it to re-enter "user speaking" state and stall — both sides stuck.
//
// Do NOT send an interrupt here: the ElevenLabs server does not always send the
// interruption ack, which would leave bIgnoreIncomingContent=true and silently
// discard all subsequent content. Instead, let the server's response play out:
// Do NOT send an interrupt here — just let the server's response play out:
// - If audio arrives → EnqueueAgentAudio sets bAgentSpeaking, response plays normally.
// - If audio never arrives → generating timeout (10s) clears bAgentGenerating.
// Either way the state machine recovers and Blueprint can reopen the mic.

View File

@ -158,20 +158,6 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
// in a loop: part arrives → event → StopListening → SendUserTurnEnd → flag reset → part arrives → loop.
// The flag is only reset in SendUserTurnStart() at the beginning of a new user turn.
// Clear the interrupt-ignore flag if it was never cleared by an "interruption" server ack.
// The ElevenLabs server does not always send the "interruption" acknowledgement reliably.
// By the time the user has spoken a full new turn (seconds of audio), any in-flight content
// from the previously interrupted generation has long since arrived — it is safe to resume
// normal content processing so the server's response to this new turn is not silently discarded.
if (bIgnoreIncomingContent)
{
bIgnoreIncomingContent = false;
const double T = UserTurnEndTime - SessionStartTime;
UE_LOG(LogElevenLabsWS, Log,
TEXT("[T+%.2fs] Cleared interrupt-ignore flag at turn end (server 'interruption' ack was not received — resuming content processing)."),
T);
}
const double T = UserTurnEndTime - SessionStartTime;
UE_LOG(LogElevenLabsWS, Log, TEXT("[T+%.2fs] User turn ended — server VAD silence detection started (turn_timeout=1s)."), T);
}
@ -196,12 +182,7 @@ void UElevenLabsWebSocketProxy::SendInterrupt()
{
if (!IsConnected()) return;
// Immediately start discarding in-flight audio and chat response parts from
// the generation we are about to interrupt. The server may still send several
// frames before it processes our interrupt. We stop ignoring once the server
// sends its "interruption" acknowledgement (HandleInterruption).
bIgnoreIncomingContent = true;
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt — ignoring incoming content until server acks."));
UE_LOG(LogElevenLabsWS, Log, TEXT("Sending interrupt."));
TSharedPtr<FJsonObject> Msg = MakeShareable(new FJsonObject());
Msg->SetStringField(TEXT("type"), ElevenLabsMessageType::Interrupt);
@ -467,17 +448,9 @@ void UElevenLabsWebSocketProxy::OnWsBinaryMessage(const void* Data, SIZE_T Size,
}
// Broadcast raw PCM bytes directly to the audio queue.
// Discard if we are waiting for an interruption ack (same logic as HandleAudioResponse).
TArray<uint8> PCMData = MoveTemp(BinaryFrameBuffer);
BinaryFrameBuffer.Reset();
if (!bIgnoreIncomingContent)
{
OnAudioReceived.Broadcast(PCMData);
}
else
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding binary audio frame (interrupt pending server ack)."));
}
OnAudioReceived.Broadcast(PCMData);
}
}
@ -507,15 +480,6 @@ void UElevenLabsWebSocketProxy::HandleConversationInitiation(const TSharedPtr<FJ
void UElevenLabsWebSocketProxy::HandleAudioResponse(const TSharedPtr<FJsonObject>& Root)
{
// Discard audio that belongs to an interrupted generation.
// The server may send several more audio frames after we sent "interrupt" —
// they must not restart the speaking state on the client side.
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding audio frame (interrupt pending server ack)."));
return;
}
// Expected structure:
// { "type": "audio",
// "audio_event": { "audio_base_64": "<base64 PCM>", "event_id": 1 }
@ -569,16 +533,6 @@ void UElevenLabsWebSocketProxy::HandleTranscript(const TSharedPtr<FJsonObject>&
void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject>& Root)
{
// ISSUE-19: discard agent_response that belongs to an interrupted generation.
// A stale agent_response from the cancelled turn would set bAgentResponseReceived=true
// on the component, allowing the silence-detection Tick to fire OnAgentStoppedSpeaking
// at the wrong time (no audio is currently playing for the new turn yet).
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_response (interrupt pending server ack)."));
return;
}
// ISSUE-22: reset bAgentResponseStartedFired so OnAgentResponseStarted fires again on
// the next turn. In Server VAD mode SendUserTurnStart() is never called — it is the only
// other place that resets this flag — so without this reset, OnAgentResponseStarted fires
@ -604,16 +558,6 @@ void UElevenLabsWebSocketProxy::HandleAgentResponse(const TSharedPtr<FJsonObject
void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJsonObject>& Root)
{
// Ignore response parts that belong to a generation we have already interrupted.
// Without this guard, old parts arriving after SendInterrupt() would re-trigger
// OnAgentResponseStarted (bAgentResponseStartedFired was reset in SendUserTurnStart),
// causing the component to stop the newly-opened microphone — creating an infinite loop.
if (bIgnoreIncomingContent)
{
UE_LOG(LogElevenLabsWS, Verbose, TEXT("Discarding agent_chat_response_part (interrupt pending server ack)."));
return;
}
// agent_chat_response_part = the server is actively generating a response (LLM token stream).
// Fire OnAgentResponseStarted once per turn so the component can auto-stop the microphone
// if the Blueprint restarted listening before the server finished processing the previous turn.
@ -647,10 +591,7 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
void UElevenLabsWebSocketProxy::HandleInterruption(const TSharedPtr<FJsonObject>& Root)
{
// Server has acknowledged the interruption — the old generation is fully stopped.
// Resume accepting incoming audio and chat response parts (for the next turn).
bIgnoreIncomingContent = false;
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received — resuming content processing)."));
UE_LOG(LogElevenLabsWS, Log, TEXT("Agent interrupted (server ack received)."));
OnInterrupted.Broadcast();
}

View File

@ -226,12 +226,6 @@ private:
// Used to compute [T+Xs] session-relative timestamps in all log messages.
double SessionStartTime = 0.0;
// Set to true in SendInterrupt() so that in-flight audio frames and
// agent_chat_response_part messages from the interrupted generation are silently
// discarded instead of re-triggering the speaking/generating state.
// Cleared when the server sends its "interruption" acknowledgement.
bool bIgnoreIncomingContent = false;
public:
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
// Controls turn_timeout in conversation_initiation_client_data.