v1.9.1: Fix audio loss after interruption, instant audio stop, lip sync reset

- Fix event_id filtering bug: reset LastInterruptEventId when new generation
  starts, preventing all audio from being silently dropped after an interruption
- Match C++ sample API config: remove optimize_streaming_latency and
  custom_llm_extra_body overrides, send empty conversation_config_override
  in Server VAD mode (only send turn_timeout in Client mode)
- Instant audio stop on interruption: call ResetAudio() before Stop() to
  flush USoundWaveProcedural's internal ring buffer
- Lip sync reset on interruption/stop: bind OnAgentInterrupted (snap to
  neutral) and OnAgentStoppedSpeaking (clear queues) events
- Revert jitter buffer (replaced by pre-buffer approach, default 2000ms)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
j.foucher 2026-02-24 09:48:56 +01:00
parent c2142f3e6b
commit 6543bc6785
6 changed files with 149 additions and 51 deletions

View File

@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
// The AudioPlaybackComponent is still "playing" from the previous turn
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
// keeps firing. Without this guard, the underflow callback would drain
// the AudioQueue immediately, defeating the pre-buffer entirely.
// the AudioQueue immediately, defeating the buffer entirely.
// The ProceduralSoundWave generates silence internally when we return
// nothing — this silence does NOT accumulate, so once bPreBuffering
// nothing — this silence does NOT accumulate, so once buffering
// clears, the buffered data plays immediately.
if (bPreBuffering)
{
@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
void UElevenLabsConversationalAgentComponent::StopAgentAudio()
{
// Flush the ProceduralSoundWave's internal buffer BEFORE stopping.
// QueueAudio() pushes data into the wave's internal ring buffer during
// OnProceduralUnderflow. Calling Stop() alone stops the AudioComponent
// but the wave still holds buffered data that would play briefly on the
// next Play() call, causing a delayed/ghostly tail of the interrupted audio.
// ResetAudio() clears that internal buffer for an instant cut.
if (ProceduralSoundWave)
{
ProceduralSoundWave->ResetAudio();
}
if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
{
AudioPlaybackComponent->Stop();
@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
// while holding it would block the audio thread for the full Blueprint handler duration.
bool bWasSpeaking = false;
double Now = 0.0;
bPreBuffering = false; // Clear pre-buffer state on stop.
bPreBuffering = false; // Clear pre-buffer state on stop.
{
FScopeLock Lock(&AudioQueueLock);
AudioQueue.Empty();

View File

@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay()
Agent->OnAgentTextResponse.AddDynamic(
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
// Bind to interruption/stop events so lip sync resets immediately
// when the agent is cut off or finishes speaking.
Agent->OnAgentInterrupted.AddDynamic(
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
Agent->OnAgentStoppedSpeaking.AddDynamic(
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
// Enable partial response streaming if not already enabled
Agent->bEnableAgentPartialResponse = true;
UE_LOG(LogElevenLabsLipSync, Log,
TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName());
}
else
{
@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas
this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
AgentComponent->OnAgentTextResponse.RemoveDynamic(
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
AgentComponent->OnAgentInterrupted.RemoveDynamic(
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic(
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
}
AgentComponent.Reset();
SpectrumAnalyzer.Reset();
@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
// We consume one queued frame every 32ms to match the original audio timing.
constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
// Pre-buffer sync: don't consume viseme queue while the agent component is
// pre-buffering audio. This keeps lip sync in sync with audio playback.
// Without this, the lip sync would start 250ms ahead of the audio.
// Buffer sync: don't consume viseme queue while the agent component is
// pre-buffering audio (delaying playback to accumulate chunks).
// This keeps lip sync in sync with audio playback.
// Without this, the lip sync would run ahead of the audio during buffering.
if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
{
return;
@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
}
}
// ─────────────────────────────────────────────────────────────────────────────
// Interruption / stop handlers
// ─────────────────────────────────────────────────────────────────────────────
void UElevenLabsLipSyncComponent::OnAgentInterrupted()
{
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral."));
ResetToNeutral();
}
void UElevenLabsLipSyncComponent::OnAgentStopped()
{
// Don't clear text state here — it's already handled by TickComponent's
// "queue runs dry" logic which checks bFullTextReceived.
// Just clear the queues so the mouth returns to neutral immediately.
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues."));
VisemeQueue.Reset();
AmplitudeQueue.Reset();
PlaybackTimer = 0.0f;
bWaitingForText = false;
}
void UElevenLabsLipSyncComponent::ResetToNeutral()
{
// Clear all queued viseme and amplitude data
VisemeQueue.Reset();
AmplitudeQueue.Reset();
PlaybackTimer = 0.0f;
bWaitingForText = false;
// Reset text-driven lip sync state for the interrupted utterance
AccumulatedText.Reset();
TextVisemeSequence.Reset();
bTextVisemesApplied = false;
bFullTextReceived = false;
// Snap all visemes to silence immediately (no smoothing delay)
for (const FName& Name : VisemeNames)
{
TargetVisemes.FindOrAdd(Name) = 0.0f;
SmoothedVisemes.FindOrAdd(Name) = 0.0f;
}
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
// Clear blendshapes so the mouth returns to fully neutral
CurrentBlendshapes.Reset();
PreviousBlendshapes.Reset();
LastConsumedVisemes.Reset();
}
// ─────────────────────────────────────────────────────────────────────────────
// Audio analysis
// ─────────────────────────────────────────────────────────────────────────────

View File

@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
// }
// }
// Configure turn-taking behaviour.
// The ElevenLabs API does NOT have a turn.mode field.
// Turn-taking is controlled by the server's VAD and the turn_* parameters.
// In push-to-talk (Client mode) the user controls the mic; the server still
// uses its VAD to detect the end of speech from the audio chunks it receives.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
// turn_timeout: how long the server waits after VAD detects silence before
// processing the user's turn. In push-to-talk (Client) mode this directly adds
// latency to every response — the server waits this many seconds of silence
// after the user releases T before it begins LLM processing.
// Build conversation_config_override matching the C++ ElevenLabs sample as closely
// as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults).
// Sending empty = server defaults for TTS chunking, latency, and LLM behaviour.
// This produces smooth continuous audio chunks without the fragmentation caused by
// explicit optimize_streaming_latency or enable_intermediate_response overrides.
//
// History:
// turn_timeout=1 was originally problematic, but ONLY when combined with
// speculative_turn=true (which has since been removed). Without speculative_turn,
// 1s is safe and halves the per-turn latency vs the 3s we had previously.
// Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
if (TurnMode == EElevenLabsTurnMode::Client)
{
// turn_timeout: how long the server waits after VAD detects silence before
// processing the user's turn. Default is ~3s. In push-to-talk mode this
// directly adds latency — the server waits after the user releases T.
// 1s is safe without speculative_turn (which was removed — see history below).
//
// History:
// turn_timeout=1 was problematic when combined with speculative_turn=true
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
// and halves the per-turn latency.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
}
// NOTE: speculative_turn is intentionally NOT sent here.
// With speculative_turn=true the server starts LLM generation speculatively
// before the VAD is fully confident the user finished speaking. Combined with
// the short turn_timeout this put the server's state machine into a state where
// it stopped processing user audio after 2 turns — subsequent turns received
// only pings and no agent_chat_response_part / audio / user_transcript at all.
// Removing it costs ~200-500ms of latency but restores reliable multi-turn
// conversation. Re-enable only if ElevenLabs confirms it is stable.
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
// enable_intermediate_response reduces time-to-first-audio by allowing the agent
// to start speaking before it has finished generating the full response.
TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
//
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
//
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
// before finishing generation → fragmented audio. When omitted (C++ sample), the
// LLM completes its response first → continuous TTS chunks.
//
// - custom_llm_extra_body (empty object): Even an empty object might override the
// agent's configured custom_llm_extra_body with nothing. Omit entirely.
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
// NOTE: We bypass SendJsonMessage() here intentionally.
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
if (!bAgentResponseStartedFired)
{
bAgentResponseStartedFired = true;
// Reset the interrupt audio filter: a new response generation has started,
// so all subsequent audio belongs to this NEW generation and must not be
// discarded by the stale interrupt event_id from the PREVIOUS generation.
// Without this reset, audio for the new response is silently dropped when
// its event_id <= LastInterruptEventId (which was set during the interruption
// of the previous response).
if (LastInterruptEventId > 0)
{
UE_LOG(LogElevenLabsWS, Log,
TEXT("New generation started — resetting LastInterruptEventId (was %d)."),
LastInterruptEventId);
LastInterruptEventId = 0;
}
const double Now = FPlatformTime::Seconds();
const double T = Now - SessionStartTime;
const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;

View File

@ -137,14 +137,12 @@ public:
bool bEnableAgentPartialResponse = false;
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
* ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
* Pre-buffering delays playback start so the second chunk arrives before the
* first finishes playing, eliminating the audible gap mid-sentence.
* Higher values = fewer gaps but more latency on the first word.
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
* Delays playback start so early TTS chunks can accumulate, preventing
* mid-sentence pauses when the second chunk hasn't arrived yet.
* Set to 0 for immediate playback. */
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
meta = (ClampMin = "0", ClampMax = "4000",
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
int32 AudioPreBufferMs = 2000;
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */

View File

@ -103,6 +103,17 @@ private:
UFUNCTION()
void OnPartialTextReceived(const FString& PartialText);
/** Called when the agent is interrupted — immediately reset lip sync to neutral. */
UFUNCTION()
void OnAgentInterrupted();
/** Called when the agent finishes speaking — reset lip sync state for next utterance. */
UFUNCTION()
void OnAgentStopped();
/** Clear all lip sync queues and reset mouth to neutral pose. */
void ResetToNeutral();
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
void ConvertTextToVisemes(const FString& Text);