v1.9.1: Fix audio loss after interruption, instant audio stop, lip sync reset
- Fix event_id filtering bug: reset LastInterruptEventId when new generation starts, preventing all audio from being silently dropped after an interruption - Match C++ sample API config: remove optimize_streaming_latency and custom_llm_extra_body overrides, send empty conversation_config_override in Server VAD mode (only send turn_timeout in Client mode) - Instant audio stop on interruption: call ResetAudio() before Stop() to flush USoundWaveProcedural's internal ring buffer - Lip sync reset on interruption/stop: bind OnAgentInterrupted (snap to neutral) and OnAgentStoppedSpeaking (clear queues) events - Revert jitter buffer (replaced by pre-buffer approach, default 2000ms) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c2142f3e6b
commit
6543bc6785
Binary file not shown.
@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
||||
// The AudioPlaybackComponent is still "playing" from the previous turn
|
||||
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
|
||||
// keeps firing. Without this guard, the underflow callback would drain
|
||||
// the AudioQueue immediately, defeating the pre-buffer entirely.
|
||||
// the AudioQueue immediately, defeating the buffer entirely.
|
||||
// The ProceduralSoundWave generates silence internally when we return
|
||||
// nothing — this silence does NOT accumulate, so once bPreBuffering
|
||||
// nothing — this silence does NOT accumulate, so once buffering
|
||||
// clears, the buffered data plays immediately.
|
||||
if (bPreBuffering)
|
||||
{
|
||||
@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
||||
|
||||
void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
||||
{
|
||||
// Flush the ProceduralSoundWave's internal buffer BEFORE stopping.
|
||||
// QueueAudio() pushes data into the wave's internal ring buffer during
|
||||
// OnProceduralUnderflow. Calling Stop() alone stops the AudioComponent
|
||||
// but the wave still holds buffered data that would play briefly on the
|
||||
// next Play() call, causing a delayed/ghostly tail of the interrupted audio.
|
||||
// ResetAudio() clears that internal buffer for an instant cut.
|
||||
if (ProceduralSoundWave)
|
||||
{
|
||||
ProceduralSoundWave->ResetAudio();
|
||||
}
|
||||
|
||||
if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
|
||||
{
|
||||
AudioPlaybackComponent->Stop();
|
||||
@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
||||
// while holding it would block the audio thread for the full Blueprint handler duration.
|
||||
bool bWasSpeaking = false;
|
||||
double Now = 0.0;
|
||||
bPreBuffering = false; // Clear pre-buffer state on stop.
|
||||
bPreBuffering = false; // Clear pre-buffer state on stop.
|
||||
{
|
||||
FScopeLock Lock(&AudioQueueLock);
|
||||
AudioQueue.Empty();
|
||||
|
||||
@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay()
|
||||
Agent->OnAgentTextResponse.AddDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
||||
|
||||
// Bind to interruption/stop events so lip sync resets immediately
|
||||
// when the agent is cut off or finishes speaking.
|
||||
Agent->OnAgentInterrupted.AddDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
|
||||
Agent->OnAgentStoppedSpeaking.AddDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
|
||||
|
||||
// Enable partial response streaming if not already enabled
|
||||
Agent->bEnableAgentPartialResponse = true;
|
||||
|
||||
UE_LOG(LogElevenLabsLipSync, Log,
|
||||
TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
|
||||
TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName());
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas
|
||||
this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
|
||||
AgentComponent->OnAgentTextResponse.RemoveDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
||||
AgentComponent->OnAgentInterrupted.RemoveDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
|
||||
AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic(
|
||||
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
|
||||
}
|
||||
AgentComponent.Reset();
|
||||
SpectrumAnalyzer.Reset();
|
||||
@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
||||
// We consume one queued frame every 32ms to match the original audio timing.
|
||||
constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
|
||||
|
||||
// Pre-buffer sync: don't consume viseme queue while the agent component is
|
||||
// pre-buffering audio. This keeps lip sync in sync with audio playback.
|
||||
// Without this, the lip sync would start 250ms ahead of the audio.
|
||||
// Buffer sync: don't consume viseme queue while the agent component is
|
||||
// pre-buffering audio (delaying playback to accumulate chunks).
|
||||
// This keeps lip sync in sync with audio playback.
|
||||
// Without this, the lip sync would run ahead of the audio during buffering.
|
||||
if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
|
||||
{
|
||||
return;
|
||||
@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
||||
}
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Interruption / stop handlers
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
void UElevenLabsLipSyncComponent::OnAgentInterrupted()
|
||||
{
|
||||
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral."));
|
||||
ResetToNeutral();
|
||||
}
|
||||
|
||||
void UElevenLabsLipSyncComponent::OnAgentStopped()
|
||||
{
|
||||
// Don't clear text state here — it's already handled by TickComponent's
|
||||
// "queue runs dry" logic which checks bFullTextReceived.
|
||||
// Just clear the queues so the mouth returns to neutral immediately.
|
||||
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues."));
|
||||
VisemeQueue.Reset();
|
||||
AmplitudeQueue.Reset();
|
||||
PlaybackTimer = 0.0f;
|
||||
bWaitingForText = false;
|
||||
}
|
||||
|
||||
void UElevenLabsLipSyncComponent::ResetToNeutral()
|
||||
{
|
||||
// Clear all queued viseme and amplitude data
|
||||
VisemeQueue.Reset();
|
||||
AmplitudeQueue.Reset();
|
||||
PlaybackTimer = 0.0f;
|
||||
bWaitingForText = false;
|
||||
|
||||
// Reset text-driven lip sync state for the interrupted utterance
|
||||
AccumulatedText.Reset();
|
||||
TextVisemeSequence.Reset();
|
||||
bTextVisemesApplied = false;
|
||||
bFullTextReceived = false;
|
||||
|
||||
// Snap all visemes to silence immediately (no smoothing delay)
|
||||
for (const FName& Name : VisemeNames)
|
||||
{
|
||||
TargetVisemes.FindOrAdd(Name) = 0.0f;
|
||||
SmoothedVisemes.FindOrAdd(Name) = 0.0f;
|
||||
}
|
||||
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||
SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||
|
||||
// Clear blendshapes so the mouth returns to fully neutral
|
||||
CurrentBlendshapes.Reset();
|
||||
PreviousBlendshapes.Reset();
|
||||
LastConsumedVisemes.Reset();
|
||||
}
|
||||
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
// Audio analysis
|
||||
// ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
||||
// }
|
||||
// }
|
||||
|
||||
// Configure turn-taking behaviour.
|
||||
// The ElevenLabs API does NOT have a turn.mode field.
|
||||
// Turn-taking is controlled by the server's VAD and the turn_* parameters.
|
||||
// In push-to-talk (Client mode) the user controls the mic; the server still
|
||||
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
// turn_timeout: how long the server waits after VAD detects silence before
|
||||
// processing the user's turn. In push-to-talk (Client) mode this directly adds
|
||||
// latency to every response — the server waits this many seconds of silence
|
||||
// after the user releases T before it begins LLM processing.
|
||||
// Build conversation_config_override matching the C++ ElevenLabs sample as closely
|
||||
// as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults).
|
||||
// Sending empty = server defaults for TTS chunking, latency, and LLM behaviour.
|
||||
// This produces smooth continuous audio chunks without the fragmentation caused by
|
||||
// explicit optimize_streaming_latency or enable_intermediate_response overrides.
|
||||
//
|
||||
// History:
|
||||
// turn_timeout=1 was originally problematic, but ONLY when combined with
|
||||
// speculative_turn=true (which has since been removed). Without speculative_turn,
|
||||
// 1s is safe and halves the per-turn latency vs the 3s we had previously.
|
||||
// Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
|
||||
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
|
||||
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
|
||||
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
||||
|
||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||
{
|
||||
// turn_timeout: how long the server waits after VAD detects silence before
|
||||
// processing the user's turn. Default is ~3s. In push-to-talk mode this
|
||||
// directly adds latency — the server waits after the user releases T.
|
||||
// 1s is safe without speculative_turn (which was removed — see history below).
|
||||
//
|
||||
// History:
|
||||
// turn_timeout=1 was problematic when combined with speculative_turn=true
|
||||
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
|
||||
// and halves the per-turn latency.
|
||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||
|
||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||
|
||||
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
||||
}
|
||||
// NOTE: speculative_turn is intentionally NOT sent here.
|
||||
// With speculative_turn=true the server starts LLM generation speculatively
|
||||
// before the VAD is fully confident the user finished speaking. Combined with
|
||||
// the short turn_timeout this put the server's state machine into a state where
|
||||
// it stopped processing user audio after 2 turns — subsequent turns received
|
||||
// only pings and no agent_chat_response_part / audio / user_transcript at all.
|
||||
// Removing it costs ~200-500ms of latency but restores reliable multi-turn
|
||||
// conversation. Re-enable only if ElevenLabs confirms it is stable.
|
||||
|
||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||
|
||||
TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
|
||||
TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
|
||||
|
||||
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
||||
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
||||
ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
|
||||
|
||||
// enable_intermediate_response reduces time-to-first-audio by allowing the agent
|
||||
// to start speaking before it has finished generating the full response.
|
||||
TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
|
||||
CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
|
||||
// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
|
||||
//
|
||||
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
|
||||
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
|
||||
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
|
||||
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
|
||||
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
|
||||
//
|
||||
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
|
||||
// before finishing generation → fragmented audio. When omitted (C++ sample), the
|
||||
// LLM completes its response first → continuous TTS chunks.
|
||||
//
|
||||
// - custom_llm_extra_body (empty object): Even an empty object might override the
|
||||
// agent's configured custom_llm_extra_body with nothing. Omit entirely.
|
||||
|
||||
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
|
||||
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
|
||||
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
|
||||
InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
|
||||
|
||||
// NOTE: We bypass SendJsonMessage() here intentionally.
|
||||
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
|
||||
@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
||||
if (!bAgentResponseStartedFired)
|
||||
{
|
||||
bAgentResponseStartedFired = true;
|
||||
|
||||
// Reset the interrupt audio filter: a new response generation has started,
|
||||
// so all subsequent audio belongs to this NEW generation and must not be
|
||||
// discarded by the stale interrupt event_id from the PREVIOUS generation.
|
||||
// Without this reset, audio for the new response is silently dropped when
|
||||
// its event_id <= LastInterruptEventId (which was set during the interruption
|
||||
// of the previous response).
|
||||
if (LastInterruptEventId > 0)
|
||||
{
|
||||
UE_LOG(LogElevenLabsWS, Log,
|
||||
TEXT("New generation started — resetting LastInterruptEventId (was %d)."),
|
||||
LastInterruptEventId);
|
||||
LastInterruptEventId = 0;
|
||||
}
|
||||
|
||||
const double Now = FPlatformTime::Seconds();
|
||||
const double T = Now - SessionStartTime;
|
||||
const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
|
||||
|
||||
@ -137,14 +137,12 @@ public:
|
||||
bool bEnableAgentPartialResponse = false;
|
||||
|
||||
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
|
||||
* ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
|
||||
* Pre-buffering delays playback start so the second chunk arrives before the
|
||||
* first finishes playing, eliminating the audible gap mid-sentence.
|
||||
* Higher values = fewer gaps but more latency on the first word.
|
||||
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
||||
* Delays playback start so early TTS chunks can accumulate, preventing
|
||||
* mid-sentence pauses when the second chunk hasn't arrived yet.
|
||||
* Set to 0 for immediate playback. */
|
||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||
meta = (ClampMin = "0", ClampMax = "4000",
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
|
||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
|
||||
int32 AudioPreBufferMs = 2000;
|
||||
|
||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||
|
||||
@ -103,6 +103,17 @@ private:
|
||||
UFUNCTION()
|
||||
void OnPartialTextReceived(const FString& PartialText);
|
||||
|
||||
/** Called when the agent is interrupted — immediately reset lip sync to neutral. */
|
||||
UFUNCTION()
|
||||
void OnAgentInterrupted();
|
||||
|
||||
/** Called when the agent finishes speaking — reset lip sync state for next utterance. */
|
||||
UFUNCTION()
|
||||
void OnAgentStopped();
|
||||
|
||||
/** Clear all lip sync queues and reset mouth to neutral pose. */
|
||||
void ResetToNeutral();
|
||||
|
||||
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
|
||||
void ConvertTextToVisemes(const FString& Text);
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user