v1.9.1: Fix audio loss after interruption, instant audio stop, lip sync reset
- Fix event_id filtering bug: reset LastInterruptEventId when new generation starts, preventing all audio from being silently dropped after an interruption - Match C++ sample API config: remove optimize_streaming_latency and custom_llm_extra_body overrides, send empty conversation_config_override in Server VAD mode (only send turn_timeout in Client mode) - Instant audio stop on interruption: call ResetAudio() before Stop() to flush USoundWaveProcedural's internal ring buffer - Lip sync reset on interruption/stop: bind OnAgentInterrupted (snap to neutral) and OnAgentStoppedSpeaking (clear queues) events - Revert jitter buffer (replaced by pre-buffer approach, default 2000ms) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
c2142f3e6b
commit
6543bc6785
Binary file not shown.
@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
|
|||||||
// The AudioPlaybackComponent is still "playing" from the previous turn
|
// The AudioPlaybackComponent is still "playing" from the previous turn
|
||||||
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
|
// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
|
||||||
// keeps firing. Without this guard, the underflow callback would drain
|
// keeps firing. Without this guard, the underflow callback would drain
|
||||||
// the AudioQueue immediately, defeating the pre-buffer entirely.
|
// the AudioQueue immediately, defeating the buffer entirely.
|
||||||
// The ProceduralSoundWave generates silence internally when we return
|
// The ProceduralSoundWave generates silence internally when we return
|
||||||
// nothing — this silence does NOT accumulate, so once bPreBuffering
|
// nothing — this silence does NOT accumulate, so once buffering
|
||||||
// clears, the buffered data plays immediately.
|
// clears, the buffered data plays immediately.
|
||||||
if (bPreBuffering)
|
if (bPreBuffering)
|
||||||
{
|
{
|
||||||
@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
|
|||||||
|
|
||||||
void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
||||||
{
|
{
|
||||||
|
// Flush the ProceduralSoundWave's internal buffer BEFORE stopping.
|
||||||
|
// QueueAudio() pushes data into the wave's internal ring buffer during
|
||||||
|
// OnProceduralUnderflow. Calling Stop() alone stops the AudioComponent
|
||||||
|
// but the wave still holds buffered data that would play briefly on the
|
||||||
|
// next Play() call, causing a delayed/ghostly tail of the interrupted audio.
|
||||||
|
// ResetAudio() clears that internal buffer for an instant cut.
|
||||||
|
if (ProceduralSoundWave)
|
||||||
|
{
|
||||||
|
ProceduralSoundWave->ResetAudio();
|
||||||
|
}
|
||||||
|
|
||||||
if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
|
if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
|
||||||
{
|
{
|
||||||
AudioPlaybackComponent->Stop();
|
AudioPlaybackComponent->Stop();
|
||||||
@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
|
|||||||
// while holding it would block the audio thread for the full Blueprint handler duration.
|
// while holding it would block the audio thread for the full Blueprint handler duration.
|
||||||
bool bWasSpeaking = false;
|
bool bWasSpeaking = false;
|
||||||
double Now = 0.0;
|
double Now = 0.0;
|
||||||
bPreBuffering = false; // Clear pre-buffer state on stop.
|
bPreBuffering = false; // Clear pre-buffer state on stop.
|
||||||
{
|
{
|
||||||
FScopeLock Lock(&AudioQueueLock);
|
FScopeLock Lock(&AudioQueueLock);
|
||||||
AudioQueue.Empty();
|
AudioQueue.Empty();
|
||||||
|
|||||||
@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay()
|
|||||||
Agent->OnAgentTextResponse.AddDynamic(
|
Agent->OnAgentTextResponse.AddDynamic(
|
||||||
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
||||||
|
|
||||||
|
// Bind to interruption/stop events so lip sync resets immediately
|
||||||
|
// when the agent is cut off or finishes speaking.
|
||||||
|
Agent->OnAgentInterrupted.AddDynamic(
|
||||||
|
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
|
||||||
|
Agent->OnAgentStoppedSpeaking.AddDynamic(
|
||||||
|
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
|
||||||
|
|
||||||
// Enable partial response streaming if not already enabled
|
// Enable partial response streaming if not already enabled
|
||||||
Agent->bEnableAgentPartialResponse = true;
|
Agent->bEnableAgentPartialResponse = true;
|
||||||
|
|
||||||
UE_LOG(LogElevenLabsLipSync, Log,
|
UE_LOG(LogElevenLabsLipSync, Log,
|
||||||
TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
|
TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName());
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas
|
|||||||
this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
|
this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
|
||||||
AgentComponent->OnAgentTextResponse.RemoveDynamic(
|
AgentComponent->OnAgentTextResponse.RemoveDynamic(
|
||||||
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
|
||||||
|
AgentComponent->OnAgentInterrupted.RemoveDynamic(
|
||||||
|
this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
|
||||||
|
AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic(
|
||||||
|
this, &UElevenLabsLipSyncComponent::OnAgentStopped);
|
||||||
}
|
}
|
||||||
AgentComponent.Reset();
|
AgentComponent.Reset();
|
||||||
SpectrumAnalyzer.Reset();
|
SpectrumAnalyzer.Reset();
|
||||||
@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
|||||||
// We consume one queued frame every 32ms to match the original audio timing.
|
// We consume one queued frame every 32ms to match the original audio timing.
|
||||||
constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
|
constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
|
||||||
|
|
||||||
// Pre-buffer sync: don't consume viseme queue while the agent component is
|
// Buffer sync: don't consume viseme queue while the agent component is
|
||||||
// pre-buffering audio. This keeps lip sync in sync with audio playback.
|
// pre-buffering audio (delaying playback to accumulate chunks).
|
||||||
// Without this, the lip sync would start 250ms ahead of the audio.
|
// This keeps lip sync in sync with audio playback.
|
||||||
|
// Without this, the lip sync would run ahead of the audio during buffering.
|
||||||
if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
|
if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
|
||||||
{
|
{
|
||||||
return;
|
return;
|
||||||
@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
// Interruption / stop handlers
|
||||||
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::OnAgentInterrupted()
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral."));
|
||||||
|
ResetToNeutral();
|
||||||
|
}
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::OnAgentStopped()
|
||||||
|
{
|
||||||
|
// Don't clear text state here — it's already handled by TickComponent's
|
||||||
|
// "queue runs dry" logic which checks bFullTextReceived.
|
||||||
|
// Just clear the queues so the mouth returns to neutral immediately.
|
||||||
|
UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues."));
|
||||||
|
VisemeQueue.Reset();
|
||||||
|
AmplitudeQueue.Reset();
|
||||||
|
PlaybackTimer = 0.0f;
|
||||||
|
bWaitingForText = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void UElevenLabsLipSyncComponent::ResetToNeutral()
|
||||||
|
{
|
||||||
|
// Clear all queued viseme and amplitude data
|
||||||
|
VisemeQueue.Reset();
|
||||||
|
AmplitudeQueue.Reset();
|
||||||
|
PlaybackTimer = 0.0f;
|
||||||
|
bWaitingForText = false;
|
||||||
|
|
||||||
|
// Reset text-driven lip sync state for the interrupted utterance
|
||||||
|
AccumulatedText.Reset();
|
||||||
|
TextVisemeSequence.Reset();
|
||||||
|
bTextVisemesApplied = false;
|
||||||
|
bFullTextReceived = false;
|
||||||
|
|
||||||
|
// Snap all visemes to silence immediately (no smoothing delay)
|
||||||
|
for (const FName& Name : VisemeNames)
|
||||||
|
{
|
||||||
|
TargetVisemes.FindOrAdd(Name) = 0.0f;
|
||||||
|
SmoothedVisemes.FindOrAdd(Name) = 0.0f;
|
||||||
|
}
|
||||||
|
TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||||
|
SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
|
||||||
|
|
||||||
|
// Clear blendshapes so the mouth returns to fully neutral
|
||||||
|
CurrentBlendshapes.Reset();
|
||||||
|
PreviousBlendshapes.Reset();
|
||||||
|
LastConsumedVisemes.Reset();
|
||||||
|
}
|
||||||
|
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
// Audio analysis
|
// Audio analysis
|
||||||
// ─────────────────────────────────────────────────────────────────────────────
|
// ─────────────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
|
|
||||||
// Configure turn-taking behaviour.
|
// Build conversation_config_override matching the C++ ElevenLabs sample as closely
|
||||||
// The ElevenLabs API does NOT have a turn.mode field.
|
// as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults).
|
||||||
// Turn-taking is controlled by the server's VAD and the turn_* parameters.
|
// Sending empty = server defaults for TTS chunking, latency, and LLM behaviour.
|
||||||
// In push-to-talk (Client mode) the user controls the mic; the server still
|
// This produces smooth continuous audio chunks without the fragmentation caused by
|
||||||
// uses its VAD to detect the end of speech from the audio chunks it receives.
|
// explicit optimize_streaming_latency or enable_intermediate_response overrides.
|
||||||
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
|
||||||
// turn_timeout: how long the server waits after VAD detects silence before
|
|
||||||
// processing the user's turn. In push-to-talk (Client) mode this directly adds
|
|
||||||
// latency to every response — the server waits this many seconds of silence
|
|
||||||
// after the user releases T before it begins LLM processing.
|
|
||||||
//
|
//
|
||||||
// History:
|
// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
|
||||||
// turn_timeout=1 was originally problematic, but ONLY when combined with
|
// In Server VAD mode, the config override is empty (matches C++ sample exactly).
|
||||||
// speculative_turn=true (which has since been removed). Without speculative_turn,
|
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
||||||
// 1s is safe and halves the per-turn latency vs the 3s we had previously.
|
|
||||||
// Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
|
|
||||||
if (TurnMode == EElevenLabsTurnMode::Client)
|
if (TurnMode == EElevenLabsTurnMode::Client)
|
||||||
{
|
{
|
||||||
|
// turn_timeout: how long the server waits after VAD detects silence before
|
||||||
|
// processing the user's turn. Default is ~3s. In push-to-talk mode this
|
||||||
|
// directly adds latency — the server waits after the user releases T.
|
||||||
|
// 1s is safe without speculative_turn (which was removed — see history below).
|
||||||
|
//
|
||||||
|
// History:
|
||||||
|
// turn_timeout=1 was problematic when combined with speculative_turn=true
|
||||||
|
// (server silently dropped turns 3+). Without speculative_turn, 1s is safe
|
||||||
|
// and halves the per-turn latency.
|
||||||
|
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
|
||||||
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
|
||||||
|
|
||||||
|
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
||||||
|
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
||||||
|
|
||||||
|
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
||||||
}
|
}
|
||||||
// NOTE: speculative_turn is intentionally NOT sent here.
|
|
||||||
// With speculative_turn=true the server starts LLM generation speculatively
|
|
||||||
// before the VAD is fully confident the user finished speaking. Combined with
|
|
||||||
// the short turn_timeout this put the server's state machine into a state where
|
|
||||||
// it stopped processing user audio after 2 turns — subsequent turns received
|
|
||||||
// only pings and no agent_chat_response_part / audio / user_transcript at all.
|
|
||||||
// Removing it costs ~200-500ms of latency but restores reliable multi-turn
|
|
||||||
// conversation. Re-enable only if ElevenLabs confirms it is stable.
|
|
||||||
|
|
||||||
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
|
// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
|
||||||
AgentObj->SetObjectField(TEXT("turn"), TurnObj);
|
//
|
||||||
|
// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
|
||||||
TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
|
// the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
|
||||||
TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
|
// With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
|
||||||
|
// With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
|
||||||
TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
|
// Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
|
||||||
ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
|
//
|
||||||
ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
|
// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
|
||||||
|
// before finishing generation → fragmented audio. When omitted (C++ sample), the
|
||||||
// enable_intermediate_response reduces time-to-first-audio by allowing the agent
|
// LLM completes its response first → continuous TTS chunks.
|
||||||
// to start speaking before it has finished generating the full response.
|
//
|
||||||
TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
|
// - custom_llm_extra_body (empty object): Even an empty object might override the
|
||||||
CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
|
// agent's configured custom_llm_extra_body with nothing. Omit entirely.
|
||||||
|
|
||||||
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
|
TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
|
||||||
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
|
InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
|
||||||
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
|
InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
|
||||||
InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
|
|
||||||
|
|
||||||
// NOTE: We bypass SendJsonMessage() here intentionally.
|
// NOTE: We bypass SendJsonMessage() here intentionally.
|
||||||
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
|
// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
|
||||||
@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
|
|||||||
if (!bAgentResponseStartedFired)
|
if (!bAgentResponseStartedFired)
|
||||||
{
|
{
|
||||||
bAgentResponseStartedFired = true;
|
bAgentResponseStartedFired = true;
|
||||||
|
|
||||||
|
// Reset the interrupt audio filter: a new response generation has started,
|
||||||
|
// so all subsequent audio belongs to this NEW generation and must not be
|
||||||
|
// discarded by the stale interrupt event_id from the PREVIOUS generation.
|
||||||
|
// Without this reset, audio for the new response is silently dropped when
|
||||||
|
// its event_id <= LastInterruptEventId (which was set during the interruption
|
||||||
|
// of the previous response).
|
||||||
|
if (LastInterruptEventId > 0)
|
||||||
|
{
|
||||||
|
UE_LOG(LogElevenLabsWS, Log,
|
||||||
|
TEXT("New generation started — resetting LastInterruptEventId (was %d)."),
|
||||||
|
LastInterruptEventId);
|
||||||
|
LastInterruptEventId = 0;
|
||||||
|
}
|
||||||
|
|
||||||
const double Now = FPlatformTime::Seconds();
|
const double Now = FPlatformTime::Seconds();
|
||||||
const double T = Now - SessionStartTime;
|
const double T = Now - SessionStartTime;
|
||||||
const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
|
const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
|
||||||
|
|||||||
@ -137,14 +137,12 @@ public:
|
|||||||
bool bEnableAgentPartialResponse = false;
|
bool bEnableAgentPartialResponse = false;
|
||||||
|
|
||||||
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
|
/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
|
||||||
* ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
|
* Delays playback start so early TTS chunks can accumulate, preventing
|
||||||
* Pre-buffering delays playback start so the second chunk arrives before the
|
* mid-sentence pauses when the second chunk hasn't arrived yet.
|
||||||
* first finishes playing, eliminating the audible gap mid-sentence.
|
* Set to 0 for immediate playback. */
|
||||||
* Higher values = fewer gaps but more latency on the first word.
|
|
||||||
* Set to 0 for immediate playback (may cause mid-sentence pauses). */
|
|
||||||
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
|
||||||
meta = (ClampMin = "0", ClampMax = "4000",
|
meta = (ClampMin = "0", ClampMax = "4000",
|
||||||
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
|
ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
|
||||||
int32 AudioPreBufferMs = 2000;
|
int32 AudioPreBufferMs = 2000;
|
||||||
|
|
||||||
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
|
||||||
|
|||||||
@ -103,6 +103,17 @@ private:
|
|||||||
UFUNCTION()
|
UFUNCTION()
|
||||||
void OnPartialTextReceived(const FString& PartialText);
|
void OnPartialTextReceived(const FString& PartialText);
|
||||||
|
|
||||||
|
/** Called when the agent is interrupted — immediately reset lip sync to neutral. */
|
||||||
|
UFUNCTION()
|
||||||
|
void OnAgentInterrupted();
|
||||||
|
|
||||||
|
/** Called when the agent finishes speaking — reset lip sync state for next utterance. */
|
||||||
|
UFUNCTION()
|
||||||
|
void OnAgentStopped();
|
||||||
|
|
||||||
|
/** Clear all lip sync queues and reset mouth to neutral pose. */
|
||||||
|
void ResetToNeutral();
|
||||||
|
|
||||||
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
|
/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
|
||||||
void ConvertTextToVisemes(const FString& Text);
|
void ConvertTextToVisemes(const FString& Text);
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user