v1.9.1: Fix audio loss after interruption, instant audio stop, lip sync reset

- Fix event_id filtering bug: reset LastInterruptEventId when new generation starts, preventing all audio from being silently dropped after an interruption - Match C++ sample API config: remove optimize_streaming_latency and custom_llm_extra_body overrides, send empty conversation_config_override in Server VAD mode (only send turn_timeout in Client mode) - Instant audio stop on interruption: call ResetAudio() before Stop() to flush USoundWaveProcedural's internal ring buffer - Lip sync reset on interruption/stop: bind OnAgentInterrupted (snap to neutral) and OnAgentStoppedSpeaking (clear queues) events - Revert jitter buffer (replaced by pre-buffer approach, default 2000ms) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 09:48:56 +01:00
parent c2142f3e6b
commit 6543bc6785
6 changed files with 149 additions and 51 deletions
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 	// The AudioPlaybackComponent is still "playing" from the previous turn
 	// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
 	// keeps firing. Without this guard, the underflow callback would drain
-	// the AudioQueue immediately, defeating the pre-buffer entirely.
+	// the AudioQueue immediately, defeating the buffer entirely.
 	// The ProceduralSoundWave generates silence internally when we return
-	// nothing — this silence does NOT accumulate, so once bPreBuffering
+	// nothing — this silence does NOT accumulate, so once buffering
 	// clears, the buffered data plays immediately.
 	if (bPreBuffering)
 	{
@@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin

 void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 {
+	// Flush the ProceduralSoundWave's internal buffer BEFORE stopping.
+	// QueueAudio() pushes data into the wave's internal ring buffer during
+	// OnProceduralUnderflow. Calling Stop() alone stops the AudioComponent
+	// but the wave still holds buffered data that would play briefly on the
+	// next Play() call, causing a delayed/ghostly tail of the interrupted audio.
+	// ResetAudio() clears that internal buffer for an instant cut.
+	if (ProceduralSoundWave)
+	{
+		ProceduralSoundWave->ResetAudio();
+	}
+
 	if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
 	{
 		AudioPlaybackComponent->Stop();
@@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 	// while holding it would block the audio thread for the full Blueprint handler duration.
 	bool bWasSpeaking = false;
 	double Now = 0.0;
-	bPreBuffering = false; // Clear pre-buffer state on stop.
+	bPreBuffering = false;      // Clear pre-buffer state on stop.
 	{
 		FScopeLock Lock(&AudioQueueLock);
 		AudioQueue.Empty();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay()
 		Agent->OnAgentTextResponse.AddDynamic(
 			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);

+		// Bind to interruption/stop events so lip sync resets immediately
+		// when the agent is cut off or finishes speaking.
+		Agent->OnAgentInterrupted.AddDynamic(
+			this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
+		Agent->OnAgentStoppedSpeaking.AddDynamic(
+			this, &UElevenLabsLipSyncComponent::OnAgentStopped);
+
 		// Enable partial response streaming if not already enabled
 		Agent->bEnableAgentPartialResponse = true;

 		UE_LOG(LogElevenLabsLipSync, Log,
-			TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
+			TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName());
 	}
 	else
 	{
@@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas
 			this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
 		AgentComponent->OnAgentTextResponse.RemoveDynamic(
 			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
+		AgentComponent->OnAgentInterrupted.RemoveDynamic(
+			this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
+		AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic(
+			this, &UElevenLabsLipSyncComponent::OnAgentStopped);
 	}
 	AgentComponent.Reset();
 	SpectrumAnalyzer.Reset();
@@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 	// We consume one queued frame every 32ms to match the original audio timing.
 	constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s

-	// Pre-buffer sync: don't consume viseme queue while the agent component is
-	// pre-buffering audio. This keeps lip sync in sync with audio playback.
-	// Without this, the lip sync would start 250ms ahead of the audio.
+	// Buffer sync: don't consume viseme queue while the agent component is
+	// pre-buffering audio (delaying playback to accumulate chunks).
+	// This keeps lip sync in sync with audio playback.
+	// Without this, the lip sync would run ahead of the audio during buffering.
 	if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
 	{
 		return;
@@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 	}
 }

+// ─────────────────────────────────────────────────────────────────────────────
+// Interruption / stop handlers
+// ─────────────────────────────────────────────────────────────────────────────
+
+void UElevenLabsLipSyncComponent::OnAgentInterrupted()
+{
+	UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral."));
+	ResetToNeutral();
+}
+
+void UElevenLabsLipSyncComponent::OnAgentStopped()
+{
+	// Don't clear text state here — it's already handled by TickComponent's
+	// "queue runs dry" logic which checks bFullTextReceived.
+	// Just clear the queues so the mouth returns to neutral immediately.
+	UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues."));
+	VisemeQueue.Reset();
+	AmplitudeQueue.Reset();
+	PlaybackTimer = 0.0f;
+	bWaitingForText = false;
+}
+
+void UElevenLabsLipSyncComponent::ResetToNeutral()
+{
+	// Clear all queued viseme and amplitude data
+	VisemeQueue.Reset();
+	AmplitudeQueue.Reset();
+	PlaybackTimer = 0.0f;
+	bWaitingForText = false;
+
+	// Reset text-driven lip sync state for the interrupted utterance
+	AccumulatedText.Reset();
+	TextVisemeSequence.Reset();
+	bTextVisemesApplied = false;
+	bFullTextReceived = false;
+
+	// Snap all visemes to silence immediately (no smoothing delay)
+	for (const FName& Name : VisemeNames)
+	{
+		TargetVisemes.FindOrAdd(Name) = 0.0f;
+		SmoothedVisemes.FindOrAdd(Name) = 0.0f;
+	}
+	TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
+	SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
+
+	// Clear blendshapes so the mouth returns to fully neutral
+	CurrentBlendshapes.Reset();
+	PreviousBlendshapes.Reset();
+	LastConsumedVisemes.Reset();
+}
+
 // ─────────────────────────────────────────────────────────────────────────────
 // Audio analysis
 // ─────────────────────────────────────────────────────────────────────────────
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	//   }
 	// }

-	// Configure turn-taking behaviour.
-	// The ElevenLabs API does NOT have a turn.mode field.
-	// Turn-taking is controlled by the server's VAD and the turn_* parameters.
-	// In push-to-talk (Client mode) the user controls the mic; the server still
-	// uses its VAD to detect the end of speech from the audio chunks it receives.
-	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
-	// turn_timeout: how long the server waits after VAD detects silence before
-	// processing the user's turn. In push-to-talk (Client) mode this directly adds
-	// latency to every response — the server waits this many seconds of silence
-	// after the user releases T before it begins LLM processing.
+	// Build conversation_config_override matching the C++ ElevenLabs sample as closely
+	// as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults).
+	// Sending empty = server defaults for TTS chunking, latency, and LLM behaviour.
+	// This produces smooth continuous audio chunks without the fragmentation caused by
+	// explicit optimize_streaming_latency or enable_intermediate_response overrides.
 	//
-	// History:
-	//   turn_timeout=1 was originally problematic, but ONLY when combined with
-	//   speculative_turn=true (which has since been removed). Without speculative_turn,
-	//   1s is safe and halves the per-turn latency vs the 3s we had previously.
-	//   Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
+	// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
+	// In Server VAD mode, the config override is empty (matches C++ sample exactly).
+	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
+
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
+		// turn_timeout: how long the server waits after VAD detects silence before
+		// processing the user's turn. Default is ~3s. In push-to-talk mode this
+		// directly adds latency — the server waits after the user releases T.
+		// 1s is safe without speculative_turn (which was removed — see history below).
+		//
+		// History:
+		//   turn_timeout=1 was problematic when combined with speculative_turn=true
+		//   (server silently dropped turns 3+). Without speculative_turn, 1s is safe
+		//   and halves the per-turn latency.
+		TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 		TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
+
+		TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
+		AgentObj->SetObjectField(TEXT("turn"), TurnObj);
+
+		ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
 	}
-	// NOTE: speculative_turn is intentionally NOT sent here.
-	// With speculative_turn=true the server starts LLM generation speculatively
-	// before the VAD is fully confident the user finished speaking.  Combined with
-	// the short turn_timeout this put the server's state machine into a state where
-	// it stopped processing user audio after 2 turns — subsequent turns received
-	// only pings and no agent_chat_response_part / audio / user_transcript at all.
-	// Removing it costs ~200-500ms of latency but restores reliable multi-turn
-	// conversation.  Re-enable only if ElevenLabs confirms it is stable.

-	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
-	AgentObj->SetObjectField(TEXT("turn"), TurnObj);
-
-	TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
-	TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
-
-	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
-	ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
-	ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
-
-	// enable_intermediate_response reduces time-to-first-audio by allowing the agent
-	// to start speaking before it has finished generating the full response.
-	TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
-	CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
+	// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
+	//
+	// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
+	//   the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
+	//   With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
+	//   With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
+	//   Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
+	//
+	// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
+	//   before finishing generation → fragmented audio. When omitted (C++ sample), the
+	//   LLM completes its response first → continuous TTS chunks.
+	//
+	// - custom_llm_extra_body (empty object): Even an empty object might override the
+	//   agent's configured custom_llm_extra_body with nothing. Omit entirely.

 	TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
 	InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
 	InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
-	InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);

 	// NOTE: We bypass SendJsonMessage() here intentionally.
 	// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
@@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
 	if (!bAgentResponseStartedFired)
 	{
 		bAgentResponseStartedFired = true;
+
+		// Reset the interrupt audio filter: a new response generation has started,
+		// so all subsequent audio belongs to this NEW generation and must not be
+		// discarded by the stale interrupt event_id from the PREVIOUS generation.
+		// Without this reset, audio for the new response is silently dropped when
+		// its event_id <= LastInterruptEventId (which was set during the interruption
+		// of the previous response).
+		if (LastInterruptEventId > 0)
+		{
+			UE_LOG(LogElevenLabsWS, Log,
+				TEXT("New generation started — resetting LastInterruptEventId (was %d)."),
+				LastInterruptEventId);
+			LastInterruptEventId = 0;
+		}
+
 		const double Now = FPlatformTime::Seconds();
 		const double T = Now - SessionStartTime;
 		const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -137,14 +137,12 @@ public:
 	bool bEnableAgentPartialResponse = false;

 	/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
-	 *  ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
-	 *  Pre-buffering delays playback start so the second chunk arrives before the
-	 *  first finishes playing, eliminating the audible gap mid-sentence.
-	 *  Higher values = fewer gaps but more latency on the first word.
-	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
+	 *  Delays playback start so early TTS chunks can accumulate, preventing
+	 *  mid-sentence pauses when the second chunk hasn't arrived yet.
+	 *  Set to 0 for immediate playback. */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
 		meta = (ClampMin = "0", ClampMax = "4000",
-		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
 	int32 AudioPreBufferMs = 2000;

 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -103,6 +103,17 @@ private:
 	UFUNCTION()
 	void OnPartialTextReceived(const FString& PartialText);

+	/** Called when the agent is interrupted — immediately reset lip sync to neutral. */
+	UFUNCTION()
+	void OnAgentInterrupted();
+
+	/** Called when the agent finishes speaking — reset lip sync state for next utterance. */
+	UFUNCTION()
+	void OnAgentStopped();
+
+	/** Clear all lip sync queues and reset mouth to neutral pose. */
+	void ResetToNeutral();
+
 	/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
 	void ConvertTextToVisemes(const FString& Text);