v1.9.1: Fix audio loss after interruption, instant audio stop, lip sync reset

- Fix event_id filtering bug: reset LastInterruptEventId when new generation starts, preventing all audio from being silently dropped after an interruption - Match C++ sample API config: remove optimize_streaming_latency and custom_llm_extra_body overrides, send empty conversation_config_override in Server VAD mode (only send turn_timeout in Client mode) - Instant audio stop on interruption: call ResetAudio() before Stop() to flush USoundWaveProcedural's internal ring buffer - Lip sync reset on interruption/stop: bind OnAgentInterrupted (snap to neutral) and OnAgentStoppedSpeaking (clear queues) events - Revert jitter buffer (replaced by pre-buffer approach, default 2000ms) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 09:48:56 +01:00
parent c2142f3e6b
commit 6543bc6785
6 changed files with 149 additions and 51 deletions
--- a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
+++ b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow(
 	// The AudioPlaybackComponent is still "playing" from the previous turn
 	// (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow
 	// keeps firing. Without this guard, the underflow callback would drain
-	// the AudioQueue immediately, defeating the pre-buffer entirely.
+	// the AudioQueue immediately, defeating the buffer entirely.
 	// The ProceduralSoundWave generates silence internally when we return
-	// nothing — this silence does NOT accumulate, so once bPreBuffering
+	// nothing — this silence does NOT accumulate, so once buffering
 	// clears, the buffered data plays immediately.
 	if (bPreBuffering)
 	{
@@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArray<uin
 void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 {
 	// Flush the ProceduralSoundWave's internal buffer BEFORE stopping.
 	// QueueAudio() pushes data into the wave's internal ring buffer during
 	// OnProceduralUnderflow. Calling Stop() alone stops the AudioComponent
 	// but the wave still holds buffered data that would play briefly on the
 	// next Play() call, causing a delayed/ghostly tail of the interrupted audio.
 	// ResetAudio() clears that internal buffer for an instant cut.
 	if (ProceduralSoundWave)
 	{
 		ProceduralSoundWave->ResetAudio();
 	}
 	if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying())
 	{
 		AudioPlaybackComponent->Stop();
@@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio()
 	// while holding it would block the audio thread for the full Blueprint handler duration.
 	bool bWasSpeaking = false;
 	double Now = 0.0;
-	bPreBuffering = false; // Clear pre-buffer state on stop.
+	bPreBuffering = false;      // Clear pre-buffer state on stop.
 	{
 		FScopeLock Lock(&AudioQueueLock);
 		AudioQueue.Empty();
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp
@@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay()
 		Agent->OnAgentTextResponse.AddDynamic(
 			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
 		// Bind to interruption/stop events so lip sync resets immediately
 		// when the agent is cut off or finishes speaking.
 		Agent->OnAgentInterrupted.AddDynamic(
 			this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
 		Agent->OnAgentStoppedSpeaking.AddDynamic(
 			this, &UElevenLabsLipSyncComponent::OnAgentStopped);
 		// Enable partial response streaming if not already enabled
 		Agent->bEnableAgentPartialResponse = true;
 		UE_LOG(LogElevenLabsLipSync, Log,
-			TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName());
+			TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName());
 	}
 	else
 	{
@@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas
 			this, &UElevenLabsLipSyncComponent::OnPartialTextReceived);
 		AgentComponent->OnAgentTextResponse.RemoveDynamic(
 			this, &UElevenLabsLipSyncComponent::OnTextResponseReceived);
 		AgentComponent->OnAgentInterrupted.RemoveDynamic(
 			this, &UElevenLabsLipSyncComponent::OnAgentInterrupted);
 		AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic(
 			this, &UElevenLabsLipSyncComponent::OnAgentStopped);
 	}
 	AgentComponent.Reset();
 	SpectrumAnalyzer.Reset();
@@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 	// We consume one queued frame every 32ms to match the original audio timing.
 	constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s
-	// Pre-buffer sync: don't consume viseme queue while the agent component is
+	// Buffer sync: don't consume viseme queue while the agent component is
-	// pre-buffering audio. This keeps lip sync in sync with audio playback.
+	// pre-buffering audio (delaying playback to accumulate chunks).
-	// Without this, the lip sync would start 250ms ahead of the audio.
+	// This keeps lip sync in sync with audio playback.
 	// Without this, the lip sync would run ahead of the audio during buffering.
 	if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering())
 	{
 		return;
@@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick
 	}
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Interruption / stop handlers
 // ─────────────────────────────────────────────────────────────────────────────
 void UElevenLabsLipSyncComponent::OnAgentInterrupted()
 {
 	UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral."));
 	ResetToNeutral();
 }
 void UElevenLabsLipSyncComponent::OnAgentStopped()
 {
 	// Don't clear text state here — it's already handled by TickComponent's
 	// "queue runs dry" logic which checks bFullTextReceived.
 	// Just clear the queues so the mouth returns to neutral immediately.
 	UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues."));
 	VisemeQueue.Reset();
 	AmplitudeQueue.Reset();
 	PlaybackTimer = 0.0f;
 	bWaitingForText = false;
 }
 void UElevenLabsLipSyncComponent::ResetToNeutral()
 {
 	// Clear all queued viseme and amplitude data
 	VisemeQueue.Reset();
 	AmplitudeQueue.Reset();
 	PlaybackTimer = 0.0f;
 	bWaitingForText = false;
 	// Reset text-driven lip sync state for the interrupted utterance
 	AccumulatedText.Reset();
 	TextVisemeSequence.Reset();
 	bTextVisemesApplied = false;
 	bFullTextReceived = false;
 	// Snap all visemes to silence immediately (no smoothing delay)
 	for (const FName& Name : VisemeNames)
 	{
 		TargetVisemes.FindOrAdd(Name) = 0.0f;
 		SmoothedVisemes.FindOrAdd(Name) = 0.0f;
 	}
 	TargetVisemes.FindOrAdd(FName("sil")) = 1.0f;
 	SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f;
 	// Clear blendshapes so the mouth returns to fully neutral
 	CurrentBlendshapes.Reset();
 	PreviousBlendshapes.Reset();
 	LastConsumedVisemes.Reset();
 }
 // ─────────────────────────────────────────────────────────────────────────────
 // Audio analysis
 // ─────────────────────────────────────────────────────────────────────────────
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	//   }
 	// }
-	// Configure turn-taking behaviour.
+	// Build conversation_config_override matching the C++ ElevenLabs sample as closely
-	// The ElevenLabs API does NOT have a turn.mode field.
+	// as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults).
-	// Turn-taking is controlled by the server's VAD and the turn_* parameters.
+	// Sending empty = server defaults for TTS chunking, latency, and LLM behaviour.
-	// In push-to-talk (Client mode) the user controls the mic; the server still
+	// This produces smooth continuous audio chunks without the fragmentation caused by
-	// uses its VAD to detect the end of speech from the audio chunks it receives.
+	// explicit optimize_streaming_latency or enable_intermediate_response overrides.
 	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 	// turn_timeout: how long the server waits after VAD detects silence before
 	// processing the user's turn. In push-to-talk (Client) mode this directly adds
 	// latency to every response — the server waits this many seconds of silence
 	// after the user releases T before it begins LLM processing.
 	//
-	// History:
+	// In Client (push-to-talk) mode only, we override turn_timeout to reduce latency.
-	//   turn_timeout=1 was originally problematic, but ONLY when combined with
+	// In Server VAD mode, the config override is empty (matches C++ sample exactly).
-	//   speculative_turn=true (which has since been removed). Without speculative_turn,
+	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
-	//   1s is safe and halves the per-turn latency vs the 3s we had previously.
+
 	//   Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1.
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
 		// turn_timeout: how long the server waits after VAD detects silence before
 		// processing the user's turn. Default is ~3s. In push-to-talk mode this
 		// directly adds latency — the server waits after the user releases T.
 		// 1s is safe without speculative_turn (which was removed — see history below).
 		//
 		// History:
 		//   turn_timeout=1 was problematic when combined with speculative_turn=true
 		//   (server silently dropped turns 3+). Without speculative_turn, 1s is safe
 		//   and halves the per-turn latency.
 		TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 		TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
 		TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
 		AgentObj->SetObjectField(TEXT("turn"), TurnObj);
 		ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
 	}
 	// NOTE: speculative_turn is intentionally NOT sent here.
 	// With speculative_turn=true the server starts LLM generation speculatively
 	// before the VAD is fully confident the user finished speaking.  Combined with
 	// the short turn_timeout this put the server's state machine into a state where
 	// it stopped processing user audio after 2 turns — subsequent turns received
 	// only pings and no agent_chat_response_part / audio / user_transcript at all.
 	// Removing it costs ~200-500ms of latency but restores reliable multi-turn
 	// conversation.  Re-enable only if ElevenLabs confirms it is stable.
-	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
+	// NOTE: We intentionally do NOT send these overrides (matching C++ sample):
-	AgentObj->SetObjectField(TEXT("turn"), TurnObj);
+	//
-
+	// - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes
-	TSharedPtr<FJsonObject> TtsObj = MakeShareable(new FJsonObject());
+	//   the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely.
-	TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3);
+	//   With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering).
-
+	//   With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps).
-	TSharedPtr<FJsonObject> ConversationConfigOverride = MakeShareable(new FJsonObject());
+	//   Server default (omitted): produces smooth continuous audio (no gaps in C++ sample).
-	ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj);
+	//
-	ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj);
+	// - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks
-
+	//   before finishing generation → fragmented audio. When omitted (C++ sample), the
-	// enable_intermediate_response reduces time-to-first-audio by allowing the agent
+	//   LLM completes its response first → continuous TTS chunks.
-	// to start speaking before it has finished generating the full response.
+	//
-	TSharedPtr<FJsonObject> CustomLlmExtraBody = MakeShareable(new FJsonObject());
+	// - custom_llm_extra_body (empty object): Even an empty object might override the
-	CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true);
+	//   agent's configured custom_llm_extra_body with nothing. Omit entirely.
 	TSharedPtr<FJsonObject> InitMsg = MakeShareable(new FJsonObject());
 	InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData);
 	InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride);
 	InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody);
 	// NOTE: We bypass SendJsonMessage() here intentionally.
 	// SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires
@@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr<FJs
 	if (!bAgentResponseStartedFired)
 	{
 		bAgentResponseStartedFired = true;
 		// Reset the interrupt audio filter: a new response generation has started,
 		// so all subsequent audio belongs to this NEW generation and must not be
 		// discarded by the stale interrupt event_id from the PREVIOUS generation.
 		// Without this reset, audio for the new response is silently dropped when
 		// its event_id <= LastInterruptEventId (which was set during the interruption
 		// of the previous response).
 		if (LastInterruptEventId > 0)
 		{
 			UE_LOG(LogElevenLabsWS, Log,
 				TEXT("New generation started — resetting LastInterruptEventId (was %d)."),
 				LastInterruptEventId);
 			LastInterruptEventId = 0;
 		}
 		const double Now = FPlatformTime::Seconds();
 		const double T = Now - SessionStartTime;
 		const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0;
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -137,14 +137,12 @@ public:
 	bool bEnableAgentPartialResponse = false;
 	/** Pre-buffer delay (ms) before starting audio playback on the first chunk.
-	 *  ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them.
+	 *  Delays playback start so early TTS chunks can accumulate, preventing
-	 *  Pre-buffering delays playback start so the second chunk arrives before the
+	 *  mid-sentence pauses when the second chunk hasn't arrived yet.
-	 *  first finishes playing, eliminating the audible gap mid-sentence.
+	 *  Set to 0 for immediate playback. */
 	 *  Higher values = fewer gaps but more latency on the first word.
 	 *  Set to 0 for immediate playback (may cause mid-sentence pauses). */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency",
 		meta = (ClampMin = "0", ClampMax = "4000",
-		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps."))
+		ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback."))
 	int32 AudioPreBufferMs = 2000;
 	/** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h
@@ -103,6 +103,17 @@ private:
 	UFUNCTION()
 	void OnPartialTextReceived(const FString& PartialText);
 	/** Called when the agent is interrupted — immediately reset lip sync to neutral. */
 	UFUNCTION()
 	void OnAgentInterrupted();
 	/** Called when the agent finishes speaking — reset lip sync state for next utterance. */
 	UFUNCTION()
 	void OnAgentStopped();
 	/** Clear all lip sync queues and reset mouth to neutral pose. */
 	void ResetToNeutral();
 	/** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */
 	void ConvertTextToVisemes(const FString& Text);