diff --git a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset index ac0738b..af89517 100644 Binary files a/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset and b/Unreal/PS_AI_Agent/Content/MetaHumans/Taro/BP_Taro.uasset differ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index d62721f..1ef314a 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -577,9 +577,9 @@ void UElevenLabsConversationalAgentComponent::OnProceduralUnderflow( // The AudioPlaybackComponent is still "playing" from the previous turn // (INDEFINITELY_LOOPING_DURATION never stops), so OnProceduralUnderflow // keeps firing. Without this guard, the underflow callback would drain - // the AudioQueue immediately, defeating the pre-buffer entirely. + // the AudioQueue immediately, defeating the buffer entirely. // The ProceduralSoundWave generates silence internally when we return - // nothing — this silence does NOT accumulate, so once bPreBuffering + // nothing — this silence does NOT accumulate, so once buffering // clears, the buffered data plays immediately. if (bPreBuffering) { @@ -703,6 +703,17 @@ void UElevenLabsConversationalAgentComponent::EnqueueAgentAudio(const TArrayResetAudio(); + } + if (AudioPlaybackComponent && AudioPlaybackComponent->IsPlaying()) { AudioPlaybackComponent->Stop(); @@ -713,7 +724,7 @@ void UElevenLabsConversationalAgentComponent::StopAgentAudio() // while holding it would block the audio thread for the full Blueprint handler duration. bool bWasSpeaking = false; double Now = 0.0; - bPreBuffering = false; // Clear pre-buffer state on stop. + bPreBuffering = false; // Clear pre-buffer state on stop. { FScopeLock Lock(&AudioQueueLock); AudioQueue.Empty(); diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp index 93dd413..2d2b934 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsLipSyncComponent.cpp @@ -229,11 +229,18 @@ void UElevenLabsLipSyncComponent::BeginPlay() Agent->OnAgentTextResponse.AddDynamic( this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); + // Bind to interruption/stop events so lip sync resets immediately + // when the agent is cut off or finishes speaking. + Agent->OnAgentInterrupted.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnAgentInterrupted); + Agent->OnAgentStoppedSpeaking.AddDynamic( + this, &UElevenLabsLipSyncComponent::OnAgentStopped); + // Enable partial response streaming if not already enabled Agent->bEnableAgentPartialResponse = true; UE_LOG(LogElevenLabsLipSync, Log, - TEXT("Lip sync bound to agent component on %s (audio + text)."), *Owner->GetName()); + TEXT("Lip sync bound to agent component on %s (audio + text + interruption)."), *Owner->GetName()); } else { @@ -392,6 +399,10 @@ void UElevenLabsLipSyncComponent::EndPlay(const EEndPlayReason::Type EndPlayReas this, &UElevenLabsLipSyncComponent::OnPartialTextReceived); AgentComponent->OnAgentTextResponse.RemoveDynamic( this, &UElevenLabsLipSyncComponent::OnTextResponseReceived); + AgentComponent->OnAgentInterrupted.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnAgentInterrupted); + AgentComponent->OnAgentStoppedSpeaking.RemoveDynamic( + this, &UElevenLabsLipSyncComponent::OnAgentStopped); } AgentComponent.Reset(); SpectrumAnalyzer.Reset(); @@ -413,9 +424,10 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick // We consume one queued frame every 32ms to match the original audio timing. constexpr float WindowDuration = 512.0f / 16000.0f; // ~0.032s - // Pre-buffer sync: don't consume viseme queue while the agent component is - // pre-buffering audio. This keeps lip sync in sync with audio playback. - // Without this, the lip sync would start 250ms ahead of the audio. + // Buffer sync: don't consume viseme queue while the agent component is + // pre-buffering audio (delaying playback to accumulate chunks). + // This keeps lip sync in sync with audio playback. + // Without this, the lip sync would run ahead of the audio during buffering. if (AgentComponent.IsValid() && AgentComponent->IsPreBuffering()) { return; @@ -593,6 +605,57 @@ void UElevenLabsLipSyncComponent::TickComponent(float DeltaTime, ELevelTick Tick } } +// ───────────────────────────────────────────────────────────────────────────── +// Interruption / stop handlers +// ───────────────────────────────────────────────────────────────────────────── + +void UElevenLabsLipSyncComponent::OnAgentInterrupted() +{ + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent interrupted — resetting lip sync to neutral.")); + ResetToNeutral(); +} + +void UElevenLabsLipSyncComponent::OnAgentStopped() +{ + // Don't clear text state here — it's already handled by TickComponent's + // "queue runs dry" logic which checks bFullTextReceived. + // Just clear the queues so the mouth returns to neutral immediately. + UE_LOG(LogElevenLabsLipSync, Log, TEXT("Agent stopped speaking — clearing lip sync queues.")); + VisemeQueue.Reset(); + AmplitudeQueue.Reset(); + PlaybackTimer = 0.0f; + bWaitingForText = false; +} + +void UElevenLabsLipSyncComponent::ResetToNeutral() +{ + // Clear all queued viseme and amplitude data + VisemeQueue.Reset(); + AmplitudeQueue.Reset(); + PlaybackTimer = 0.0f; + bWaitingForText = false; + + // Reset text-driven lip sync state for the interrupted utterance + AccumulatedText.Reset(); + TextVisemeSequence.Reset(); + bTextVisemesApplied = false; + bFullTextReceived = false; + + // Snap all visemes to silence immediately (no smoothing delay) + for (const FName& Name : VisemeNames) + { + TargetVisemes.FindOrAdd(Name) = 0.0f; + SmoothedVisemes.FindOrAdd(Name) = 0.0f; + } + TargetVisemes.FindOrAdd(FName("sil")) = 1.0f; + SmoothedVisemes.FindOrAdd(FName("sil")) = 1.0f; + + // Clear blendshapes so the mouth returns to fully neutral + CurrentBlendshapes.Reset(); + PreviousBlendshapes.Reset(); + LastConsumedVisemes.Reset(); +} + // ───────────────────────────────────────────────────────────────────────────── // Audio analysis // ───────────────────────────────────────────────────────────────────────────── diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp index 65b86f9..e0d32f0 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp @@ -221,54 +221,54 @@ void UElevenLabsWebSocketProxy::OnWsConnected() // } // } - // Configure turn-taking behaviour. - // The ElevenLabs API does NOT have a turn.mode field. - // Turn-taking is controlled by the server's VAD and the turn_* parameters. - // In push-to-talk (Client mode) the user controls the mic; the server still - // uses its VAD to detect the end of speech from the audio chunks it receives. - TSharedPtr TurnObj = MakeShareable(new FJsonObject()); - // turn_timeout: how long the server waits after VAD detects silence before - // processing the user's turn. In push-to-talk (Client) mode this directly adds - // latency to every response — the server waits this many seconds of silence - // after the user releases T before it begins LLM processing. + // Build conversation_config_override matching the C++ ElevenLabs sample as closely + // as possible. The C++ sample sends: { "conversation_config_override": {} } (all defaults). + // Sending empty = server defaults for TTS chunking, latency, and LLM behaviour. + // This produces smooth continuous audio chunks without the fragmentation caused by + // explicit optimize_streaming_latency or enable_intermediate_response overrides. // - // History: - // turn_timeout=1 was originally problematic, but ONLY when combined with - // speculative_turn=true (which has since been removed). Without speculative_turn, - // 1s is safe and halves the per-turn latency vs the 3s we had previously. - // Original failure: server silently dropped turns 3+ with speculative_turn+timeout=1. + // In Client (push-to-talk) mode only, we override turn_timeout to reduce latency. + // In Server VAD mode, the config override is empty (matches C++ sample exactly). + TSharedPtr ConversationConfigOverride = MakeShareable(new FJsonObject()); + if (TurnMode == EElevenLabsTurnMode::Client) { + // turn_timeout: how long the server waits after VAD detects silence before + // processing the user's turn. Default is ~3s. In push-to-talk mode this + // directly adds latency — the server waits after the user releases T. + // 1s is safe without speculative_turn (which was removed — see history below). + // + // History: + // turn_timeout=1 was problematic when combined with speculative_turn=true + // (server silently dropped turns 3+). Without speculative_turn, 1s is safe + // and halves the per-turn latency. + TSharedPtr TurnObj = MakeShareable(new FJsonObject()); TurnObj->SetNumberField(TEXT("turn_timeout"), 1); + + TSharedPtr AgentObj = MakeShareable(new FJsonObject()); + AgentObj->SetObjectField(TEXT("turn"), TurnObj); + + ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); } - // NOTE: speculative_turn is intentionally NOT sent here. - // With speculative_turn=true the server starts LLM generation speculatively - // before the VAD is fully confident the user finished speaking. Combined with - // the short turn_timeout this put the server's state machine into a state where - // it stopped processing user audio after 2 turns — subsequent turns received - // only pings and no agent_chat_response_part / audio / user_transcript at all. - // Removing it costs ~200-500ms of latency but restores reliable multi-turn - // conversation. Re-enable only if ElevenLabs confirms it is stable. - TSharedPtr AgentObj = MakeShareable(new FJsonObject()); - AgentObj->SetObjectField(TEXT("turn"), TurnObj); - - TSharedPtr TtsObj = MakeShareable(new FJsonObject()); - TtsObj->SetNumberField(TEXT("optimize_streaming_latency"), 3); - - TSharedPtr ConversationConfigOverride = MakeShareable(new FJsonObject()); - ConversationConfigOverride->SetObjectField(TEXT("agent"), AgentObj); - ConversationConfigOverride->SetObjectField(TEXT("tts"), TtsObj); - - // enable_intermediate_response reduces time-to-first-audio by allowing the agent - // to start speaking before it has finished generating the full response. - TSharedPtr CustomLlmExtraBody = MakeShareable(new FJsonObject()); - CustomLlmExtraBody->SetBoolField(TEXT("enable_intermediate_response"), true); + // NOTE: We intentionally do NOT send these overrides (matching C++ sample): + // + // - tts.optimize_streaming_latency: Explicitly sending ANY value (even 0) changes + // the TTS chunking behaviour vs server defaults. The C++ sample omits this entirely. + // With value 3: many tiny chunks with 500ms-2s gaps (requires heavy buffering). + // With value 0: fewer larger chunks but ~3s inter-chunk gaps (still causes gaps). + // Server default (omitted): produces smooth continuous audio (no gaps in C++ sample). + // + // - custom_llm_extra_body.enable_intermediate_response: When true, the LLM speaks + // before finishing generation → fragmented audio. When omitted (C++ sample), the + // LLM completes its response first → continuous TTS chunks. + // + // - custom_llm_extra_body (empty object): Even an empty object might override the + // agent's configured custom_llm_extra_body with nothing. Omit entirely. TSharedPtr InitMsg = MakeShareable(new FJsonObject()); InitMsg->SetStringField(TEXT("type"), ElevenLabsMessageType::ConversationClientData); InitMsg->SetObjectField(TEXT("conversation_config_override"), ConversationConfigOverride); - InitMsg->SetObjectField(TEXT("custom_llm_extra_body"), CustomLlmExtraBody); // NOTE: We bypass SendJsonMessage() here intentionally. // SendJsonMessage() guards on WebSocket->IsConnected(), but OnWsConnected fires @@ -578,6 +578,21 @@ void UElevenLabsWebSocketProxy::HandleAgentChatResponsePart(const TSharedPtr 0) + { + UE_LOG(LogElevenLabsWS, Log, + TEXT("New generation started — resetting LastInterruptEventId (was %d)."), + LastInterruptEventId); + LastInterruptEventId = 0; + } + const double Now = FPlatformTime::Seconds(); const double T = Now - SessionStartTime; const double LatencyFromTurnEnd = UserTurnEndTime > 0.0 ? (Now - UserTurnEndTime) * 1000.0 : 0.0; diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index 1125f94..a1020ca 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -137,14 +137,12 @@ public: bool bEnableAgentPartialResponse = false; /** Pre-buffer delay (ms) before starting audio playback on the first chunk. - * ElevenLabs TTS splits responses into 2-3 audio chunks with gaps between them. - * Pre-buffering delays playback start so the second chunk arrives before the - * first finishes playing, eliminating the audible gap mid-sentence. - * Higher values = fewer gaps but more latency on the first word. - * Set to 0 for immediate playback (may cause mid-sentence pauses). */ + * Delays playback start so early TTS chunks can accumulate, preventing + * mid-sentence pauses when the second chunk hasn't arrived yet. + * Set to 0 for immediate playback. */ UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency", meta = (ClampMin = "0", ClampMax = "4000", - ToolTip = "Pre-buffer delay in ms before starting audio playback.\nAbsorbs gaps between TTS chunks to prevent mid-sentence pauses.\n0 = immediate playback, 250 = balanced, 500+ = test for server-side gaps.")) + ToolTip = "Pre-buffer delay in ms before starting audio playback.\nHigher values reduce mid-sentence pauses but add initial latency.\n0 = immediate playback.")) int32 AudioPreBufferMs = 2000; /** Safety timeout: if the server does not start generating a response within this many seconds after the user stops speaking, fire OnAgentResponseTimeout. Set to 0 to disable. A normal response starts within 0.1-0.8s. */ diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h index abaf230..3fa6747 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsLipSyncComponent.h @@ -103,6 +103,17 @@ private: UFUNCTION() void OnPartialTextReceived(const FString& PartialText); + /** Called when the agent is interrupted — immediately reset lip sync to neutral. */ + UFUNCTION() + void OnAgentInterrupted(); + + /** Called when the agent finishes speaking — reset lip sync state for next utterance. */ + UFUNCTION() + void OnAgentStopped(); + + /** Clear all lip sync queues and reset mouth to neutral pose. */ + void ResetToNeutral(); + /** Convert text to a sequence of OVR viseme names (grapheme-to-phoneme-to-viseme). */ void ConvertTextToVisemes(const FString& Text);