diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini new file mode 100644 index 0000000..ccebca2 --- /dev/null +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini @@ -0,0 +1,8 @@ +[FilterPlugin] +; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and +; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively. +; +; Examples: +; /README.txt +; /Extras/... +; /Binaries/ThirdParty/*.dll diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp index 7ca14c9..545d40a 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp @@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation() &UElevenLabsConversationalAgentComponent::HandleInterrupted); } - // Pass our TurnMode to the proxy so it sends the correct mode in - // conversation_initiation_client_data and sends user_activity with each audio chunk. + // Pass configuration to the proxy before connecting. WebSocketProxy->TurnMode = TurnMode; + WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn; WebSocketProxy->Connect(AgentID); } @@ -242,12 +242,18 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray& PCMData) UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num()); + // Track when the last audio chunk was sent for latency measurement. + LastAudioChunkSentTime = FPlatformTime::Seconds(); + // ElevenLabs expects: { "user_audio_chunk": "" } // The server's VAD detects silence to determine end-of-turn. // Do NOT send user_activity here — it resets the turn timeout timer @@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd() { // No explicit "end turn" message exists in the ElevenLabs API. // The server detects end-of-speech via VAD when we stop sending audio chunks. + UserTurnEndTime = FPlatformTime::Seconds(); + bWaitingForResponse = true; + bFirstAudioResponseLogged = false; UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence.")); } @@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected() // "type": "conversation_initiation_client_data", // "conversation_config_override": { // "agent": { - // "turn": { "turn_timeout": 3 } + // "turn": { "turn_timeout": 3, "speculative_turn": true } // }, // "tts": { // "optimize_streaming_latency": 3 @@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected() // uses its VAD to detect the end of speech from the audio chunks it receives. TSharedPtr TurnObj = MakeShareable(new FJsonObject()); // Lower turn_timeout so the agent responds faster after the user stops speaking. - // Default is 7s which feels very slow for push-to-talk. + // Default is 7s. In push-to-talk (Client mode), the user explicitly signals + // end-of-turn by releasing the key, so we can use a very short timeout (1s). if (TurnMode == EElevenLabsTurnMode::Client) { - TurnObj->SetNumberField(TEXT("turn_timeout"), 3); + TurnObj->SetNumberField(TEXT("turn_timeout"), 1); + } + // Speculative turn: start LLM generation during silence before the VAD is + // fully confident the user finished speaking. Reduces latency by 200-500ms. + if (bSpeculativeTurn) + { + TurnObj->SetBoolField(TEXT("speculative_turn"), true); } TSharedPtr AgentObj = MakeShareable(new FJsonObject()); @@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message) } else if (MsgType == ElevenLabsMessageType::AudioResponse) { + // Log time-to-first-audio: latency between end of user turn and first agent audio. + if (bWaitingForResponse && !bFirstAudioResponseLogged) + { + const double Now = FPlatformTime::Seconds(); + const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; + const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0; + UE_LOG(LogElevenLabsWS, Warning, + TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"), + LatencyFromTurnEnd, LatencyFromLastChunk); + bFirstAudioResponseLogged = true; + } HandleAudioResponse(Root); } else if (MsgType == ElevenLabsMessageType::UserTranscript) { + // Log transcription latency. + if (bWaitingForResponse) + { + const double Now = FPlatformTime::Seconds(); + const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; + UE_LOG(LogElevenLabsWS, Warning, + TEXT("[LATENCY] User transcript received: %.0f ms after turn end"), + LatencyFromTurnEnd); + bWaitingForResponse = false; + } HandleTranscript(Root); } else if (MsgType == ElevenLabsMessageType::AgentResponse) { + // Log agent text response latency. + if (UserTurnEndTime > 0.0) + { + const double Now = FPlatformTime::Seconds(); + const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0; + UE_LOG(LogElevenLabsWS, Warning, + TEXT("[LATENCY] Agent text response: %.0f ms after turn end"), + LatencyFromTurnEnd); + } HandleAgentResponse(Root); } else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection) diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h index de581e5..f972f94 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h @@ -80,6 +80,29 @@ public: UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs") bool bAutoStartListening = true; + /** + * Enable speculative turn: the LLM starts generating a response during + * silence before the VAD is fully confident the user has finished speaking. + * Reduces latency by 200-500ms but may occasionally produce premature responses. + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency") + bool bSpeculativeTurn = true; + + /** + * Forward user speech transcripts (user_transcript events) to the + * OnAgentTranscript delegate. Disable to reduce overhead if you don't + * need to display what the user said. + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events") + bool bEnableUserTranscript = true; + + /** + * Forward agent text responses (agent_response events) to the + * OnAgentTextResponse delegate. Disable if you only need audio output. + */ + UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events") + bool bEnableAgentTextResponse = true; + // ── Events ──────────────────────────────────────────────────────────────── UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events") diff --git a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h index ce1f97f..13fb2e8 100644 --- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h +++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h @@ -184,9 +184,21 @@ private: // ElevenLabs sends JSON as binary frames; large messages arrive in fragments. TArray BinaryFrameBuffer; + // ── Latency tracking ───────────────────────────────────────────────────── + // Timestamp of the last audio chunk sent (user speech). + double LastAudioChunkSentTime = 0.0; + // Timestamp when user turn ended (StopListening). + double UserTurnEndTime = 0.0; + // Whether we are waiting for the first response after user stopped speaking. + bool bWaitingForResponse = false; + // Whether we already logged the first audio response latency for this turn. + bool bFirstAudioResponseLogged = false; + public: // Set by UElevenLabsConversationalAgentComponent before calling Connect(). - // Controls the turn mode string sent in conversation_initiation_client_data - // AND whether user_activity is sent automatically with each audio chunk. + // Controls turn_timeout in conversation_initiation_client_data. EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server; + + // Speculative turn: start LLM generation during silence before full turn confidence. + bool bSpeculativeTurn = true; };