Working !

2026-02-20 08:24:56 +01:00
parent f7f0b0c45b
commit 9f28ed7457
5 changed files with 101 additions and 9 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini
@@ -0,0 +1,8 @@
+[FilterPlugin]
+; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and
+; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively.
+;
+; Examples:
+;    /README.txt
+;    /Extras/...
+;    /Binaries/ThirdParty/*.dll
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
 			&UElevenLabsConversationalAgentComponent::HandleInterrupted);
 	}

-	// Pass our TurnMode to the proxy so it sends the correct mode in
-	// conversation_initiation_client_data and sends user_activity with each audio chunk.
+	// Pass configuration to the proxy before connecting.
 	WebSocketProxy->TurnMode = TurnMode;
+	WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn;

 	WebSocketProxy->Connect(AgentID);
 }
@@ -242,12 +242,18 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<u

 void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
 {
-	OnAgentTranscript.Broadcast(Segment);
+	if (bEnableUserTranscript)
+	{
+		OnAgentTranscript.Broadcast(Segment);
+	}
 }

 void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
 {
-	OnAgentTextResponse.Broadcast(ResponseText);
+	if (bEnableAgentTextResponse)
+	{
+		OnAgentTextResponse.Broadcast(ResponseText);
+	}
 }

 void UElevenLabsConversationalAgentComponent::HandleInterrupted()
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -106,6 +106,9 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)

 	UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());

+	// Track when the last audio chunk was sent for latency measurement.
+	LastAudioChunkSentTime = FPlatformTime::Seconds();
+
 	// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
 	// The server's VAD detects silence to determine end-of-turn.
 	// Do NOT send user_activity here — it resets the turn timeout timer
@@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 {
 	// No explicit "end turn" message exists in the ElevenLabs API.
 	// The server detects end-of-speech via VAD when we stop sending audio chunks.
+	UserTurnEndTime = FPlatformTime::Seconds();
+	bWaitingForResponse = true;
+	bFirstAudioResponseLogged = false;
 	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
 }

@@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	//   "type": "conversation_initiation_client_data",
 	//   "conversation_config_override": {
 	//     "agent": {
-	//       "turn": { "turn_timeout": 3 }
+	//       "turn": { "turn_timeout": 3, "speculative_turn": true }
 	//     },
 	//     "tts": {
 	//       "optimize_streaming_latency": 3
@@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	// uses its VAD to detect the end of speech from the audio chunks it receives.
 	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 	// Lower turn_timeout so the agent responds faster after the user stops speaking.
-	// Default is 7s which feels very slow for push-to-talk.
+	// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
+	// end-of-turn by releasing the key, so we can use a very short timeout (1s).
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
-		TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
+		TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
+	}
+	// Speculative turn: start LLM generation during silence before the VAD is
+	// fully confident the user finished speaking. Reduces latency by 200-500ms.
+	if (bSpeculativeTurn)
+	{
+		TurnObj->SetBoolField(TEXT("speculative_turn"), true);
 	}

 	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
@@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 	}
 	else if (MsgType == ElevenLabsMessageType::AudioResponse)
 	{
+		// Log time-to-first-audio: latency between end of user turn and first agent audio.
+		if (bWaitingForResponse && !bFirstAudioResponseLogged)
+		{
+			const double Now = FPlatformTime::Seconds();
+			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
+			const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
+			UE_LOG(LogElevenLabsWS, Warning,
+				TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
+				LatencyFromTurnEnd, LatencyFromLastChunk);
+			bFirstAudioResponseLogged = true;
+		}
 		HandleAudioResponse(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::UserTranscript)
 	{
+		// Log transcription latency.
+		if (bWaitingForResponse)
+		{
+			const double Now = FPlatformTime::Seconds();
+			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
+			UE_LOG(LogElevenLabsWS, Warning,
+				TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
+				LatencyFromTurnEnd);
+			bWaitingForResponse = false;
+		}
 		HandleTranscript(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::AgentResponse)
 	{
+		// Log agent text response latency.
+		if (UserTurnEndTime > 0.0)
+		{
+			const double Now = FPlatformTime::Seconds();
+			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
+			UE_LOG(LogElevenLabsWS, Warning,
+				TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
+				LatencyFromTurnEnd);
+		}
 		HandleAgentResponse(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -80,6 +80,29 @@ public:
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
 	bool bAutoStartListening = true;

+	/**
+	 * Enable speculative turn: the LLM starts generating a response during
+	 * silence before the VAD is fully confident the user has finished speaking.
+	 * Reduces latency by 200-500ms but may occasionally produce premature responses.
+	 */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
+	bool bSpeculativeTurn = true;
+
+	/**
+	 * Forward user speech transcripts (user_transcript events) to the
+	 * OnAgentTranscript delegate. Disable to reduce overhead if you don't
+	 * need to display what the user said.
+	 */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
+	bool bEnableUserTranscript = true;
+
+	/**
+	 * Forward agent text responses (agent_response events) to the
+	 * OnAgentTextResponse delegate. Disable if you only need audio output.
+	 */
+	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
+	bool bEnableAgentTextResponse = true;
+
 	// ── Events ────────────────────────────────────────────────────────────────

 	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -184,9 +184,21 @@ private:
 	// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
 	TArray<uint8> BinaryFrameBuffer;

+	// ── Latency tracking ─────────────────────────────────────────────────────
+	// Timestamp of the last audio chunk sent (user speech).
+	double LastAudioChunkSentTime = 0.0;
+	// Timestamp when user turn ended (StopListening).
+	double UserTurnEndTime = 0.0;
+	// Whether we are waiting for the first response after user stopped speaking.
+	bool bWaitingForResponse = false;
+	// Whether we already logged the first audio response latency for this turn.
+	bool bFirstAudioResponseLogged = false;
+
 public:
 	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
-	// Controls the turn mode string sent in conversation_initiation_client_data
-	// AND whether user_activity is sent automatically with each audio chunk.
+	// Controls turn_timeout in conversation_initiation_client_data.
 	EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
+
+	// Speculative turn: start LLM generation during silence before full turn confidence.
+	bool bSpeculativeTurn = true;
 };