Working !

2026-02-20 08:24:56 +01:00
parent f7f0b0c45b
commit 9f28ed7457
5 changed files with 101 additions and 9 deletions
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Config/FilterPlugin.ini
@@ -0,0 +1,8 @@
 [FilterPlugin]
 ; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and
 ; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively.
 ;
 ; Examples:
 ;    /README.txt
 ;    /Extras/...
 ;    /Binaries/ThirdParty/*.dll
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsConversationalAgentComponent.cpp
@@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
 			&UElevenLabsConversationalAgentComponent::HandleInterrupted);
 	}
-	// Pass our TurnMode to the proxy so it sends the correct mode in
+	// Pass configuration to the proxy before connecting.
 	// conversation_initiation_client_data and sends user_activity with each audio chunk.
 	WebSocketProxy->TurnMode = TurnMode;
 	WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn;
 	WebSocketProxy->Connect(AgentID);
 }
@@ -241,14 +241,20 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<u
 }
 void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
 {
 	if (bEnableUserTranscript)
 	{
 		OnAgentTranscript.Broadcast(Segment);
 	}
 }
 void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
 {
 	if (bEnableAgentTextResponse)
 	{
 		OnAgentTextResponse.Broadcast(ResponseText);
 	}
 }
 void UElevenLabsConversationalAgentComponent::HandleInterrupted()
 {
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Private/ElevenLabsWebSocketProxy.cpp
@@ -106,6 +106,9 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
 	UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
 	// Track when the last audio chunk was sent for latency measurement.
 	LastAudioChunkSentTime = FPlatformTime::Seconds();
 	// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
 	// The server's VAD detects silence to determine end-of-turn.
 	// Do NOT send user_activity here — it resets the turn timeout timer
@@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
 {
 	// No explicit "end turn" message exists in the ElevenLabs API.
 	// The server detects end-of-speech via VAD when we stop sending audio chunks.
 	UserTurnEndTime = FPlatformTime::Seconds();
 	bWaitingForResponse = true;
 	bFirstAudioResponseLogged = false;
 	UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
 }
@@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	//   "type": "conversation_initiation_client_data",
 	//   "conversation_config_override": {
 	//     "agent": {
-	//       "turn": { "turn_timeout": 3 }
+	//       "turn": { "turn_timeout": 3, "speculative_turn": true }
 	//     },
 	//     "tts": {
 	//       "optimize_streaming_latency": 3
@@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
 	// uses its VAD to detect the end of speech from the audio chunks it receives.
 	TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
 	// Lower turn_timeout so the agent responds faster after the user stops speaking.
-	// Default is 7s which feels very slow for push-to-talk.
+	// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
 	// end-of-turn by releasing the key, so we can use a very short timeout (1s).
 	if (TurnMode == EElevenLabsTurnMode::Client)
 	{
-		TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
+		TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
 	}
 	// Speculative turn: start LLM generation during silence before the VAD is
 	// fully confident the user finished speaking. Reduces latency by 200-500ms.
 	if (bSpeculativeTurn)
 	{
 		TurnObj->SetBoolField(TEXT("speculative_turn"), true);
 	}
 	TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
@@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
 	}
 	else if (MsgType == ElevenLabsMessageType::AudioResponse)
 	{
 		// Log time-to-first-audio: latency between end of user turn and first agent audio.
 		if (bWaitingForResponse && !bFirstAudioResponseLogged)
 		{
 			const double Now = FPlatformTime::Seconds();
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
 				TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
 				LatencyFromTurnEnd, LatencyFromLastChunk);
 			bFirstAudioResponseLogged = true;
 		}
 		HandleAudioResponse(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::UserTranscript)
 	{
 		// Log transcription latency.
 		if (bWaitingForResponse)
 		{
 			const double Now = FPlatformTime::Seconds();
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
 				TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
 				LatencyFromTurnEnd);
 			bWaitingForResponse = false;
 		}
 		HandleTranscript(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::AgentResponse)
 	{
 		// Log agent text response latency.
 		if (UserTurnEndTime > 0.0)
 		{
 			const double Now = FPlatformTime::Seconds();
 			const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
 			UE_LOG(LogElevenLabsWS, Warning,
 				TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
 				LatencyFromTurnEnd);
 		}
 		HandleAgentResponse(Root);
 	}
 	else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsConversationalAgentComponent.h
@@ -80,6 +80,29 @@ public:
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
 	bool bAutoStartListening = true;
 	/**
 	 * Enable speculative turn: the LLM starts generating a response during
 	 * silence before the VAD is fully confident the user has finished speaking.
 	 * Reduces latency by 200-500ms but may occasionally produce premature responses.
 	 */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
 	bool bSpeculativeTurn = true;
 	/**
 	 * Forward user speech transcripts (user_transcript events) to the
 	 * OnAgentTranscript delegate. Disable to reduce overhead if you don't
 	 * need to display what the user said.
 	 */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
 	bool bEnableUserTranscript = true;
 	/**
 	 * Forward agent text responses (agent_response events) to the
 	 * OnAgentTextResponse delegate. Disable if you only need audio output.
 	 */
 	UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
 	bool bEnableAgentTextResponse = true;
 	// ── Events ────────────────────────────────────────────────────────────────
 	UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")
--- a/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
+++ b/Unreal/PS_AI_Agent/Plugins/PS_AI_Agent_ElevenLabs/Source/PS_AI_Agent_ElevenLabs/Public/ElevenLabsWebSocketProxy.h
@@ -184,9 +184,21 @@ private:
 	// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
 	TArray<uint8> BinaryFrameBuffer;
 	// ── Latency tracking ─────────────────────────────────────────────────────
 	// Timestamp of the last audio chunk sent (user speech).
 	double LastAudioChunkSentTime = 0.0;
 	// Timestamp when user turn ended (StopListening).
 	double UserTurnEndTime = 0.0;
 	// Whether we are waiting for the first response after user stopped speaking.
 	bool bWaitingForResponse = false;
 	// Whether we already logged the first audio response latency for this turn.
 	bool bFirstAudioResponseLogged = false;
 public:
 	// Set by UElevenLabsConversationalAgentComponent before calling Connect().
-	// Controls the turn mode string sent in conversation_initiation_client_data
+	// Controls turn_timeout in conversation_initiation_client_data.
 	// AND whether user_activity is sent automatically with each audio chunk.
 	EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
 	// Speculative turn: start LLM generation during silence before full turn confidence.
 	bool bSpeculativeTurn = true;
 };