Working !

This commit is contained in:
j.foucher 2026-02-20 08:24:56 +01:00
parent f7f0b0c45b
commit 9f28ed7457
5 changed files with 101 additions and 9 deletions

View File

@ -0,0 +1,8 @@
[FilterPlugin]
; This section lists additional files which will be packaged along with your plugin. Paths should be listed relative to the root plugin directory, and
; may include "...", "*", and "?" wildcards to match directories, files, and individual characters respectively.
;
; Examples:
; /README.txt
; /Extras/...
; /Binaries/ThirdParty/*.dll

View File

@ -86,9 +86,9 @@ void UElevenLabsConversationalAgentComponent::StartConversation()
&UElevenLabsConversationalAgentComponent::HandleInterrupted);
}
// Pass our TurnMode to the proxy so it sends the correct mode in
// conversation_initiation_client_data and sends user_activity with each audio chunk.
// Pass configuration to the proxy before connecting.
WebSocketProxy->TurnMode = TurnMode;
WebSocketProxy->bSpeculativeTurn = bSpeculativeTurn;
WebSocketProxy->Connect(AgentID);
}
@ -242,12 +242,18 @@ void UElevenLabsConversationalAgentComponent::HandleAudioReceived(const TArray<u
void UElevenLabsConversationalAgentComponent::HandleTranscript(const FElevenLabsTranscriptSegment& Segment)
{
OnAgentTranscript.Broadcast(Segment);
if (bEnableUserTranscript)
{
OnAgentTranscript.Broadcast(Segment);
}
}
void UElevenLabsConversationalAgentComponent::HandleAgentResponse(const FString& ResponseText)
{
OnAgentTextResponse.Broadcast(ResponseText);
if (bEnableAgentTextResponse)
{
OnAgentTextResponse.Broadcast(ResponseText);
}
}
void UElevenLabsConversationalAgentComponent::HandleInterrupted()

View File

@ -106,6 +106,9 @@ void UElevenLabsWebSocketProxy::SendAudioChunk(const TArray<uint8>& PCMData)
UE_LOG(LogElevenLabsWS, Log, TEXT("SendAudioChunk: %d bytes (PCM int16 LE @ 16kHz mono)"), PCMData.Num());
// Track when the last audio chunk was sent for latency measurement.
LastAudioChunkSentTime = FPlatformTime::Seconds();
// ElevenLabs expects: { "user_audio_chunk": "<base64 PCM>" }
// The server's VAD detects silence to determine end-of-turn.
// Do NOT send user_activity here — it resets the turn timeout timer
@ -143,6 +146,9 @@ void UElevenLabsWebSocketProxy::SendUserTurnEnd()
{
// No explicit "end turn" message exists in the ElevenLabs API.
// The server detects end-of-speech via VAD when we stop sending audio chunks.
UserTurnEndTime = FPlatformTime::Seconds();
bWaitingForResponse = true;
bFirstAudioResponseLogged = false;
UE_LOG(LogElevenLabsWS, Log, TEXT("User turn ended — stopped sending audio chunks. Server VAD will detect silence."));
}
@ -188,7 +194,7 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
// "type": "conversation_initiation_client_data",
// "conversation_config_override": {
// "agent": {
// "turn": { "turn_timeout": 3 }
// "turn": { "turn_timeout": 3, "speculative_turn": true }
// },
// "tts": {
// "optimize_streaming_latency": 3
@ -206,10 +212,17 @@ void UElevenLabsWebSocketProxy::OnWsConnected()
// uses its VAD to detect the end of speech from the audio chunks it receives.
TSharedPtr<FJsonObject> TurnObj = MakeShareable(new FJsonObject());
// Lower turn_timeout so the agent responds faster after the user stops speaking.
// Default is 7s which feels very slow for push-to-talk.
// Default is 7s. In push-to-talk (Client mode), the user explicitly signals
// end-of-turn by releasing the key, so we can use a very short timeout (1s).
if (TurnMode == EElevenLabsTurnMode::Client)
{
TurnObj->SetNumberField(TEXT("turn_timeout"), 3);
TurnObj->SetNumberField(TEXT("turn_timeout"), 1);
}
// Speculative turn: start LLM generation during silence before the VAD is
// fully confident the user finished speaking. Reduces latency by 200-500ms.
if (bSpeculativeTurn)
{
TurnObj->SetBoolField(TEXT("speculative_turn"), true);
}
TSharedPtr<FJsonObject> AgentObj = MakeShareable(new FJsonObject());
@ -293,14 +306,44 @@ void UElevenLabsWebSocketProxy::OnWsMessage(const FString& Message)
}
else if (MsgType == ElevenLabsMessageType::AudioResponse)
{
// Log time-to-first-audio: latency between end of user turn and first agent audio.
if (bWaitingForResponse && !bFirstAudioResponseLogged)
{
const double Now = FPlatformTime::Seconds();
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
const double LatencyFromLastChunk = (Now - LastAudioChunkSentTime) * 1000.0;
UE_LOG(LogElevenLabsWS, Warning,
TEXT("[LATENCY] Time-to-first-audio: %.0f ms (from turn end), %.0f ms (from last chunk sent)"),
LatencyFromTurnEnd, LatencyFromLastChunk);
bFirstAudioResponseLogged = true;
}
HandleAudioResponse(Root);
}
else if (MsgType == ElevenLabsMessageType::UserTranscript)
{
// Log transcription latency.
if (bWaitingForResponse)
{
const double Now = FPlatformTime::Seconds();
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
UE_LOG(LogElevenLabsWS, Warning,
TEXT("[LATENCY] User transcript received: %.0f ms after turn end"),
LatencyFromTurnEnd);
bWaitingForResponse = false;
}
HandleTranscript(Root);
}
else if (MsgType == ElevenLabsMessageType::AgentResponse)
{
// Log agent text response latency.
if (UserTurnEndTime > 0.0)
{
const double Now = FPlatformTime::Seconds();
const double LatencyFromTurnEnd = (Now - UserTurnEndTime) * 1000.0;
UE_LOG(LogElevenLabsWS, Warning,
TEXT("[LATENCY] Agent text response: %.0f ms after turn end"),
LatencyFromTurnEnd);
}
HandleAgentResponse(Root);
}
else if (MsgType == ElevenLabsMessageType::AgentResponseCorrection)

View File

@ -80,6 +80,29 @@ public:
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs")
bool bAutoStartListening = true;
/**
* Enable speculative turn: the LLM starts generating a response during
* silence before the VAD is fully confident the user has finished speaking.
* Reduces latency by 200-500ms but may occasionally produce premature responses.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Latency")
bool bSpeculativeTurn = true;
/**
* Forward user speech transcripts (user_transcript events) to the
* OnAgentTranscript delegate. Disable to reduce overhead if you don't
* need to display what the user said.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableUserTranscript = true;
/**
* Forward agent text responses (agent_response events) to the
* OnAgentTextResponse delegate. Disable if you only need audio output.
*/
UPROPERTY(EditAnywhere, BlueprintReadWrite, Category = "ElevenLabs|Events")
bool bEnableAgentTextResponse = true;
// ── Events ────────────────────────────────────────────────────────────────
UPROPERTY(BlueprintAssignable, Category = "ElevenLabs|Events")

View File

@ -184,9 +184,21 @@ private:
// ElevenLabs sends JSON as binary frames; large messages arrive in fragments.
TArray<uint8> BinaryFrameBuffer;
// ── Latency tracking ─────────────────────────────────────────────────────
// Timestamp of the last audio chunk sent (user speech).
double LastAudioChunkSentTime = 0.0;
// Timestamp when user turn ended (StopListening).
double UserTurnEndTime = 0.0;
// Whether we are waiting for the first response after user stopped speaking.
bool bWaitingForResponse = false;
// Whether we already logged the first audio response latency for this turn.
bool bFirstAudioResponseLogged = false;
public:
// Set by UElevenLabsConversationalAgentComponent before calling Connect().
// Controls the turn mode string sent in conversation_initiation_client_data
// AND whether user_activity is sent automatically with each audio chunk.
// Controls turn_timeout in conversation_initiation_client_data.
EElevenLabsTurnMode TurnMode = EElevenLabsTurnMode::Server;
// Speculative turn: start LLM generation during silence before full turn confidence.
bool bSpeculativeTurn = true;
};